def getPersonNames(string): s = string.replace("\n", " . ") tagger = Ner(host='localhost', port=9199) taggedEng = tagger.get_entities(s) tagger = Ner(host='localhost', port=9198) taggedInd = tagger.get_entities(s) namesList = [] name = None for word in taggedEng: if name != None and word[1] == "PERSON": name += " " + str(word[0]) elif name == None and word[1] == "PERSON": name = str(word[0]) elif name != None: namesList.append(name.lower().replace("\n", "")) name = None if name != None: namesList.append(name.lower().replace("\n", "")) name = None for word in taggedInd: if name != None and word[1] == "PERSON": name += " " + str(word[0]) elif name == None and word[1] == "PERSON": name = str(word[0]) elif name != None: namesList.append(name.lower().replace("\n", "")) name = None if name != None: namesList.append(name.lower().replace("\n", "")) n = len(namesList) i = 0 while i < n: name = namesList[i] j = 0 while j < n: if name in namesList[j] and i != j: namesList.pop(i) n -= 1 i -= 1 break j += 1 i += 1 return namesList
def extract_unknown_ner(sentences_df, TEXT_COL='sentences', NER_COL='named_entities', ner_port=9199): ''' Extracted named entities using Stanford's NER. Requires a java server be already launched. sentences_df: pandas dataframe with one column that contains non-lowercased sentences TEXT_COL: name of column with sentences NER_COL: str name of column for output ''' # To run this, you need to set up SNER local server # download stanford core nlp (should be a zip file of format stanford-ner-YYYY-MM-DD) (maybe from https://nlp.stanford.edu/software/CRF-NER.shtml#Download) # need to start the Java server: # cd C:\ProgramData\Anaconda3\Lib\site-packages\sner\stanford-ner-2018-02-27 # java -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.all.3class.distsim.crf.ser.gz -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false # filter to sentences long enough to have sentiment and player name min_length = 10 # characters sentences_df = sentences_df[sentences_df[TEXT_COL].str.len() >= min_length] # tag using Java pos_tagger = Ner(host='localhost', port=ner_port) # would love to parallelize this, as it takes ~2-3 hours per year of data # ddf = dd.from_pandas(sentences_df) sner_entities = lambda text: [ token for token, part in pos_tagger.get_entities(text) if part in {'PERSON', 'ORGANIZATION', 'LOCATION'} ] sentences_df[NER_COL] = sentences_df[TEXT_COL].apply( lambda doc: sner_entities(doc)) return sentences_df
class StanfordNERClient(NER): """ To run the server see: pythonmodules/ner/run_stanford.sh """ def __init__(self, host=None, port=None): self.host = 'localhost' if host is None else host self.port = 9001 if port is None else port self.server = None def connect(self): self.server = Ner(host=self.host, port=self.port) def tag(self, text, **kwargs): # remove "untokenizable" characters to avoid warning from ner server text = bytes(text, 'utf-8').decode('utf-8', 'ignore') text = text.replace('\xFF\xFD', '') text = str(text).splitlines() if self.server is None: self.connect() try: return self._run(text) except ConnectionResetError: self.connect() except ConnectionRefusedError: raise ConnectionRefusedError( "Connection refused, is the server running at %s:%d? Check run_stanford.sh..." % (self.host, self.port)) return self._run(text) def _run(self, text): return list( itertools.chain(*[self.server.get_entities(line) for line in text]))
class STF_NER: def __init__(self): self.tagger = Ner(host='localhost', port=8046) def get_ner(self, l): res = self.tagger.get_entities(l) return res
def use_stanford_ner(data): entities = [] helper = Helper tagger = Ner(host='localhost', port=9199) for row in data: sentences = helper.get_sentences(row['supposed_string']) ne_sentences = [ tagger.get_entities(sentence) for sentence in sentences ] tagged_sentences = [ helper.transform_stanford_name_entity_to_tree(sentence) for sentence in ne_sentences ] grammar = helper.get_grammar('stanford_ner') parsed_sentences = helper.get_parsed_sentences(grammar, tagged_sentences) entities.extend(helper.extract_entities(parsed_sentences, row)) return entities
def com_ner(data_type: str, rp: str): """ Function gets the NER tags of the sentence using the sequential model pre-trained by Stanford NLP programs. :argument: :param data_type: String either `training` or `test` :param rp: Absolute path of the root directory of the project :return: boolean_flag: True for successful operation. all_ners: List of NER tags of each line """ # initialize the tagger corresponding to StandfordNER server tagger = Ner(host="localhost", port=9199) all_ners = [] read_flag, file = read_file("raw_sentence_{0}".format(data_type), rp) if read_flag: for line in file: word_tags = tagger.get_entities(line) ner_tags = [x[1] for x in word_tags] all_ners.append(" ".join(ner_tags)) return True, all_ners else: return False
cand_file = " " # output outputfile = " " k = 0 with open(cand_file, "r") as input, open(outputfile, "w") as output: reader = csv.reader(input, delimiter="\t") writer = csv.writer(output, delimiter="\t", quoting=csv.QUOTE_ALL) for row in reader: articleid = row[0] source = row[1] sent = row[2] # using the server from stanford ner tagger tagger = Ner(host="localhost", port=9199) # tagging the candidate sentence tag = tagger.get_entities(sent) # check if all source words are tagged with "PERSON" for i in range(len(tag)): flag = False if tag[i][0] == first_pattern: source_person = [] for j in range(1, min(6, len(tag) - i)): if (tag[i + j][1] != "PERSON" and tag[i + j][0] != second_pattern and tag[i + j][0] != first_pattern): break elif tag[i + j][0] == first_pattern: source_person = [] source_person = [] elif (tag[i + j][0] == second_pattern and source_person != []
def newsinfo(request): # nltk.download('vader_lexicon') context = {} form = StockForm(request.GET) form.is_valid() beta=0.8 text = form.cleaned_data['news_input'] text = re.sub(r"[^\w\s]", '', text) st = Ner(host='localhost',port=9199) sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(text) sentimentResult="" for k in sorted(ss): sentimentResult+='{0}: {1}, '.format(k, ss[k]) sentimentResult=sentimentResult[:-2] context['sentimentresult']=sentimentResult # first part - find pronoun start=time.time() tokens = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) wordcounts=Counter(tokens) organizationList=[] for tag in tagged: if(tag[1]=='NNP' and tag[0]!='' and tag[0] not in organizationList): organizationList.append(tag[0]) #second part - stanford NER newOrganizationList=st.get_entities(text) for org in newOrganizationList: if org[1]=='ORGANIZATION' and org[0] not in organizationList: organizationList.append(org[0]) #third part - nltk NER organizationList=get_continuous_chunks(text,organizationList) for org in organizationList: print(org) #use google search to check the term whether is it an organization/corporation newOrganizationList=[] for org in organizationList: if(checkOrganization(org)): newOrganizationList.append(org) # print(newOrganizationList) tickerList=[] filteredOrgList=[] wordCountList=[] #find ticker for orgName in newOrganizationList: ticker,tickerOrgName=findTicker(orgName) if(ticker!=None and ticker not in tickerList): print(orgName) print(wordcounts[orgName]) wordCountList.append(wordcounts[orgName]) filteredOrgList.append(tickerOrgName) tickerList.append(ticker) probDict={} totalWordCount=sum(wordCountList) for idx,org in enumerate(filteredOrgList): competitorName=[] marketCap=[] percentage=1.0/totalWordCount*wordCountList[idx] if(org not in probDict): probDict[org]=percentage*beta else: probDict[org]+=percentage*beta getCompetitorInfo(tickerList[idx],org,competitorName,marketCap) if(competitorName!=[]): marketCap.pop(0) totalMarketCap=sum(marketCap) for i,competitor in enumerate(competitorName): if(i<len(marketCap)): if(competitor not in probDict): probDict[competitor]=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i] else: probDict[competitor]+=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i] print(probDict) labels=[] values=[] for key, value in probDict.iteritems(): labels.append(key) values.append(value) if(len(labels)!=0): trace = graph_objs.Pie(labels=labels, values=values,textinfo='none') fig = Figure(data=Data([trace])) context['piechart']=plot(fig,auto_open=False,output_type='div') top10String="" sortList=numpy.argsort(values)[::-1] for idx,value in enumerate(sortList): if(idx==10): break top10String+=labels[value]+':'+str(values[value]*100)+'%,' top10String=top10String[:-1] context['top10']=top10String orgString="" for org in filteredOrgList: orgString+=org+',' orgString=orgString[:-1] tickerString="" for ticker in tickerList: tickerString+=ticker+',' tickerString=tickerString[:-1] context['foundentities']=orgString context['foundtickers']=tickerString context['result']=urllib2.unquote(text) print(context['result']) print(context['foundentities']) print(context['foundtickers']) return render(request, 'news/newsinfo.html', context)
class landslide_location_extractor(utils.AssedMessageProcessor.AssedMessageProcessor): def __init__(self, debug=False): self.debug = debug self.time = time.time() pool = redis.ConnectionPool(host='localhost',port=6379, db=0) self.r=redis.Redis(connection_pool = pool) self.timecheck = 600 self.locations = {} self.update_location_store() self.NER = Ner(host="localhost", port=9199) self.counter = 0 self.memory={} config = load_config("./config/assed_config.json") self.APIKEY = config["APIKEYS"]["googlemaps"] self.stream_tracker = {} def process(self,message): if message["streamtype"] not in self.stream_tracker: self.stream_tracker[message["streamtype"]] = {} self.stream_tracker[message["streamtype"]]["bad_location"] = 0 self.stream_tracker[message["streamtype"]]["good_location"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] = 0 if time.time() - self.time > self.timecheck: utils.helper_utils.std_flush("[%s] -- Updating news location store."%utils.helper_utils.readable_time()) self.update_location_store() self.time = time.time() for _streamtype in self.stream_tracker: utils.helper_utils.std_flush("[%s] -- Processed %i elements from %s with %i good locations and %i bad locations"%(utils.helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"])) self.stream_tracker[_streamtype]["totalcounter"] = 0 self.stream_tracker[_streamtype]["good_location"] = 0 self.stream_tracker[_streamtype]["bad_location"] = 0 if self.debug: utils.helper_utils.std_flush("Processed %i elements from %s with %i good locations and %i bad locations"%(self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"])) self.stream_tracker[message["streamtype"]]["totalcounter"] += 1 # Check if location exists latitude = None longitude = None if "location" in message and message["location"] is not None and len(message["location"]) > 0: #already have a location pass else: # First location tagging to get locations... cleaned_message = str(message["text"].encode("utf-8"))[2:-2] cleaned_message = " ".join(nltk.tokenize.word_tokenize(cleaned_message)) loc_tags = self.NER.get_entities(cleaned_message) desc_locations = self.extractLocations(loc_tags) locations = " ".join(desc_locations) if len(desc_locations) > 0 else None if locations is None: # Attempt match... for sublocations in self.locations: if sublocations in cleaned_message: locations = sublocations latitude = self.locations[sublocations][0] longitude = self.locations[sublocations][1] break else: # This is number of location items... pass #utils.helper_utils.std_flush(self.counter) if locations is None: self.stream_tracker[message["streamtype"]]["bad_location"] += 1 return (False, message) # location is there, we will attempt geocoding right here... right now... right on this ship # With sublocations... if latitude is None or longitude is None: standardized_location = utils.helper_utils.location_standardize(locations) for sublocation in standardized_location.split(":"): if sublocation in self.locations: latitude = self.locations[sublocation][0] longitude = self.locations[sublocation][1] message["location"] = locations # check if coords already in message if message["latitude"] is not None and message["longitude"] is not None: pass else: if latitude is not None and longitude is not None: message["latitude"] = str(latitude) message["longitude"] = str(longitude) else: # Attempt to get location from extractor memory (assed:extractor...) # First normalize... extractor_locations = utils.helper_utils.location_standardize(message["location"]) # Then attempt retrieve coordinates = None for extractor_sublocation in extractor_locations.split(":"): r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation) coordinates = self.r.get(r_key) if coordinates is not None: latlng = coordinates.decode("utf-8").split(",") latitude = float(latlng[0]) longitude = float(latlng[1]) break if coordinates is None: # no sublocation exists. We are gonna have to geocode utils.helper_utils.std_flush("[%s] -- Performing geolocation for %s using googlemaps"%(utils.helper_utils.readable_time(), message["location"])) latitude = False while latitude == False: latitude,longitude = utils.helper_utils.lookup_address_only(message["location"], self.APIKEY, self.r) if latitude == False: warnings.warn("[%s] -- WARNING -- Maps API Expired for %s. Trying after 2 hours."%(utils.helper_utils.readable_time(), time.time())) time.sleep(7200) if latitude is not None and longitude is not None: coordinates = str(latitude) + "," + str(longitude) for extractor_sublocation in extractor_locations.split(":"): r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation) # TODO ADD TO MEMORY AS WELL self.r.set(r_key, coordinates, ex=259200) if latitude is not None and longitude is not None: message["latitude"] = str(latitude) message["longitude"] = str(longitude) else: self.stream_tracker[message["streamtype"]]["bad_location"] += 1 return (False, message) self.stream_tracker[message["streamtype"]]["good_location"] += 1 return (True, message) def update_location_store(self,): self.locations = {} for _key in self.r.scan_iter(match="assed:sublocation:*", count=500): # keep only the first key location key_location = _key.decode("utf-8").split("assed:sublocation:")[1] if key_location.strip(): key_coords = self.r.get(_key).decode("utf-8").split(",") latitude = float(key_coords[0]) longitude = float(key_coords[1]) self.locations[key_location] = (latitude, longitude) def extractLocations(self,temp_loc_tags): locations = [] temp_loc=[] if temp_loc_tags[0][1] == 'LOCATION': temp_loc.append(temp_loc_tags[0][0]) for entry in temp_loc_tags[1:]: if entry[1] == 'LOCATION': temp_loc.append(entry[0]) else: if temp_loc: locations.append(' '.join(temp_loc)) temp_loc=[] if temp_loc: locations.append(' '.join(temp_loc)) return locations
class News(multiprocessing.Process): def __init__(self, assed_config, root_name, errorQueue, messageQueue, **kwargs): multiprocessing.Process.__init__(self) # set up DB connections self.DB_CONN = get_db_connection(assed_config) self.client = NewsApiClient(api_key="f715251d799140f793e63a1aec194920") self.root_name = root_name self.errorQueue = errorQueue self.messageQueue = messageQueue # No cached list because we are getting new stuff every day... self.config = kwargs["config"] self.NER = Ner(host='localhost', port=9199) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) self.r = redis.Redis(connection_pool=pool) pass def run(self, ): try: for event_topic in self.config["topic_names"]: if not self.config["topic_names"][event_topic][ "high_confidence"]["valid"]: continue self.messageQueue.put("News downloader - working on %s" % event_topic) event_topic_key = str( self.config["topic_names"][event_topic]["index"]) self.cached_list = self.getCachedList(event_topic_key) stopwords = self.config["topic_names"][event_topic][ "stopwords"] keyword_set = self.config["topic_names"][event_topic][ "high_confidence"]["keywords"] articles = [] for keyword in keyword_set: try: response = self.client.get_everything(q=keyword, page_size=100) articles += response["articles"] except Exception as e: self.messageQueue.put( "NewsAPI for %s-%s failed with error: %s" % (event_topic, keyword, repr(e))) article_content, article_location = self.getArticleDetails( articles, stopwords) self.insertNews(article_content, event_topic_key) self.updateRedisLocations(article_location) self.DB_CONN.close() self.messageQueue.put( "Completed News download successfully at %s." % readable_time()) except Exception as e: traceback.print_exc() self.errorQueue.put((self.root_name, str(e))) def getArticleDetails(self, articles, stopwords): article_content = [] article_location = [] exist_skip, stop_skip, location_skip, coordinate_skip = 0, 0, 0, 0 for article in articles: item = {} item["id"] = base64.b64encode(str.encode(article["url"])).decode() if item["id"] in self.cached_list: exist_skip += 1 continue item["source"] = article["source"]["name"] item["url"] = article["url"] item["time"] = dateutil.parser.parse( article["publishedAt"]).replace( tzinfo=tz.gettz('UTC')).astimezone( tz=tz.gettz('EDT')).strftime("%Y-%m-%d %H:%M:%S") item["title"] = article["title"] item["text"] = article["description"] #We are doing an extremely basic lookup <-- if it has landslide keyword, we accept. #Since this is alreadya landslide feed, google/whatever has better classifiers. We exploit those to create a super simple keyword filter. search_flag = False search_counter = 0 rText = item["text"] if "content" in article and article["content"] is not None and len( article["content"]) > 0: rText += article["content"] while not search_flag and search_counter < len(stopwords): if stopwords[search_counter] in rText: search_flag = True search_counter += 1 if search_flag: stop_skip += 1 continue # Description based location temp_loc_tags = self.NER.get_entities(item["text"]) desc_locations = self.extractLocations(temp_loc_tags) content_locations = [] try: temp_loc_tags = self.NER.get_entities(" ".join( nltk.tokenize.word_tokenize(article["content"]))) content_locations = self.extractLocations(temp_loc_tags) except (TypeError, IndexError): # TypeError -- if content is empty in article. IndexError -- if content is not None, but still empty pass # create location set - take unique from both desc and content_location, after normalization... item["description_location"] = [ location_normalize(item) for item in desc_locations ] item["content_location"] = [ location_normalize(item) for item in content_locations ] final_locations = list( set(item["description_location"] + item["content_location"])) if len(final_locations) == 0: location_skip += 1 continue item["locations"] = final_locations lat, lng = lookup_address_only( desc_locations, self.config["APIKEYS"]["googlemaps"], self.r) if lat == False: raise ValueError("Ran out of GoogleMaps daily keys") if lat is None or lng is None: coordinate_skip += 1 continue item["latitude"] = lat item["longitude"] = lng item["cell"] = generate_cell(lat, lng) article_content.append(item) article_location.append({ "name": final_locations, "lat": lat, "lng": lng }) self.messageQueue.put( "Obtained News with: %i items and skipped \n\texisting %i items\n\tstopword %i items, \n\tmissing location %i items \n\tmissing coordinates %i items" % (len(article_content), exist_skip, stop_skip, location_skip, coordinate_skip)) return article_content, article_location def extractLocations(self, temp_loc_tags): locations = [] temp_loc = [] if temp_loc_tags[0][1] == 'LOCATION': temp_loc.append(temp_loc_tags[0][0]) for entry in temp_loc_tags[1:]: if entry[1] == 'LOCATION': temp_loc.append(entry[0]) else: if temp_loc: locations.append(' '.join(temp_loc)) temp_loc = [] if temp_loc: locations.append(' '.join(temp_loc)) return locations def convertDateFromTime(self, tm): ''' Convert datetime to MySQL's datetime from time structure. ''' return time.strftime("%Y-%m-%d %H:%M:%S", tm) def getCachedList(self, event_topic): cachedlist = set() cursor = self.DB_CONN.cursor() select = "SELECT item_id FROM HCS_News where timestamp > %s and topic_name = %s" % ( (datetime.now() - timedelta(days=5)).strftime("%Y-%m-%d"), event_topic) cursor.execute(select) results = cursor.fetchall() cursor.close() for row in results: cachedlist.add(row[0]) self.messageQueue.put("News cachedlist has %i items in last 5 days" % (len(cachedlist))) return cachedlist def insertNews(self, article_items, event_topic_key): event_topic_key = int(event_topic_key) cursor = self.DB_CONN.cursor() for item in article_items: insert = 'INSERT INTO HCS_News ( \ item_id, link, \ cell, latitude, longitude, timestamp, location, news_src, text, topic_name) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s, %s,%s)' params = (item['id'], item['url'], item['cell'], \ item['latitude'], item['longitude'], item['time'], ",".join(item["locations"]), item['source'], item['text'], event_topic_key) try: cursor.execute(insert, params) self.DB_CONN.commit() except Exception as e: traceback.print_exc() self.messageQueue.put('Failed to insert %s with error %s' % (item["id"], repr(e))) cursor.close() def updateRedisLocations(self, article_location): # get REDIS connection totalLocations = len(article_location) sublocations = 0 for location in article_location: converted_location = " ".join(location["name"]) location_std = location_standardize(converted_location) location_key = high_confidence_streamer_key("news:location:" + location_std) self.r.set(location_key, converted_location, ex=259200) point_str = str(location["lat"]) + "," + str(location["lng"]) for sublocation in location_std.split(":"): sublocationkey = sublocation_key(sublocation) self.r.set(sublocationkey, point_str, ex=259200) sublocations += 1 self.messageQueue.put( "Completed News with: %i locations and %i sublocations" % (totalLocations, sublocations))
isdigit( )) for s in directors ] directors = [ s.title() for s in directors ] # print(directors) cleaned = [] peoples = [] for element in directors: tags = st.get_entities( element ) # print(tokens, tags) if all(tag[ 1] == 'PERSON' for tag in tags ): name = ' '.join( tag[0] for tag