def extract_unknown_ner(sentences_df, TEXT_COL='sentences', NER_COL='named_entities', ner_port=9199): ''' Extracted named entities using Stanford's NER. Requires a java server be already launched. sentences_df: pandas dataframe with one column that contains non-lowercased sentences TEXT_COL: name of column with sentences NER_COL: str name of column for output ''' # To run this, you need to set up SNER local server # download stanford core nlp (should be a zip file of format stanford-ner-YYYY-MM-DD) (maybe from https://nlp.stanford.edu/software/CRF-NER.shtml#Download) # need to start the Java server: # cd C:\ProgramData\Anaconda3\Lib\site-packages\sner\stanford-ner-2018-02-27 # java -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.all.3class.distsim.crf.ser.gz -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false # filter to sentences long enough to have sentiment and player name min_length = 10 # characters sentences_df = sentences_df[sentences_df[TEXT_COL].str.len() >= min_length] # tag using Java pos_tagger = Ner(host='localhost', port=ner_port) # would love to parallelize this, as it takes ~2-3 hours per year of data # ddf = dd.from_pandas(sentences_df) sner_entities = lambda text: [ token for token, part in pos_tagger.get_entities(text) if part in {'PERSON', 'ORGANIZATION', 'LOCATION'} ] sentences_df[NER_COL] = sentences_df[TEXT_COL].apply( lambda doc: sner_entities(doc)) return sentences_df
def __init__(self, debug=False): self.debug = debug self.time = time.time() pool = redis.ConnectionPool(host='localhost',port=6379, db=0) self.r=redis.Redis(connection_pool = pool) self.timecheck = 600 self.locations = {} self.update_location_store() self.NER = Ner(host="localhost", port=9199) self.counter = 0 self.memory={} config = load_config("./config/assed_config.json") self.APIKEY = config["APIKEYS"]["googlemaps"] self.stream_tracker = {}
def __init__(self, assed_config, root_name, errorQueue, messageQueue, **kwargs): multiprocessing.Process.__init__(self) # set up DB connections self.DB_CONN = get_db_connection(assed_config) self.client = NewsApiClient(api_key="f715251d799140f793e63a1aec194920") self.root_name = root_name self.errorQueue = errorQueue self.messageQueue = messageQueue # No cached list because we are getting new stuff every day... self.config = kwargs["config"] self.NER = Ner(host='localhost', port=9199) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) self.r = redis.Redis(connection_pool=pool) pass
def start_sner(self): command = "java -Djava.ext.dirs=./lib -cp {0} edu.stanford.nlp.ie.NERServer " \ "-port 9199 -loadClassifier {1} -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer " \ "-tokenizerOptions tokenizeNLs=false".format(self.sner_jar_path, self.sner_class_path) self.stop() self.proc_obj = Popen(args=command) self.tagger = Ner(host='localhost', port=9199)
class StanfordNERClient(NER): """ To run the server see: pythonmodules/ner/run_stanford.sh """ def __init__(self, host=None, port=None): self.host = 'localhost' if host is None else host self.port = 9001 if port is None else port self.server = None def connect(self): self.server = Ner(host=self.host, port=self.port) def tag(self, text, **kwargs): # remove "untokenizable" characters to avoid warning from ner server text = bytes(text, 'utf-8').decode('utf-8', 'ignore') text = text.replace('\xFF\xFD', '') text = str(text).splitlines() if self.server is None: self.connect() try: return self._run(text) except ConnectionResetError: self.connect() except ConnectionRefusedError: raise ConnectionRefusedError( "Connection refused, is the server running at %s:%d? Check run_stanford.sh..." % (self.host, self.port)) return self._run(text) def _run(self, text): return list( itertools.chain(*[self.server.get_entities(line) for line in text]))
class STF_NER: def __init__(self): self.tagger = Ner(host='localhost', port=8046) def get_ner(self, l): res = self.tagger.get_entities(l) return res
def startNERServer(): #os.chdir('./StanfordNER/') #subprocess.Popen(['./run_stanfordner_server.sh']) # start the server up #os.chdir('../') # get the NER object ner_tagger = Ner(host='localhost', port=9199) return ner_tagger
def use_stanford_ner(data): entities = [] helper = Helper tagger = Ner(host='localhost', port=9199) for row in data: sentences = helper.get_sentences(row['supposed_string']) ne_sentences = [ tagger.get_entities(sentence) for sentence in sentences ] tagged_sentences = [ helper.transform_stanford_name_entity_to_tree(sentence) for sentence in ne_sentences ] grammar = helper.get_grammar('stanford_ner') parsed_sentences = helper.get_parsed_sentences(grammar, tagged_sentences) entities.extend(helper.extract_entities(parsed_sentences, row)) return entities
def com_ner(data_type: str, rp: str): """ Function gets the NER tags of the sentence using the sequential model pre-trained by Stanford NLP programs. :argument: :param data_type: String either `training` or `test` :param rp: Absolute path of the root directory of the project :return: boolean_flag: True for successful operation. all_ners: List of NER tags of each line """ # initialize the tagger corresponding to StandfordNER server tagger = Ner(host="localhost", port=9199) all_ners = [] read_flag, file = read_file("raw_sentence_{0}".format(data_type), rp) if read_flag: for line in file: word_tags = tagger.get_entities(line) ner_tags = [x[1] for x in word_tags] all_ners.append(" ".join(ner_tags)) return True, all_ners else: return False
def main(args): client = MongoClient(args.connection_string) db = client["dax_gcp"] collection = db["all_news"] tagger = Ner(host='localhost', port=9199) operations = [] records = 0 start_time = time.time() for doc in collection.find({}, no_cursor_timeout=True): tags_temp = get_tags(doc["NEWS_TITLE_NewsDim"], tagger) if not tags_temp: continue tags = process_tags(tags_temp) new_values = {} new_values["tag_header_LOCATION"] = list() new_values["tag_header_PERSON"] = list() new_values["tag_header_ORGANIZATION"] = list() new_values["tag_header_MONEY"] = list() new_values["tag_header_PERCENT"] = list() new_values["tag_header_DATE"] = list() new_values["tag_header_TIME"] = list() for word, tag in tags: if tag != "O": new_values["tag_header_" + tag].append(word) operations.append(UpdateOne({"_id": doc["_id"]}, {"$set": new_values})) # Send once every 1000 in batch if (len(operations) == 1000): print("Performing bulk write") collection.bulk_write(operations, ordered=False) operations = [] records += 1000 print("Write done. Saved {} records".format(records)) if (len(operations) > 0): collection.bulk_write(operations, ordered=False) print("--- %s seconds ---" % (time.time() - start_time)) print("Processed {} records".format(records))
def getPersonNames(string): s = string.replace("\n", " . ") tagger = Ner(host='localhost', port=9199) taggedEng = tagger.get_entities(s) tagger = Ner(host='localhost', port=9198) taggedInd = tagger.get_entities(s) namesList = [] name = None for word in taggedEng: if name != None and word[1] == "PERSON": name += " " + str(word[0]) elif name == None and word[1] == "PERSON": name = str(word[0]) elif name != None: namesList.append(name.lower().replace("\n", "")) name = None if name != None: namesList.append(name.lower().replace("\n", "")) name = None for word in taggedInd: if name != None and word[1] == "PERSON": name += " " + str(word[0]) elif name == None and word[1] == "PERSON": name = str(word[0]) elif name != None: namesList.append(name.lower().replace("\n", "")) name = None if name != None: namesList.append(name.lower().replace("\n", "")) n = len(namesList) i = 0 while i < n: name = namesList[i] j = 0 while j < n: if name in namesList[j] and i != j: namesList.pop(i) n -= 1 i -= 1 break j += 1 i += 1 return namesList
def newsinfo(request): # nltk.download('vader_lexicon') context = {} form = StockForm(request.GET) form.is_valid() beta=0.8 text = form.cleaned_data['news_input'] text = re.sub(r"[^\w\s]", '', text) st = Ner(host='localhost',port=9199) sid = SentimentIntensityAnalyzer() ss = sid.polarity_scores(text) sentimentResult="" for k in sorted(ss): sentimentResult+='{0}: {1}, '.format(k, ss[k]) sentimentResult=sentimentResult[:-2] context['sentimentresult']=sentimentResult # first part - find pronoun start=time.time() tokens = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) wordcounts=Counter(tokens) organizationList=[] for tag in tagged: if(tag[1]=='NNP' and tag[0]!='' and tag[0] not in organizationList): organizationList.append(tag[0]) #second part - stanford NER newOrganizationList=st.get_entities(text) for org in newOrganizationList: if org[1]=='ORGANIZATION' and org[0] not in organizationList: organizationList.append(org[0]) #third part - nltk NER organizationList=get_continuous_chunks(text,organizationList) for org in organizationList: print(org) #use google search to check the term whether is it an organization/corporation newOrganizationList=[] for org in organizationList: if(checkOrganization(org)): newOrganizationList.append(org) # print(newOrganizationList) tickerList=[] filteredOrgList=[] wordCountList=[] #find ticker for orgName in newOrganizationList: ticker,tickerOrgName=findTicker(orgName) if(ticker!=None and ticker not in tickerList): print(orgName) print(wordcounts[orgName]) wordCountList.append(wordcounts[orgName]) filteredOrgList.append(tickerOrgName) tickerList.append(ticker) probDict={} totalWordCount=sum(wordCountList) for idx,org in enumerate(filteredOrgList): competitorName=[] marketCap=[] percentage=1.0/totalWordCount*wordCountList[idx] if(org not in probDict): probDict[org]=percentage*beta else: probDict[org]+=percentage*beta getCompetitorInfo(tickerList[idx],org,competitorName,marketCap) if(competitorName!=[]): marketCap.pop(0) totalMarketCap=sum(marketCap) for i,competitor in enumerate(competitorName): if(i<len(marketCap)): if(competitor not in probDict): probDict[competitor]=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i] else: probDict[competitor]+=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i] print(probDict) labels=[] values=[] for key, value in probDict.iteritems(): labels.append(key) values.append(value) if(len(labels)!=0): trace = graph_objs.Pie(labels=labels, values=values,textinfo='none') fig = Figure(data=Data([trace])) context['piechart']=plot(fig,auto_open=False,output_type='div') top10String="" sortList=numpy.argsort(values)[::-1] for idx,value in enumerate(sortList): if(idx==10): break top10String+=labels[value]+':'+str(values[value]*100)+'%,' top10String=top10String[:-1] context['top10']=top10String orgString="" for org in filteredOrgList: orgString+=org+',' orgString=orgString[:-1] tickerString="" for ticker in tickerList: tickerString+=ticker+',' tickerString=tickerString[:-1] context['foundentities']=orgString context['foundtickers']=tickerString context['result']=urllib2.unquote(text) print(context['result']) print(context['foundentities']) print(context['foundtickers']) return render(request, 'news/newsinfo.html', context)
def __init__(self): self.tagger = Ner(host='localhost', port=8046)
urls = ('/', 'SimpleIndexSearchPage', '/entityAwareSearchPage', 'EntityAwareSearch', '/searchSimpleIndex', 'SearchSimpleIndex', '/searchEntityAwareIndex', 'SearchEntityAwareIndex') CATEGORY = { 'b': 'Business', 'e': 'Entertainment', 't': 'Science and Technology', 'm': 'Health' } render = web.template.render('templates/', base='layout') SOLR_SIMPLEINDEX = pysolr.Solr('http://localhost:8983/solr/simpleindex') SOLR_ENTITYAWAREINDEX = pysolr.Solr( 'http://localhost:8983/solr/entityawareindex') STANFORD_NER_SERVER = Ner(host='localhost', port=9199) def get_web_input(web_input): draw = web_input['draw'] query = web_input['search[value]'] if len(query) == 0: query = '*:*' offset = web_input['start'] count = web_input['length'] return draw, query, offset, count def search_simple_index(query, offset, count, draw): """ This function is responsible for hitting the solr endpoint
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords as nltk_stopwords from nltk.stem import PorterStemmer from nltk import pos_tag, ne_chunk from nltk.tree import Tree from nltk import data as nltk_data from sner import Ner from stemming.porter2 import stem from string import digits, punctuation import unicodedata _logger = logging.getLogger(__name__) if config.NLTK_DATA_PATH is not None: nltk_data.path.append(config.NLTK_DATA_PATH) _stanford_ner_tagger = Ner(host=config.STANFORD_NER_TAGGER_SERVER['host'], port=config.STANFORD_NER_TAGGER_SERVER['port']) _set_of_stopwords = set(nltk_stopwords.words("english")) # the following is very useful, but shouldn't be done for the purpose of comparing to SOTA with a standard preprocessing # for sw in ["BBC", "TheJournal", "ie", "Al", "Jazeera", "News"]: # set_of_stopwords.add(sw) def ner_tokenize(text): stemmer = PorterStemmer() # unicode categories can be found here http://www.unicode.org/reports/tr44/#General_Category_Values chunked = ne_chunk( pos_tag( word_tokenize("".join(c for c in unicodedata.normalize('NFD', text)
INDEX_MAP = [ "ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP" ] # Location, Time, Person, Organization, Money, Percent, Date CASELESS_CLASSIFIER = '/usr/share/stanford-ner/classifiers/english.muc.7class.caseless.distsim.crf.ser.gz' # To use the Stanford NER server run the following command in the stanford-ner directory ''' java -Xmx3g -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.muc.7class.distsim.crf.ser.gz -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false ''' STANFORD_NER_HANDLER = Ner(host='localhost', port=9199) def accumulate(list_of_tuples): tokens, entities = zip(*list_of_tuples) recognised = defaultdict(set) duplicates = defaultdict(list) for i, item in enumerate(entities): duplicates[item].append(i) for key, value in duplicates.items(): for k, g in groupby(enumerate(value), lambda x: x[0] - x[1]): indices = list(map(itemgetter(1), g)) recognised[key].add(' '.join(tokens[index] for index in indices)) recognised.pop('O', None)
class landslide_location_extractor(utils.AssedMessageProcessor.AssedMessageProcessor): def __init__(self, debug=False): self.debug = debug self.time = time.time() pool = redis.ConnectionPool(host='localhost',port=6379, db=0) self.r=redis.Redis(connection_pool = pool) self.timecheck = 600 self.locations = {} self.update_location_store() self.NER = Ner(host="localhost", port=9199) self.counter = 0 self.memory={} config = load_config("./config/assed_config.json") self.APIKEY = config["APIKEYS"]["googlemaps"] self.stream_tracker = {} def process(self,message): if message["streamtype"] not in self.stream_tracker: self.stream_tracker[message["streamtype"]] = {} self.stream_tracker[message["streamtype"]]["bad_location"] = 0 self.stream_tracker[message["streamtype"]]["good_location"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] = 0 if time.time() - self.time > self.timecheck: utils.helper_utils.std_flush("[%s] -- Updating news location store."%utils.helper_utils.readable_time()) self.update_location_store() self.time = time.time() for _streamtype in self.stream_tracker: utils.helper_utils.std_flush("[%s] -- Processed %i elements from %s with %i good locations and %i bad locations"%(utils.helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"])) self.stream_tracker[_streamtype]["totalcounter"] = 0 self.stream_tracker[_streamtype]["good_location"] = 0 self.stream_tracker[_streamtype]["bad_location"] = 0 if self.debug: utils.helper_utils.std_flush("Processed %i elements from %s with %i good locations and %i bad locations"%(self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"])) self.stream_tracker[message["streamtype"]]["totalcounter"] += 1 # Check if location exists latitude = None longitude = None if "location" in message and message["location"] is not None and len(message["location"]) > 0: #already have a location pass else: # First location tagging to get locations... cleaned_message = str(message["text"].encode("utf-8"))[2:-2] cleaned_message = " ".join(nltk.tokenize.word_tokenize(cleaned_message)) loc_tags = self.NER.get_entities(cleaned_message) desc_locations = self.extractLocations(loc_tags) locations = " ".join(desc_locations) if len(desc_locations) > 0 else None if locations is None: # Attempt match... for sublocations in self.locations: if sublocations in cleaned_message: locations = sublocations latitude = self.locations[sublocations][0] longitude = self.locations[sublocations][1] break else: # This is number of location items... pass #utils.helper_utils.std_flush(self.counter) if locations is None: self.stream_tracker[message["streamtype"]]["bad_location"] += 1 return (False, message) # location is there, we will attempt geocoding right here... right now... right on this ship # With sublocations... if latitude is None or longitude is None: standardized_location = utils.helper_utils.location_standardize(locations) for sublocation in standardized_location.split(":"): if sublocation in self.locations: latitude = self.locations[sublocation][0] longitude = self.locations[sublocation][1] message["location"] = locations # check if coords already in message if message["latitude"] is not None and message["longitude"] is not None: pass else: if latitude is not None and longitude is not None: message["latitude"] = str(latitude) message["longitude"] = str(longitude) else: # Attempt to get location from extractor memory (assed:extractor...) # First normalize... extractor_locations = utils.helper_utils.location_standardize(message["location"]) # Then attempt retrieve coordinates = None for extractor_sublocation in extractor_locations.split(":"): r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation) coordinates = self.r.get(r_key) if coordinates is not None: latlng = coordinates.decode("utf-8").split(",") latitude = float(latlng[0]) longitude = float(latlng[1]) break if coordinates is None: # no sublocation exists. We are gonna have to geocode utils.helper_utils.std_flush("[%s] -- Performing geolocation for %s using googlemaps"%(utils.helper_utils.readable_time(), message["location"])) latitude = False while latitude == False: latitude,longitude = utils.helper_utils.lookup_address_only(message["location"], self.APIKEY, self.r) if latitude == False: warnings.warn("[%s] -- WARNING -- Maps API Expired for %s. Trying after 2 hours."%(utils.helper_utils.readable_time(), time.time())) time.sleep(7200) if latitude is not None and longitude is not None: coordinates = str(latitude) + "," + str(longitude) for extractor_sublocation in extractor_locations.split(":"): r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation) # TODO ADD TO MEMORY AS WELL self.r.set(r_key, coordinates, ex=259200) if latitude is not None and longitude is not None: message["latitude"] = str(latitude) message["longitude"] = str(longitude) else: self.stream_tracker[message["streamtype"]]["bad_location"] += 1 return (False, message) self.stream_tracker[message["streamtype"]]["good_location"] += 1 return (True, message) def update_location_store(self,): self.locations = {} for _key in self.r.scan_iter(match="assed:sublocation:*", count=500): # keep only the first key location key_location = _key.decode("utf-8").split("assed:sublocation:")[1] if key_location.strip(): key_coords = self.r.get(_key).decode("utf-8").split(",") latitude = float(key_coords[0]) longitude = float(key_coords[1]) self.locations[key_location] = (latitude, longitude) def extractLocations(self,temp_loc_tags): locations = [] temp_loc=[] if temp_loc_tags[0][1] == 'LOCATION': temp_loc.append(temp_loc_tags[0][0]) for entry in temp_loc_tags[1:]: if entry[1] == 'LOCATION': temp_loc.append(entry[0]) else: if temp_loc: locations.append(' '.join(temp_loc)) temp_loc=[] if temp_loc: locations.append(' '.join(temp_loc)) return locations
class News(multiprocessing.Process): def __init__(self, assed_config, root_name, errorQueue, messageQueue, **kwargs): multiprocessing.Process.__init__(self) # set up DB connections self.DB_CONN = get_db_connection(assed_config) self.client = NewsApiClient(api_key="f715251d799140f793e63a1aec194920") self.root_name = root_name self.errorQueue = errorQueue self.messageQueue = messageQueue # No cached list because we are getting new stuff every day... self.config = kwargs["config"] self.NER = Ner(host='localhost', port=9199) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) self.r = redis.Redis(connection_pool=pool) pass def run(self, ): try: for event_topic in self.config["topic_names"]: if not self.config["topic_names"][event_topic][ "high_confidence"]["valid"]: continue self.messageQueue.put("News downloader - working on %s" % event_topic) event_topic_key = str( self.config["topic_names"][event_topic]["index"]) self.cached_list = self.getCachedList(event_topic_key) stopwords = self.config["topic_names"][event_topic][ "stopwords"] keyword_set = self.config["topic_names"][event_topic][ "high_confidence"]["keywords"] articles = [] for keyword in keyword_set: try: response = self.client.get_everything(q=keyword, page_size=100) articles += response["articles"] except Exception as e: self.messageQueue.put( "NewsAPI for %s-%s failed with error: %s" % (event_topic, keyword, repr(e))) article_content, article_location = self.getArticleDetails( articles, stopwords) self.insertNews(article_content, event_topic_key) self.updateRedisLocations(article_location) self.DB_CONN.close() self.messageQueue.put( "Completed News download successfully at %s." % readable_time()) except Exception as e: traceback.print_exc() self.errorQueue.put((self.root_name, str(e))) def getArticleDetails(self, articles, stopwords): article_content = [] article_location = [] exist_skip, stop_skip, location_skip, coordinate_skip = 0, 0, 0, 0 for article in articles: item = {} item["id"] = base64.b64encode(str.encode(article["url"])).decode() if item["id"] in self.cached_list: exist_skip += 1 continue item["source"] = article["source"]["name"] item["url"] = article["url"] item["time"] = dateutil.parser.parse( article["publishedAt"]).replace( tzinfo=tz.gettz('UTC')).astimezone( tz=tz.gettz('EDT')).strftime("%Y-%m-%d %H:%M:%S") item["title"] = article["title"] item["text"] = article["description"] #We are doing an extremely basic lookup <-- if it has landslide keyword, we accept. #Since this is alreadya landslide feed, google/whatever has better classifiers. We exploit those to create a super simple keyword filter. search_flag = False search_counter = 0 rText = item["text"] if "content" in article and article["content"] is not None and len( article["content"]) > 0: rText += article["content"] while not search_flag and search_counter < len(stopwords): if stopwords[search_counter] in rText: search_flag = True search_counter += 1 if search_flag: stop_skip += 1 continue # Description based location temp_loc_tags = self.NER.get_entities(item["text"]) desc_locations = self.extractLocations(temp_loc_tags) content_locations = [] try: temp_loc_tags = self.NER.get_entities(" ".join( nltk.tokenize.word_tokenize(article["content"]))) content_locations = self.extractLocations(temp_loc_tags) except (TypeError, IndexError): # TypeError -- if content is empty in article. IndexError -- if content is not None, but still empty pass # create location set - take unique from both desc and content_location, after normalization... item["description_location"] = [ location_normalize(item) for item in desc_locations ] item["content_location"] = [ location_normalize(item) for item in content_locations ] final_locations = list( set(item["description_location"] + item["content_location"])) if len(final_locations) == 0: location_skip += 1 continue item["locations"] = final_locations lat, lng = lookup_address_only( desc_locations, self.config["APIKEYS"]["googlemaps"], self.r) if lat == False: raise ValueError("Ran out of GoogleMaps daily keys") if lat is None or lng is None: coordinate_skip += 1 continue item["latitude"] = lat item["longitude"] = lng item["cell"] = generate_cell(lat, lng) article_content.append(item) article_location.append({ "name": final_locations, "lat": lat, "lng": lng }) self.messageQueue.put( "Obtained News with: %i items and skipped \n\texisting %i items\n\tstopword %i items, \n\tmissing location %i items \n\tmissing coordinates %i items" % (len(article_content), exist_skip, stop_skip, location_skip, coordinate_skip)) return article_content, article_location def extractLocations(self, temp_loc_tags): locations = [] temp_loc = [] if temp_loc_tags[0][1] == 'LOCATION': temp_loc.append(temp_loc_tags[0][0]) for entry in temp_loc_tags[1:]: if entry[1] == 'LOCATION': temp_loc.append(entry[0]) else: if temp_loc: locations.append(' '.join(temp_loc)) temp_loc = [] if temp_loc: locations.append(' '.join(temp_loc)) return locations def convertDateFromTime(self, tm): ''' Convert datetime to MySQL's datetime from time structure. ''' return time.strftime("%Y-%m-%d %H:%M:%S", tm) def getCachedList(self, event_topic): cachedlist = set() cursor = self.DB_CONN.cursor() select = "SELECT item_id FROM HCS_News where timestamp > %s and topic_name = %s" % ( (datetime.now() - timedelta(days=5)).strftime("%Y-%m-%d"), event_topic) cursor.execute(select) results = cursor.fetchall() cursor.close() for row in results: cachedlist.add(row[0]) self.messageQueue.put("News cachedlist has %i items in last 5 days" % (len(cachedlist))) return cachedlist def insertNews(self, article_items, event_topic_key): event_topic_key = int(event_topic_key) cursor = self.DB_CONN.cursor() for item in article_items: insert = 'INSERT INTO HCS_News ( \ item_id, link, \ cell, latitude, longitude, timestamp, location, news_src, text, topic_name) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s, %s,%s)' params = (item['id'], item['url'], item['cell'], \ item['latitude'], item['longitude'], item['time'], ",".join(item["locations"]), item['source'], item['text'], event_topic_key) try: cursor.execute(insert, params) self.DB_CONN.commit() except Exception as e: traceback.print_exc() self.messageQueue.put('Failed to insert %s with error %s' % (item["id"], repr(e))) cursor.close() def updateRedisLocations(self, article_location): # get REDIS connection totalLocations = len(article_location) sublocations = 0 for location in article_location: converted_location = " ".join(location["name"]) location_std = location_standardize(converted_location) location_key = high_confidence_streamer_key("news:location:" + location_std) self.r.set(location_key, converted_location, ex=259200) point_str = str(location["lat"]) + "," + str(location["lng"]) for sublocation in location_std.split(":"): sublocationkey = sublocation_key(sublocation) self.r.set(sublocationkey, point_str, ex=259200) sublocations += 1 self.messageQueue.put( "Completed News with: %i locations and %i sublocations" % (totalLocations, sublocations))
import re import requests import os import unicodedata from bs4 import BeautifulSoup from IPython.display import display import mysql.connector, requests, os, os.path from mysql.connector import Error, errorcode from datetime import datetime import sys from sner import Ner # os.environ['CLASSPATH'] = 'stanford-ner-4.0.0/stanford-ner.jar' # from nltk.tag import StanfordNERTagger # os.getenv('CLASSPATH') = '../Downloads/stanford-postagger.jar' # nltk.download() st = Ner(host='172.104.7.112', port=9199) # print(st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) ) sys.setrecursionlimit(1000) # rootdir = os.getcwd() + '/mnt' # for subdir, dirs, files in os.walk(rootdir): # for file in files: # print(dirs,file) def restore_windows_1252_characters(restore_string): """ Replace C1 control characters in the Unicode string s by the characters at the corresponding code points in Windows-1252, where possible. """ def to_windows_1252(match):
def connect(self): self.server = Ner(host=self.host, port=self.port)
def get_stanford_ner_client(): """ Get an instance of the Ner http client. :return: """ return Ner(host=STANFORD_NER_URL, port=STANFORD_NER_PORT)
from nltk.tag import StanfordNERTagger from nltk.internals import find_jars_within_path import labdatascript as lb import re import unicodedata from sner import Ner from nltk import tokenize st = Ner(host='localhost', port=9199) #7-class # stanford_dir = st._stanford_jar.rpartition('/')[0] #these lines are just a hack to get around the java problem # from nltk.internals import find_jars_within_path # stanford_jars = find_jars_within_path(stanford_dir) # st._stanford_jar = ':'.join(stanford_jars) def analyze_texts(dict, top_num=5 ): #takes soups_dict of texts gathered, creates corpus ''' Draws from labdatascript functions for NLP analysis of gathered texts Input: result dictionary from web crawler with gathered text as values, top n to slice from analysis results Output: dict with wordcounts, bi + tri + quad grams, topic model Further: takes top x values from ngrams, passed in as parameter in function call ''' corpus = [] for i in dict.values(): i = re.sub('[0-9]+', '', i) corpus.append(i)
# file from theof.py cand_file = " " # output outputfile = " " k = 0 with open(cand_file, "r") as input, open(outputfile, "w") as output: reader = csv.reader(input, delimiter="\t") writer = csv.writer(output, delimiter="\t", quoting=csv.QUOTE_ALL) for row in reader: articleid = row[0] source = row[1] sent = row[2] # using the server from stanford ner tagger tagger = Ner(host="localhost", port=9199) # tagging the candidate sentence tag = tagger.get_entities(sent) # check if all source words are tagged with "PERSON" for i in range(len(tag)): flag = False if tag[i][0] == first_pattern: source_person = [] for j in range(1, min(6, len(tag) - i)): if (tag[i + j][1] != "PERSON" and tag[i + j][0] != second_pattern and tag[i + j][0] != first_pattern): break elif tag[i + j][0] == first_pattern: source_person = [] source_person = []
import datetime import re import numpy as np from nltk import word_tokenize import pandas as pd #for the named entity recoginition from sner import Ner tagger = Ner(host="localhost", port=9199) #checks if the user's full name is included in the meeting title, if either username or title blank returns 0 def users_fullname(meeting): if pd.isnull(meeting["username"]) or pd.isnull(meeting["title"]): return 0 if str(meeting["username"]).lower() in str(meeting["title"]).lower(): return 1 else: return 0 # returns 1 if the meeting occurs on a nonworkday, returns 0 if the starttime is blank def not_workday(meeting): if pd.isnull(meeting["starttime"]): return 0 if meeting["starttime"].isocalendar()[2] >= 6: return 1 else: return 0