def trial2(): """ Let's try using the nltk and one of the readability texts :return: """ pretrained_model_path = '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/www-experiments/stanford-ner-2015-12-09/' all3class = pretrained_model_path+'classifiers/english.all.3class.distsim.crf.ser.gz' conll4class = pretrained_model_path+'classifiers/english.conll.4class.distsim.crf.ser.gz' muc7class = pretrained_model_path+'classifiers/english.muc.7class.distsim.crf.ser.gz' st_muc = StanfordNERTagger(muc7class, pretrained_model_path+'stanford-ner.jar', encoding='utf-8') st_conll = StanfordNERTagger(conll4class, pretrained_model_path+'stanford-ner.jar', encoding='utf-8') st_3class = StanfordNERTagger(all3class, pretrained_model_path + 'stanford-ner.jar', encoding='utf-8') annotated_cities_file = '/Users/mayankkejriwal/datasets/memex-evaluation-november/annotated-cities/ann_city_title_state_1_50.txt' TP = 0 FP = 0 FN = 0 with codecs.open(annotated_cities_file, 'r', 'utf-8') as f: for line in f: obj = json.loads(line) text = obj['high_recall_readability_text'] tokenized_text = word_tokenize(text) classified_text_muc = st_muc.tag(tokenized_text) classified_text_conll = st_conll.tag(tokenized_text) classified_text_3class = st_3class.tag(tokenized_text) tagged_locations = set() correct_locations = _build_locations_true_positives_set(obj, ['correct_cities','correct_states','correct_cities_title']) # if 'correct_country' in obj and obj['correct_country']: # correct_locations = correct_locations.union(set(TextPreprocessors.TextPreprocessors._preprocess_tokens # (obj['correct_country'].split(),['lower']))) for i in range(0, len(classified_text_muc)): tag_muc = classified_text_muc[i] tag_conll = classified_text_conll[i] tag_3class = classified_text_3class[i] if str(tag_3class[1]) == 'LOCATION': # if str(tag_muc[1]) == 'LOCATION' or str(tag_conll[1]) == 'LOCATION' or str(tag_3class[1]) == 'LOCATION': tagged_locations.add(tag_3class[0].lower()) # print tagged_locations # print correct_locations TP += len(tagged_locations.intersection(correct_locations)) FP += (len(tagged_locations)-len(tagged_locations.intersection(correct_locations))) FN += (len(correct_locations)-len(tagged_locations.intersection(correct_locations))) # print classified_text[0][1] # print(classified_text) # break print 'TP, FP, FN are...' print TP print FP print FN
def get_location(loc): """ currently working only on my computer english Model english.muc.7class.distsim.crf.ser.gz german Models german.dewac_175m_600.crf.ser.gz german.hgc_175m_600.crf.ser.gz """ # Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar') loc_ner = st.tag(loc) """ might be faster starting from back to front 'LOCATION' for English 'I-LOC' for German """ # code that glues named entities like 'New York' back together loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item] try: location = loc_tuples[0] if len(loc_tuples) > 1: for i in range(1,len(loc_tuples)): location += ' ' + loc_tuples[i] except IndexError: # if no location is specified return None return location
def extract_named_entities(threadName,output_collection,fetchedTweets): st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') try: counter = 0 mongo_list = [] for fetchedTweet in fetchedTweets: counter += 1 named_entities = [] sentence = fetchedTweet['cleaned_text'] neList = st.tag(sentence.split()) for ne in neList: if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']: named_entities.append((ne[0], ne[1])) fetchedTweet['named_entities'] = named_entities mongo_list.append(fetchedTweet) if counter % 100 == 0: logging.info("{}: Tweets processed: {} tweets".format(threadName, counter)) write_mongo(threadName,output_collection,mongo_list) mongo_list = [] if len(mongo_list) > 0: write_mongo(threadName,output_collection,mongo_list) mongo_list = [] except Exception, e: print(e) sys.exit()
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type): stanford_tagger = StanfordNERTagger( model_name, stanford_ner_path, encoding='utf-8') text = sent_obj.sentence tokenized_text = list() spans = list() #Recover spans here for match in re.finditer("\S+", text): start = match.start() end = match.end() word = match.group(0) tokenized_text.append(word.rstrip(",.;:")) spans.append((start,end)) tokenized_text = strip_sec_headers_tokenized_text(tokenized_text) classified_text = stanford_tagger.tag(tokenized_text) # Expand tuple to have span as well len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset final_class_and_span = list() for idx,tup in enumerate(classified_text): combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1]) final_class_and_span.append(combined) #print(classified_text) sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span return sent_obj
def pretag(self): text=self.text st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\ "/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar") paragraphs = [] paragraphs_string='' for x in text: paragraphs.append(str(x)) paragraphs_string=' '.join(paragraphs) tagging=st.tag(paragraphs_string.split()) symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system'] badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up'] badlist_stem=[] self.badlist=badlist self.symlist=symlist for i in range(len(badlist)): badlist_stem.append(stemmer.stem(badlist[i])) self.badlist_stem=badlist_stem pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ] pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps] pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech] pretag= pretag1+pretag2+pretag3 domain2synsets = defaultdict(list) synset2domains = defaultdict(list) self.pretag=pretag
def ner(): os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer' os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar' os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH'] eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz') for x in content: print(eng_tagger.tag(x.split()))
def NERTagging(text): log_file = open("Dump/log/Main_output.txt", "a") st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'resources/ner/stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) log_file.write('NER \n %s \n' % classified_text) print(classified_text) log_file.close() return
def getEntityCount(tweet): # Use the Stanford NER Tagger st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # tokenize the tweet tokenized_text = word_tokenize(tweet) classified_text = st.tag(tokenized_text) countPerson =0 for text in classified_text: if "PERSON" in text[1]: countPerson+=1 return countPerson
def nltk_ner(remainders): st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') for item in remainders: name = "" tagged = st.tag(item.split()) for entity in tagged: if entity[1] == u'PERSON': name += (entity[0].title() + ' ') if name: return True, name, item else: return False, name, item
def getEntities(self, sentence): st = StanfordNERTagger( '/home/ubuntu/english.all.3class.distsim.crf.ser.gz', '/home/ubuntu/stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(sentence) classified_text = [i[0] for i in st.tag(tokenized_text) if i[1] != "O"] stanfordNames = [ i[0] for i in st.tag(tokenized_text) if i[1] == "PERSON" ] #text = "#BUSINESSofTheDay : #Bankruptcy Reveals Wild List: Who Weinstein Owes #Money To: Malia Obama, #DavidBowie, #MichaelBay http://a.msn.com/0C/en-us/BBKu067?ocid=st … #JudiDench #QuentinTarantinao #MichaelBay #DanielRadcliffe #RobertDeNiro #SexualAssault #MeToo #TimesUp #USA #EU #UK" #print sentence nlp = spacy.load('en') text = nlp(sentence.decode("utf-8")) #return [X.text for X in text.ents if X.label_ == "PERSON"] res = set([]) for X in text.ents: if X.label_ == "PERSON": name = X.text.split() if len(name) > 1: if name[0] in classified_text and name[ 1] in classified_text: res.add(X.text) elif name[0] in classified_text: res.add(name[0]) elif name[1] in classified_text: res.add(name[1]) else: res.add(X.text) for name in stanfordNames: res.add(name) for word in sentence.split(): if word[0] == "@": res.add(word[1:]) return res
def main(): f = open("ada_lovelace.txt") raw = f.read() tokens = nltk.word_tokenize(raw) tagged = nltk.pos_tag(tokens) nouns = [token for token, pos in tagged if pos.startswith('N')] lemmatizer = WordNetLemmatizer() noun_lemmas = [] st = StanfordNERTagger( '/home/thomas/Downloads/stanford-ner-2018-02-27/classifiers/english.conll.4class.distsim.crf.ser.gz', '/home/thomas/Downloads/stanford-ner-2018-02-27/stanford-ner.jar') l = st.tag(raw.split()) sorted_l = sorted(l, key=lambda x: x[1]) print(sorted_l) #exercise2.3 print("Exercise 2.3, nouns: ") n = st.tag(nouns) new = [tuple(s if s != "0" else "MISC" for s in tup) for tup in n] print(new)
def sf_ner_tagger(self, para): stanford_ner_dir = '/Users/Rena/StandfordParserData/stanford-ner-2018-02-27/' eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz' my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar' tagger = StanfordNERTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar) pattern = '^[A-Za-z]+$' tokens = [t for t in para.split() if re.match(pattern, t)] return tagger.tag(tokens)
def entity_recognition_stanford(tokens, base_path): ''' Uses the Stanford NER model wrapped in NLTK''' classifier_model_path = base_path + '/english.muc.7class.distsim.crf.ser.gz' ner_jar_path = base_path + '/stanford-ner.jar' stanford_tagger = StanfordNERTagger(classifier_model_path, ner_jar_path, encoding='utf-8') ner_tagged = stanford_tagger.tag(tokens) replaced_text_list = [ word[0] if word[1] == "O" else str('|' + word[1] + '|') for word in ner_tagged ] return ' '.join(replaced_text_list)
def find_NER_type(tokenized_text): """ Named Entity Recognition. Finds the NER type of each token :param tokenized_text: :return: """ from nltk.tag import StanfordNERTagger st = StanfordNERTagger('../exist-stanford-ner/resources/classifiers/english.all.3class.distsim.crf.ser.gz', '../exist-stanford-ner/java/lib/stanford-ner-2015-04-20.jar', encoding='utf-8') classified_text = st.tag(tokenized_text) return classified_text
class NerAnalyzer: def __init__(self, modelFileFile, nerJarFile): self.st = StanfordNERTagger(modelFileFile, nerJarFile) def analyze(self, text): result = {} if (text): tags = self.st.tag(str(text).split()) personTags = [item[0] for item in tags if item[1] == 'PERSON'] if (any(personTags)): result['Person tags by NER'] = personTags return result
def recognizeEntities(line): try: tagger = StanfordNERTagger( "Stanford-NER/english.muc.7class.distsim.crf.ser.gz", "Stanford-NER/stanford-ner.jar", encoding="utf-8") except LookupError as e: print(e) raise SystemExit tokens = word_tokenize(line) taggedTokens = tagger.tag(tokens) return taggedTokens
def start(raw_text, username): # Standford Classifier and Standord NER Path stanford_classifier = '..\stanford-ner-2017-06-09\classifiers\english.muc.7class.distsim.crf.ser.gz' stanford_ner_path = '..\stanford-ner-2017-06-09\stanford-ner.jar' # Creating Tagger Object st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8') # # Get the Input # with open('input.txt', 'r') as myfile: # raw_text=myfile.read().replace('\n', '') # Tokenize the Sentence and the Words sentence_tokenized = nltk.sent_tokenize(raw_text) word_tokenized = [nltk.word_tokenize(sent) for sent in sentence_tokenized] # Creates a list of lists that stores all the chunked words (with entities) for each and every tokenized sentence all_chunked_words = [] for words in word_tokenized: chunked_words = GetChunkWords(st.tag(words)) all_chunked_words.append(chunked_words) namedentities = [] for words in word_tokenized: namedentities.append(WriteChunkedWordsToFile(st.tag(words))) with open('output\\namedentities.txt', 'w') as outfile: json.dump(namedentities, outfile) # Creates a list of sentiment values for each sentence in the raw text # result = GetListOfSentimentsforeachSentence(raw_text) sentiment_list = GetListOfSentimentsforeachSentence(raw_text) # Creates a Dictionary of key-value pair, where key is the token and value is the average sentiment all_tokens_sentiment = GetListOfSentimentsforeachToken(all_chunked_words,sentiment_list) # Register Code with open('output\\'+ username +'.txt', 'w+') as outfile: json.dump(all_tokens_sentiment, outfile)
def detection(document): # stanford's NER tagger 3 entity classification PERSON LOCATION ORGANIZATION O(other) stan_ner = StanfordNERTagger( '/home/iraklis/Desktop/PhDLocal/Tools/stanford-ner-2017-06-09/classifiers/' 'english.all.3class.distsim.crf.ser.gz', '/home/iraklis/Desktop/PhDLocal/Tools/stanford-ner-2017-06-09/' 'stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(document) classified_text = stan_ner.tag(tokenized_text) write_on_file(classified_text) # Chunking entities (for example first name with last name) previous_tuple = classified_text[0] entity_str = "" if previous_tuple[1] != 'O': entity_str += previous_tuple[0] store_entities = dict() store_entities["P"] = [] store_entities["L"] = [] store_entities["O"] = [] for txt in classified_text[1:-1]: if txt[1] != 'O': if txt[1] == previous_tuple[1]: entity_str += " " + txt[0] else: entity_str = txt[0] else: if entity_str != "": store_entities[previous_tuple[1][0]].append(entity_str) entity_str = "" previous_tuple = txt # We are using OrderedDict to delete duplicates and preserve the insertion order store_entities["P"] = remove_entities( list(OrderedDict((x, True) for x in store_entities["P"]).keys())) store_entities["L"] = remove_entities( list(OrderedDict((x, True) for x in store_entities["L"]).keys())) store_entities["O"] = remove_entities( list(OrderedDict((x, True) for x in store_entities["O"]).keys())) # We transform the article_entity_dict_list in order to contain the entities of # each article sentence. The article entities will be calculated when it is needed. sentence_entities = entity_tools.transform_article_dict( document, store_entities) return sentence_entities
def remove_names(text): meaningful_words = [] tagged_word = [] tags = ['LOCATION', 'ORGANIZATION', 'PERSON'] st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') for sentence in text: tagged_word.append(st.tag(sentence)) for words in tagged_word: for word in words: if word[1] in tags: meaningful_words.append(word[0]) return meaningful_words
def orgs(text): st = StanfordNERTagger('trained_model/stanford_ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'trained_model/stanford_ner/stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) names_list = [] for tag_tuple in classified_text: if tag_tuple[1] == 'ORGANIZATION': names_list.append(tag_tuple[0]) return names_list
def text_tokinization(text): # Change the path according to your system stanford_classifier = 'C:\Farshid\programming\Java\stanford-ner-2016-10-31\classifiers\english.all.3class.distsim.crf.ser.gz' stanford_ner_path = 'C:\Farshid\programming\Java\stanford-ner-2016-10-31\stanford-ner.jar' # Creating Tagger Object st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8') tokenizedText = word_tokenize(text) classifiedText = st.tag(tokenizedText) return (classifiedText)
def reader(): f=open("../chat_history/q.txt",'rb') file=open("../chat_history/ner_train2.txt",'ab') tagger=StanfordNERTagger('/Users/vishnuchopra/Project/stanford-ner-2017-06-09/classifiers/english.conll.4class.distsim.crf.ser.gz', '/Users/vishnuchopra/Project/stanford-ner-2017-06-09/stanford-ner.jar') for line in f: print(line) line=line.strip() tags=tagger.tag(line.split()) for tag in tags: nert=tag[1] print(nert+' ') file.write(nert+' ') file.write('\n') print('\n')
def stanford_ner(words, args): start = time.time() """ 3 class: Location, Person, Organization 4 class: Location, Person, Organization, Misc 7 class: Location, Person, Organization, Money, Percent, Date, Time """ ner_classifier_path = 'english.all.3class.distsim.crf.ser.gz' # default 3 class if args.ner_class == 7: ner_classifier_path = 'english.muc.7class.distsim.crf.ser.gz' elif args.ner_class == 4: ner_classifier_path = 'english.conll.4class.distsim.crf.ser.gz' ner_classifier_full_path = os.path.join(stanford_ner_directory_path, 'classifiers', ner_classifier_path) ner_jar_path = os.path.join(stanford_ner_directory_path, 'stanford-ner.jar') s_ner_tagger = StanfordNERTagger(ner_classifier_full_path, ner_jar_path, encoding='UTF-8') _tagged = s_ner_tagger.tag(words) # NLP BIO tags processing (B-beginning NE, I-inside NE, O-outside NE) bio_tagged = [] prev_tag = "O" for token, tag in _tagged: if tag == "O": # O bio_tagged.append((token, tag)) prev_tag = tag continue if tag != "O" and prev_tag == "O": # Begin NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag == tag: # Inside NE bio_tagged.append((token, "I-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag != tag: # Adjacent NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag # convert bio_tags to NLTK tree-like format tokens, ne_tags = zip(*bio_tagged) pos_tags = [pos for token, pos in get_pos_tags(tokens, args)] conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)] ne_tree = conlltags2tree(conlltags) print 'Stanford NER took %.3f sec, NEs are:\n %s\n' % ( time.time() - start, structure_ne(ne_tree))
def Locations_in_text(text): # This Function Tag every word in a text and return the words with "Location" tag as a list st = StanfordNERTagger(NER_FOLDER + 'english.all.3class.distsim.crf.ser.gz') TagedNamedEntity = st.tag(text.split()) Locations = [] for NamedEntity in TagedNamedEntity: name = NamedEntity[0] tag = NamedEntity[1] if tag == "LOCATION": Locations.append(name) return Locations
def extractNER(sentence): model = '/Users/pranavr/anaconda/lib/python3.5/site-packages/stanford-ner-2017-06-09/classifiers/english.muc.7class.distsim.crf.ser.gz' jar = '/Users/pranavr/anaconda/lib/python3.5/site-packages/stanford-ner-2017-06-09/stanford-ner.jar' st = StanfordNERTagger(model, jar, encoding='utf-8') tokenized_text = word_tokenize(sentence) classified_text = st.tag(tokenized_text) taggedWords = [] for tup in classified_text: if (tup[1] != 'O'): taggedWords.append(tup) return taggedWords
def ret_loc_ner(tweet): st = StanfordNERTagger('stanford_ner/english.all.3class.distsim.crf.ser.gz', 'stanford_ner/stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(tweet) classified_text = dict(st.tag(tokenized_text)) list_of_locations = [] for word, entity in classified_text.items(): if entity == "LOCATION": list_of_locations.append(word) return list_of_locations
def get_ner_document_tags(self, document_id): ner = StanfordNERTagger( 'C:/Users/1/James/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar') for document in self.corpus[document_id]: for line in document: tags = ner.tag(' '.join(line)) document = [] for (word, tag) in tags: if tag != 'O': self.ner_corpus_tags.append(document.append(word)) return self.ner_document_tags
class named_entity: def __init__(self): self.tagger = StanfordNERTagger( 'stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8') #filename = "filename9500.pickle" #with open(filename,'rb') as handle: # _dic = pickle.load(handle) self.named_entity_count = {} def get_named_entity(self, tweet): per, loc, org = 0, 0, 0 #print(tweet) text = tweet[1] tokenized_tweet = word_tokenize(text) taggs = self.tagger.tag(tokenized_tweet) _dic = {} _dic["PERSON"] = "" _dic["LOCATION"] = "" _dic["ORGANIZATION"] = "" for tag, chunk in groupby(taggs, lambda x: x[1]): ans = " ".join(w for w, t in chunk) if tag in _dic: _dic[tag] += " " + str(ans.encode('utf-8')) #print(ans) if str(tag) == "O": continue if str(tag) == "PERSON": per += 1 elif str(tag) == "LOCATION": loc += 1 elif str(tag) == "ORGANIZATION": org += 1 if str(ans.encode('utf-8')).lower() in self.named_entity_count: self.named_entity_count[str( ans.encode('utf-8')).lower()] += 1 else: self.named_entity_count[str( ans.encode('utf-8')).lower()] = 1 entry = [] entry.append(tweet[0]) entry.append(tweet[2]) entry.append(tweet[1]) entry.extend([per, loc, org]) return entry, _dic
def doc_ents(filename, f): ner = StanfordNERTagger( 'C:/Users/1/James/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar') document = [] dd = {} with open(f, 'r') as myfile: text = myfile.readlines() for i, (word, tag) in enumerate(ner.tag(text)): if tag != 'O': document.append(word) dd[remove_extenstion(filename).title()] = set(document) return dd
def NER(text, tagName): import nltk from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize st = StanfordNERTagger('./english.all.3class.distsim.crf.ser.gz', './stanford-ner.jar', encoding='utf-8') for sent in nltk.sent_tokenize(text): tokens = nltk.tokenize.word_tokenize(sent) tags = st.tag(tokens) name = [] for tag in tags: if tag[1] == tagName: name.append(tag[0]) name = " ".join(name) return name
def namedEntityRecog(x): stanford_ner_tagger = StanfordNERTagger( 'Documents/projects/praveenProject/sparkNLP/stanford_ner/classifiers/english.muc.7class.distsim.crf.ser.gz', 'Documents/projects/praveenProject/sparkNLP/stanford_ner/stanford-ner-3.9.2.jar') results = stanford_ner_tagger.tag(x) #print('Original Sentence: %s' % (article)) list1 = [] for result in results: tag_value = result[0] tag_type = result[1] if tag_type != 'O': list1.append('Type: %s, Value: %s' % (tag_type, tag_value)) filtered_list = [s for s in list1 if s!=""] return filtered_list
def get_NER(text): st = StanfordNERTagger('english.all.3class.distsim.crf.ser', 'stanford-ner.jar', encoding='UTF8') tokens = word_tokenize(text) NER_text = st.tag(tokens) if os.path.exists('NER_log.txt'): f = open('NER_log.txt', 'w', encoding='UTF8') else: f = open('NER_log.txt', 'x', encoding='UTF8') f = open('NER_log.txt', 'w', encoding='UTF8') for i in NER_text: if i[1] != 'O': f.write(str(i)) return NER_text
class NERClassifier(object): def __init__(self): from nltk.tag import StanfordNERTagger parentdir = osp.join(osp.abspath(osp.join(os.getcwd(), os.pardir)), 'QuoraQuestionPairs') jarfile = osp.join(parentdir, 'data', 'stanford-ner-2015-04-20', 'stanford-ner-3.5.2.jar') modelfile = osp.join(parentdir, 'data', 'stanford-ner-2015-04-20', 'classifiers', 'english.all.3class.distsim.crf.ser.gz') self.tagger = StanfordNERTagger(modelfile, path_to_jar=jarfile) def __call__(sentence): if isinstance(sentence, list): return self.tagger.tag()
def trial1(): """ Just to make sure we're not screwing everything up. :return: """ st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz', '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar', encoding='utf-8') text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text)
def test_chinese_ner(): # sent = u'小明硕士毕业于中国科学院计算所后在日本京都大学深造' sent = u'习近平于1978年至1982年在江苏工学院农业机械工程系农业机械专业学习,获工学学士学位' tokens = jieba.cut(sent) new_tokens = [] for x in tokens: print x new_tokens.append(x) new_sent = u" ".join(new_tokens) print new_sent chi_tagger = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz') ## sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' for word, tag in chi_tagger.tag(new_sent.split()): print word.encode('utf-8'), tag
def stanford_ne_tagger(tokens): st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) tags = st.tag(tokens) continuous_chunks = get_continuous_chunks(tags) named_entities_str_tag = set() for ne in continuous_chunks: if (ne[0][1] == u'LOCATION'): named_entities_str_tag.add( lower(u' '.join([token for token, tag in ne]))) return named_entities_str_tag
def get_namedentities(text): """ Returns named entities in text using StanfordNERTagger """ st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar') ner_tagged = st.tag(text.lower().split()) named_entities = [] if len(ner_tagged) > 0: for n in ner_tagged: if n[1]!='O': named_entities.append(remove_punctuation(n[0])) named_entities = [n for n in named_entities if n] return named_entities
def get_relations(articles): relations = { 'NODE1': [], 'NODE2': [], 'TYPE': [], 'DATE': [], 'SOURCEINTEXT': [], 'SOURCE': [], 'CONTEXT': [] } gz_path = "stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz" jar_path = 'stanford-ner-2018-10-16/stanford-ner.jar' st = StanfordNERTagger(gz_path, jar_path, encoding='utf-8') for text in range(len(articles['TEXT'])): print('Analysing for sentiment and relations in:\n', articles['SOURCE'][text]) sentences = sentence(articles['TEXT'][text]) for sent in sentences: # sentiment analysis sentiment = TextBlob(sent).sentiment[0] if sentiment > 0: sentiment = 'positive' elif sentiment < 0: sentiment = 'negative' else: sentiment = 'neutral' # relation extraction tags = st.tag(sent.split()) uniques = [] for t in tags: if t not in uniques: uniques.append(t) relate = combinations(grouptags(uniques), 2) # add to the output for r in relate: relations['NODE1'].append(strip_punct(r[0][1])) relations['NODE2'].append(strip_punct(r[1][1])) relations['TYPE'].append(sentiment) relations['DATE'].append(articles['Date'][text]) try: relations['SOURCEINTEXT'].append(','.join( articles['Authors'][text])) except TypeError: relations['SOURCEINTEXT'].append(None) relations['SOURCE'].append(articles['SOURCE'][text]) relations['CONTEXT'].append(sent) print('Finished Analysing all news sources.') return relations
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section=None): """ Extract entities using the Stanford NER tagger. Must pass in the path to the tagging model and jar as downloaded from the Stanford Core NLP website. """ results = defaultdict(lambda: defaultdict(list)) fileids = fileids or corpus.fileids() tagger = StanfordNERTagger(model, jar) section = section for fileid in fileids: if section is not None: text = nltk.word_tokenize( list(sectpull([fileid], section=section))[0][1]) else: text = corpus.words(fileid) chunk = [] for token, tag in tagger.tag(text): if tag == 'O': if chunk: # Flush the current chunk etext = " ".join([c[0] for c in chunk]) etag = chunk[0][1] chunk = [] if etag == 'PERSON': key = 'persons' elif etag == 'ORGANIZATION': key = 'organizations' elif etag == 'LOCATION': key = 'locations' else: key = 'other' results[fileid][key].append(etext) else: # Build chunk from tags chunk.append((token, tag)) return results
class StanfordNER(NER): def __init__(self, path=None): if path is None: path = os.path.dirname(os.path.realpath(__file__)) + '/' self.ner = StanfordNERTagger( path + 'stanford/classifiers/english.all.3class.distsim.crf.ser.gz', path + 'stanford/stanford-ner.jar', encoding='utf-8') self.set = NER.allowed_tags def tag(self, text, language=None, **kwargs): tokenized_text = word_tokenize(text) classified_text = self.ner.tag(tokenized_text) return classified_text
class Dictionary: def __init__(self): self.dictionary = PyDictionary() self.st = StanfordNERTagger( 'D:\Python\stanford-ner-2016-10-31\classifiers\english.all.3class.distsim.crf.ser.gz', 'D:\Python\stanford-ner-2016-10-31\stanford-ner.jar') def getMeaning(self, word): tokenized_text = word_tokenize(word) classified_text = self.st.tag(tokenized_text) print(classified_text) for i in classified_text: if (i[1] == 'WORD'): return self.dictionary.meaning(i[0])
def classify_text(text): """Using the 3-class Stanford Named Entity Recognition model, classify each word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for other).""" directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/" mod = "classifiers/english.all.3class.distsim.crf.ser.gz" tag = "stanford-ner.jar" path_to_model = os.path.normpath(directory + mod) path_to_tagger = os.path.normpath(directory + tag) st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8') tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) return classified_text
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None): """ Extract entities using the Stanford NER tagger. Must pass in the path to the tagging model and jar as downloaded from the Stanford Core NLP website. """ results = defaultdict(lambda: defaultdict(list)) fileids = fileids or corpus.fileids() tagger = StanfordNERTagger(model, jar) section = section for fileid in fileids: if section is not None: text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1]) else: text = corpus.words(fileid) chunk = [] for token, tag in tagger.tag(text): if tag == 'O': if chunk: # Flush the current chunk etext = " ".join([c[0] for c in chunk]) etag = chunk[0][1] chunk = [] # if etag == 'PERSON': # key = 'persons' # elif etag == 'ORGANIZATION': # key = 'organizations' # elif etag == 'LOCATION': # key = 'locations' # else: # key = 'other' if etag == 'LOCATION': key = 'locations' else: key = 'other' results[fileid][key].append(etext) else: # Build chunk from tags chunk.append((token, tag)) return results
def html_ner(content): st = StanfordNERTagger( './lib/classifiers/english.all.3class.distsim.crf.ser.gz', './lib/stanford-ner-3.5.2.jar') soup = BeautifulSoup(content, "html.parser") for script in soup(["script", "style", "sup"]): script.extract() tokenised_sents = list(soup.stripped_strings) tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents] tagged_sents = [st.tag(sent) for sent in tokenised_words] result = list() for sent in tagged_sents: for tag, chunk in groupby(sent, lambda x: x[1]): if tag != 'O': result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip())) return result
def sanitize_result(self, text): st = StanfordNERTagger('C:\Python27\stanford_ner\classifiers\english.all.3class.distsim.crf.ser.gz', 'C:\Python27\stanford_ner\stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(self.capitalize_first_letter(text)) classified_text = st.tag(tokenized_text) named_entities = self.get_continuous_chunks(classified_text) named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities] named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities] for tag, chunk in groupby(named_entities_str_tag, lambda x:x[1]): if tag == "PERSON": #print "%-12s"%tag, " ".join(w for w, t in chunk) name = " ".join(w for w, t in chunk) return name
def main(): # training standford NER tagger st = StanfordNERTagger( "/home/viswanath/Downloads/stanford-ner-2014-08-27/classifiers/english.conll.4class.distsim.crf.ser.gz", "/home/viswanath/Downloads/stanford-ner-2014-08-27/stanford-ner.jar", encoding="utf-8", ) fname = "/home/viswanath/data/resume/test_data/01.txt" fp = open(fname, "r") text = fp.read() # print text lstemp = cleanse_data(text) list_ner_out = st.tag(lstemp.split()) # list_ner_out = st.tag(text.split()) # print list_ner_out # list_out = st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) fp = open("ner_temp.txt", "w") # fp.write(list_ner_out) for item in list_ner_out: fp.write("{0}\n".format(item)) fp.close() ne_tagged_sent = list_ner_out ne_tree = stanfordNE2tree(ne_tagged_sent) print ne_tree ne_in_sent = [] for subtree in ne_tree: if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O" ne_label = subtree.label() ne_string = " ".join([token for token, pos in subtree.leaves()]) ne_in_sent.append((ne_string, ne_label)) print ne_in_sent
conn = psycopg2.connect(database="NewsSource", user="******", password="******", host="newdb.cnceaogjppz8.us-west-2.rds.amazonaws.com",port="5432") cur = conn.cursor() #cur.execute("select articletext,id from articlestable order by crawleddatetime desc") #cur.execute("select articletext,id from articlestable where order by crawleddatetime desc") updateq = "update articlestable set wikilinks= %s where id=%s" while True: cur.execute("select articletext,id from articlestable where COALESCE(wikilinks,'') = '' order by crawleddatetime desc limit 10") #cur.execute("select articletext,id from articlestable order by crawleddatetime desc limit 150") rows = cur.fetchall() for row in rows: print row[0] print py = st.tag(row[0].split()) ne_tagged_sent = py named_entities = get_continuous_chunks(ne_tagged_sent) named_entities = get_continuous_chunks(ne_tagged_sent) named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities] named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities] ner = named_entities_str_tag people =[] places =[] organ =[] print named_entities_str_tag print for entity in ner:
class PhotoStream(object): def __init__(self): self.store_list= [] self.st = StanfordNERTagger(r'english.all.3class.nodistsim.crf.ser.gz',r'stanford-ner.jar') def request_pop_photo_stream(self, category, attempts): if attempts == 3: return None payload = {'consumer_key':CONSUMER_KEY,'rpp':100,'feature':'popular','only':category,'tags':1, 'sort':'rating','image_size':3} results = [] for page in range(1,6): payload['page'] = page try: response = requests.get('https://api.500px.com/v1/photos', params=payload) result_page = response.json() results.extend(result_page['photos']) except: self.request_pop_photo_stream(category, attempts+1) return results def get_located_photos(self, photo_list, category): for photo in photo_list: if photo_manager.photo_seen(photo['id']): self.store_list.append(photo) else: photo_manager.add_photo(photo['id']) if photo['latitude'] is not None: photo['exact location'] = True self.store_list.append(photo) elif category != 'People': self.extract_location(photo,category) return self.store_list def extract_location(self, photo, category): tags = photo['tags'] possible_locations = [] tag_pos_array = [] pos = 0 total_tags = '' for tag in tags: tag = tag.title() if tag in location_manager.known_locations: possible_locations.append(tag) elif tag in location_manager.not_locations: continue elif tag not in EXCLUDED_LOCATIONS: separated_subtags = tag.split() #record the end position of a tag in total_tags pos = pos+len(separated_subtags) tag_pos_array.append(pos) total_tags = total_tags+tag+' ' total_tags = total_tags[:-1] possible_locations.extend(self.nlp_analyze(total_tags, tag_pos_array)) possible_locations = [loc for loc in possible_locations if loc not in EXCLUDED_LOCATIONS] if len(possible_locations) == 0: return None location_dic = {} for i in range(len(possible_locations)): if possible_locations[i] in location_dic: location_dic[possible_locations[i]] = location_dic[possible_locations[i]]+1 else: location_dic[possible_locations[i]] = 1 sorted_location = sorted(location_dic.items(),key=operator.itemgetter(1)) sorted_location.reverse() #use the location most frequently appeared in tags location = sorted_location[0] #print photo['name'] #print location[0] #print "\n" lat, lng = self.request_latlng(location[0], category) if lat is not None: photo['latitude'] = lat photo['longitude'] = lng photo['exact location'] = False self.store_list.append(photo) def nlp_analyze(self, text, tag_pos_array): if text is None or text == "": return [] possible_locations = [] splitted_text = text.split() total_length = len(splitted_text) result = self.st.tag(splitted_text) start = 0 for i in range(0,len(tag_pos_array)): end = tag_pos_array[i] conseq = False loc_tmp = "" for j in range(start,end): if result[j][1] == 'LOCATION': if conseq: loc_tmp = loc_tmp+' '+result[j][0] else: loc_tmp = result[j][0] conseq = True else: if loc_tmp != "": location_manager.known_locations.add(loc_tmp) possible_locations.append(loc_tmp) loc_tmp = "" conseq = False location_manager.not_locations.add(result[j][0]) if loc_tmp != "": possible_locations.append(loc_tmp) start = end return possible_locations def request_latlng(self, location, category): #use three geonames.org accounts to avoid requesting limitations geoname_account = GEONAME_ACCOUNTS[PHOTO_CATEGORIES.index(category)] payload = {'q':location,'maxRows':1,'username':geoname_account} try: response = requests.get('http://api.geonames.org/searchJSON', params=payload) result = response.json() return float(result['geonames'][0]['lat']), float(result['geonames'][0]['lng']) except: return None, None def save_photo_stream_to_db(self, photo_list, category): if category == 'City and Architecture': photo_collection = mydb.city elif category == 'Landscapes': photo_collection = mydb.landscape elif category == 'People': photo_collection = mydb.people for photo in photo_list: #check if the photo is already in the database if photo_collection.find_one({'id':photo['id']}) is None: if photo['latitude'] is not None: photo_collection.insert(photo) else: photo_collection.update_one({'id':photo['id']},{ "$set": { "rating": photo['rating'] } } )
StanfordNERPath = './stanford-ner' st = StanfordNERTagger(StanfordNERPath + '/classifiers/english.all.3class.distsim.crf.ser.gz', StanfordNERPath + '/stanford-ner.jar') indicator = set([u'is', u'was', u'are', u'were']) noun_tag = set(['NN', 'NNS', 'NNP', 'NNPS']) q = open("withDoc.rq").read() results = G.query(q) outfile = open("../output/stadium.tsv", "w") count = 0 for row in results: count += 1 if count % 10 == 0: print "%d documents has been processed" % count text = row[1] tags = st.tag(word_tokenize(text)) firstSentence = '' for j in range(1, len(text)): if (text[j] == '.') and ((j+1==len(text) or (text[j+1] in string.whitespace))): if text[j-1] not in string.uppercase: firstSentence = text[:j] break if firstSentence == '': firstSentence = text.split('.')[0] postags = pos_tag(word_tokenize(firstSentence)) posLen = len(postags) noun = '' idx = 0 for j in range(posLen): if postags[j][0] in indicator: idx = j
def namedEntityRecognize(self, sentence): #perform NER on the sentence - returns a list of tuples of (word, ne-recognized tags) st = StanfordNERTagger(self.modelPath) print st.tag(sentence.split()) return st.tag(sentence.split())
def get_entities(content): st = StanfordNERTagger('C:\Users\Philippe\Downloads\stanford-ner-2015-04-20\stanford-ner-2015-04-20\classifiers\english.all.3class.distsim.crf.ser.gz') entity_list = st.tag(content.split()) return entity_list
#from code_classifier_chunker import ConsecutiveNPChunker jar_folder = "/Users/nishantagarwal/stanford-ner-2015-04-20/stanford-ner.jar" os.environ['CLASSPATH']=jar_folder from nltk.tag import StanfordNERTagger path_to_jar = "/Users/nishantagarwal/stanford-ner-2015-04-20/stanford-ner.jar" path_to_model = "/Users/nishantagarwal/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz"#english.muc.7class.distsim.crf.ser.gz" #train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) #test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) #chunker = ConsecutiveNPChunker(train_sents) l = 'When did the boys realize they were lost?' #print chunker.parse(l) grammar = "NP: {<LOCATION><O>?<LOCATION>+}" qetag = StanfordNERTagger(path_to_model,path_to_jar) l = nltk.word_tokenize(l) ne_tagged = qetag.tag(l) tagged = nltk.pos_tag(l) print l print tagged print ne_tagged cp = nltk.RegexpParser(grammar) tre = cp.parse(ne_tagged) for subtree in tre.subtrees(): if subtree.label() == 'NP': break #print subtree.leaves() #print(chunker.evaluate(test_sents)) #subtree = subtree.leaves() answer = ' '.join(word for word,pos in subtree.leaves()) print answer #qetag = StanfordNERTagger(path_to_model,path_to_jar)
A big benefit of the Stanford NER tagger is that is provides us with a few different models for pulling out named entities. We can use any of the following: 3 class model for recognizing locations, persons, and organizations 4 class model for recognizing locations, persons, organizations, and miscellaneous entities 7 class model for recognizing locations, persons, organizations, times, money, percents, and dates ################################################################################################ The parameters passed to the StanfordNERTagger class include: Classification model path (3 class model used below) Stanford tagger jar file path Training data encoding (default of ASCII) """ from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar', encoding='utf-8') text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text)
from nltk.tag import StanfordNERTagger from nltk import word_tokenize import string import sys reload(sys) sys.setdefaultencoding('utf-8') st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') outfile = open("../output/entity.txt", "w") doc_count = 826 entitySet = {} print "Stage 1: NER starts!" for i in range(doc_count): infile = open("../data/%d.txt" % i) tags = st.tag(word_tokenize(unicode(infile.read()))) infile.close() transform_file = open("../data/transformed_%d.txt" % i, "w") entity = u'' label = u'' transform_text = u'' if i % 10 == 0: print "%d files have been processed." % i for tag in tags: if tag[1] == u'O': if label != u'': if not entitySet.has_key((entity, label)): entitySet[(entity, label)] = len(entitySet) outfile.write('%s%d\t%s\n' % (label, entitySet[(entity, label)], entity)) transform_text += ' %s%d' % (label, entitySet[(entity, label)]) transform_text += ' ' + tag[0]
from nltk.tag import StanfordNERTagger st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz") print st.tag( "Rami Eid is studying at Stony Brook University in NY. And he wants to work at CERN in Switzerland in Europe .".split() )
class StanfordTagger(object): """ Wrapper for the Stanford NER Tagger """ __currentDirectory = os.path.dirname(os.path.realpath(__file__)) # Current directory __classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" __stanfordJar = "%s/dist/stanford-ner.jar" def __init__(self, language="en"): from nltk.tag import StanfordNERTagger self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,) self.__tagger = StanfordNERTagger( self.__classifier, self.__stanfordJar, encoding="utf-8") self.__namedEntitiesFinder = NERFinder(language=language) def __tags(self, raw_text): """ Return the named entities tokens given a raw text :raw_text: Raw text """ from nltk.tokenize import word_tokenize if isinstance(raw_text, str): # Decode to utf-8 raw_text = raw_text.decode('utf-8') # Tokenize the string token_text = word_tokenize(raw_text) # Retrieve the named entities from the tokens ne_tags = self.__tagger.tag(token_text) return(ne_tags) def __bio_tagger(self, ne_tagged): """ Return BIO tags from named entities :ne_tagged: name_entities tokens """ bio_tagged = [] prev_tag = "O" for token, tag in ne_tagged: if tag == "O": #O bio_tagged.append((token, tag)) prev_tag = tag continue if tag != "O" and prev_tag == "O": # Begin NE bio_tagged.append((token, "B-"+tag)) prev_tag = tag elif prev_tag != "O" and prev_tag == tag: # Inside NE bio_tagged.append((token, "I-"+tag)) prev_tag = tag elif prev_tag != "O" and prev_tag != tag: # Adjacent NE bio_tagged.append((token, "B-"+tag)) prev_tag = tag return bio_tagged def __generate_tree(self, bio_tagged): """ Tranform a list of tags in a tree """ from nltk import pos_tag from nltk.chunk import conlltags2tree tokens, ne_tags = zip(*bio_tagged) pos_tags = [pos for token, pos in pos_tag(tokens)] conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)] ne_tree = conlltags2tree(conlltags) return ne_tree def __getEntities(self, taggedWords): """ It returns the entities from a list of tagged words (NER or POS) after generating the syntax tree """ bio_tagged = self.__bio_tagger(taggedWords) stanford_tree = self.__generate_tree(bio_tagged=bio_tagged) entities = self.__namedEntitiesFinder.getEntities(stanford_tree) return entities def getEntitiesByTags(self, pos_tagged_words): """ Get entities from a list of word tagged with POS Tags. """ entities = self.__getEntities(taggedWords=pos_tagged_words) return entities def getEntities(self, raw_text): """ Get the entities from a raw text """ ne_entities = self.__tags(raw_text=raw_text) entities = self.__getEntities(taggedWords=ne_entities) return entities
class PhotoStream(object): def __init__(self): self.client = api.FiveHundredPx(CONSUMER_KEY, CONSUMER_SECRET) self.store_list = [] self.st = StanfordNERTagger(r"english.all.3class.nodistsim.crf.ser.gz", r"stanford-ner.jar") def request_pop_photo_stream(self, category, attempts): if attempts == 3: return None results = None try: results = self.client.get_photos(rpp=2, feature="popular", only=category, sort="rating", tags=1) except: self.request_pop_photo_stream(category, attempts + 1) return results def get_located_photos(self, photo_list, category): for photo in photo_list: if photo_manager.photo_seen(photo["id"]): self.store_list.append(photo) else: photo_manager.add_photo(photo["id"]) if photo["latitude"] is not None: photo["exact location"] = True self.store_list.append(photo) elif category != "People": self.extract_location(photo, category) return self.store_list def extract_location(self, photo, category): tags = photo["tags"] possible_locations = [] q = Queue() job_list = [] for tag in tags: tag = tag.title() if tag in location_manager.known_locations: possible_locations.append(tag) elif tag in location_manager.not_locations: continue elif tag not in EXCLUDED_LOCATIONS: p = Process(target=self.nlp_analyze, args=(tag, q)) job_list.append(p) p.start() for job in job_list: job.join() while not q.empty(): possible_locations.extend(q.get()) if len(possible_locations) == 0: return None location_dic = {} for i in range(len(possible_locations)): if possible_locations[i] in location_dic: location_dic[possible_locations[i]] = location_dic[possible_locations[i]] + 1 else: location_dic[possible_locations[i]] = 1 sorted_location = sorted(location_dic.items(), key=operator.itemgetter(1)) sorted_location.reverse() # use the location most frequently appeared in tags location = sorted_location[0] print photo["name"] print location[0] print "\n" lat, lng = self.request_latlng(location[0], category) if lat is not None: photo["latitude"] = lat photo["longitude"] = lng photo["exact location"] = False self.store_list.append(photo) def nlp_analyze(self, text, q): if text is None or text == "": return [] possible_locations = [] result = self.st.tag(text.split()) loc_tmp = "" conseq = False for r in result: if r[1] == "LOCATION": if conseq: loc_tmp = loc_tmp + " " + r[0] else: loc_tmp = r[0] conseq = True else: if loc_tmp != "": location_manager.known_locations.add(loc_tmp) possible_locations.append(loc_tmp) loc_tmp = "" conseq = False location_manager.not_locations.add(r[0]) if loc_tmp != "": possible_locations.append(loc_tmp) q.put(possible_locations) def request_latlng(self, location, category): # use three geonames.org accounts to avoid requesting limitations geoname_account = GEONAME_ACCOUNTS[PHOTO_CATEGORIES.index(category)] payload = {"q": location, "maxRows": 1, "username": geoname_account} try: response = requests.get("http://api.geonames.org/searchJSON", params=payload) result = response.json() return float(result["geonames"][0]["lat"]), float(result["geonames"][0]["lng"]) except: return None, None def save_photo_stream_to_db(self, photo_list, category): if category == "City and Architecture": photo_collection = mydb.city elif category == "Landscapes": photo_collection = mydb.landscape elif category == "People": photo_collection = mydb.people for photo in photo_list: # check if the photo is already in the database if photo_collection.find_one({"id": photo["id"]}) is None: if photo["latitude"] is not None: photo_collection.insert(photo) else: photo_collection.update_one( {"id": photo["id"]}, {"$set": {"rating": photo["rating"]}, "$currentDate": {"lastModified": True}} )
import ner from nltk.tag import StanfordNERTagger stanford_ner_dir = '/home/will/packages/stanfordNER/' eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz' my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar' st = StanfordNERTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar) st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # tagger = ner.HttpNER(host='localhost', port=8080) # tagger.get_entities("University of California is located in California, United States")
def main(argv): # Opening file to read and file to write, also preparing StanfordNERTagger. st = StanfordNERTagger('/home/sietse/Desktop/Project Tekstanalyse/Project/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz') f = open("{0}/en.tok.off.pos".format(argv[1])).readlines() g = open("en.tok.off.pos.ent.{0}.{1}".format(argv[1][-9:-6],argv[1][-5:]), "w") context = [] lines = [] text = [] sets = [] bigrams = [] lines_updated = [] NER_tags = ["COU", "CIT", "NAT", "PER", "ORG", "ANI", "SPO", "ENT"] # Creating context for lesk function and NER- and POS-tagging words. print("[Tagging words...]") for line in f: line = line.split() lines.append(line) context.append(line[3]) text.append((line[3],line[4])) chunk = nltk.ne_chunk(text) stanford = st.tag(context) # Here the words will be made ready to be written to the output file (NER-tag added to the line). print("[Applying tags to lines...]") for line in f: lemmas = [] names = [] line = line.split() for i in stanford: if line[4] in "NNPS" or "NNS": if line[3] in i: if i[1] == "PERSON": line.append("PER") if i[1] == "ORGANIZATION": line.append("ORG") if i[1] == "LOCATION": line.append("LOCATION") if len(wordnet.synsets(line[3], 'n')) == 0: hyper = [] if len(wordnet.synsets(line[3], 'n')) == 1: synset = wordnet.synsets(line[3], 'n')[0] hyper = synset.hypernym_paths() if len(wordnet.synsets(line[3], 'n')) > 1: synset = lesk(context,line[3],'n') hyper = synset.hypernym_paths() for i in hyper: for e in i: lemmas.append(e.lemmas()) for i in lemmas: for e in i: names.append(e.name()) if "country" in names and line[3][0].isupper(): line.append("COU") elif "government" in names and line[3][0].isupper(): line.append("COU") elif "province" in names and line[3][0].isupper(): line.append("COU") elif "state" in names and line[3][0].isupper(): line.append("COU") elif "city" in names and line[3][0].isupper(): line.append("CIT") elif "sport" in names: line.append("SPO") elif "animal" in names: line.append("ANI") elif "entertainment" in names and line[3][0].isupper(): line.append("ENT") elif "amusement" in names and line[3][0].isupper(): line.append("ENT") elif "island" in names: line.append("NAT") elif "water" in names: line.append("NAT") elif "mountain" in names: line.append("NAT") if "LOCATION" in line and not "COU" in line: line.append("CIT") line.remove("LOCATION") if "LOCATION" in line: line.remove("LOCATION") if "LOCATION" in line and "COU" in line: line.remove("LOCATION") if "LOCATION" in line: line.remove("LOCATION") lines_updated.append(line) # Bigrams are put in a list to make urls more precise. for i,j in zip(lines_updated,lines_updated[1:]): if 0 <= 5 < len(i) and 0 <= 5 < len(j): if i[5] in NER_tags and j[5] in NER_tags: bigrams.append(i[3]+" "+j[3]) # Words are assigned Wikipedia pages and then the whole is written to the output file. print("[Attaching Wikipedia urls and writing output file...]") for line in lines_updated: try: if 0 <= 5 < len(line): if bigrams: for i in bigrams: if line[3] in i: page = wikipedia.page(i) break else: page = wikipedia.page(line[3]) else: page = wikipedia.page(line[3]) if len(line) >= 6: g.write("{0} {1} {2} {3} {4} {5} {6}\n".format(line[0], line[1], line[2], line[3], line[4], line[5], page.url)) else: g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4])) else: g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4])) # If the wikipedia module does not know what url to attach, a custom url will be added which (probably) leads to a disambiguation page. except wikipedia.exceptions.DisambiguationError: for i in bigrams: if line[3] in i: page = "https://en.wikipedia.org/wiki/{0}".format(i.replace(" ", "_")) break else: page = "https://en.wikipedia.org/wiki/{0}".format(line[3]) if len(line) >= 6: g.write("{0} {1} {2} {3} {4} {5} {6}\n".format(line[0], line[1], line[2], line[3], line[4], line[5], page)) else: g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4])) continue # If another error occurs, the line will be written to the file with or without NER-tag and without url. except wikipedia.exceptions.PageError: if len(line) >= 5: g.write("{0} {1} {2} {3} {4} {5}\n".format(line[0], line[1], line[2], line[3], line[4], line[5])) else: g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4])) continue print("[Processed file...]")
class getWikiInfo: #retrieves the family from wiki text and infoboxes def __init__(self, person): self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') classifier = "ner/classifiers/" + "english.all.3class.distsim.crf.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = StanfordNERTagger(classifier, jar) self.ap = [] self.person = person self.query = Sparql(person) self.setSpouse() self.setMother() self.setFather() self.setFullName() self.setAbstract() self.setAbstractInfo() def setAbstractInfo(self): try: conObj = wikipedia.page(self.person) content = conObj.content except wikipedia.exceptions.DisambiguationError as e: content = None except wikipedia.exceptions.PageError as e: content = None if content: for sentence in self.tokenizer.tokenize(content): if 'daughter of' in sentence: sentence = sentence[sentence.find('daughter of'):] elif 'son of' in sentence: sentence = sentence[sentence.find('son of'):] elif 'child of' in sentence: sentence = sentence[sentence.find('child of'):] else: sentence = False # daughter of, son of, child of if sentence is not False: person = '' for tag in self.tagger.tag(sentence.split()): if tag[1] == 'PERSON': person = person + " " + tag[0] else: if not person == '': self.ap.append(person) person = '' def getAbstractParents(self): if len(self.ap) > 0: return ", " .join(set(self.ap)) else: return "Unknown" def setSpouse(self): if 'spouse' in self.query.result: self.spouse = list(self.query.result['spouse']) else: self.spouse = ['Unknown', 'literal'] def setMother(self): if 'mother' in self.query.result: self.mother = list(self.query.result['mother']) else: self.mother = ['Unknown', 'literal'] def setFather(self): if 'father' in self.query.result: self.father = list(self.query.result['father']) else: self.father = ['Unknown', 'literal'] def setFullName(self): if 'fullName' in self.query.result: self.fullName = list(self.query.result['fullName']) else: self.fullName = [self.person, 'literal'] def setAbstract(self): if 'abstract' in self.query.result: self.abstract = list(self.query.result['abstract']) else: self.abstract = ['Unknown', 'literal'] def getSpouse(self): return self.spouse def getMother(self): return self.mother def getFather(self): return self.father def getFullName(self): return self.fullName def getAbstract(self): return self.abstract