def ner_tag(sents, silent=True) : if sents == '' or sents == [] : return [] # saves ner_tagger as global variable, # such that it is not recreated everytime ner_tag is executed if not 'ner_tagger' in globals(): global ner_tagger ner_tagger = NERTagger(conf.stanford_ner_classifier, conf.stanford_ner) # if sentence not tokenized if type(sents) in [str,unicode] : sents = tokenize(sents,'sw') # bring input sents in right form elif type(sents[0]) in [str,unicode] : if ' ' in sents[0] : sents = [tokenize(s,'w') for s in sents] else : sents = [sents] tagged = ner_tagger.tag_sents(sents) if not silent : print 'ner-tags:',tagged return tagged
def ner_tag(sents, silent=True) : """ Named Entety Recognition for sentences. Keyword arguments: sents -- Sentece, list of sentences or list of tokens. Returns : List of (word,neg-tag) pairs, that aims to preserve the structure of the sents input argument. """ if len(sents) == 0 : return [] # saves ner_tagger as global variable, # such that it is not recreated everytime ner_tag is executed if not 'ner_tagger' in globals(): global ner_tagger ner_tagger = NERTagger(stanford_ner_classifier, stanford_ner) # if sentence not tokenized if type(sents) in [str,unicode] : sents = tokenize(sents,'sw') # bring input sents in right form elif type(sents[0]) in [str,unicode] : if ' ' in sents[0] : sents = [tokenize(s,'w') for s in sents] else : sents = [sents] tagged = ner_tagger.tag_sents(sents) if not silent : print('ner-tags:', tagged) return tagged
def entityTagger(): """ Tags nouns in given file, writes them to output file :rtype : object """ class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') output = open("entity.tagged", "w") with open("pos.tagged", "r") as inp_file: for l in inp_file: line = l.split() # If words is a noun, go tag it! print(line) if line[5] == "NN" or line[5] == "NNP": ner_tagged = class3.tag([line[4]]) for t in ner_tagged[0]: # No nertag? Check wordnet tagging if len(t[1]) < 3: tag = wordNetTagger(t[0]) data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5], tag)) output.write(data+"\n") else: data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5], t[1])) output.write(data+"\n") else: data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5], "-")) output.write(data+"\n") output.close()
def ngramTagger(l): """ This function takes a list of ngrams, creates bigrams and entity tags them. :param l: input must be a list of bigrams, formed in tuples :return: returns a list with words that are tagged. (For example, "El Salvador" would be [("El", "LOCATION"), ("Salvador", "LOCATION")] """ bigrams_ner = [] bigrams_wn = [] bigrams = [] tb = [] for i in l: ngram_ner = i[0] + " " + i[1] ngram_wn = i[0] + "_" + i[1] bigrams_ner.append(ngram_ner) bigrams_wn.append(ngram_wn) bigrams.append((ngram_ner, ngram_wn)) class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') tagged_bigrams = class3.tag(bigrams_ner) for l in tagged_bigrams: for t in l: if len(t[1]) > 3: if t[1] != "LOCATION": tb.append(t) for bg in bigrams: tag_bg = bgWordNetTagger(bg[0], bg[1]) if tag_bg == "COUNTRY" or tag_bg == "STATE" or tag_bg == "CITY" or tag_bg == "TOWN": words = bg[0].split() tb.extend([(words[0], tag_bg), (words[1], tag_bg)]) print(tb)
def main(): #os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin" path="ner" classifier = path + "/classifiers/" + "english.muc.7class.distsim.crf.ser.gz" jar = path + "/stanford-ner-3.4.jar" tagger = NERTagger(classifier, jar) tokens = tokenize('ada_lovelace.txt') taggedText = tagger.tag(tokens) countList=[] nounList = [] for word, tag in taggedText: countList.append(tag) if tag != 'O': nounList.append(word) print("Answer to 2.1: \n{} \nThey certainly aren't all correct.".format(Counter(countList))) print() print("Answer to 2.2: The other classifiers seem to achieve similar results,\nbut because of the multiple categories it is more interesting to read.") lemmas = lemmatize(nounList) taggedLemmas = tagger.tag(lemmas) print("Answer to 2.3:\n", taggedLemmas)
def sdfprocess(rawexpr): parser=NERTagger(path_to_model='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', java_options='-mx2000m') expr = preprocess(rawexpr) named_expr = rechunk(parser.tag(word_tokenize(expr))) for t in named_expr: if t[1] == 'PERSON': return t[0] return expr
def ngramTagger(l): """ this function creates bigrams, tags them via Stanford NER or Word Net, and searches links for wiki pages. :param l: input must be a list of bigrams, formed in tuples :return: returns a list with words that are tagged and linked to wikipedia. """ print("checking ngrams") nerts = [] # First, create words which are suited as input for NERTagger. for i in l: ngram_ner = i[0] + " " + i[1] nerts.append(ngram_ner) # Input the list of suitable bigrams in the NERTagger, and form the output to a wanted format with nerToBG() class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') ner_result = class3.tag(nerts) bigramsAndTags = nerToBG(ner_result) for t in bigramsAndTags: # If tagged as location, get rid of location via the same technique as locationTagger(), but then for bigrams, # using getRidOfLocation() if t[1] == "LOCATION" or t[2] == "LOCATION": wn_bg = t[0].split()[0] + "_" + t[0].split()[1] wAndTag = getRidOfLocation(wn_bg) t[1] = wAndTag[1] t[2] = wAndTag[1] final_list = [] a = 0 for j in range(len(bigramsAndTags)): # If the 2 words of the bigram are tagged the same, append them to final_list. if bigramsAndTags[a][1] == bigramsAndTags[a][2]: final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])]) # If word 1 isn't tagged and word 2 is, check if word 1 is tagged in the development set. # If this tag is the same as the tag of word 2, append to final_list. elif checkBGTag(bigramsAndTags[a][0].split()[0]) == bigramsAndTags[a][2]: final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][2])]) # If word 2 isn't tagged and word 1 is, check if word 2 is tagged in the single word tagged development set. # If this tag is the same as the tag of word 1, append to final_list. elif checkBGTag(bigramsAndTags[a][0].split()[1]) == bigramsAndTags[a][1]: final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])]) a += 1 taglink_bigrams = [] for bgs in final_list[:]: # If bigrams are still not tagged, remove them from the list. if len(bgs[1]) < 4: final_list.remove(bgs) else: # If they are tagged, look up wikipedia links. links = wiki_lookup(bgs[0], bgs[1]) words = bgs[0].split(" ") taglink_bigrams.extend([(words[0], bgs[1], links), (words[1], bgs[1], links)]) return taglink_bigrams
def queryForEntity2(expectedEntity,passage): st = NERTagger('/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz','/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar') answer=st.tag(passage.split()) print answer answers=[] for j,currentExpectedEntity in enumerate(expectedEntity): for i,pair in enumerate(answer): if(pair[1]==currentExpectedEntity): answers.append(answer[i]) return answers
def tagger(data): try: st=NERTagger('./nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz','./nltk-data/StanfordNER/stanford-ner.jar') except: return ret_failure(705) #try: tag = st.tag(data.split()) #except: # return ret_failure(702) return ret_success(tag)
def main(): words = ["Barack Obama", "Holland", "Government", "Tennis", "happiness"] noun_lemmas = [] nouns = [] final_ner_tagged = [] not_ner_tagged = [] pos_tags = nltk.pos_tag(words) lemmatizer = WordNetLemmatizer() class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') # STANFORD NERTAGGING HAPPENS HERE for tag in pos_tags: if tag[1] == 'NNP': nouns.append(tag[0]) elif tag[1] == 'NN': nouns.append(tag[0]) ner_tagged = class3.tag(nouns) for t in ner_tagged[0]: if t[1] == u'O': not_ner_tagged.append(t[0]) else: final_ner_tagged.append(t) print("NERTagged:") print(final_ner_tagged) entities = { "COUNTRY": wordnet.synsets("country", pos='n'), "STATE": wordnet.synsets("state", pos='n'), "CITY": wordnet.synsets("city", pos='n'), "TOWN": wordnet.synsets("town", pos='n'), "NAT": wordnet.synsets("natural places", pos='n'), "PER": wordnet.synsets("person", pos='n'), "ORG": wordnet.synsets("organisation", pos='n'), "ANI": wordnet.synsets("animal", pos='n'), "SPO": wordnet.synsets("sport", pos='n'), "ENT": wordnet.synsets("entertainment", pos='n'), } tagged_top_entities = defaultdict(list) for word in pos_tags: if word[1] == "NN" or word[1] == "NNP": noun_lemmas.append(lemmatizer.lemmatize(word[0], wordnet.NOUN)) word_synset = wordnet.synsets(word[0], pos="n") for e in list(entities.keys()): if len(word_synset) != 0 and len(entities[e]) != 0: if hypernymOf(word_synset[0], entities[e][0]): tagged_top_entities[word[0]].append(e) print("WordNet tagged:") for w in tagged_top_entities: print("{:15}{:15}".format(w, tagged_top_entities[w]))
def compute_NER(corpus): NER=[] #fi=open("NER_features_train.txt","w") st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath')) for sentence in corpus: ner=st.tag(sentence.split()) ner_tag="" for n in ner: ner_tag=ner_tag+n[1]+" " NER.append(ner_tag) return NER
def german_ner(text): """ Moves the list of words through the NER tagger""" text = text.encode('utf8') st = NERTagger('/Users/Lena/src/context/stanford-ner/classifiers/german/dewac_175m_600.crf.ser.gz', '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') tagged = st.tag(text.split()) return tagged
def spanish_ner(text): """ Moves the list of words through the NER tagger""" text = text.encode('utf8') st = NERTagger('/Users/Lena/src/context/stanford-ner/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz', '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') tagged = st.tag(text.split()) return tagged
def queryForEntity2(expectedEntity, passage): st = NERTagger( '/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar') answer = st.tag(passage.split()) print answer answers = [] for j, currentExpectedEntity in enumerate(expectedEntity): for i, pair in enumerate(answer): if (pair[1] == currentExpectedEntity): answers.append(answer[i]) return answers
def standfordtagger(words): try: os.environ['JAVAHOME'] = '' path = "" classifier = path + "" jar = path + "/stanford-ner-3.4.jar" st = NERTagger(classifier, jar) stanford_tagger = st.tag(words) return stanford_tagger except: print(words)
def compute_NER(corpus): NER = [] #fi=open("NER_features_train.txt","w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) ner_tag = "" for n in ner: ner_tag = ner_tag + n[1] + " " NER.append(ner_tag) return NER
def findWord(self): """ """ st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar') tagged= st.tag(self.question.split()) for item in tagged: if item[1]== self.queryType: #print item[0] return item[0] return -1
def tagger(data): try: st = NERTagger( './nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz', './nltk-data/StanfordNER/stanford-ner.jar') except: return ret_failure(705) #try: tag = st.tag(data.split()) #except: # return ret_failure(702) return ret_success(tag)
def standfordtagger(words): try: os.environ['JAVAHOME'] = '/usr/lib/jvm/java-1.7.0-openjdk-amd64' path = "/home/guido/PTA/stanford-ner-2014-06-16" classifier = path + "/classifiers/" + "english.all.3class.distsim.crf.ser.gz" jar = path + "/stanford-ner-3.4.jar" st = NERTagger(classifier, jar) stanford_tagger = st.tag(words) return stanford_tagger except: print(words)
def add_ner(self, target): all_token = self.get_token(target) st = \ NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar') ner_result = st.tag_sents(all_token) w = open('ner_%s' % target, 'wb') for num, row in enumerate(ner_result): for item in row: w.write(item[0] + '\n') w.write('\n') #end for print len(ner_result), len(all_token) return
def run_tagger(self, payload): """ Runs :py:meth:`nltk.tag.stanford.NERTagger.tag_sents` on the provided text (http://www.nltk.org/api/nltk.tag.html#nltk.tag.stanford.NERTagger.tag_sents) :param payload: Fulltext payload. :type payload: string :return: List of parsed sentences. """ if NERTagger is None: return None tagger = NERTagger(self.classifier, self.jarfile) return tagger.tag_sents([payload.encode('ascii', 'ignore').split()])
def extract_entities_stanford(sample, stanfordPath, model): from nltk.tag.stanford import NERTagger st = NERTagger(stanfordPath + get_model_name(model), stanfordPath + '/stanford-ner-2014-01-04.jar') entity_names = st.tag(sample.split()) entities = [] for entity, tag in entity_names: if cmp(tag, "O") != 0: entities.append([entity, tag]) return entities
def add_ner(self,target): all_token = self.get_token(target); st = \ NERTagger('../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz','../stanford-ner-2015-04-20/stanford-ner.jar'); ner_result = st.tag_sents(all_token); w = open('ner_%s'%target,'wb'); for num,row in enumerate(ner_result): for item in row: w.write(item[0]+'\n'); w.write('\n'); #end for print len(ner_result),len(all_token); return;
def test_main(request): #Java imports from nltk.tag.stanford import NERTagger java_path="C:/Program Files/Java/jre1.8.0_31/bin/java.exe" os.environ['JAVAHOME']=java_path stanford_jar=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/stanford-ner.jar' stanford_trained=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/classifiers/english.all.7class.distsim.crf.ser.gz' NER_Tagger = NERTagger(stanford_trained, stanford_jar) phrases="once upon a midnight dreary" tags=NER_Tagger.tag(phrases) #Above imported print "Got "+str(tags) return HttpResponse(str(tags))
def main(): parser = get_argparser() args = parser.parse_args() ner = NERTagger('lib/english.all.3class.distsim.crf.ser.gz', 'lib/stanford-ner-2013-06-20.jar', encoding='utf-8') text = get_text(args.workid) sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(s) for s in sentences] tagged_sentences = ner.batch_tag(tokenized_sentences) print(set_of_named_entities(tagged_sentences))
def whoQuestion(tokens): st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') posTags = nltk.pos_tag(tokens) ner = st.tag(tokens) if posTags[0][1] == 'NNP' and ner[0][1] == 'PERSON': # We have a PERSON i = 0 while (posTags[i][1] == 'NNP' and ner[i][1] == 'PERSON'): i = i + 1 if tokens[i] in EXIST: tokens = changeToQuestionMark(tokens) tokens = ['Who'] + tokens[i:] return (True, ' '.join(tokens[:-1]) + tokens[-1])
def get_names(self, sentence): # Use NLTK Tagger if self.tagger == 'NLTK': tokens = nltk.tokenize.word_tokenize(sentence) # word tokenizer pos_tags = nltk.pos_tag(tokens) # part of speech tagging ner_tags = nltk.ne_chunk(pos_tags) # named entity recognition # Use Stanford NER Tagger instead of NLTK default elif self.tagger == 'Stanford': st = NERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar') ner_tags = st.tag(sentence.split()) return self.get_names_from_tags(ner_tags)
def get_names(self, sentence): # Use NLTK Tagger if self.tagger == 'NLTK': tokens = nltk.tokenize.word_tokenize(sentence) # word tokenizer pos_tags = nltk.pos_tag(tokens) # part of speech tagging ner_tags = nltk.ne_chunk(pos_tags) # named entity recognition # Use Stanford NER Tagger instead of NLTK default elif self.tagger == 'Stanford': st = NERTagger( '/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar') ner_tags = st.tag(sentence.split()) return self.get_names_from_tags(ner_tags)
class NERParser (object): def __init__(self): self.st = NERTagger("/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/classifiers/english.muc.7class.distsim.crf.ser.gz", "/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/stanford-ner.jar") self.locations = [] self.organizations = [] def parse (self, text): ne = self.st.tag(nltk.word_tokenize(text)) for sentence in ne: lastwordwasentity = False lastentity = '' lasttype = '' for (word, entitytype) in sentence: if entitytype == 'ORGANIZATION' or entitytype == 'LOCATION': if lastwordwasentity: lastentity += ' ' + word else: lastentity = word lastwordwasentity = True lasttype = entitytype else: if lastwordwasentity: if lasttype == 'LOCATION': self.locations.append(lastentity) else: self.organizations.append(lastentity) lastentity = '' lastwordwasentity = False def locationFrequencies (self): print collections.Counter (self.locations) def organizationFrequencies (self): print collections.Counter (self.organizations)
def __init__(self): # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford self.Tagger = NERTagger( app.config['SNER_CLASSIFIERS'], app.config['SNER_JARFILE'], encoding='utf-8') return
class Parser(object): def __init__(self): self.st = NERTagger(os.path.join(STANFORD_PATH,'classifiers/english.all.3class.distsim.crf.ser.gz'), os.path.join(STANFORD_PATH,'stanford-ner-3.4.jar')) def NER(self, s): s = s.replace('.',' ') s = s.encode('utf-8') return self.st.tag(s.split())
def main(): file = open("ada_lovelace.txt", 'r') file = file.read() file = file.decode('utf-8') text = nltk.word_tokenize(file) # Location, Person, Organization class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') class3_nowiki = NERTagger('stanford-ner/classifiers/english.nowiki.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') # Location, Person, Organization, Misc class4 = NERTagger('stanford-ner/classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') # Time, Location, Organization, Person, Money, Percent, Date class7 = NERTagger('stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') #print(class3.tag(text)) nnp_words = [] nn_words = [] not_tagged = [] pos_tags = nltk.pos_tag(text) for tag in pos_tags: if tag[1] == 'NNP': nnp_words.append(tag[0]) elif tag[1] == 'NN': nn_words.append(tag[0]) print("NERTagged words:") ner_tagged = class3.tag(nnp_words) tagged = [] for t in ner_tagged[0]: if t[1] == u'O': not_tagged.append(t) else: tagged.append(t) print(tagged) print("WordNet Tagged Words:") print(WNtagger(nn_words)) print("Not Tagged Words:") print(not_tagged)
def compute_NER(corpus): #NER=[] fi=open(read_property('NER_features_train_coarse_path'),"w") st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath')) for sentence in corpus: ner=st.tag(sentence.split()) #print ner #pos_seq=nltk.pos_tag(text) #print pos_seq ner_tag="" for n in ner: #print n[1] ner_tag=ner_tag+n[1]+" " #print pos_tags fi.write(ner_tag+"\n") #NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close()
def NERTag(self, question): """ input: query (keywords of query) as string output: NER tagged list of the snippets and title """ snippets= self.getSnippets(question) taggedList= [] start_time = time.time() for item in snippets: st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar') temp = item.encode('ascii','ignore') tagged= st.tag(temp.split()) taggedList.append(tagged) # print "NER tagged list: ", taggedList # print # print "Tagging: ", time.time() - start_time # print return taggedList
def compute_NER(corpus): #NER=[] fi = open(read_property('NER_features_train_coarse_path'), "w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) #print ner #pos_seq=nltk.pos_tag(text) #print pos_seq ner_tag = "" for n in ner: #print n[1] ner_tag = ner_tag + n[1] + " " #print pos_tags fi.write(ner_tag + "\n") #NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close()
def extract_persons_stanford(sample, stanfordPath, model): from nltk.tag.stanford import NERTagger import operator st = NERTagger(stanfordPath + get_model_name(model), stanfordPath + '/stanford-ner-2014-01-04.jar') entity_names = st.tag(sample.split()) entity_count = {} for entity, tag in entity_names: if cmp(tag, "PERSON") == 0: if entity in entity_count: entity_count[entity] += 1 else: entity_count[entity] = 1 sorted_occurrences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1)) return sorted_occurrences
def entityTagger(): class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') output = open("en.tok.off.test.pos.tagged", "w") with open("en.tok.off.test.pos", "r") as inp_file: for l in inp_file: line = l.split() if line[4] == "NN" or line[4] == "NNP": ner_tagged = class3.tag([line[3]]) print("Nertagged:", ner_tagged) for t in ner_tagged[0]: if len(t[1]) < 3: tag = wordNetTagger(t[0]) print("Wordnet tag:", tag) data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], tag)) output.write(data+"\n") else: data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], t[1])) output.write(data+"\n") output.close()
class EventDetectiveNer(EventDetective): def loadClassifier(self): classifier = "ner/classifiers/" + "tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar) def tagText(self, candidate): result = defaultdict(list) text = " ".join([tweet['text'] for tweet in candidate]) #make one long text for line in self.tagger.tag(nltk.word_tokenize(text)): for word, tag in line: result[tag].append(word) return result def generateMarkers(self): print("Creating Google Maps markers & add WIKI links...") js = open('vis/map/js/markers.js', 'w') js.write('var locations = [') for tweets, label in self.events: writableCluster = '' gh = [] i = 0 avgLon = 0 avgLat = 0 #tweets = sorted(tweets, key=itemgetter('unixTime')); for tweet in tweets: i = i + 1 gh.append(tweet['geoHash']) avgLon += float(tweet["lon"]) avgLat += float(tweet["lat"]) # backslashes voor multiline strings in Javascript writableCluster += "{} {} {} {}<br/><br/>".format( tweet['localTime'], tweet['geoHash'], tweet['user'], tweet['text']).replace("'", "\\'") # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde... # Oftewel, we doen even alsof de aarde plat is ;-) avgLon /= i avgLat /= i nertags = self.tagText(tweets) for key in nertags: if key != 'O': writableCluster += "</br> {} {}".format( key, " ,".join(list(set(nertags[key]))).replace("'", "\\'")) js.write("['{}', {}, {}, '{}'],".format(writableCluster, avgLat, avgLon, label)) js.write('];') js.close()
def main(word_transformation = None, result_path = None, n = 50): tagged_corpus = CoNLLNERReader(TEST_DATA_PATH).read()[:n] tagger = NERTagger('/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/classifiers/english.conll.4class.distsim.crf.ser.gz', '/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/stanford-ner.jar') print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w,t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w,t in sent] for sent in tagged_corpus) correct_tags = [transform_labels([t for w,t in sent]) for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i+1) % 5 == 0: print "%d finished" %(i+1) try: ptags = [t for w,t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" %(sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent assert len(really_correct_tags) == len(predicted_tags), "length inconsistent" print "%d finished" %(i+1) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
def handleProperNoun(tokens, pos, position): st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') # get tokens & pos before verb bTokens = tokens[:position] bPos = pos[:position] ner = st.tag(bTokens) # reverse everything now ner = ner[::-1] bPos = bPos[::-1] person = False i = -1 if isProperNoun(bPos[0][1]) and isPerson(ner[0][1]): i = 0 person = True while (i < len(bPos) and isProperNoun(bPos[i][1]) and isPerson(ner[i][1])): i = i + 1 elif isProperNoun(bPos[0][1]): i = 0 while (i < len(bPos) and isProperNoun(bPos[i][1])): i = i + 1 # Reverse back and remove extra ner = ner[::-1] if (i > -1): for r in range(1, i): tokens.pop(len(bTokens) - i) pos.pop(len(bTokens) - i) position = position - 1 if person: tokens[position - 1] = 'who' else: tokens[position - 1] = 'what' return (tokens, pos, position)
def __init__(self, argv): classifier = "ner/classifiers/" + "wikification.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar) self.testfile = open(sys.argv[1]) with open('html/htmlheader.txt', 'r') as h: self.htmlHeader = h.read() with open('html/htmlfooter.txt', 'r') as f: self.htmlFooter = f.read() self.measures = Measures() self.classify()
def generate(word): sentence = word st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) ner = st.tag(tokens) # TODO: Add in the question mark at the end of the sentence (success, question) = simpleYesNo(tokens, pos) if success: return question (success, question) = simpleWhoOrWhat(tokens, pos) if success: return question return None
def tagdata(refDict): """ Gives the data its NER Tags using our trained tagger """ #pbar = ProgressBar() tokens = [] testData = codecs.open('testdata.tsv', 'r') for line in testData: if len(line) > 1: token = line.strip().split('\t') tokens.append(token[0]) #os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin" path="ner" classifier = "ner-pta.ser.gz" jar = "stanford-ner.jar" tagger = NERTagger(classifier, jar) taggedText = tagger.tag(tokens) for line in taggedText: for tup in line: for key, value in refDict.items(): if tup[0] == value[0]: refDict[key] = [tup[0],tup[1]] return taggedText, refDict
def NERTag(self, question): """ input: query (keywords of query) as string output: NER tagged list of the snippets and title """ snippets = self.getSnippets(question) taggedList = [] start_time = time.time() for item in snippets: st = NERTagger( 'stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner-2014-01-04/stanford-ner.jar') temp = item.encode('ascii', 'ignore') tagged = st.tag(temp.split()) taggedList.append(tagged) # print "NER tagged list: ", taggedList # print # print "Tagging: ", time.time() - start_time # print return taggedList
class EventDetectiveNer(EventDetective): def loadClassifier(self): classifier = "ner/classifiers/" + "tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar) def tagText(self, candidate): result = defaultdict(list) text = " ".join([tweet['text'] for tweet in candidate]) #make one long text for line in self.tagger.tag(nltk.word_tokenize(text)): for word, tag in line: result[tag].append(word) return result def generateMarkers(self): print("Creating Google Maps markers & add WIKI links...") js = open('vis/map/js/markers.js','w') js.write('var locations = [') for tweets,label in self.events: writableCluster = '' gh = [] i = 0 avgLon = 0 avgLat = 0 #tweets = sorted(tweets, key=itemgetter('unixTime')); for tweet in tweets: i = i + 1 gh.append(tweet['geoHash']) avgLon += float(tweet["lon"]) avgLat += float(tweet["lat"]) # backslashes voor multiline strings in Javascript writableCluster += "{} {} {} {}<br/><br/>".format(tweet['localTime'], tweet['geoHash'], tweet['user'], tweet['text']).replace("'", "\\'") # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde... # Oftewel, we doen even alsof de aarde plat is ;-) avgLon /= i avgLat /= i nertags = self.tagText(tweets) for key in nertags: if key != 'O': writableCluster += "</br> {} {}".format(key, " ,".join(list(set(nertags[key]))).replace("'", "\\'")) js.write("['{}', {}, {}, '{}'],".format(writableCluster,avgLat,avgLon,label)) js.write('];') js.close()
class Ner(): def __init__(self): classifier = "ner/classifiers/" + "ner-model-tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar) def tagText(self, candidate): result = defaultdict(list) text = " ".join([tweet['text'] for tweet in candidate]) #make one long text for line in self.tagger.tag(self.tokens): for word, tag in line: result[tag].append(word) return result
def print_symptoms_from_page(url='', model='', stanford_jar=''): html_reader = HTMLReader(url) cleaned_text = html_reader.get_text_from_page() symptoms = set() st = NERTagger(model, stanford_jar, encoding='utf-8') sentences = nltk.sent_tokenize(cleaned_text) for sentence in sentences: tags = st.tag(nltk.word_tokenize(sentence)) tag_index = 0 while tag_index < len(tags): if tags[tag_index][1] == 'SYMP': symptom = [] while tag_index < len(tags) and tags[tag_index][1] != 'O': symptom.append(tags[tag_index][0]) tag_index += 1 symptoms.add(' '.join(symptom)) else: tag_index += 1 print "Found %d symptoms:" % len(symptoms) for symptom in symptoms: print symptom
def tagger_init(ner_class=7): global tagger if ner_class == 4: classifier = "english.conll.4class.distsim.crf.ser.gz" elif ner_class == 7: classifier = "english.muc.7class.distsim.crf.ser.gz" else: print('Invalid ner_class, should be 4 or 7') NER_CLASSIFIER = os.path.join(stanford_path, "classifiers", classifier) tagger = NERTagger(NER_CLASSIFIER, NER_JAR) return True
def findName(line): st = NERTagger( '../poli_stanford_ner/stanford_ner/english.all.3class.distsim.crf.ser.gz', '../poli_stanford_ner/stanford_ner/stanford-ner-4.2.0.jar') pos = 0 savedPos = -1 multi_name = {} ret_names = [] # classifying if there are names in the sentence for sent in nltk.sent_tokenize(line): tokens = nltk.tokenize.word_tokenize(sent) tags = st.tag(tokens) for tag in tags: if tag[1] == 'PERSON': print(tag) multi_name[pos] = tag pos += 1 # where it starts to see if there's first, middle, and last names keys = isConsecutive(multi_name) if keys: #print("Multi name!") for keySet in keys: tmp = None for key in keySet: if tmp is None: tmp = multi_name[key][0] else: tmp += "_" + multi_name[key][0] #print("\t\t", tmp) ret_names.append(tmp) else: tmp = None for posInLine in multi_name: # if this is the first time through if savedPos == -1: savedPos = posInLine if savedPos + 1 != posInLine: tmp = multi_name[savedPos][0] ret_names.append(tmp) savedPos = posInLine print(ret_names) return ret_names
class StanfordNerTagger(): """ Wrapper class for the nltk.tag.stanford.NERTagger module. Provides streamlined instantiation and helper methods to simplify the process of using the tagger. """ def __init__(self): # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford self.Tagger = NERTagger(app.config['SNER_CLASSIFIERS'], app.config['SNER_JARFILE'], encoding='utf-8') return def Tag(self, text): """ Given text, the tagger will identify all entities mentioned in the text and associate them with an entity type. Example: Input: "I am Jack and I live in Phoenix, Arizona." Tag Result: "[(I)]... TODO" :param str text: text to tokenize and tag :returns: list of tuples -- see above example """ entities = self.Tagger.tag(text) return entities def __repr__(self): return "<StanfordNerTagger(Tagger=%s)>" % (self.Tagger)
from nltk.tag.stanford import NERTagger ALL_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz' NOWIKI_CASELESS = '/home/azureuser/edu/stanford/nlp/models/ner/english.nowiki.3class.caseless.distsim.crf.ser.gz' TRAINING_MOD = ALL_CASELESS NER_JAR = '/home/azureuser/stanford-ner-2014-01-04/stanford-ner.jar' st = NERTagger(TRAINING_MOD, NER_JAR) def get_named_entities(text): tagged = st.tag(text.split()) return [t for t in tagged if t[1] is not 'O']
from nltk.tag.stanford import NERTagger import os java_path = "C:/Program Files/Java/jdk1.8.0_45/bin/java.exe" os.environ['JAVAHOME'] = java_path st = NERTagger('./english.all.7class.distsim.crf.ser.gz', './stanford-corenlp-3.5.2.jar') file = open("text/289007975") while 1: lines = file.readlines(100000) if not lines: break for line in lines: print st.tag(unicode(line, errors='ignore').split())
def loadClassifier(self): classifier = "ner/classifiers/" + "tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar)
return "09" elif (month.lower() == "october"): return "10" elif (month.lower() == "november"): return "11" elif (month.lower() == "december"): return "12" #http://api.wunderground.com/api/4ab5a36ab8ce63df/history_19940625/q/CA/Santa_barbara.json #def stream(head, tail, *rest, **kwargs): # if kwargs.key("lazy") # # do something here # # if kwargs.key(""): # #stream(x, y, lazy = True) # #stream(x, y, 0, 0, 0, 0, x= "hello") st = NERTagger( '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz', '/Users/dspoka/Desktop/Afact/NLP/stanford-ner/stanford-ner-3.4.1.jar') _dateExtract( "I f****d a girl named May and it was really hot who was born on June 25th, 1994" ) print("Let's see if this works!")
#!/usr/bin/env python # -*- coding: utf-8 -* import numpy import nltk from nltk.tag.stanford import NERTagger ## Configure this to be your Java directory #nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe") chunk = u"妈妈带我去公园散步" #chunk = u"妈我" #tagger = POSTagger() #token_tags = tagger.tag(chunk) #for token,tag in token_tags: # print token,tag text = nltk.word_tokenize(chunk.encode('utf-8')) st = NERTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar') poop = st.tag(text) print poop #tagger = pickle.load(open('sinica_treebank_brill_aubt.pickle')) #poop = tagger.tag(text) #print poop #poop2 = nltk.pos_tag(text) #print poop2
reload(sys) sys.setdefaultencoding('utf-8') pathtojava = "/usr/bin/java" #os.environ['JAVAHOME'] = pathtojava importer = zipimport.zipimporter('nltk.mod') nltk = importer.load_module('nltk') nltk.internals.config_java(pathtojava) nltk.data.path += ["./nltkData/"] from nltk.tag.stanford import NERTagger #nltk.internals.config_java(pathtojava); #stanfordTagge- = NERTagger('CollSmall-ner-model.ser.gz', 'stanford-ner.jar', 'utf-8') stanfordTagger = NERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar', 'utf-8') #input = open('stanfordNER.pickle', 'rb'); #stanfordTagger = load(input) #input.close() # input is file with fullpath filenames for line in sys.stdin: #assume line is the full path for a file fname = line.rstrip('\n').split('\t')[0] text = '' try: with open('./eventData/' + fname, 'r') as f: text = f.read() except: continue