def main(): #os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin" path="ner" classifier = path + "/classifiers/" + "english.muc.7class.distsim.crf.ser.gz" jar = path + "/stanford-ner-3.4.jar" tagger = NERTagger(classifier, jar) tokens = tokenize('ada_lovelace.txt') taggedText = tagger.tag(tokens) countList=[] nounList = [] for word, tag in taggedText: countList.append(tag) if tag != 'O': nounList.append(word) print("Answer to 2.1: \n{} \nThey certainly aren't all correct.".format(Counter(countList))) print() print("Answer to 2.2: The other classifiers seem to achieve similar results,\nbut because of the multiple categories it is more interesting to read.") lemmas = lemmatize(nounList) taggedLemmas = tagger.tag(lemmas) print("Answer to 2.3:\n", taggedLemmas)
class NERParser (object): def __init__(self): self.st = NERTagger("/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/classifiers/english.muc.7class.distsim.crf.ser.gz", "/Users/trentniemeyer/nltk_data/stanford-ner-2014-06-16/stanford-ner.jar") self.locations = [] self.organizations = [] def parse (self, text): ne = self.st.tag(nltk.word_tokenize(text)) for sentence in ne: lastwordwasentity = False lastentity = '' lasttype = '' for (word, entitytype) in sentence: if entitytype == 'ORGANIZATION' or entitytype == 'LOCATION': if lastwordwasentity: lastentity += ' ' + word else: lastentity = word lastwordwasentity = True lasttype = entitytype else: if lastwordwasentity: if lasttype == 'LOCATION': self.locations.append(lastentity) else: self.organizations.append(lastentity) lastentity = '' lastwordwasentity = False def locationFrequencies (self): print collections.Counter (self.locations) def organizationFrequencies (self): print collections.Counter (self.organizations)
def entityTagger(): """ Tags nouns in given file, writes them to output file :rtype : object """ class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') output = open("entity.tagged", "w") with open("pos.tagged", "r") as inp_file: for l in inp_file: line = l.split() # If words is a noun, go tag it! print(line) if line[5] == "NN" or line[5] == "NNP": ner_tagged = class3.tag([line[4]]) for t in ner_tagged[0]: # No nertag? Check wordnet tagging if len(t[1]) < 3: tag = wordNetTagger(t[0]) data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5], tag)) output.write(data+"\n") else: data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5], t[1])) output.write(data+"\n") else: data = ("{:8}{:8}{:8}{:8}{:60}{:6}{:13}".format(line[0], line[1], line[2], line[3], line[4], line[5], "-")) output.write(data+"\n") output.close()
def ngramTagger(l): """ This function takes a list of ngrams, creates bigrams and entity tags them. :param l: input must be a list of bigrams, formed in tuples :return: returns a list with words that are tagged. (For example, "El Salvador" would be [("El", "LOCATION"), ("Salvador", "LOCATION")] """ bigrams_ner = [] bigrams_wn = [] bigrams = [] tb = [] for i in l: ngram_ner = i[0] + " " + i[1] ngram_wn = i[0] + "_" + i[1] bigrams_ner.append(ngram_ner) bigrams_wn.append(ngram_wn) bigrams.append((ngram_ner, ngram_wn)) class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') tagged_bigrams = class3.tag(bigrams_ner) for l in tagged_bigrams: for t in l: if len(t[1]) > 3: if t[1] != "LOCATION": tb.append(t) for bg in bigrams: tag_bg = bgWordNetTagger(bg[0], bg[1]) if tag_bg == "COUNTRY" or tag_bg == "STATE" or tag_bg == "CITY" or tag_bg == "TOWN": words = bg[0].split() tb.extend([(words[0], tag_bg), (words[1], tag_bg)]) print(tb)
class Parser(object): def __init__(self): self.st = NERTagger(os.path.join(STANFORD_PATH,'classifiers/english.all.3class.distsim.crf.ser.gz'), os.path.join(STANFORD_PATH,'stanford-ner-3.4.jar')) def NER(self, s): s = s.replace('.',' ') s = s.encode('utf-8') return self.st.tag(s.split())
def sdfprocess(rawexpr): parser=NERTagger(path_to_model='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/cosmo/Dropbox/Purdue/nlp/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar', java_options='-mx2000m') expr = preprocess(rawexpr) named_expr = rechunk(parser.tag(word_tokenize(expr))) for t in named_expr: if t[1] == 'PERSON': return t[0] return expr
def ngramTagger(l): """ this function creates bigrams, tags them via Stanford NER or Word Net, and searches links for wiki pages. :param l: input must be a list of bigrams, formed in tuples :return: returns a list with words that are tagged and linked to wikipedia. """ print("checking ngrams") nerts = [] # First, create words which are suited as input for NERTagger. for i in l: ngram_ner = i[0] + " " + i[1] nerts.append(ngram_ner) # Input the list of suitable bigrams in the NERTagger, and form the output to a wanted format with nerToBG() class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') ner_result = class3.tag(nerts) bigramsAndTags = nerToBG(ner_result) for t in bigramsAndTags: # If tagged as location, get rid of location via the same technique as locationTagger(), but then for bigrams, # using getRidOfLocation() if t[1] == "LOCATION" or t[2] == "LOCATION": wn_bg = t[0].split()[0] + "_" + t[0].split()[1] wAndTag = getRidOfLocation(wn_bg) t[1] = wAndTag[1] t[2] = wAndTag[1] final_list = [] a = 0 for j in range(len(bigramsAndTags)): # If the 2 words of the bigram are tagged the same, append them to final_list. if bigramsAndTags[a][1] == bigramsAndTags[a][2]: final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])]) # If word 1 isn't tagged and word 2 is, check if word 1 is tagged in the development set. # If this tag is the same as the tag of word 2, append to final_list. elif checkBGTag(bigramsAndTags[a][0].split()[0]) == bigramsAndTags[a][2]: final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][2])]) # If word 2 isn't tagged and word 1 is, check if word 2 is tagged in the single word tagged development set. # If this tag is the same as the tag of word 1, append to final_list. elif checkBGTag(bigramsAndTags[a][0].split()[1]) == bigramsAndTags[a][1]: final_list.extend([(bigramsAndTags[a][0], bigramsAndTags[a][1])]) a += 1 taglink_bigrams = [] for bgs in final_list[:]: # If bigrams are still not tagged, remove them from the list. if len(bgs[1]) < 4: final_list.remove(bgs) else: # If they are tagged, look up wikipedia links. links = wiki_lookup(bgs[0], bgs[1]) words = bgs[0].split(" ") taglink_bigrams.extend([(words[0], bgs[1], links), (words[1], bgs[1], links)]) return taglink_bigrams
def queryForEntity2(expectedEntity,passage): st = NERTagger('/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz','/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar') answer=st.tag(passage.split()) print answer answers=[] for j,currentExpectedEntity in enumerate(expectedEntity): for i,pair in enumerate(answer): if(pair[1]==currentExpectedEntity): answers.append(answer[i]) return answers
def tagger(data): try: st=NERTagger('./nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz','./nltk-data/StanfordNER/stanford-ner.jar') except: return ret_failure(705) #try: tag = st.tag(data.split()) #except: # return ret_failure(702) return ret_success(tag)
def compute_NER(corpus): NER=[] #fi=open("NER_features_train.txt","w") st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath')) for sentence in corpus: ner=st.tag(sentence.split()) ner_tag="" for n in ner: ner_tag=ner_tag+n[1]+" " NER.append(ner_tag) return NER
def main(): words = ["Barack Obama", "Holland", "Government", "Tennis", "happiness"] noun_lemmas = [] nouns = [] final_ner_tagged = [] not_ner_tagged = [] pos_tags = nltk.pos_tag(words) lemmatizer = WordNetLemmatizer() class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') # STANFORD NERTAGGING HAPPENS HERE for tag in pos_tags: if tag[1] == 'NNP': nouns.append(tag[0]) elif tag[1] == 'NN': nouns.append(tag[0]) ner_tagged = class3.tag(nouns) for t in ner_tagged[0]: if t[1] == u'O': not_ner_tagged.append(t[0]) else: final_ner_tagged.append(t) print("NERTagged:") print(final_ner_tagged) entities = { "COUNTRY": wordnet.synsets("country", pos='n'), "STATE": wordnet.synsets("state", pos='n'), "CITY": wordnet.synsets("city", pos='n'), "TOWN": wordnet.synsets("town", pos='n'), "NAT": wordnet.synsets("natural places", pos='n'), "PER": wordnet.synsets("person", pos='n'), "ORG": wordnet.synsets("organisation", pos='n'), "ANI": wordnet.synsets("animal", pos='n'), "SPO": wordnet.synsets("sport", pos='n'), "ENT": wordnet.synsets("entertainment", pos='n'), } tagged_top_entities = defaultdict(list) for word in pos_tags: if word[1] == "NN" or word[1] == "NNP": noun_lemmas.append(lemmatizer.lemmatize(word[0], wordnet.NOUN)) word_synset = wordnet.synsets(word[0], pos="n") for e in list(entities.keys()): if len(word_synset) != 0 and len(entities[e]) != 0: if hypernymOf(word_synset[0], entities[e][0]): tagged_top_entities[word[0]].append(e) print("WordNet tagged:") for w in tagged_top_entities: print("{:15}{:15}".format(w, tagged_top_entities[w]))
class EventDetectiveNer(EventDetective): def loadClassifier(self): classifier = "ner/classifiers/" + "tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar) def tagText(self, candidate): result = defaultdict(list) text = " ".join([tweet['text'] for tweet in candidate]) #make one long text for line in self.tagger.tag(nltk.word_tokenize(text)): for word, tag in line: result[tag].append(word) return result def generateMarkers(self): print("Creating Google Maps markers & add WIKI links...") js = open('vis/map/js/markers.js', 'w') js.write('var locations = [') for tweets, label in self.events: writableCluster = '' gh = [] i = 0 avgLon = 0 avgLat = 0 #tweets = sorted(tweets, key=itemgetter('unixTime')); for tweet in tweets: i = i + 1 gh.append(tweet['geoHash']) avgLon += float(tweet["lon"]) avgLat += float(tweet["lat"]) # backslashes voor multiline strings in Javascript writableCluster += "{} {} {} {}<br/><br/>".format( tweet['localTime'], tweet['geoHash'], tweet['user'], tweet['text']).replace("'", "\\'") # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde... # Oftewel, we doen even alsof de aarde plat is ;-) avgLon /= i avgLat /= i nertags = self.tagText(tweets) for key in nertags: if key != 'O': writableCluster += "</br> {} {}".format( key, " ,".join(list(set(nertags[key]))).replace("'", "\\'")) js.write("['{}', {}, {}, '{}'],".format(writableCluster, avgLat, avgLon, label)) js.write('];') js.close()
def german_ner(text): """ Moves the list of words through the NER tagger""" text = text.encode('utf8') st = NERTagger('/Users/Lena/src/context/stanford-ner/classifiers/german/dewac_175m_600.crf.ser.gz', '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') tagged = st.tag(text.split()) return tagged
def spanish_ner(text): """ Moves the list of words through the NER tagger""" text = text.encode('utf8') st = NERTagger('/Users/Lena/src/context/stanford-ner/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz', '/Users/Lena/src/context/stanford-ner/stanford-ner.jar', 'utf8') tagged = st.tag(text.split()) return tagged
def queryForEntity2(expectedEntity, passage): st = NERTagger( '/Users/srinisha/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/srinisha/Downloads/stanford-ner-2014-06-16/stanford-ner.jar') answer = st.tag(passage.split()) print answer answers = [] for j, currentExpectedEntity in enumerate(expectedEntity): for i, pair in enumerate(answer): if (pair[1] == currentExpectedEntity): answers.append(answer[i]) return answers
def standfordtagger(words): try: os.environ['JAVAHOME'] = '' path = "" classifier = path + "" jar = path + "/stanford-ner-3.4.jar" st = NERTagger(classifier, jar) stanford_tagger = st.tag(words) return stanford_tagger except: print(words)
def compute_NER(corpus): NER = [] #fi=open("NER_features_train.txt","w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) ner_tag = "" for n in ner: ner_tag = ner_tag + n[1] + " " NER.append(ner_tag) return NER
def standfordtagger(words): try: os.environ['JAVAHOME'] = '/usr/lib/jvm/java-1.7.0-openjdk-amd64' path = "/home/guido/PTA/stanford-ner-2014-06-16" classifier = path + "/classifiers/" + "english.all.3class.distsim.crf.ser.gz" jar = path + "/stanford-ner-3.4.jar" st = NERTagger(classifier, jar) stanford_tagger = st.tag(words) return stanford_tagger except: print(words)
def tagger(data): try: st = NERTagger( './nltk-data/StanfordNER/english.all.3class.distsim.crf.ser.gz', './nltk-data/StanfordNER/stanford-ner.jar') except: return ret_failure(705) #try: tag = st.tag(data.split()) #except: # return ret_failure(702) return ret_success(tag)
def findWord(self): """ """ st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar') tagged= st.tag(self.question.split()) for item in tagged: if item[1]== self.queryType: #print item[0] return item[0] return -1
class EventDetectiveNer(EventDetective): def loadClassifier(self): classifier = "ner/classifiers/" + "tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar) def tagText(self, candidate): result = defaultdict(list) text = " ".join([tweet['text'] for tweet in candidate]) #make one long text for line in self.tagger.tag(nltk.word_tokenize(text)): for word, tag in line: result[tag].append(word) return result def generateMarkers(self): print("Creating Google Maps markers & add WIKI links...") js = open('vis/map/js/markers.js','w') js.write('var locations = [') for tweets,label in self.events: writableCluster = '' gh = [] i = 0 avgLon = 0 avgLat = 0 #tweets = sorted(tweets, key=itemgetter('unixTime')); for tweet in tweets: i = i + 1 gh.append(tweet['geoHash']) avgLon += float(tweet["lon"]) avgLat += float(tweet["lat"]) # backslashes voor multiline strings in Javascript writableCluster += "{} {} {} {}<br/><br/>".format(tweet['localTime'], tweet['geoHash'], tweet['user'], tweet['text']).replace("'", "\\'") # Bepaal het Cartesiaans (normale) gemiddelde van de coordinaten, de afwijking (door vorm # van de aarde) zal waarschijnlijk niet groot zijn omdat het gaat om een klein vlak op aarde... # Oftewel, we doen even alsof de aarde plat is ;-) avgLon /= i avgLat /= i nertags = self.tagText(tweets) for key in nertags: if key != 'O': writableCluster += "</br> {} {}".format(key, " ,".join(list(set(nertags[key]))).replace("'", "\\'")) js.write("['{}', {}, {}, '{}'],".format(writableCluster,avgLat,avgLon,label)) js.write('];') js.close()
def extract_entities_stanford(sample, stanfordPath, model): from nltk.tag.stanford import NERTagger st = NERTagger(stanfordPath + get_model_name(model), stanfordPath + '/stanford-ner-2014-01-04.jar') entity_names = st.tag(sample.split()) entities = [] for entity, tag in entity_names: if cmp(tag, "O") != 0: entities.append([entity, tag]) return entities
class Ner(): def __init__(self): classifier = "ner/classifiers/" + "ner-model-tweets.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = NERTagger(classifier, jar) def tagText(self, candidate): result = defaultdict(list) text = " ".join([tweet['text'] for tweet in candidate]) #make one long text for line in self.tagger.tag(self.tokens): for word, tag in line: result[tag].append(word) return result
def get_names(self, sentence): # Use NLTK Tagger if self.tagger == 'NLTK': tokens = nltk.tokenize.word_tokenize(sentence) # word tokenizer pos_tags = nltk.pos_tag(tokens) # part of speech tagging ner_tags = nltk.ne_chunk(pos_tags) # named entity recognition # Use Stanford NER Tagger instead of NLTK default elif self.tagger == 'Stanford': st = NERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar') ner_tags = st.tag(sentence.split()) return self.get_names_from_tags(ner_tags)
def whoQuestion(tokens): st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') posTags = nltk.pos_tag(tokens) ner = st.tag(tokens) if posTags[0][1] == 'NNP' and ner[0][1] == 'PERSON': # We have a PERSON i = 0 while (posTags[i][1] == 'NNP' and ner[i][1] == 'PERSON'): i = i + 1 if tokens[i] in EXIST: tokens = changeToQuestionMark(tokens) tokens = ['Who'] + tokens[i:] return (True, ' '.join(tokens[:-1]) + tokens[-1])
def test_main(request): #Java imports from nltk.tag.stanford import NERTagger java_path="C:/Program Files/Java/jre1.8.0_31/bin/java.exe" os.environ['JAVAHOME']=java_path stanford_jar=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/stanford-ner.jar' stanford_trained=settings.BASE_DIR+'/../nltk_data/stanford-ner-2015-01-30/classifiers/english.all.7class.distsim.crf.ser.gz' NER_Tagger = NERTagger(stanford_trained, stanford_jar) phrases="once upon a midnight dreary" tags=NER_Tagger.tag(phrases) #Above imported print "Got "+str(tags) return HttpResponse(str(tags))
def get_names(self, sentence): # Use NLTK Tagger if self.tagger == 'NLTK': tokens = nltk.tokenize.word_tokenize(sentence) # word tokenizer pos_tags = nltk.pos_tag(tokens) # part of speech tagging ner_tags = nltk.ne_chunk(pos_tags) # named entity recognition # Use Stanford NER Tagger instead of NLTK default elif self.tagger == 'Stanford': st = NERTagger( '/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar') ner_tags = st.tag(sentence.split()) return self.get_names_from_tags(ner_tags)
def main(): file = open("ada_lovelace.txt", 'r') file = file.read() file = file.decode('utf-8') text = nltk.word_tokenize(file) # Location, Person, Organization class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') class3_nowiki = NERTagger('stanford-ner/classifiers/english.nowiki.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') # Location, Person, Organization, Misc class4 = NERTagger('stanford-ner/classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') # Time, Location, Organization, Person, Money, Percent, Date class7 = NERTagger('stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') #print(class3.tag(text)) nnp_words = [] nn_words = [] not_tagged = [] pos_tags = nltk.pos_tag(text) for tag in pos_tags: if tag[1] == 'NNP': nnp_words.append(tag[0]) elif tag[1] == 'NN': nn_words.append(tag[0]) print("NERTagged words:") ner_tagged = class3.tag(nnp_words) tagged = [] for t in ner_tagged[0]: if t[1] == u'O': not_tagged.append(t) else: tagged.append(t) print(tagged) print("WordNet Tagged Words:") print(WNtagger(nn_words)) print("Not Tagged Words:") print(not_tagged)
def compute_NER(corpus): #NER=[] fi=open(read_property('NER_features_train_coarse_path'),"w") st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath')) for sentence in corpus: ner=st.tag(sentence.split()) #print ner #pos_seq=nltk.pos_tag(text) #print pos_seq ner_tag="" for n in ner: #print n[1] ner_tag=ner_tag+n[1]+" " #print pos_tags fi.write(ner_tag+"\n") #NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close()
def NERTag(self, question): """ input: query (keywords of query) as string output: NER tagged list of the snippets and title """ snippets= self.getSnippets(question) taggedList= [] start_time = time.time() for item in snippets: st = NERTagger('stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz','stanford-ner-2014-01-04/stanford-ner.jar') temp = item.encode('ascii','ignore') tagged= st.tag(temp.split()) taggedList.append(tagged) # print "NER tagged list: ", taggedList # print # print "Tagging: ", time.time() - start_time # print return taggedList
def compute_NER(corpus): #NER=[] fi = open(read_property('NER_features_train_coarse_path'), "w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) #print ner #pos_seq=nltk.pos_tag(text) #print pos_seq ner_tag = "" for n in ner: #print n[1] ner_tag = ner_tag + n[1] + " " #print pos_tags fi.write(ner_tag + "\n") #NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close()
def extract_persons_stanford(sample, stanfordPath, model): from nltk.tag.stanford import NERTagger import operator st = NERTagger(stanfordPath + get_model_name(model), stanfordPath + '/stanford-ner-2014-01-04.jar') entity_names = st.tag(sample.split()) entity_count = {} for entity, tag in entity_names: if cmp(tag, "PERSON") == 0: if entity in entity_count: entity_count[entity] += 1 else: entity_count[entity] = 1 sorted_occurrences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1)) return sorted_occurrences
def entityTagger(): class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') output = open("en.tok.off.test.pos.tagged", "w") with open("en.tok.off.test.pos", "r") as inp_file: for l in inp_file: line = l.split() if line[4] == "NN" or line[4] == "NNP": ner_tagged = class3.tag([line[3]]) print("Nertagged:", ner_tagged) for t in ner_tagged[0]: if len(t[1]) < 3: tag = wordNetTagger(t[0]) print("Wordnet tag:", tag) data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], tag)) output.write(data+"\n") else: data = ("{:4}{:4}{:6}{:20}{:6}{:10}".format(line[0], line[1], line[2], line[3], line[4], t[1])) output.write(data+"\n") output.close()
def main(word_transformation = None, result_path = None, n = 50): tagged_corpus = CoNLLNERReader(TEST_DATA_PATH).read()[:n] tagger = NERTagger('/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/classifiers/english.conll.4class.distsim.crf.ser.gz', '/cs/fs/home/hxiao/code/stanford-ner-2015-01-30/stanford-ner.jar') print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w,t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w,t in sent] for sent in tagged_corpus) correct_tags = [transform_labels([t for w,t in sent]) for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i+1) % 5 == 0: print "%d finished" %(i+1) try: ptags = [t for w,t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" %(sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent assert len(really_correct_tags) == len(predicted_tags), "length inconsistent" print "%d finished" %(i+1) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
def handleProperNoun(tokens, pos, position): st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') # get tokens & pos before verb bTokens = tokens[:position] bPos = pos[:position] ner = st.tag(bTokens) # reverse everything now ner = ner[::-1] bPos = bPos[::-1] person = False i = -1 if isProperNoun(bPos[0][1]) and isPerson(ner[0][1]): i = 0 person = True while (i < len(bPos) and isProperNoun(bPos[i][1]) and isPerson(ner[i][1])): i = i + 1 elif isProperNoun(bPos[0][1]): i = 0 while (i < len(bPos) and isProperNoun(bPos[i][1])): i = i + 1 # Reverse back and remove extra ner = ner[::-1] if (i > -1): for r in range(1, i): tokens.pop(len(bTokens) - i) pos.pop(len(bTokens) - i) position = position - 1 if person: tokens[position - 1] = 'who' else: tokens[position - 1] = 'what' return (tokens, pos, position)
def NERTag(self, question): """ input: query (keywords of query) as string output: NER tagged list of the snippets and title """ snippets = self.getSnippets(question) taggedList = [] start_time = time.time() for item in snippets: st = NERTagger( 'stanford-ner-2014-01-04/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner-2014-01-04/stanford-ner.jar') temp = item.encode('ascii', 'ignore') tagged = st.tag(temp.split()) taggedList.append(tagged) # print "NER tagged list: ", taggedList # print # print "Tagging: ", time.time() - start_time # print return taggedList
def tagdata(refDict): """ Gives the data its NER Tags using our trained tagger """ #pbar = ProgressBar() tokens = [] testData = codecs.open('testdata.tsv', 'r') for line in testData: if len(line) > 1: token = line.strip().split('\t') tokens.append(token[0]) #os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_45/bin" path="ner" classifier = "ner-pta.ser.gz" jar = "stanford-ner.jar" tagger = NERTagger(classifier, jar) taggedText = tagger.tag(tokens) for line in taggedText: for tup in line: for key, value in refDict.items(): if tup[0] == value[0]: refDict[key] = [tup[0],tup[1]] return taggedText, refDict
def generate(word): sentence = word st = NERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) ner = st.tag(tokens) # TODO: Add in the question mark at the end of the sentence (success, question) = simpleYesNo(tokens, pos) if success: return question (success, question) = simpleWhoOrWhat(tokens, pos) if success: return question return None
def entityTaggertest(l): """ function that entity tags a list of nouns :param l: list of nouns :return: list of tagged nouns, tuples (word, tag) """ tagged = [] class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') ner_tagged = class3.tag(l) for l in ner_tagged: for t in l: # If the word is tagged via NERTagger if len(t[1]) > 3: tagged.append(t) # If the words is not tagged, try to tag it via Word Net if len(t[1]) < 3: tag = wordNetTagger(t[0]) # If even Word Net cant tag it, return without tag. if tag != "-": tagged.append((t[0], tag)) return tagged
def print_symptoms_from_page(url='', model='', stanford_jar=''): html_reader = HTMLReader(url) cleaned_text = html_reader.get_text_from_page() symptoms = set() st = NERTagger(model, stanford_jar, encoding='utf-8') sentences = nltk.sent_tokenize(cleaned_text) for sentence in sentences: tags = st.tag(nltk.word_tokenize(sentence)) tag_index = 0 while tag_index < len(tags): if tags[tag_index][1] == 'SYMP': symptom = [] while tag_index < len(tags) and tags[tag_index][1] != 'O': symptom.append(tags[tag_index][0]) tag_index += 1 symptoms.add(' '.join(symptom)) else: tag_index += 1 print "Found %d symptoms:" % len(symptoms) for symptom in symptoms: print symptom
def select_names(cnx,business_id): query = "SELECT * FROM hairVegas2 WHERE business_id = '%s' " %business_id cur.execute(query) raw = cur.fetchall() review = [row[4] for row in raw] #print type(review) unames = {} for i in range(0, len(review)): r = review[i] tokens = word_tokenize(r) tagged_token = pos_tag(tokens) nouns_only = [ word for (word, tag) in tagged_token if tag.startswith('NNP')] nopunct_nouns = [word.replace(".","") for word in nouns_only ] st = NERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar') persons = st.tag(nopunct_nouns) names = [ n for n,p in persons if p == 'PERSON' ] unique = list(set(names)) unames[i] = unique for r in range(0, len(review)): aset = unames[r] for i in range(0, len(aset)): for j in range(i+1, len(aset)): tokens = word_tokenize(review[r]) nopunct_tokens = [word.replace(".","") for word in tokens ] a = nopunct_tokens.index(aset[i]) b = nopunct_tokens.index(aset[j]) #print r, a, b if abs(b-a) == 1: if b > a: unames[r].pop(j) else: unames[r].pop(i) #print unames person = [name for sublist in unames.values() for name in sublist] #print person return person
class StanfordNerTagger(): """ Wrapper class for the nltk.tag.stanford.NERTagger module. Provides streamlined instantiation and helper methods to simplify the process of using the tagger. """ def __init__(self): # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford self.Tagger = NERTagger( app.config['SNER_CLASSIFIERS'], app.config['SNER_JARFILE'], encoding='utf-8') return def Tag(self, text): """ Given text, the tagger will identify all entities mentioned in the text and associate them with an entity type. Example: Input: "I am Jack and I live in Phoenix, Arizona." Tag Result: "[(I)]... TODO" :param str text: text to tokenize and tag :returns: list of tuples -- see above example """ entities = self.Tagger.tag(text) return entities def __repr__(self): return "<StanfordNerTagger(Tagger=%s)>" % (self.Tagger)
class StanfordNerTagger(): """ Wrapper class for the nltk.tag.stanford.NERTagger module. Provides streamlined instantiation and helper methods to simplify the process of using the tagger. """ def __init__(self): # Example here: www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford self.Tagger = NERTagger(app.config['SNER_CLASSIFIERS'], app.config['SNER_JARFILE'], encoding='utf-8') return def Tag(self, text): """ Given text, the tagger will identify all entities mentioned in the text and associate them with an entity type. Example: Input: "I am Jack and I live in Phoenix, Arizona." Tag Result: "[(I)]... TODO" :param str text: text to tokenize and tag :returns: list of tuples -- see above example """ entities = self.Tagger.tag(text) return entities def __repr__(self): return "<StanfordNerTagger(Tagger=%s)>" % (self.Tagger)
def bgWordNetTagger(ner_word, wn_word): class3 = NERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') tag_bigram = class3.tag([ner_word]) if tag_bigram[0][0][1] == "LOCATION": if len(wordnet.synsets(wn_word, pos="n")) > 0: word = wordnet.synsets(wn_word, pos="n")[0] city = wordnet.synsets("City", pos="n")[0] state = wordnet.synsets("State", pos="n")[0] country = wordnet.synsets("Country", pos="n")[1] town = wordnet.synsets("Town", pos='n')[0] results = [("CITY", word.path_similarity(city)), ("STATE", word.path_similarity(state)), ("COUNTRY", word.path_similarity(country)), ("TOWN", word.path_similarity(town))] sorted_scores = sorted(results, key=lambda tup: tup[1], reverse=True) return sorted_scores[0][0] else: return "-" return "-"
from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() print '\n' print 'Lemmatization' for w in word_data[:20]: print 'Actual: %s Lemm %s' % (w, wordnet_lemmatizer.lemmatize(w)) # Stanford Named Entity Recognizer # http://nlp.stanford.edu from nltk.tag.stanford import NERTagger print '\nPerforming NER tagging: ' st = NERTagger( './stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', './stanford-ner-2014-06-16/stanford-ner.jar') print st.tag( '''Barrack Obama is the president of the United States of America . His father is from Kenya and Mother from United States of America. He has two daughters with his wife. He has strong opposition in Congress due to Republicans''' .split()) #Please provide your keys here TWITTER_APP_KEY = 'XXXXXXXXXXXXXX' TWITTER_APP_KEY_SECRET = 'XXXXXXXXXXXXXX' TWITTER_ACCESS_TOKEN = 'XXXXXXXXXXXXXXXXXXXXXX' TWITTER_ACCESS_TOKEN_SECRET = 'XXXXXXXXXXXXXXXXXXXXX' t = Twython(app_key=TWITTER_APP_KEY, app_secret=TWITTER_APP_KEY_SECRET, oauth_token=TWITTER_ACCESS_TOKEN, oauth_token_secret=TWITTER_ACCESS_TOKEN_SECRET) # get access to tweets with open('../data/politician_tweets.json') as fp:
#!/usr/bin/env python # -*- coding: utf-8 -* import numpy import nltk from nltk.tag.stanford import NERTagger ## Configure this to be your Java directory #nltk.internals.config_java(u"C:/Program Files/Java/jre7/bin/java.exe") chunk = u"妈妈带我去公园散步" #chunk = u"妈我" #tagger = POSTagger() #token_tags = tagger.tag(chunk) #for token,tag in token_tags: # print token,tag text = nltk.word_tokenize(chunk.encode('utf-8')) st = NERTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar') poop = st.tag(text) print poop #tagger = pickle.load(open('sinica_treebank_brill_aubt.pickle')) #poop = tagger.tag(text) #print poop #poop2 = nltk.pos_tag(text) #print poop2
# text3 = """While cleaning out her husband's attic, Mrs. Phyllis Cahill inadvertently included among the items sold to the pawnbroker a secreted Ming vase John Cahill had stolen from the museum.""" # text3 = "Kim thought that with her experience, she could convince Sandy to trust Chris." # text3 = "Jaime was scared that with Chris around, her security would be compromised." # text3 = "Kim was scared that he would break her trust." # text3 = "He accepted the position of Chairman of Carlisle Group, a major banking company" # text2 = "Alexander the Great conquered the Empire of Persia" # text3 = "Jordan said she would not do it, and Taylor conquered the Empire of Persia" f = open("classify_names.txt", "r") lines = f.readlines() for line in lines: text3 = line nameslist = [] tags = st.tag(text3.split()) for tag in tags: if tag[1] == "PERSON": name = [] i = tags.index(tag) while tags[i][1] == "PERSON": name.append(tags[i][0]) tags.remove(tags[i]) nameslist.append(name) # print name # print gender_features(name[0]) for name in nameslist: nameslist[nameslist.index(name)] = ' '.join(name)
importer = zipimport.zipimporter('nltk.mod') nltk = importer.load_module('nltk') nltk.internals.config_java(pathtojava) nltk.data.path += ["./nltkData/"] from nltk.tag.stanford import NERTagger #nltk.internals.config_java(pathtojava); #stanfordTagge- = NERTagger('CollSmall-ner-model.ser.gz', 'stanford-ner.jar', 'utf-8') stanfordTagger = NERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar', 'utf-8') #input = open('stanfordNER.pickle', 'rb'); #stanfordTagger = load(input) #input.close() # input is file with fullpath filenames for line in sys.stdin: #assume line is the full path for a file fname = line.rstrip('\n').split('\t')[0] text = '' try: with open('./eventData/' + fname, 'r') as f: text = f.read() except: continue if len(text) > 0 and text != None: #print text.split(); for t in stanfordTagger.tag(text.split()): if len(t[0]) > 2 and t[1] != 'O': print '%s_%s\t%d' % (t[1], t[0].lower(), 1)
'/Users/HENGJIE/Desktop/trydjango18/stanford-ner/stanford-ner.jar') #path = '/Users/HENGJIE/Desktop/text repo/Bloomberg/donald trump/donald trump -1w-2017-03-03.txt' path = '/Users/HENGJIE/Desktop/text repo/test data/samsung.txt' total_list = [] # the list to store entities with document frequency with open(path, 'r') as f: lines = json.load(f) filtered_lines = [ line for line in lines if line['title'].find('Samsung') >= 0 ] print len(filtered_lines) for line in filtered_lines: content = line['content'] content = content.encode('utf-8', 'ignore') sentences = st.tag(content.split()) article_list = [ ] # the list to store non-repeating entities within one article print len(sentences) for sentence in sentences: merge(sentence) for token in sentence: new = True i = 0 # if token[0].lower() == 'ko' and token[1] == 'LOCATION': # token = ('Korea', 'LOCATION') if token[1] != 'O': if (token[0] == 'Samsung Group'): new = False # use document frequency instead of term frequency else:
from nltk.tag.stanford import NERTagger import os java_path = "C:/Program Files/Java/jdk1.8.0_45/bin/java.exe" os.environ['JAVAHOME'] = java_path st = NERTagger('./english.all.7class.distsim.crf.ser.gz', './stanford-corenlp-3.5.2.jar') file = open("text/289007975") while 1: lines = file.readlines(100000) if not lines: break for line in lines: print st.tag(unicode(line, errors='ignore').split())
from nltk.tag.stanford import NERTagger import sys, json st = NERTagger('english.muc.7class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') ret_list = st.tag(sys.argv[1:]) #temp_list = sys.argv[1:] #print ret_list ret_dict = {} i = 0 cname = '' fname = '' aname = '' j = 0 l = len(ret_list) while (j < l): #print j if ret_list[j][1] == 'ORGANIZATION' and i == 0 and not cname: cname = ret_list[j][0] j = j + 1 while j < l and ret_list[j][1] == 'ORGANIZATION': #print "loop1" cname = cname + " " + ret_list[j][0] j = j + 1 i = 1 elif ret_list[j][1] == 'ORGANIZATION' and i == 1 and not fname: fname = ret_list[j][0] j = j + 1 while j < l and ret_list[j][1] == 'ORGANIZATION': #print "loop2" fname = fname + " " + ret_list[j][0] j = j + 1
def stanford(tweet): st = NERTagger(STANFORD_NER, STANFORD_NER_JAR) return st.tag(tweet.split())
m = soup.get_text() m = m.encode('utf-8') m = m.split("\n") m[:] = [x.decode('utf-8').strip() for x in m if x.decode('utf-8').strip() != ''] m[:] = [x.replace(' ', u' ') for x in m] m[:] = [x.replace(u'\xa0', u' ') for x in m] m = " ".join(m) #m = ''.join([i if ord(i) < 128 else ' ' for i in m]) m = m.decode('utf-8') m=[m] namefound = [] thisurl = (url, []) ########################### name finding ######################### st = NERTagger('C:/Users/Harshit Agarwal/Downloads/stanford-ner-2014-06-16/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz','C:/Users/Harshit Agarwal/Downloads/stanford-ner-2014-06-16/stanford-ner-2014-06-16/stanford-ner.jar') string = m[0].split() listname = st.tag(string) for iii in xrange(len(listname)-3): if (listname[iii][1]=='PERSON' and listname[iii+1][1]=='PERSON' and listname[iii+2][1]=='PERSON'): name = listname[iii][0]+" " + listname[iii+1][0]+ " " +listname[iii+2][0] namefound.append(name) thisurl[1].append((name, [], [], [])) iii+=3 elif (listname[iii][1]=='PERSON' and listname[iii+1][1]=='PERSON'): name = listname[iii][0]+" " + listname[iii+1][0] namefound.append(name) thisurl[1].append((name, [], [], [])) iii+=2 ################################################################## print namefound for kk in emails[it]: for count in range(len(namefound)):
from nltk.tag.stanford import NERTagger st = NERTagger('stanford-ner/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar') print st.tag('You can call me Billiy Bubu and I live in Amsterdam.'.split())
from nltk.tag.stanford import NERTagger model_path = "../ner/english.muc.7class.distsim.crf.ser.gz" jar_path = "../ner/stanford-ner.jar" st = NERTagger(model_path, jar_path) text = 'Rami Eid is studying at Stony Brook University in NY. He lives in United States of America' tokens = text.split() st.tag(tokens)
url = 'http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip' dir = os.path.join(dl.data.get_data_dir(), 'ner') if not os.path.exists(dir): os.mkdir(dir) fname = 'stanford-ner-2015-04-20.zip' out = os.path.join(dir, fname) if not dl.conf.file_exists(out): dl.data.download(url, out) with ZipFile(out) as nerzip: nerzip.extractall(path=dir) return os.path.join(dir, fname.replace('.zip', '')) dir = download_ner() st = NERTagger( os.path.join(dir, 'classifiers', 'english.all.3class.distsim.crf.ser.gz'), os.path.join(dir, 'stanford-ner.jar')) fid = brown.fileids(categories='news')[0] printer = dl.log_api.Printer(nelems=9) tagged = [ pair for pair in dl.collect.flatten(st.tag(brown.words(fid))) if pair[1] != 'O' ] printer.print(tagged)