def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type): stanford_tagger = StanfordNERTagger( model_name, stanford_ner_path, encoding='utf-8') text = sent_obj.sentence tokenized_text = list() spans = list() #Recover spans here for match in re.finditer("\S+", text): start = match.start() end = match.end() word = match.group(0) tokenized_text.append(word.rstrip(",.;:")) spans.append((start,end)) tokenized_text = strip_sec_headers_tokenized_text(tokenized_text) classified_text = stanford_tagger.tag(tokenized_text) # Expand tuple to have span as well len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset final_class_and_span = list() for idx,tup in enumerate(classified_text): combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1]) final_class_and_span.append(combined) #print(classified_text) sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span return sent_obj
def get_location(loc): """ currently working only on my computer english Model english.muc.7class.distsim.crf.ser.gz german Models german.dewac_175m_600.crf.ser.gz german.hgc_175m_600.crf.ser.gz """ # Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar') loc_ner = st.tag(loc) """ might be faster starting from back to front 'LOCATION' for English 'I-LOC' for German """ # code that glues named entities like 'New York' back together loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item] try: location = loc_tuples[0] if len(loc_tuples) > 1: for i in range(1,len(loc_tuples)): location += ' ' + loc_tuples[i] except IndexError: # if no location is specified return None return location
def extract_named_entities(threadName,output_collection,fetchedTweets): st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') try: counter = 0 mongo_list = [] for fetchedTweet in fetchedTweets: counter += 1 named_entities = [] sentence = fetchedTweet['cleaned_text'] neList = st.tag(sentence.split()) for ne in neList: if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']: named_entities.append((ne[0], ne[1])) fetchedTweet['named_entities'] = named_entities mongo_list.append(fetchedTweet) if counter % 100 == 0: logging.info("{}: Tweets processed: {} tweets".format(threadName, counter)) write_mongo(threadName,output_collection,mongo_list) mongo_list = [] if len(mongo_list) > 0: write_mongo(threadName,output_collection,mongo_list) mongo_list = [] except Exception, e: print(e) sys.exit()
def pretag(self): text=self.text st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\ "/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar") paragraphs = [] paragraphs_string='' for x in text: paragraphs.append(str(x)) paragraphs_string=' '.join(paragraphs) tagging=st.tag(paragraphs_string.split()) symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system'] badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up'] badlist_stem=[] self.badlist=badlist self.symlist=symlist for i in range(len(badlist)): badlist_stem.append(stemmer.stem(badlist[i])) self.badlist_stem=badlist_stem pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ] pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps] pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech] pretag= pretag1+pretag2+pretag3 domain2synsets = defaultdict(list) synset2domains = defaultdict(list) self.pretag=pretag
def ner(): os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer' os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar' os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH'] eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz') for x in content: print(eng_tagger.tag(x.split()))
def getEntityCount(tweet): # Use the Stanford NER Tagger st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # tokenize the tweet tokenized_text = word_tokenize(tweet) classified_text = st.tag(tokenized_text) countPerson =0 for text in classified_text: if "PERSON" in text[1]: countPerson+=1 return countPerson
def NERTagging(text): log_file = open("Dump/log/Main_output.txt", "a") st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'resources/ner/stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) log_file.write('NER \n %s \n' % classified_text) print(classified_text) log_file.close() return
def nltk_ner(remainders): st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') for item in remainders: name = "" tagged = st.tag(item.split()) for entity in tagged: if entity[1] == u'PERSON': name += (entity[0].title() + ' ') if name: return True, name, item else: return False, name, item
def get_namedentities(text): """ Returns named entities in text using StanfordNERTagger """ st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar') ner_tagged = st.tag(text.lower().split()) named_entities = [] if len(ner_tagged) > 0: for n in ner_tagged: if n[1]!='O': named_entities.append(remove_punctuation(n[0])) named_entities = [n for n in named_entities if n] return named_entities
def trial1(): """ Just to make sure we're not screwing everything up. :return: """ st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz', '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar', encoding='utf-8') text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text)
def classify_text(text): """Using the 3-class Stanford Named Entity Recognition model, classify each word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for other).""" directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/" mod = "classifiers/english.all.3class.distsim.crf.ser.gz" tag = "stanford-ner.jar" path_to_model = os.path.normpath(directory + mod) path_to_tagger = os.path.normpath(directory + tag) st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8') tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) return classified_text
def __init__(self, use_stanford=False, NER_model=None, NER_tagger=None, POS_model=None, POS_tagger=None): """The initializer of the class :param NER_model: NER model path :param NER_tagger: NER tagger path :param POS_model: POS model path :param POS_tagger: POS tagger path :param use_stanford: boolean, if using stanford NER and POS tagging """ self.NER_model = NER_model self.NER_tagger = NER_tagger self.POS_model = POS_model self.POS_tagger = POS_tagger self.use_stanford = use_stanford if use_stanford: if NER_model is None or NER_tagger is None or POS_model is None or POS_tagger is None: sys.exit("tagging initialization: Stanford models and taggers" " have to be provided!") else: self.post = StanfordPOSTagger(self.POS_model, self.POS_tagger).tag self.nert = StanfordNERTagger(self.NER_model, self.NER_tagger).tag else: self.post = nltk.pos_tag self.nert = nltk.ne_chunk
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None): """ Extract entities using the Stanford NER tagger. Must pass in the path to the tagging model and jar as downloaded from the Stanford Core NLP website. """ results = defaultdict(lambda: defaultdict(list)) fileids = fileids or corpus.fileids() tagger = StanfordNERTagger(model, jar) section = section for fileid in fileids: if section is not None: text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1]) else: text = corpus.words(fileid) chunk = [] for token, tag in tagger.tag(text): if tag == 'O': if chunk: # Flush the current chunk etext = " ".join([c[0] for c in chunk]) etag = chunk[0][1] chunk = [] # if etag == 'PERSON': # key = 'persons' # elif etag == 'ORGANIZATION': # key = 'organizations' # elif etag == 'LOCATION': # key = 'locations' # else: # key = 'other' if etag == 'LOCATION': key = 'locations' else: key = 'other' results[fileid][key].append(etext) else: # Build chunk from tags chunk.append((token, tag)) return results
def main(): parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar) st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution." sent = word_tokenize(raw_sent) ne_tuple = st.cur_tag(sent) # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized) print ne_tuple print parser.raw_parse(raw_sent).next() return # find name entity f = 0 ne_list = [] for (ne, label) in ne_tuple: if label == 'PERSON': f = 1 if f and label != 'PERSON': break if f: ne_list.append(ne) # print ne_list init_file(main_tree) ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ? # try head to ask who/what pattern = "S < NP=np" head = check_output(['bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file]) print head def get_main_verbs(tree): pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)' main_verbs = check_output(['bash', ###add bash !!!! tregex_path, '-s', pattern, init_tree_file]) print main_verbs main_verbs = main_verbs.split('\n')[:-1] main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs] return main_verbs
def __init__(self, language="en"): from nltk.tag import StanfordNERTagger self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,) self.__tagger = StanfordNERTagger( self.__classifier, self.__stanfordJar, encoding="utf-8") self.__namedEntitiesFinder = NERFinder(language=language)
def html_ner(content): st = StanfordNERTagger( './lib/classifiers/english.all.3class.distsim.crf.ser.gz', './lib/stanford-ner-3.5.2.jar') soup = BeautifulSoup(content, "html.parser") for script in soup(["script", "style", "sup"]): script.extract() tokenised_sents = list(soup.stripped_strings) tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents] tagged_sents = [st.tag(sent) for sent in tokenised_words] result = list() for sent in tagged_sents: for tag, chunk in groupby(sent, lambda x: x[1]): if tag != 'O': result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip())) return result
def sanitize_result(self, text): st = StanfordNERTagger('C:\Python27\stanford_ner\classifiers\english.all.3class.distsim.crf.ser.gz', 'C:\Python27\stanford_ner\stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(self.capitalize_first_letter(text)) classified_text = st.tag(tokenized_text) named_entities = self.get_continuous_chunks(classified_text) named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities] named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities] for tag, chunk in groupby(named_entities_str_tag, lambda x:x[1]): if tag == "PERSON": #print "%-12s"%tag, " ".join(w for w, t in chunk) name = " ".join(w for w, t in chunk) return name
def __init__(self, model_num): if model_num == 3: pathname = config.STANFORD_3CLASS elif model_num == 4: pathname = config.STANFORD_4CLASS elif model_num == 7: pathname = config.STANFORD_7CLASS else: raise Exception('No model for:', model_num) self.tagger = StanfordNERTagger(pathname, config.STANFORD_NER_JAR)
def init_ner_mapper(self): # load the StanfordNER Tagger # model_ger = "/opt/Projects/nlp/stanford-ner-2015-04-20/classifiers" \ # "/german/german.hgc_175m_600.crf.ser.gz" # stanford_jar = "/opt/Projects/nlp/stanford-ner-2015-04-20/stanford" \ # "-ner.jar" model_ger = "/home/janrn/ner/german.hgc_175m_600.crf.ser.gz" # earkdev stanford_jar = "/home/janrn/ner/stanford-ner.jar" # earkdev self.tagger = StanfordNERTagger(model_ger, stanford_jar, encoding="utf-8", java_options='-mx4096m', )
def main(): # training standford NER tagger st = StanfordNERTagger( "/home/viswanath/Downloads/stanford-ner-2014-08-27/classifiers/english.conll.4class.distsim.crf.ser.gz", "/home/viswanath/Downloads/stanford-ner-2014-08-27/stanford-ner.jar", encoding="utf-8", ) fname = "/home/viswanath/data/resume/test_data/01.txt" fp = open(fname, "r") text = fp.read() # print text lstemp = cleanse_data(text) list_ner_out = st.tag(lstemp.split()) # list_ner_out = st.tag(text.split()) # print list_ner_out # list_out = st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) fp = open("ner_temp.txt", "w") # fp.write(list_ner_out) for item in list_ner_out: fp.write("{0}\n".format(item)) fp.close() ne_tagged_sent = list_ner_out ne_tree = stanfordNE2tree(ne_tagged_sent) print ne_tree ne_in_sent = [] for subtree in ne_tree: if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O" ne_label = subtree.label() ne_string = " ".join([token for token, pos in subtree.leaves()]) ne_in_sent.append((ne_string, ne_label)) print ne_in_sent
def __init__(self): ''' Assign some class variables; start scanning process. ''' self.status = 0 self.input_loc = config.input_area self.processing_loc = config.processing_area self.output_loc = config.output_area self.tagger = StanfordNERTagger(config.german_ner, config.stanford_jar, encoding='utf-8') self.scan()
def __init__(self, person): self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') classifier = "ner/classifiers/" + "english.all.3class.distsim.crf.ser.gz" jar = "ner/stanford-ner-3.4.jar" self.tagger = StanfordNERTagger(classifier, jar) self.ap = [] self.person = person self.query = Sparql(person) self.setSpouse() self.setMother() self.setFather() self.setFullName() self.setAbstract() self.setAbstractInfo()
def trial2(): """ Let's try using the nltk and one of the readability texts :return: """ pretrained_model_path = '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/www-experiments/stanford-ner-2015-12-09/' all3class = pretrained_model_path+'classifiers/english.all.3class.distsim.crf.ser.gz' conll4class = pretrained_model_path+'classifiers/english.conll.4class.distsim.crf.ser.gz' muc7class = pretrained_model_path+'classifiers/english.muc.7class.distsim.crf.ser.gz' st_muc = StanfordNERTagger(muc7class, pretrained_model_path+'stanford-ner.jar', encoding='utf-8') st_conll = StanfordNERTagger(conll4class, pretrained_model_path+'stanford-ner.jar', encoding='utf-8') st_3class = StanfordNERTagger(all3class, pretrained_model_path + 'stanford-ner.jar', encoding='utf-8') annotated_cities_file = '/Users/mayankkejriwal/datasets/memex-evaluation-november/annotated-cities/ann_city_title_state_1_50.txt' TP = 0 FP = 0 FN = 0 with codecs.open(annotated_cities_file, 'r', 'utf-8') as f: for line in f: obj = json.loads(line) text = obj['high_recall_readability_text'] tokenized_text = word_tokenize(text) classified_text_muc = st_muc.tag(tokenized_text) classified_text_conll = st_conll.tag(tokenized_text) classified_text_3class = st_3class.tag(tokenized_text) tagged_locations = set() correct_locations = _build_locations_true_positives_set(obj, ['correct_cities','correct_states','correct_cities_title']) # if 'correct_country' in obj and obj['correct_country']: # correct_locations = correct_locations.union(set(TextPreprocessors.TextPreprocessors._preprocess_tokens # (obj['correct_country'].split(),['lower']))) for i in range(0, len(classified_text_muc)): tag_muc = classified_text_muc[i] tag_conll = classified_text_conll[i] tag_3class = classified_text_3class[i] if str(tag_3class[1]) == 'LOCATION': # if str(tag_muc[1]) == 'LOCATION' or str(tag_conll[1]) == 'LOCATION' or str(tag_3class[1]) == 'LOCATION': tagged_locations.add(tag_3class[0].lower()) # print tagged_locations # print correct_locations TP += len(tagged_locations.intersection(correct_locations)) FP += (len(tagged_locations)-len(tagged_locations.intersection(correct_locations))) FN += (len(correct_locations)-len(tagged_locations.intersection(correct_locations))) # print classified_text[0][1] # print(classified_text) # break print 'TP, FP, FN are...' print TP print FP print FN
class StanfordTagger(object): """ Wrapper for the Stanford NER Tagger """ __classifier = "" __stanfordJar = "" def __init__(self, data=None): from nltk.tag import StanfordNERTagger self.__tagger = StanfordNERTagger(self.__classifier, self.__stanfordJar, encoding="utf-8") def tags(self, raw_text): """ Extract named entities from a raw text :raw_text: The raw text """ from nltk.tokenize import word_tokenize token_text = word_tokenize(raw_text) ne_tags = self.__tagger.tags(token_text) return(ne_tags)
A big benefit of the Stanford NER tagger is that is provides us with a few different models for pulling out named entities. We can use any of the following: 3 class model for recognizing locations, persons, and organizations 4 class model for recognizing locations, persons, organizations, and miscellaneous entities 7 class model for recognizing locations, persons, organizations, times, money, percents, and dates ################################################################################################ The parameters passed to the StanfordNERTagger class include: Classification model path (3 class model used below) Stanford tagger jar file path Training data encoding (default of ASCII) """ from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/share/stanford-ner/stanford-ner.jar', encoding='utf-8') text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text)
from nltk.tag import StanfordNERTagger st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz") print st.tag( "Rami Eid is studying at Stony Brook University in NY. And he wants to work at CERN in Switzerland in Europe .".split() )
def stanford_tagger(token_text): st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar') ne_tagged = st.tag(token_text) return ne_tagged
################################################################################ ## This module contains functions that take sentences as input and returns time ## and location information. ################################################################################ print(":::::::::::::Loading time and location tagging Libraries::::::::::::::\n") #Time Tagger libraries import json from sutime import SUTime jar_files = "./Resources/python-sutime-master/jars/" sutime = SUTime(jars=jar_files, mark_time_ranges=True) # NER Libraries from nltk.tokenize import word_tokenize from nltk.tag import StanfordNERTagger st = StanfordNERTagger('./Resources/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz',\ './Resources/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8') print("\n:::::::::::::All libraries loaded:::::::::::::\n\n") def get_time(sentence): su_out = json.loads(json.dumps(sutime.parse(sentence), sort_keys=True, indent=4)) if not len(su_out) == 0: time_list = [x['value'] for x in su_out if type(x['value'])==type('')] return ", ".join(time_list) else: return None
class StanfordTagger(object): """ Wrapper for the Stanford NER Tagger """ __currentDirectory = os.path.dirname(os.path.realpath(__file__)) # Current directory __classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" __stanfordJar = "%s/dist/stanford-ner.jar" def __init__(self, language="en"): from nltk.tag import StanfordNERTagger self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,) self.__tagger = StanfordNERTagger( self.__classifier, self.__stanfordJar, encoding="utf-8") self.__namedEntitiesFinder = NERFinder(language=language) def __tags(self, raw_text): """ Return the named entities tokens given a raw text :raw_text: Raw text """ from nltk.tokenize import word_tokenize if isinstance(raw_text, str): # Decode to utf-8 raw_text = raw_text.decode('utf-8') # Tokenize the string token_text = word_tokenize(raw_text) # Retrieve the named entities from the tokens ne_tags = self.__tagger.tag(token_text) return(ne_tags) def __bio_tagger(self, ne_tagged): """ Return BIO tags from named entities :ne_tagged: name_entities tokens """ bio_tagged = [] prev_tag = "O" for token, tag in ne_tagged: if tag == "O": #O bio_tagged.append((token, tag)) prev_tag = tag continue if tag != "O" and prev_tag == "O": # Begin NE bio_tagged.append((token, "B-"+tag)) prev_tag = tag elif prev_tag != "O" and prev_tag == tag: # Inside NE bio_tagged.append((token, "I-"+tag)) prev_tag = tag elif prev_tag != "O" and prev_tag != tag: # Adjacent NE bio_tagged.append((token, "B-"+tag)) prev_tag = tag return bio_tagged def __generate_tree(self, bio_tagged): """ Tranform a list of tags in a tree """ from nltk import pos_tag from nltk.chunk import conlltags2tree tokens, ne_tags = zip(*bio_tagged) pos_tags = [pos for token, pos in pos_tag(tokens)] conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)] ne_tree = conlltags2tree(conlltags) return ne_tree def __getEntities(self, taggedWords): """ It returns the entities from a list of tagged words (NER or POS) after generating the syntax tree """ bio_tagged = self.__bio_tagger(taggedWords) stanford_tree = self.__generate_tree(bio_tagged=bio_tagged) entities = self.__namedEntitiesFinder.getEntities(stanford_tree) return entities def getEntitiesByTags(self, pos_tagged_words): """ Get entities from a list of word tagged with POS Tags. """ entities = self.__getEntities(taggedWords=pos_tagged_words) return entities def getEntities(self, raw_text): """ Get the entities from a raw text """ ne_entities = self.__tags(raw_text=raw_text) entities = self.__getEntities(taggedWords=ne_entities) return entities
def ch_nertagger(str): chi_tagger = StanfordNERTagger(model_filename=r'E:\tools\stanfordNLTK\jar\classifiers\chinese.misc.distsim.crf.ser.gz',path_to_jar=r'E:\tools\stanfordNLTK\jar\stanford-ner.jar') for word, tag in chi_tagger.tag(str.split()): print(word,tag)
from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize st = StanfordNERTagger('C:\\Users\\Amlendu11\\Downloads\\stanford-ner-2015-12-09 (1)\\stanford-ner-2015-12-09\\classifiers\\english.all.3class.distsim.crf.ser.gz', 'C:\\Users\\Amlendu11\\Downloads\\stanford-ner-2015-12-09 (1)\\stanford-ner-2015-12-09\\stanford-ner-3.6.0.jar', encoding='utf-8') text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text)
#!/usr/bin/env python # -*- encoding=utf-8 -*- import re from nltk.tag import StanfordNERTagger import jieba from extract_time import * chinese_ner = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz') entity_class = {"PERSON": 0, "GPE": 1, "MISC": 2, "ORGANIZATION": 3, "O": 4} def extract_entity(s): tokens = list(jieba.cut(s)) r = chinese_ner.tag(tokens) entity_dict = {} pre_cls = "" terms = [] for token, cls in r: # print type(token) print "%s, %s" % (token, cls) if cls != pre_cls: if pre_cls != "": entity_dict.setdefault(pre_cls, []) entity_dict[pre_cls].append(terms) terms = [] terms.append(token)
from pymongo import MongoClient import json from bson import json_util import nltk import json from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize #Conexion a MongoDB cliente = MongoClient() #Inicializar objeto cliente = MongoClient('127.0.0.1', 27017) #Indicar parametros del servidor bd = cliente.taller4 #Seleccionar Schema coleccion = bd.body_respuestas #Seleccionar Coleccion st = StanfordNERTagger( '/home/xubuntu/Taller4/nueva/classifiers/english.muc.7class.distsim.crf.ser.gz', '/home/xubuntu/Taller4/nueva/stanford-ner.jar', encoding='utf-8') #consulta1= coleccion.find({"items.question_id":{"$gte":60000}}) pregunta = 0 p = 0 y = 0 k = 0 l = 0 m = 0 n = 0 entidades = [] try: bd = cliente.taller4 #Seleccionar Schema coleccion = bd.body_respuestas #Seleccionar Coleccion
from nltk.parse.stanford import StanfordDependencyParser, StanfordParser from nltk.tag import StanfordNERTagger import os path_ner = "/home/pongsakorn/Desktop/stanford-ner-2017-06-09" path_parser = "/home/pongsakorn/Desktop/stanford-parser-full-2017-06-09" path_postagger = "/home/pongsakorn/Desktop/stanford-postagger-full-2017-06-09" class_path_cmd = ".:{}:{}:{}".format(path_ner, path_parser, path_postagger) path_postagger_model = "/home/pongsakorn/Desktop/stanford-postagger-full-2017-06-09/models" path_ner_clf = "/home/pongsakorn/Desktop/stanford-ner-2017-06-09/classifiers" class_model_cmd = "{}:{}:{}".format(path_postagger_model, path_parser, path_ner_clf) #print(class_path_cmd) #print(class_model_cmd) os.environ['CLASSPATH'] = class_path_cmd os.environ['STANFORD_MODELS'] = class_model_cmd model_path = '/home/pongsakorn/Desktop/stanford-parser-full-2017-06-09/englishPCFG.ser.gz' stanford_dependency_parser = StanfordDependencyParser(model_path=model_path) stanford_parser = StanfordParser(model_path=model_path) stanford_ne_tagger = StanfordNERTagger('../../stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='../../stanford-ner-2017-06-09/stanford-ner.jar')
# -*- coding: utf-8 -*- import sys from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize import nltk import json import scale import pickle st = StanfordNERTagger( '../../stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz', '../../stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8') def tag_answers_tag(fw, ner, pos, tag1): words = [] tags = [] answer = [] i = 0 c = 0 a = 1 while i < len(ner): if ner[i][0][0].isupper(): y = "U" else: y = "L" if ner[i][1] == tag1 and a: z = 'A ' c = 1
print("Verb::->" + token.text) elif (token.pos_ == 'NOUN'): # print token print("NOUN-->" + token.text) elif (token.pos_ == 'PROPN'): # print token print("PROPN-->" + token.text) ############################################## article = "Attackers compromise Microsoft Exchange servers to hijack internal email chains Pakistan , India , US" import nltk from nltk.tag import StanfordNERTagger print('NTLK Version: %s' % nltk.__version__) stanford_ner_tagger = StanfordNERTagger( 'C:/Users/TahsinAsif/OneDrive - CYFIRMA INDIA PRIVATE LIMITED/antuitBackUp@3march/Asif/AI/NameEntityRecog/stanford_ner/' + 'classifiers/english.muc.7class.distsim.crf.ser.gz', 'C:/Users/TahsinAsif/OneDrive - CYFIRMA INDIA PRIVATE LIMITED/antuitBackUp@3march/Asif/AI/NameEntityRecog/stanford_ner/' + 'stanford-ner-3.9.2.jar') results = stanford_ner_tagger.tag(article.split()) print('Original Sentence: %s' % (article)) for result in results: tag_value = result[0] tag_type = result[1] if tag_type != 'O': print('Type: %s, Value: %s' % (tag_type, tag_value))
class Tagger: def __init__(self): self.backoff = self.backoff_tagger(backoff=DefaultTagger('NN')) self.st = StanfordNERTagger( 'stanfordNERJars/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanfordNERJars/stanford-ner.jar', encoding='utf-8') if os.path.exists("out/"): shutil.rmtree('out/') train_sents = brown.tagged_sents()[:48000] def backoff_tagger(self, backoff=None): """ Used to tag text using a more accurate backoff tagger :param backoff: the current backoff :return: a backoff tagger """ for cls in [UnigramTagger, BigramTagger, TrigramTagger]: backoff = cls(self.train_sents, backoff=backoff) return backoff def ner_stanford(self, text, entity): """ Gets a list of specific entities from text :param text: the text we want to search in :param entity: the entity to extract :return: the list of entities """ tokenized_text = word_tokenize(text) classified_text = self.st.tag(tokenized_text) results = [] for tag, chunk in groupby(classified_text, lambda x: x[1]): if tag == entity: results.append(" ".join(w for w, t in chunk)) return set(results) @staticmethod def tag_paragraphs(text): """ Tags paragraphs in text :param text: text to be tagged :return: """ text = '\n\n{}\n\n'.format(text.strip('\n')) para = re.compile(paragraphRegex) for match in para.finditer(text): paragraph = match.group(1) if paragraph: text = text.replace(paragraph, '<paragraph>{}</paragraph>'.format(paragraph)) return text.strip() def tag_sentences(self, text): """ Tags sentences in the text :param text: text to be tagged :return: tagged text """ # text_parts = self.split_on_tags(text, 'paragraph') text_parts = re.split(r'</?{}>'.format('paragraph'), text) sentences = [] for part in text_parts: p = part.strip() s = sent_tokenize(p) sentences.extend(s) # sentences.extend(sent_tokenize(part.strip())) # filter everything that is not a proper sentence temp = [] for sent in sentences: res = re.match(not_sentence_regx_str, sent) if res is not None: temp.append(sent) # sentences = list(filter(lambda s: re.match(not_sentence_regx_str, s), sentences)) for sent in temp: text = text.replace(sent, '<sentence>{}</sentence>'.format(sent)) return text @staticmethod def tag_times(stime, etime, text): """ Tags times in the text :param stime: the start time :param etime: the end time :param text: the text to tag :return: the text tagged with times """ if not etime and not stime: return text textHolder = text time_regx = re.compile(time_regx_str) for time_str in set(time_regx.findall(textHolder)): time = time_parser.parse(time_str).time() if time_parser.parse(stime).time() == time: textHolder = textHolder.replace(time_str, '<stime>{}</stime>'.format(time_str)) elif etime: if time_parser.parse(etime).time() == time: textHolder = textHolder.replace(time_str, '<etime>{}</etime>'.format(time_str)) return textHolder @staticmethod def tag_locations(locations, text): """ Tags locations in the text :param locations: locations to be tagged :param text: text to be tagged :return: the text with locations tagged """ for loc in locations: compiled = re.compile(re.escape(loc), flags=re.IGNORECASE) text = re.sub(compiled, '<location>' + loc + '</location>', text) return text @staticmethod def tag_speakers(text, speakers): """ Tags speakers in the text :param text: text to tag :param speakers: speakers to tag :return: the tagged text """ for spk in speakers: insensitive_spk = re.compile(r'(\b({})\b|[.?!]({})\b|\(({})\))'.format(re.escape(spk), re.escape(spk), re.escape(spk), re.escape(spk)), re.IGNORECASE) try: name = re.search(insensitive_spk, text).group(1) clean = name.strip() text = text.replace(name, '<speaker>' + clean + '</speaker>') except: pass return text def tag_seminar(self, path, directory, extractor): """ Tags seminar with all previously found data and writes the data to a file. :param path: the path to the untagged files :param directory: the directory they are in :param extractor: the extractor class to extract data """ for file in tqdm(os.listdir(directory)): filename = os.fsdecode(file) if filename.endswith(".txt"): with open(path + filename, 'r', encoding='utf-8') as f: placeholder = f.read().strip('\n -*') # Splits the text into header and body try: header, body = re.search(header_body_regx_str, placeholder).groups() except: continue header = header.rstrip('\n') stime, etime = extractor.extract_time(header) locations = extractor.extract_location(header, body, self) speakers = extractor.extract_speaker(header, body, self) body = self.tag_paragraphs(body) body = self.tag_sentences(body) seminar = header + '\n\n' + body seminar = self.tag_times(stime, etime, seminar) seminar = self.tag_speakers(seminar, speakers) seminar = self.tag_locations(locations, seminar) out_location = "out/" Utils.mkdir_p(out_location) out = open(out_location + filename, "w+") out.write(seminar) out.close() continue
from nltk.tag import StanfordNERTagger import string, sys from settings import config, db # Variables config = config.getConfig() db = db.getDB() st = StanfordNERTagger( "classifiers/english.conll.4class.distsim.crf.ser.gz", "classifiers/stanford-ner.jar" ) #"/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz", "/usr/share/stanford-ner/stanford-ner.jar") # Declare the collections articles = db.articles indicators = db.indicators analysis = db.analysis stanford_named_ents = db.stanford_named_entities def extract(): count = 0 stanford_named_ents.drop() for article in articles.find(no_cursor_timeout=True): try: text = article['article_text'] url = article['url'] if not (stanford_named_ents.find_one({"url": url})):
def normalize(text): return stem_tokens( nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') def cosine_sim(text1, text2): tfidf = vectorizer.fit_transform([text1, text2]) return ((tfidf * tfidf.T).A)[0, 1] st = StanfordNERTagger( './static/other_files/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', './static/other_files/stanford-ner-2014-06-16/stanford-ner.jar', encoding='utf-8') def disambiguate(text_document_name, main_directory, download_directory, tag_search): # tag_search is either PERSON or LOCATION if tag_search not in ('PERSON', 'LOCATION'): raise ValueError( "tag_search parameter can only have values 'PERSON' or 'LOCATION' " ) tagged_persons = [] with codecs.open(os.path.join(main_directory, text_document_name), "r") as text_document: text = text_document.read() tokenized = nltk.word_tokenize(text)
def namedEntityRecognize(self, sentence): #perform NER on the sentence - returns a list of tuples of (word, ne-recognized tags) st = StanfordNERTagger(self.modelPath) print st.tag(sentence.split()) return st.tag(sentence.split())
outfile1.close() outfile2.close() # Get the relations between team and coach print "Starts Extracting Coaches!" q = open("coach.rq").read() results = G.query(q) outfile = open("../output/coach.tsv", "w") for row in results: outfile.write("%s\t%s\n" % (row[0], row[1])) outfile.close() # Get the entities of stadium print "Start Identifying Stadiums!" StanfordNERPath = './stanford-ner' st = StanfordNERTagger(StanfordNERPath + '/classifiers/english.all.3class.distsim.crf.ser.gz', StanfordNERPath + '/stanford-ner.jar') indicator = set([u'is', u'was', u'are', u'were']) noun_tag = set(['NN', 'NNS', 'NNP', 'NNPS']) q = open("withDoc.rq").read() results = G.query(q) outfile = open("../output/stadium.tsv", "w") count = 0 for row in results: count += 1 if count % 10 == 0: print "%d documents has been processed" % count text = row[1] tags = st.tag(word_tokenize(text)) firstSentence = ''
import ner from nltk.tag import StanfordNERTagger stanford_ner_dir = '/home/will/packages/stanfordNER/' eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz' my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar' st = StanfordNERTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar) st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # tagger = ner.HttpNER(host='localhost', port=8080) # tagger.get_entities("University of California is located in California, United States")
tree = parse_trees[0] # get all NP trees and extract their leaves # Use help(nltk.tree.Tree) to find out which NLTK method you can use to do this for s in tree.subtrees(lambda tree: tree.label() == "NP"): print(s.leaves()) # In[29]: # Named Entity Recognition (Using Stanford NLP) from nltk.tag import StanfordNERTagger import os import pandas as pd java_path = 'D:/jdk-13.0.2/bin/java.exe' os.environ['JAVA_HOME'] = java_path sner = StanfordNERTagger( '/home/lzanella/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/lzanella/stanford-ner-2018-10-16/stanford-ner.jar') named_entities = [] with open("/home/lzanella/ameliepoulain.txt", "r", encoding="utf8", errors='ignore') as infile: content = infile.read() sentences = nltk.sent_tokenize(content) counter = 0 for sentence in sentences[0:3]: # print("\n SENTENCE %i : %s \n \n NE: \n"%(counter,sentence))
from Liwc_Trie_Functions import create_trie, get_liwc_categories from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder from nltk import word_tokenize from empath import Empath df = pd.read_csv('dataset.csv') ppd = pd.read_csv('pre_processed_dataset.csv') ohe = OneHotEncoder() lb = LabelEncoder() # Using Stanford NER Tagger API jar_n = '/localhome/debarshi/sarcasm/stanford-ner-2018-10-16/stanford-ner-3.9.2.jar' model_n = '/localhome/debarshi/sarcasm/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz' ner_tagger = StanfordNERTagger(model_n, jar_n, encoding='utf8') # Using Stanford POS Tagger API jar = '/localhome/debarshi/sarcasm/stanford-postagger-2018-10-16/stanford-postagger-3.9.2.jar' model = '/localhome/debarshi/sarcasm/stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') # Extracting POS Features POS_snippets = [] for i in range(len(df['Snippet'])): POS_snippets.extend(pos_tagger.tag(word_tokenize(df['Snippet'][i]))) POS_snippets_type = [x[1] for x in POS_snippets] POS_snippets_type = lb.fit_transform(POS_snippets_type) pos_vec = ohe.fit_transform(np.reshape(POS_snippets_type, (-1, 1))) pos_vec = pos_vec.todense()