def named_entities(sentence): st = StanfordNERTagger( model_filename= '../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") tags = st.tag(word_tokenize(sentence)) # clean up the result from the tagger prev_tag_name = str(tags[0][1]) cur_entity = str(tags[0][0]) entities = {} for i in range(1, len(tags)): cur_tag = tags[i] cur_token = str(cur_tag[0]) cur_tag_name = str(cur_tag[1]) if cur_tag_name == prev_tag_name: cur_entity = cur_entity + " " + cur_token else: if not prev_tag_name in entities: entities[prev_tag_name] = [] # change encoding, another way is to .encode('ascii','ignore') entities[prev_tag_name].append(str(cur_entity)) cur_entity = cur_token prev_tag_name = cur_tag_name if 'O' in entities: del entities['O'] # not needed, 'O' means not a named entity return entities
def get_ner_tags(self): sys.path.append('../preprocess') from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokenized_list = [ct.split() for ct in self.cleaned_data] NERTags = st.tag_sents(tokenized_list) tags = [nt for nt in NERTags] ids = [[i for a, i in zip(t, range(len(t))) if a[1] != "O"] for t in tags] phrases = [] for i, t in zip(ids, tags): phrase = "" tt = "N/A" for p, index in zip(i, range(len(i))): if index == len(i) - 1: phrase += "{}".format(t[p][0]) tt = phrase, t[p][1] else: phrase += "{} ".format(t[p][0]) phrases.append(tt) return phrases
def stanford_ner_tagger(stanford_dir, jarfile, modelfile, tag_this_file, output_file): jarfile = stanford_dir + jarfile modelfile = stanford_dir + modelfile st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile) i = 0 tagged_ne = [] with open(tag_this_file, "r") as f: for line in f: line = line.split() i += 1 if len(line) > 0: tagged_ne.append(line[0]) else: # Remove the SENENDs from the output file afterwards. Needed to keep the format consistent # Keep in mind, that some "/" are still removed. Is replace in postprecessing step. tagged_ne.append("SENEND") print(tagged_ne) # Tag the file using Stanford NER out = st.tag(tagged_ne) # Write the results to a tsv file with open(output_file, "w") as f: for i in out: f.write(str(i[0]) + "\t" + i[1] + "\n")
def getNER(sent): st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz', 'stanford-ner.jar') NerSen = st.tag(sent.split()) Entities = {} EntityName = ""; length = len(NerSen) for i in range(length): if NerSen[i][1] != 'O': if i >0 and NerSen[i - 1][1] != 'O': EntityName += '_' EntityName += NerSen[i][0] if i == length-1 or (i < length-1 and NerSen[i + 1][1] == 'O'): Entities[EntityName] = NerSen[i][1] EntityName = "" words=nltk.word_tokenize(sent) pos_tags =nltk.pos_tag(words) EntityKey = [] for key in Entities.keys(): EntityKey.append(key) for token,tag in pos_tags: if tag == "NNP": isContain = False for key in EntityKey: if token in key: isContain = True if not isContain: Entities[token] = "NNP" return Entities
def __init__(self, articleText): self.blob = textblob.TextBlob(articleText) #blob.tags keywords = [ x[0] for x in self.blob.tags if "NNP" in x[1] or "NN" in x[1] or "CD" in x[1] ] self.keywords = set(keywords) self.nounPhrases = Counter(self.blob.noun_phrases).most_common() st = StanfordNERTagger( 'D:/Source/Newspeek/Newspeek/news/stanford-ner-2014-06-16/classifiers/english.muc.7class.distsim.crf.ser.gz', 'D:/Source/Newspeek/Newspeek/news/stanford-ner-2014-06-16/stanford-corenlp-caseless-2015-04-20-models.jar' ) self.namedEntities = dict((a.lower(), b) for a, b in set( [x for x in st.tag(articleText.split()) if x[1] != 'O'])) self.namedPhrases = {} for np in self.nounPhrases: tags = [] for word in np[0].split(): tag = 'O' if word.lower() in self.namedEntities.keys(): tag = self.namedEntities[word.lower()] tags.append(tag) np_tag = Counter(tags).most_common(1)[0][0] if np_tag != 'O': self.namedPhrases[np[0].lower()] = np_tag pass
def stanfordNer(): f = open("federer.txt", "r") text = f.read() jar = './stanford-ner-tagger/stanford-ner.jar' model = './stanford-ner-tagger/english.muc.7class.distsim.crf.ser.gz' ner_tagger = StanfordNERTagger(model, jar, encoding='utf8') words = nltk.word_tokenize(text) color_print('\nStanfordNER', color='blue', bold=True, underline=True) color_print('\nPrepoznati entiteti:\n', color='yellow', bold=True, underline=True) table = PrettyTable(["Prepoznati entitet", "Tip entiteta"]) for token, tag in ner_tagger.tag(words): if tag != "O": table.add_row([token, tag]) print(table) with open("Rezultati_StanfordNER.txt", "w", encoding="utf8") as text_file: text_file.write("Prepoznati entiteti: \n\n%s" % table) print("Rezultati sačuvani u fajl Rezultati_StanfordNER.txt")
def out(self): stanford_dir = '/home/gary/stanford-ner-2015-04-20/' jarfile = stanford_dir + 'stanford-ner.jar' modelfile = stanford_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz' st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile) tokenized_text = word_tokenize(self.text) classified_text = st.tag(tokenized_text) #return organization names regexNN = re.compile('ORGANIZATION') org_list = list() i = 0 while i < len(classified_text): print(i) item = classified_text[i] if regexNN.search(item[1]): n = i + 1 company = item[0] while n < len(classified_text): item = classified_text[n] if regexNN.search(item[1]): company = company + " " + item[0] n = n + 1 else: break i = n org_list.append(company) i = i + 1 return org_list
def main(): try: filename = sys.argv[1] f = open(filename, "r") except IndexError: print( "You probably didn't specify an input file. Correct format python3 ass5.py <InputFileName>" ) exit() except FileNotFoundError: print( "The file you specified does not exist. Please check and try again." ) exit() inputs = f.readlines() jar = './stanford-ner.jar' model = './ner-model-english.ser.gz' ner_tagger = StanfordNERTagger(model, jar, encoding='utf8') file2 = open("output.txt", "w") for sentence in inputs: words = nltk.word_tokenize(sentence) for x in ner_tagger.tag(words): file2.write("[" + x[0] + ", " + x[1] + "], ") file2.write('\n')
def tagSent(sent): words=nltk.word_tokenize(sent) pos_tags =nltk.pos_tag(words) EntityKey = [] Entities = {} EntityName = "" length = len(pos_tags) NounTag = ["NN","NNS","CD"] for i in range(length): if pos_tags[i][1] == 'NNP': if i >0 and pos_tags[i - 1][1] == 'NNP': EntityName += '_' EntityName += pos_tags[i][0] if i == length-1 or (i < length-1 and pos_tags[i + 1][1] != 'NNP'): Entities[EntityName] = pos_tags[i][1] EntityName = "" if pos_tags[i][1] in NounTag: EntityName += pos_tags[i][0] j=i-1 while j>=0 and pos_tags[j][1] == "JJ": EntityName = pos_tags[j][0] +" "+EntityName j -= 1 Entities[EntityName] = "OTHER" EntityName="" st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz', 'stanford-ner.jar') NerSen = st.tag(sent.split()) for token,type in Entities.items(): for word,tag in NerSen: if (word in token) and ("PERSON" == tag): Entities[token] = "PERSON" return Entities
def extract_character_names(book_contents): data = CharacterData() lines = sent_tokenize(book_contents) tagger = StanfordNERTagger( "./stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz", "./stanford-ner/stanford-ner.jar") discovered_names_in_line = set() for line in lines: words = word_tokenize(line) taggd_token = tagger.tag(words) name = "" first_name = "" discovered_names_in_line.clear() for word, tag in taggd_token: if tag == "PERSON": if name == "": first_name = word name += word + " " else: if name != "": name = name.strip() if first_name not in discovered_names_in_line: data.add_line_for_name(first_name, name, line) discovered_names_in_line.add(first_name) name = "" first_name = "" return data
def get_ner_tags(self): sys.path.append('../preprocess') from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokenized_list = [ct.split() for ct in self.cleaned_data] NERTags = st.tag_sents(tokenized_list) n = [] for nt in NERTags: n.extend(nt) ids = [] #get the indexes of all words that have NER tags ids = [i for a, i in zip(n, range(len(n))) if a[1] != "O"] a = np.array(ids) consecutive_ids = np.split(a, np.where(np.diff(a) != 1)[0] + 1) phrases = [] for ci in consecutive_ids: phrase = "" tag = "" for id_ in ci: phrase += "{} ".format(n[id_][0]) tag += "{}".format(n[id_][1]) phrases.append(phrase) cleaned_phrases = self.del_repeat(phrases) return cleaned_phrases
def __init__(self, classifier_path=None, ner_path = None, sutime_jar_path = None): # Change the path according to your system if classifier_path is None: classifier_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\classifiers\english.muc.7class.distsim.crf.ser.gz" if ner_path is None: ner_path = "C:\stanford_corenlp\stanford-ner-2018-02-27\stanford-ner-2018-02-27\stanford-ner.jar" if sutime_jar_path is None: sutime_jar_path = "C:\stanford_corenlp\stanford-corenlp-full-2018-02-27\stanford-corenlp-full-2018-02-27" self.stanford_classifier = classifier_path self.stanford_ner_path = ner_path self.sutime_path = sutime_jar_path # Creating Tagger Object self.st = StanfordNERTagger(self.stanford_classifier, self.stanford_ner_path) self.su = SUTime(jars=self.sutime_path, mark_time_ranges=True, include_range=True) self.weather_terms = ["weather", "climate", "precipitation", "sun", "rain", "cloud","snow", "hot", "humid", "cold", "sunny", "windy","cloudy", "rainy", "snowy", "misty", "foggy", "colder","hotter", "warmer", "pleasant"] self.greet_terms= ["hello","hey","howdy","hello","hi", "yo", "yaw"] self.closure_terms = ["no", "nope", "thank you", "bye", "tata", "thanks", "that will be all", "that's it", "that'll be all"] self.day_terms = ["dawn", "dusk", "morning", "evening", "noon","afternoon", "night", "tonight", "midnight", "midday"] #, "hours"] self.date_terms = ["today", "tomorrow", "yesterday"]
def stanford_ner_tagger(stanford_dir, jarfile, modelfile, tag_this_file, output_file): jarfile = stanford_dir + jarfile modelfile = stanford_dir + modelfile #st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile) stanford_classifier = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz' stanford_ner_path = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\stanford-ner.jar' # Creating Tagger Object st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8') i = 0 tagged_ne = [] with open(tag_this_file, "r") as f: for line in f: line = line.split() i += 1 if len(line) > 0: tagged_ne.append(line[0]) else: # Remove the SENENDs from the output file afterwards. Needed to keep the format consistent # Keep in mind, that some "/" are still removed. Is replace in postprecessing step. tagged_ne.append("SENEND") print(tagged_ne) # Tag the file using Stanford NER out = st.tag(tagged_ne) # Write the results to a tsv file with open(output_file, "w") as f: for i in out: f.write(str(i[0]) + "\t" + i[1] + "\n")
def Name_Entity_recognition2(f2): st = StanfordNERTagger(os.environ['STANFORD_CLASSIFIER'], os.environ['STANFORD_NER_PATH'], encoding='utf-8') st.java_options ='-mx1000m' word = word_tokenize(f2) classified_text = st.tag(word) return classified_text
def o_tag(): """Returns a noun with a tag if the tag is unfindable or a location""" sttag = StanfordNERTagger( 'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar') sttags = sttag.tag(get_nouns()) return [ sttag for sttag in sttags if sttag[1] == 'O' or sttag[1] == "LOCATION" ]
def NERTag(text): os.environ['CLASSPATH'] = "C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar" os.environ['STANFORD_MODELS'] = "C:/Users/1/James/stanford-parser-full-2015-12-09" os.environ['JAVAHOME'] = "C:/Program Files/Java/jdk1.8.0_102" ner = StanfordNERTagger('C:/Users/1/James/stanford-ner-2015-12-09/classifiers/' 'english.all.3class.distsim.crf.ser.gz', 'C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar') r= ner.tag(text) return r
def namedEntityMatch(row): #os.environ["JAVA_HOME"]="C:\Program Files\Java\jdk1.8.0_151" if "JAVA_HOME" not in os.environ: print( "Please set the value of JAVA_HOME environment variable, or install java in your machine" ) sys.exit(-1) ner = StanfordNERTagger( r"stanford-ner-2014-06-16\classifiers\english.all.3class.distsim.crf.ser.gz", r"stanford-ner-2014-06-16\stanford-ner.jar") ques1Entities = ner.tag(str(row['question1']).lower().split()) ques2Entities = ner.tag(str(row['question2']).lower().split()) entityDict1 = {} entityDict2 = {} for entity in entityDict1: if entity[1] != "0": if entity[1] in entityDict1: entityDict1[entity[1]].append(entity[0]) else: nameList = [] nameList.append(entity[0]) entityDict1[entity[1]] = nameList for entity in entityDict2: if entity[1] != "0": if entity[1] in entityDict2: entityDict2[entity[1]].append(entity[0]) else: nameList = [] nameList.append(entity[0]) entityDict2[entity[1]] = nameList if len(entityDict1) == 0 or len(entityDict2) == 0: return 0 totalCount = 0 matchCount = 0 for key in entityDict1: entityList1 = entityDict1[key] if key in entityDict2: entityList2 = entityDict2[key] for item in entityList1: if item in entityList2: matchCount += 1 totalCount += 1 for key in entityDict2: entityList2 = entityDict2[key] if key in entityDict1: entityList1 = entityDict1[key] for item in entityList2: if item in entityList1: matchCount += 1 totalCount += 1 return float(matchCount) / float(totalCount)
def NERFunc(file_path,fileName,G): os.environ["STANFORD_MODELS"] = os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20") st = StanfordNERTagger(os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20","classifiers","english.all.3class.distsim.crf.ser.gz"), os.path.join(graphConstants.ROOT_FOLDER,"stanford-ner-2015-04-20","stanford-ner.jar" )) #print "Java error path JAVAHOME=" + str(os.environ["JAVAHOME"]) #print "Java error path JAVA_HOME=" + str(os.environ["JAVA_HOME"]) for index, file_path in enumerate(file_path): node_name = fileName[index] data = readFromFile(file_path) if data is not None: netagged_words = st.tag(data.split()) for tag, chunk in groupby(netagged_words, lambda x:x[1]): if tag != "O": entity = " ".join(w for w, t in chunk) if entity != "": entity = entity.encode('utf-8') entity = re.sub(r'[^\x00-\x7F]+', ' ', entity) entity = entity.lower() no_punctuation = entity.translate(None, string.punctuation) entity=re.sub("[^a-zA-Z]+", " ", no_punctuation) #print("Tag = "+ tag+" entity = "+ entity) #If this topic doesn't exist as a node then add it if entity not in G.nodes(): G.add_node(entity) G.node[entity]['type'] = graphConstants.TYPE_NER #If the edge between this doc and entity is already present or not if G.has_edge(node_name,entity) is False: G.add_edge(node_name,entity, weight = 1) else: G[node_name][entity]["weight"] = G[node_name][entity]["weight"] + 1 graphUtils.logger.info("entity topic entity = "+entity + " document ="+node_name) if G.has_edge(entity,node_name) is False: G.add_edge(entity,node_name, weight = 1) else: G[entity][node_name]["weight"] = G[entity][node_name]["weight"] + 1 topics = entity.split() if(len(topics) > 1): for word in entity.split(): #Only change weight if this topic already exists if word in G.nodes(): #If the edge between this doc and topic is already present or not if G.has_edge(node_name,word) is False: G.add_edge(node_name,word, weight = 1) else: G[node_name][word]["weight"] = G[node_name][word]["weight"] + 1 if G.has_edge(word,node_name) is False: G.add_edge(word,node_name, weight = 1) else: G[word][node_name]["weight"] = G[word][node_name]["weight"] + 1 graphUtils.logger.info("entity topic word = "+word + " document ="+node_name) return G
def __init__(self, cslm, transitions, tags): self.cslm = cslm self.transitions = transitions self.tags = tags self.engClassifier = StanfordNERTagger( "../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz", "../stanford-ner-2015-04-20/stanford-ner.jar") self.spanClassifier = StanfordNERTagger( "../stanford-ner-2015-04-20/classifiers/spanish.ancora.distsim.s512.crf.ser.gz", "../stanford-ner-2015-04-20/stanford-ner.jar")
def NERWithOldStanford(input_sample): java_path = "C:\Program Files (x86)\Common Files\Oracle\Java\javapath\java.exe" #"C:/Program Files/Java/jdk1.8.0_161/bin/java.exe" os.environ['JAVAHOME'] = java_path tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(input_sample) classified_paragraphs_list = tagger.tag_sents([tokenized_text]) formatted_result = formatted_entities(classified_paragraphs_list) return formatted_result
def nermodel(jsondata): #download stanford NER from https://nlp.stanford.edu/software/CRF-NER.html#Download # go to stanford-ner.jar path in stanfordNER model ner_tagger = StanfordNERTagger( "/home/ubuntu/Documents/node-python/simple-express/python_scripts/bcm-model.ser.gz", "/home/ubuntu/Documents/stanford-ner-2018-10-16/stanford-ner.jar", encoding='utf8') result = ner_tagger.tag(jsondata.split()) print(json.dumps({"output": result}))
def get_ner_tags(answer): #download teh stanford ner zip file and extract it . #Change the directory location in the folllowing path st = StanfordNERTagger( '/Users/shubhambarhate/Desktop/project3/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/shubhambarhate/Desktop/project3/stanford-ner-2017-06-09/stanford-ner.jar' ) return st.tag(context_list[0].split())
def ner_tag(): """Returns a noun with a tag if the tag is person or organization""" sttag = StanfordNERTagger( 'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar') sttags = sttag.tag(get_nouns()) return [ sttag for sttag in sttags if sttag[1] == "PERSON" or sttag[1] == "ORGANIZATION" ]
def NERFunc(data, G, node_name): os.environ["STANFORD_MODELS"] = os.path.join(graphConstants.ROOT_FOLDER, "stanford-ner-2015-04-20") st = StanfordNERTagger( os.path.join(graphConstants.ROOT_FOLDER, "stanford-ner-2015-04-20", "classifiers", "english.all.3class.distsim.crf.ser.gz"), os.path.join(graphConstants.ROOT_FOLDER, "stanford-ner-2015-04-20", "stanford-ner.jar")) if data is not None: for sentence in nltk.sent_tokenize(data): netagged_words = st.tag(sentence.split()) for tag, chunk in groupby(netagged_words, lambda x: x[1]): if tag != "O": entity = " ".join(w for w, t in chunk) if entity != "": entity = entity.encode('utf-8') entity = re.sub(r'[^\x00-\x7F]+', ' ', entity) entity = entity.lower() no_punctuation = entity.translate( None, string.punctuation) entity = re.sub("[^a-zA-Z]+", " ", no_punctuation) #print("Tag = "+ tag+" entity = "+ entity) #If this topic doesn't exist as a node then add it if entity not in G.nodes(): continue #If the edge between this doc and entity is already present or not if G.has_edge(node_name, entity) is False: G.add_edge(node_name, entity, weight=1) else: G[node_name][entity][ "weight"] = G[node_name][entity]["weight"] + 1 print "Recomm entity = " + entity + " document =" + node_name if G.has_edge(entity, node_name) is False: G.add_edge(entity, node_name, weight=1) else: G[entity][node_name][ "weight"] = G[entity][node_name]["weight"] + 1 topics = entity.split() if (len(topics) > 1): for word in entity.split(): #Only change weight if this topic already exists if word in G.nodes(): #If the edge between this doc and topic is already present or not if G.has_edge(node_name, word) is False: G.add_edge(node_name, word, weight=1) else: G[node_name][word]["weight"] = G[ node_name][word]["weight"] + 1 if G.has_edge(word, node_name) is False: G.add_edge(word, node_name, weight=1) else: G[word][node_name]["weight"] = G[word][ node_name]["weight"] + 1 print "Recomm entity topic word = " + word + " document =" + node_name
def __init__(self): """ Open client for Stanford NERTagger :return: protocol open """ ser_path = get_project_path( ) + '/nltk_libs/english.all.3class.distsim.crf.ser' jar_path = get_project_path() + '/nltk_libs/stanford-ner-3.8.0.jar' self.st = StanfordNERTagger(ser_path, jar_path)
def get_NER_Tagger(content): NER_classifier = "/Users/aparnaghosh87/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz" os.environ[ 'CLASSPATH'] = "/Users/aparnaghosh87/Downloads/stanford-ner-2014-06-16" st = StanfordNERTagger(NER_classifier, encoding='utf-8') tokenized_text = word_tokenize(content) classified_text = st.tag(tokenized_text) # output looks like this: # [('While', 'O'), ('in', 'O'), ('France', 'LOCATION'), (',', 'O'), ('Christine', 'PERSON'), ('Lagarde', 'PERSON'), ('discussed', 'O'), ('short-term', 'O'), ('stimulus', 'O'), ('efforts', 'O'), ('in', 'O'), ('a', 'O'), ('recent', 'O'), # ('interview', 'O'), ('with', 'O'), ('the', 'O'), ('Wall', 'O'), ('Street', 'O'), ('Journal', 'O'), ('.', 'O')] return classified_text
def getTheNamedEntities(text): st = StanfordNERTagger(StanfordNERClassifierPath, StanfordNERjarPath) lstTag = st.tag(text.split()) result = {} for tag in lstTag: #if str(tag[1]).lower() != 'o': #### CAUTION - DROPPING the General Term 'Object' items result [str(str(tag[0])).replace('.','')] = str(tag[1]) return result
def __init__(self, penalty: float, threshold: float): self.ner = StanfordNERTagger( 'libs/english.all.3class.distsim.crf.ser.gz', 'libs/stanford-ner-3.9.1.jar') path_to_jar = 'libs/stanford-corenlp-3.9.1.jar' path_to_models_jar = 'libs/stanford-corenlp-3.9.1-models.jar' self.dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) self.penalty = penalty self.threshold = threshold
def compute_NER(corpus): NER = [] st = StanfordNERTagger( 'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner.jar') ner = st.tag(corpus.split()) ner_tag = "" for n in ner: ner_tag = ner_tag + n[1] + " " NER.append(ner_tag) return NER
def main(): # parse all the command line arguments args = parser.parse_args() args_is_tag = args.tag args_is_word = args.word # validate the path passed in the argument if not args_is_tag: arge.error("--tag is missing") else: tagger = StanfordNERTagger( '/Users/Shared/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/Shared/stanford-ner/stanford-ner.jar', encoding='utf-8') news_corpus = [ 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.', "Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, and the Apple TV digital media player. Apple's consumer software includes the OS X and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites. Its online services include the iTunes Store, the iOS App Store and Mac App Store, and iCloud. Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne on April 1, 1976, to develop and sell personal computers. It was incorporated as Apple Computer, Inc. on January 3, 1977, and was renamed as Apple Inc. on January 9, 2007, to reflect its shifted focus toward consumer electronics. Apple (NASDAQ: AAPL ) joined the Dow Jones Industrial Average on March 19, 2015.", "At least 100 security forces killed in fight for Afghan city", "Sonia Gandhi, Oscar Fernandes Move High Court In National Herald IT Case", "Musk talking to Saudi fund, others as he seeks Tesla buyout financing", "Volkswagen's Electrify America taps Flintstones, Jetsons for EV campaign", "Netflix finance chief David Wells to step down", "Independent labels urge EU to block Sony's $2.3 billion bid for EMI", "Samsung may suspend operations at China mobile phone plant - report", "Oil India's quarterly profit jumps 56 percent, but misses estimate", "SEBI proposes changes to consent settlement rules", "VF to spin off Lee and Wrangler jeans into public company", "Erdogan vows action against 'economic terrorists' over lira plunge", "Citigroup says global card chief Linville leaving in shakeup", "Tesla short sellers trim exposure but stay the course", "Facebook pages with large U.S. following to require more authorization" "Hackers at convention test voting systems for bugs" ] news_dict = {} for i, each_news in enumerate(news_corpus): tokenized_list = word_tokenize(each_news) news_dict[i] = formatted_entities_for_tag( tagger.tag(tokenized_list)) if args_is_word: search_news_key_ls = keyword_search(args_is_tag, args_is_word, news_corpus, news_dict, tagger) else: search_news_key_ls = tag_search(args_is_tag, news_dict) search_news_ls = [] for each_key in search_news_key_ls: search_news_ls.append(news_corpus[each_key]) news_df = pd.DataFrame({'News': search_news_ls}) news_df.to_csv('News.csv', index=False)
def perpIndividual(inFile): st = StanfordNERTagger( 'D:\PythonProjects\StanfordParser\stanford-ner-2017-06-09\classifiers\english.all.3class.distsim.crf.ser.gz', 'D:\PythonProjects\StanfordParser\stanford-ner-2017-06-09\stanford-ner.jar', encoding='utf-8') text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' tokenized_text = word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text)
def ner_tag(url): text = nltk_toy(url) if text: #unicodedata.normalize('NFKD', text).encode('ascii','ignore') sentences = nltk.sent_tokenize(text) words = [nltk.word_tokenize(sentance) for sentance in sentences] tokens = [nltk.pos_tag(word) for word in words] ner = StanfordNERTagger('ner/english.muc.7class.distsim.crf.ser.gz','ner/stanford-ner.jar') thing = [] for sent in re.split('\. |! |\? ',text): # split at sentence boundaries tag = ner.tag(sent.split()) # tag each sentence for (x,i) in tag: # for all of the NER tags if i != 'O': # for all the tags which are not empty thing.append((x,i)) # keep track print(thing) return thing return None
class NamedEntityTagger(object): """ Performs NER against a given document""" def __init__(self): self.tagger = StanfordNERTagger('/stanford_ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/stanford_ner/stanford-ner.jar') def perform_ner(self, text): return self.tagger.tag(text)
def named_entity_extraction(self): try: ner = StanfordNERTagger('../lib/stanford-lib/english.all.3class.distsim.crf.ser.gz', '../lib/stanford-lib/stanford-ner.jar') extracted_ne2 = ner.tag(self.metadata["plaintext"].replace(".", " ").replace(",", " , ").replace("!", " ").replace("?", " ").replace("\n"," ").split()) extracted_ne = extracted_ne2 except: ner = NERTagger('../lib/stanford-lib/english.all.3class.distsim.crf.ser.gz', '../lib/stanford-lib/stanford-ner.jar') extracted_ne2 = ner.tag(self.metadata["plaintext"].replace(".", " ").replace(",", " , ").replace("!", " ").replace("?", " ").replace("\n"," ").split()) extracted_ne = extracted_ne2[0] persons = self.process_named_entities(extracted_ne, "PERSON") organizations = self.process_named_entities(extracted_ne, "ORGANIZATION") locations = self.unify_locations(extracted_ne) self.metadata["persons"] = persons self.metadata["organizations"] = organizations self.metadata["locations"] = locations general_locations = self.enrich_location(locations) self.metadata["countries"] = general_locations[0] # a list of countries self.metadata["places"] = general_locations[1] # a list of places
def named_entities(sentence): st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") tags = st.tag(word_tokenize(sentence)) print tags # clean up the result from the tagger prev_tag_name = str(tags[0][1]) cur_entity = str(tags[0][0]) entities = {} for i in range(1, len(tags)): cur_tag = tags[i] cur_token = str(cur_tag[0]) cur_tag_name = str(cur_tag[1]) if cur_tag_name == prev_tag_name: cur_entity = cur_entity + " " + cur_token else: if not prev_tag_name in entities: entities[prev_tag_name] = [] # change encoding, another way is to .encode('ascii','ignore') entities[prev_tag_name].append(str(cur_entity)) cur_entity = cur_token prev_tag_name = cur_tag_name del entities['O'] # not needed, 'O' means not a named entity return entities
def main(): file_paths = [] for root, directories, files in os.walk("training"): for filename in files: if filename == "en.tok.off.pos": filepath = os.path.join(root, filename) file_paths.append(filepath) classifier = "stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz" jar = "stanford-ner-2014-06-16/stanford-ner-3.4.jar" NERTagger = StanfordNERTagger(classifier, jar) for filesource in file_paths: words, lines = rawtext(filesource) NERlist = NERTagger.tag(words) LOClist, TAG1list = LOC_ORG_PERtagger(NERlist) TAG2list = CIT_COUtagger(LOClist) TAG3list = ANI_SPO_NAT_ENTtagger(words) ALLlist = [TAG1list,TAG2list,TAG3list] writeout(ALLlist, filesource, lines) NNPlist = read_data(filesource) #print(filesource," ",NNPlist) FINAL_NNPlist = nnp_checker(NNPlist) #print(FINAL_NNPlist,"\n") WIKIlinks = [] for nnp in FINAL_NNPlist: ngram = nnp[-1] link = link_checker(ngram) if link != -1: nnp.append(link) wiki_writeout(FINAL_NNPlist, lines, filesource)
def __init__(self, rawQueryFile, contentFile): self.rawQuery = '' self.content = '' self.questionTypeWH = ["how many", "who", "what", "where", "when", "why", "which", "how"] self.questionTypeFactoid1 = ["do", "did", "does"] self.questionTypeFactoid2 = ["is", "are", "has", "have", "had", "was", "were", "would", "will", "should", "can", "could"] self.questionTypeOther = ["how", "list", "describe"] self.stopWords = ("the","a","an","am","of","by","at","be","on","or","any","in","to","as","its","it") self.negationWords = ("none", "not", "no", "can't", "couldn't", "don't", "won't","neither","nobody","nowhere","nothing") self.allTypes = ("WHType", "YesNo", "List", "None") self.cur_dir = os.getcwd() self.NERTaggerObj = StanfordNERTagger(self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/stanford-ner.jar') self.qWord = "" self.Initialize() self.Main()
class PiiAnalyzer(object): def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz') def analysis(self): people = [] organizations = [] locations = [] emails = [] phone_numbers = [] street_addresses = [] credit_cards = [] ips = [] data = [] with open(self.filepath, 'rU') as filedata: reader = csv.reader(filedata) for row in reader: data.extend(row) for text in row: emails.extend(self.parser.emails(text)) phone_numbers.extend(self.parser.phones("".join(text.split()))) street_addresses.extend(self.parser.street_addresses(text)) credit_cards.extend(self.parser.credit_cards(text)) ips.extend(self.parser.ips(text)) for title, tag in self.standford_ner.tag(set(data)): if tag == 'PERSON': people.append(title) if tag == 'LOCATION': locations.append(title) if tag == 'ORGANIZATION': organizations.append(title) return {'people': people, 'locations': locations, 'organizations': organizations, 'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses, 'credit_cards': credit_cards, 'ips': ips }
#the path where you have downloaded and unziped the ner parser. sp_dir = '/home/sarah/nertagger/' model1 = sp_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz' model2 = sp_dir + 'classifiers/english.conll.4class.distsim.crf.ser.gz' model3 = sp_dir + 'classifiers/english.muc.7class.distsim.crf.ser.gz' jar_path = sp_dir + 'stanford-ner.jar' #our test sentence eng_sent = 'Rami Eid has been studying at Stony Brook University in NY since 2007. He pays $30 daily' print eng_sent eng_tokens = word_tokenize(eng_sent) #for 3 classes-Location, Person, Organization print "\n\n 3 classes" st_3 = StanfordNERTagger(model_filename = model1, path_to_jar = jar_path) eng_tagged = st_3.tag(eng_tokens) for i in eng_tagged: print i #for 3 classes-Location, Person, Organization, Misc print "\n\n 4 classes" st_4 = StanfordNERTagger(model_filename = model2, path_to_jar = jar_path) eng_tagged = st_4.tag(eng_tokens) for i in eng_tagged: print i #for 7 classes-Time, Location, Organization, Person, Money, Percent, Date print "\n\n 7 classes" st_7 = StanfordNERTagger(model_filename = model3, path_to_jar = jar_path) eng_tagged = st_7.tag(eng_tokens) for i in eng_tagged:
def ner_tag(): """Returns a noun with a tag if the tag is person or organization""" sttag = StanfordNERTagger('stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar') sttags = sttag.tag(get_nouns()) return [sttag for sttag in sttags if sttag[1] == "PERSON" or sttag[1] == "ORGANIZATION"]
class AnsweringModule: def __init__(self, rawQueryFile, contentFile): self.rawQuery = '' self.content = '' self.questionTypeWH = ["how many", "who", "what", "where", "when", "why", "which", "how"] self.questionTypeFactoid1 = ["do", "did", "does"] self.questionTypeFactoid2 = ["is", "are", "has", "have", "had", "was", "were", "would", "will", "should", "can", "could"] self.questionTypeOther = ["how", "list", "describe"] self.stopWords = ("the","a","an","am","of","by","at","be","on","or","any","in","to","as","its","it") self.negationWords = ("none", "not", "no", "can't", "couldn't", "don't", "won't","neither","nobody","nowhere","nothing") self.allTypes = ("WHType", "YesNo", "List", "None") self.cur_dir = os.getcwd() self.NERTaggerObj = StanfordNERTagger(self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',self.cur_dir+'/PythonScripts/stanford-ner-2014-06-16/stanford-ner.jar') self.qWord = "" self.Initialize() self.Main() def Initialize(self): # STEP 0: Convert HTML to Raw text using Beautiful Soup self.rawQuery = convertHTMLtoRawText(rawQueryFile) self.content = convertHTMLtoRawText(contentFile) # Replace EOL character self.rawQuery = self.rawQuery.strip().replace("?","").replace('"','').replace(".", " ") self.campusLocations = {} # Replaces the stopwords and the qWord from the question def ReplaceStopQuestionWords(self, question): for stopWord in self.stopWords: pattern = r'\b%s\b'% stopWord question= re.sub(pattern,"",question,flags=re.IGNORECASE) pattern = r'\b%s\b' % self.qWord questionWithOutJunk = re.sub(pattern,"",question,flags = re.IGNORECASE) pattern = r'?$' questionWithOutJunk = questionWithOutJunk.replace('?','') return questionWithOutJunk # Given the input question, this method returns the qWord and the type of question def DefineQuestionType(self, question): questionLC = question.lower() # Check first word in sentence wordsInSentence = questionLC.split() if wordsInSentence[0] in self.questionTypeWH: # Check for How Many type question if wordsInSentence[0]+" "+wordsInSentence[1] in self.questionTypeWH: return self.allTypes[0], "how many" # All other WH Questions else: return self.allTypes[0], wordsInSentence[0] elif wordsInSentence[0] in self.questionTypeFactoid2: return self.allTypes[1], wordsInSentence[0] elif wordsInSentence[0] in self.questionTypeFactoid1: return self.allTypes[1], wordsInSentence[0] elif wordsInSentence[0] in self.questionTypeOther: return self.allTypes[2], wordsInSentence[0] else: # For complex sentences, check for question words after comma if "," in questionLC: wordsInSentence = questionLC.split(",")[1].split() if wordsInSentence[0] in self.questionTypeWH: # Check for How many if wordsInSentence[0]+" "+wordsInSentence[1] in self.questionTypeWH: return self.allTypes[0], "how many" # All other WH Questions else: return self.allTypes[0], wordsInSentence[0] elif wordsInSentence[0] in self.questionTypeFactoid2: return self.allTypes[1], wordsInSentence[0] elif wordsInSentence[0] in self.questionTypeFactoid1: return self.allTypes[1], wordsInSentence[0] elif wordsInSentence[0] in self.questionTypeOther: return self.allTypes[2], wordsInSentence[0] else: return self.allTypes[3], "" #As a last resort, look for question word in the entire question sentence #We ignore edge cases where there are multiple question words else: for q in self.questionTypeWH: if q in questionLC: return self.allTypes[0], q for q in self.questionTypeFactoid2: if q in questionLC: return self.allTypes[1], q for q in self.questionTypeFactoid1: if q in questionLC: return self.allTypes[1], q for q in self.questionTypeOther: if q in questionLC: return self.allTypes[2], q return self.allTypes[3], "" # Given a sentence, checks whether it contains any GHC locations def CheckForWHEREAnswer(self, sentence): for token in sentence: if token.lower() in self.campusLocations: return True sentTokens = nltk.word_tokenize(sentence) NERtags = self.NERTaggerObj.tag(sentTokens) NERtags = chunkNEROutput(NERtags) countOfOccurence = 0 for i in xrange(0,len(NERtags)): if 'LOCATION' in NERtags[i]: ans, tag = NERtags[i] if ans in sentence: countOfOccurence += 1 continue return True if countOfOccurence > 1: return True return None # Given a sentence, checks whether it contains time stamps def CheckForWHENAnswer(self, sentence): timeStamp = getTimeStamp(sentence) if timeStamp is not None: return True # Covered Edge case for "second century AD" for timeEvent in {"AD", "BCE", "BC"}: pattern = r'[^a-zA-Z]%s[^a-zA-Z]' % timeEvent if re.search(pattern, sentence) is not None: return True return None # Given the content and question, this method extracts the matching sentences def CheckForMatch(self, content, question): result = '' # Tokenize the question queryTokens, queryPosTags, queryMorphTokens = tokenize(question) lengthOfQuery = len(queryMorphTokens) # Split the entire content into Sentences contentSentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\n)\s', content) for idx, sentence_level1 in enumerate(contentSentences): for sentence in sentence_level1.split("\n"): sentence = sentence.replace("<p>","").replace("</p>","").replace(".","") sentence_processed = sentence sentence_processed = sentence_processed.replace(". ","").replace("<a href"," <a href") sentTokens, sentPosTags, sentMorphTokens = tokenize(sentence_processed) counter = 0.0 for word in queryMorphTokens: # Get list of synonyms of the query word synonyms = synonyms_as_set(word, queryTokens[queryMorphTokens.index(word)]) # For each word in the query, we check if the word occurs in the sentence if word in sentMorphTokens: counter+=1.0 else: # OR check if the word is a synonym of a word in the sentence i.e. one of the synonyms exist in the sentence for syn in synonyms: if syn in sentMorphTokens: counter+=1.0 break matchPercent = float(counter/lengthOfQuery) if matchPercent >= 0.4: # ADDITIONAL CHECKS: # WHERE TYPE QUESTIONS SHOULD HAVE LOCATION IN SENTENCE: if self.qWord == "where": if self.CheckForWHEREAnswer(sentence_processed) == None: # Check for neighboring sentences to extract the location if (idx+1) <= len(contentSentences)-1: fwdSentence = contentSentences[idx+1] if self.CheckForWHEREAnswer(fwdSentence) == True: sentence = sentence + "\n" + fwdSentence if (idx-1) >= 0: bckSentence = contentSentences[idx-1] if self.CheckForWHEREAnswer(bckSentence) == True: sentence = bckSentence + "\n" + sentence # WHEN TYPE QUESTIONS SHOULD HAVE TIME IN SENTENCE: if self.qWord == "when": if self.CheckForWHENAnswer(sentence_processed) == None: # If the current sentence doesn't have a date. We check for previous sentence if (idx+1) <= len(contentSentences)-1: fwdSentence = contentSentences[idx+1] if self.CheckForWHENAnswer(fwdSentence) == True: sentence = sentence + "\n" + fwdSentence if (idx-1) >= 0: bckSentence = contentSentences[idx-1] if self.CheckForWHENAnswer(bckSentence) == True: sentence = bckSentence + "\n" + sentence result = result + sentence + "\n" print result def Main(self): # STEP 1: Define Question Type: qType, self.qWord = self.DefineQuestionType(self.rawQuery) if self.qWord == "where": self.campusLocations = getCampusLocation(self.cur_dir+"/PythonScripts/gazeteer/campusLocations.txt") # STEP 2: Remove stop words and junk from the question questionWithOutJunk = self.ReplaceStopQuestionWords(self.rawQuery) # STEP 3: self.CheckForMatch(self.content, questionWithOutJunk)
import wikipedia as wiki from bs4 import BeautifulSoup from nltk.tag.stanford import StanfordNERTagger from nltk.tokenize import word_tokenize from unidecode import unidecode from datetime import datetime from indexers import TextCollection, TextGroup # Path to java jpath2 = 'C:/Program Files (x86)/Java/jre1.8.0_73/bin' # Setting Java environment os.environ['JAVAHOME'] = jpath2 # initializing stanford NER st = StanfordNERTagger('C:\stanford-ner-2014-06-16\classifiers\english.muc.7class.distsim.crf.ser.gz', 'C:\stanford-ner-2014-06-16\stanford-ner.jar', encoding='UTF-8') def clean(in_txt: str) -> str: in_txt = re.sub("/.*/; ", "", in_txt) in_txt = re.sub("–", "-", in_txt) in_txt = re.sub(r"\\", "", in_txt) in_txt = re.sub("Â\xa0", " ", in_txt) in_txt = unidecode(in_txt) indices = [m.start() for m in re.finditer('(\d{4}(-|/)\d{2})', in_txt)] for i in reversed(indices): in_txt = in_txt[0:(i + 4)] + " to " + in_txt[i:i + 2] + in_txt[i + 5:] indices = [m.start() for m in re.finditer(r'\d{4}(-|/)\d{4}', in_txt)] for i in reversed(indices): in_txt = in_txt[:i + 4] + ' to ' + in_txt[i + 5:]
def entitiy_rec(text): ner = StanfordNERTagger("ner/classifiers/english.all.3class.distsim.crf.ser.gz", "ner/stanford-ner.jar") tags=ner.tag(text) return tags
def process (self, parameters={}, data={} ): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'stanford_ner_mapping' in parameters: mapping = parameters['stanford_ner_mapping'] else: # todo: extend mapping for models with more classes like dates mapping = { 'PERSON': 'person_ss', 'LOCATION': 'location_ss', 'ORGANIZATION': 'organization_ss', 'I-ORG': 'organization_ss', 'I-PER': 'person_ss', 'I-LOC': 'location_ss', 'ORG': 'organization_ss', 'PER': 'person_ss', 'LOC': 'location_ss', 'PERS': 'person_ss', 'LUG': 'location_ss', 'MONEY': 'money_ss', } # default classifier classifier = 'english.all.3class.distsim.crf.ser.gz' if 'stanford_ner_classifier_default' in parameters: classifier = parameters['stanford_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'stanford_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['stanford_ner_classifiers']: classifier = parameters['stanford_ner_classifiers'][data['language_s']] # if standard classifier configured to None and no classifier for detected language, exit the plugin if not classifier: return parameters, data kwargs={} if 'stanford_ner_java_options' in parameters: kwargs['java_options'] = parameters['stanford_ner_java_options'] if 'stanford_ner_path_to_jar' in parameters: kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar'] analyse_fields = ['title_txt','content_txt','description_txt','ocr_t','ocr_descew_t'] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # classify/tag with class each word of the content st = StanfordNERTagger(classifier, encoding='utf8', verbose=verbose, **kwargs) entities = st.tag(text.split()) # compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens) entities = self.multi_word_entities(entities) # if class of entity is mapped to a facet/field, append the entity to this facet/field for entity, entity_class in entities: if entity_class in mapping: if verbose: print ( "NER classified word(s)/name {} to {}. Appending to mapped facet {}".format(entity, entity_class, mapping[entity_class]) ) etl.append(data, mapping[entity_class], entity) else: if verbose: print ( "Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}".format(entity_class, entity) ) # mark the document, that it was analyzed by this plugin yet data['enhance_ner_stanford_b'] = "true" return parameters, data
class Evaluator: def __init__(self, cslm, transitions, tags): self.cslm = cslm self.transitions = transitions self.tags = tags self.engClassifier = StanfordNERTagger( "../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz", "../stanford-ner-2015-04-20/stanford-ner.jar") self.spanClassifier = StanfordNERTagger( "../stanford-ner-2015-04-20/classifiers/spanish.ancora.distsim.s512.crf.ser.gz", "../stanford-ner-2015-04-20/stanford-ner.jar") def tagger(self, text_list): hmm = HiddenMarkovModel(text_list, self.tags, self.transitions, self.cslm) hmmtags = hmm.generateTags() # generate list of hmm tags words = hmm.words # generate list of words taggedTokens = [] prevLang = "Eng" engTags = [] spnTags = [] engTag = "" spanTag = "" token = re.compile(ur'[^\w\s]', re.UNICODE) print "Tagging {} words".format(len(words)) for k, word in enumerate(words): # check if punctuation else use hmmtag lang = 'Punct' if re.match(token, word) and not word[-1].isalpha() else hmmtags[k] lang = 'Num' if word.isdigit() else lang # check if word is NE if lang != "Punct": index = k % 1000 if index == 0: engTags = self.engClassifier.tag(words[k:k+1000]) spnTags = self.spanClassifier.tag(words[k:k+1000]) engTag = engTags[index][1] spanTag = spnTags[index][1] else: engTag = "O" spanTag = "O" # mark as NE if either classifier identifies it if engTag != 'O' or spanTag != 'O': NE = "{}/{}".format(engTag, spanTag) else: NE = "O" # record probabilities if lang in ("Eng", "Spn"): hmmProb = round(hmm.transitions[prevLang][lang], 2) engProb = round(self.cslm.prob("Eng", word), 2) spnProb = round(self.cslm.prob("Spn", word), 2) totalProb = (hmmProb + engProb) if lang == "Eng" else (hmmProb + spnProb) prevLang = lang else: hmmProb = "N/A" engProb = "N/A" spnProb = "N/A" totalProb = "N/A" taggedTokens.append((word, lang, NE, str(engProb), str(spnProb), str(hmmProb), str(totalProb))) #taggedTokens.append((word, lang, NE)) #print word, lang, NE return taggedTokens # Tag testCorpus and write to output file def annotate(self, testCorpus): print "Annotation Mode" with io.open(testCorpus.strip(".txt") + '_annotated.txt', 'w', encoding='utf8') as output: text = io.open(testCorpus).read() testWords = toWordsCaseSen(text) tagged_rows = self.tagger(testWords) output.write(u"Token\tLanguage\tNamed Entity\tEng-NGram Prob\tSpn-NGram Prob\tHMM Prob\tTotal Prob\n") for row in tagged_rows: csv_row = '\t'.join([unicode(s) for s in row]) + u"\n" print csv_row output.write(csv_row) print "Annotation file written" # Evaluate goldStandard and write to output file def evaluate(self, goldStandard): print "Evaluation Mode" with io.open(goldStandard + '_outputwithHMM.txt', 'w', encoding='utf8') as output: #create list of text and tags lines = io.open(goldStandard, 'r', encoding='utf8').readlines() text, gold_tags = [], [] for x in lines: columns = x.split("\t") text.append(columns[-2].strip()) gold_tags.append(columns[-1].strip()) # annotate text with model annotated_output = self.tagger(text) #tokens, lang_tags, NE_tags = map(list, zip(*annotated_output)) tokens, lang_tags, NE_tags, engProbs, spnProbs, hmmProbs, totalProbs = map(list, zip(*annotated_output)) # set counters to 0 langCorrect = langTotal = NECorrect = NETotal = 0 evaluations = [] # compare gold standard and model tags for lang, NE, gold in zip(lang_tags, NE_tags, gold_tags): if gold in ('Eng', 'Spn'): #evaluate language tags langTotal += 1 if gold == lang: langCorrect += 1 evaluations.append("Correct") else: evaluations.append("Incorrect") # evaluate NE tags elif gold == "NamedEnt": NETotal += 1 if NE != 'O': NECorrect += 1 evaluations.append("Correct") else: evaluations.append("Incorrect") # don't evaluate punctuation else: evaluations.append("NA") #write output.write(u"Language Accuracy: {}\n".format(langCorrect / float(langTotal))) output.write(u"NE Accuracy: {}\n".format(NECorrect / float(NETotal))) output.write(u"Token\tGold Standard\tTagged Language\tNamed Entity\tEvaluation\n") for all_columns in zip(text, gold_tags, lang_tags, NE_tags, evaluations): output.write(u"\t".join(all_columns) + u"\n") print "Evaluation file written"
def __init__(self): self.tagger = StanfordNERTagger('/stanford_ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/stanford_ner/stanford-ner.jar')
def ner(datasetfile, format, language): tweets = "" tweetids = [] if language == 'english': st = StanfordNERTagger(BASEPATH+'/classifiers/english.all.3class.distsim.crf.ser.gz', BASEPATH+'/classifiers/stanford-ner.jar', encoding='utf8') elif language == 'spanish': st = StanfordNERTagger(BASEPATH+'/classifiers/spanish.ancora.distsim.s512.crf.ser.gz', BASEPATH+'/classifiers/stanford-ner.jar', encoding='utf8') if format == 'xml': dataset = etree.parse(datasetfile) for tweet in dataset.xpath('//Tweet'): tweetText = tweet.xpath('./TweetText/text()')[0] tweets += ' '.join(re.findall(r"[\w:/!#$%&*+,\-:;?@^_`{|}~.]+|[\"'()[\]<=>]", tweetText))+"\n" tweetids.append(tweet.xpath('./TweetId/text()')[0]) tweets = tweets.encode('utf-8') elif format == "nif": tweetdict = {} a = rdflib.Graph() a.parse(datasetfile, format='n3') for s, p, o in a: if s.endswith(',') and p.endswith('isString'): tweetid = s.split('#')[0].split('.xml/')[1] tweetdict[tweetid] = ' '.join(re.findall(r"[\w:/!#$%&*+,\-:;?@^_`{|}~.]+|[\"'()[\]<=>]", o)) for key in sorted(tweetdict): tweetids.append(key) tweets += tweetdict[key]+'\n' tweets = tweets.encode('utf-8') #print tweets elif format == "text": tweets = datasetfile tweetlist = [] for t in tweets.splitlines(): newtweet = [] for word in t.split(): newword = u'' if word.endswith(",") or word.endswith(".") or word.endswith(")") or word.endswith("\'"): newtweet.append(word[:-1]) newtweet.append(word[-1]) else: newtweet.append(word) #print newtweet tweetlist.append(newtweet) results = '' tagged = [] for tweet in tweetlist: tagged.append(st.tag(tweet)) #print tagged[-1] #print len(tagged) inEntity = False for line in tagged: #print line for (word, entity) in line: if entity != 'O' and inEntity: entity = 'I-'+entity elif entity != 'O' and inEntity == False: entity = 'B-'+entity inEntity = True else: inEntity = False results += word + '/' + entity + ' ' if tweetids: results += "||"+tweetids[x] results += "\n" #print results return results
from __future__ import print_function from nltk.tag.stanford import StanfordNERTagger from nltk.tokenize import word_tokenize import nltk import sys import os tagger = StanfordNERTagger('stanford-ner-2017-06-09/classifiers/english.conll.4class.distsim.crf.ser.gz', path_to_jar='stanford-ner-2017-06-09/stanford-ner.jar') print(sys.argv[1]) with open(sys.argv[1]) as fin: current_entity = [] entities = [] for line in fin: for token,tag in tagger.tag(word_tokenize(line)): if tag != 'O': current_entity.append((token,tag)) else: if current_entity != []: entities.append(current_entity) current_entity = [] if current_entity != []: entities.append(current_entity) with open(os.path.splitext(sys.argv[1])[0]+'.ne','w') as fout: for entity in entities: print('%s_%s'%(' '.join([tok for tok,tag in entity]),entity[0][1]),file=fout)
#!/bin/env python3.5 from nltk.tag.stanford import StanfordNERTagger from nltk.internals import find_jars_within_path from nltk.tokenize import sent_tokenize import os tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar') tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09')) print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))
def main(): pathname = "test/*/*/*.pos" directory = glob.glob(pathname) for f in directory: with open(f) as readfile: print(f) print("Processing...") result = [] history = [] chunk = False # Collect words words = [line.split()[3] for line in readfile if len(line.split()) > 1] # NER tag using Stanford stanford = StanfordNERTagger('stanford-ner-2014-06-16/classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar') tagged = stanford.tag(words) for word_tuple in tagged: word = word_tuple[0] tag = word_tuple[1] # Determine new tag new_tag = transform_tag(tag, word, words) # Determine chunk if len(history) > 0 and history[-1][1] != new_tag: if len(history) > 1 and history[-1][1]: chunk = " ".join([tpl[0] for tpl in history]) history = [] new_word_tuple = (word, new_tag) history.append(new_word_tuple) # Search wikipedia page # Process chunk if chunk: wikiurl = wiki_search(chunk) chunk_length = len(chunk.split()) old_combis = result[-chunk_length:] result = result[:-chunk_length] for old_combi in old_combis: new_combi = old_combi[:-1] new_combi.append(wikiurl) result.append(new_combi) chunk = False # Process word if new_tag: wikiurl = wiki_search(word) else: wikiurl = "" result.append([word, new_tag, wikiurl]) # Write results to .ent.aut file with open(f) as readfile2, open(f + ".ent.aut", "a") as writefile: print("Writing...") n = 0 for line in readfile2: if len(line) > 1: new_line = line.rstrip() + " " + result[n][1] + " " + result[n][2] print(new_line, file=writefile) n += 1
import os from nltk.tag.stanford import StanfordNERTagger # java_path = "C:/Program Files/Java/jdk1.8.0_05/bin/java.exe" # os.environ['JAVAHOME'] = java_path # path2 = 'C:/Users/Pantelis/Desktop/stanford-ner' st = StanfordNERTagger('classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar') f1 = open('Dostoyevski_TheGambler.txt','r') f2 = open('Dostoyevsky_TheGambler_Results.txt','w') f3 = open('Dostoyevsky_TheGambler_PERSONS.txt','w') f4 = open('Dostoyevsky_TheGambler_Unique_PERSONS.txt','w') book=f1.read() persons =[] #print book results2= st.tag(book.split()) for name,entity in results2: print name +" " + entity f2.write(name +" " + entity+"\n" ) if entity == "PERSON": f3.write(name +"\n") if name not in persons: persons.append(name) for k in persons: f4.write(k+"\n")
def nertag(text): st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar') print(st.tag(text.split()))
def o_tag(): """Returns a noun with a tag if the tag is unfindable or a location""" sttag = StanfordNERTagger('stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2014-06-16/stanford-ner-3.4.jar') sttags = sttag.tag(get_nouns()) return [sttag for sttag in sttags if sttag[1] == 'O' or sttag[1] == "LOCATION"]
''' Created on Apr 12, 2016 @author: zhongzhu ''' from nltk.parse.stanford import StanfordParser from nltk.tag.stanford import StanfordNERTagger from nltk.tokenize import word_tokenize import script_wrapper as stanford_parser sentence = "Dempsey was drafted by Major League Soccer club New England Revolution." st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar") tags = st.tag(word_tokenize(sentence)) print(tags) prev_tag_name = tags[0][1] cur_entity = tags[0][0] entities = {} for i in range(1, len(tags)): cur_tag = tags[i] cur_token = cur_tag[0] cur_tag_name = cur_tag[1] if cur_tag_name == prev_tag_name: cur_entity = cur_entity + " " + cur_token else: if not prev_tag_name in entities: entities[prev_tag_name] = [] entities[prev_tag_name].append(cur_entity) cur_entity = cur_token
def __init__(self, filepath): self.filepath = filepath self.parser = CommonRegex() self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')
def stanfordNERExtractor(sentence): st = StanfordNERTagger(baseLocation + 'english.muc.7class.distsim.crf.ser.gz', baseLocation + 'stanford-ner.jar') return st.tag(sentence.split())
class StanfordNERTaggerExtractor(object): """docstring for ClassName""" def __init__(self): self.st = StanfordNERTagger('intent_class_models/stanford-jars/english.all.3class.distsim.crf.ser.gz' , "intent_class_models/stanford-jars/stanford-ner.jar" ) # self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz' , # 'stanford-ner.jar' ) def tag_text_single(self,text): ''' :param text: :return: ''' # assert type(text) == str sents = self.st.tag(nltk.word_tokenize(text)) return sents def identify_NER_tags_single(self,text_tag,tag_to_find): ''' :param text_tag: Tagged text :param tag_to_find: :return: ''' tag_strs = [] prev_wrd_tag = False for wrd,tag in text_tag: if tag == tag_to_find: if not prev_wrd_tag: tag_strs.append(wrd) else: prev_wrd = tag_strs.pop() new_wrd = prev_wrd+' '+wrd tag_strs.append(new_wrd) prev_wrd_tag = True else: prev_wrd_tag = False tags_final = [] for wrd in tag_strs: if wrd not in tags_final: tags_final.append(wrd) return tags_final def tag_text_multi(self,text): ''' ''' tokenized_sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)] return self.st.tag_sents(tokenized_sents) def identify_NER_tags_multi(self,text_tag,tag_to_find): ''' ''' tag_strs = [] for sent_tag in text_tag: for wrd in self.identify_NER_tags_single(sent_tag,tag_to_find): if wrd not in tag_strs: tag_strs.append(wrd) return tag_strs def tag_text_multi_from_single(self,ner_tags): ''' converting a huge single text tags into sentence based tags this is done because tagging sentence wise is slow. so we tag the entire text and split them after''' sents = '' for wrd,_ in ner_tags: sents += wrd+' ' sent_tags = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(sents)] cnt = 0 final_tags = [] for sent_ind in range(len(sent_tags)): sent_tag_list = [] for wrd_ind in range(len(sent_tags[sent_ind])): try: sent_tag_list.append(ner_tags[cnt]) cnt += 1 except: break final_tags.append(sent_tag_list) return final_tags
#!/usr/bin/python from nltk.tag.stanford import StanfordNERTagger import operator import re fileList = open("fileList.txt", "r") fileName = fileList.readlines() fileList.close() outfile = open("country.txt", "w") english_nertagger = StanfordNERTagger('/Users/stellamberv/Documents/stanford-ner-2014-08-27/classifiers/english.muc.7class.distsim.crf.ser.gz','/Users/stellamberv/Documents/stanford-ner-2014-08-27/stanford-ner.jar') for i in range(len(fileName)): oneFile = open(fileName[i].rstrip(), "r") oneFileContent = oneFile.read() oneFile.close() str_split = english_nertagger.tag(oneFileContent.split()) j = 0 country = "" while j < len(str_split): if str_split[j][1] == u'LOCATION': country = country + " " + (str_split[j][0]).encode("utf-8") j = j + 1 else: j = j + 1 if len(country) == 0: continue else:
def __init__(self): self.st = StanfordNERTagger('intent_class_models/stanford-jars/english.all.3class.distsim.crf.ser.gz' , "intent_class_models/stanford-jars/stanford-ner.jar" )