def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type): stanford_tagger = StanfordNERTagger( model_name, stanford_ner_path, encoding='utf-8') text = sent_obj.text tokenized_text = list() spans = list() #Recover spans here for match in re.finditer("\S+", text): start = match.start() end = match.end() word = match.group(0) tokenized_text.append(word.rstrip(",.;:)(")) spans.append((start,end)) tokenized_text = strip_sec_headers_tokenized_text(tokenized_text) classified_text = stanford_tagger.tag(tokenized_text) # Expand tuple to have span as well len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset final_class_and_span = list() for idx,tup in enumerate(classified_text): combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1]) final_class_and_span.append(combined) # print(classified_text) sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span)) return sent_obj
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type): stanford_tagger = StanfordNERTagger(model_name, stanford_ner_path, encoding='utf-8') text = sent_obj.text tokenized_text = list() spans = list() #Recover spans here for match in re.finditer("\S+", text): start = match.start() end = match.end() word = match.group(0) tokenized_text.append(word.rstrip(",.;:)(")) spans.append((start, end)) tokenized_text = strip_sec_headers_tokenized_text(tokenized_text) classified_text = stanford_tagger.tag(tokenized_text) # Expand tuple to have span as well len_diff = len(spans) - len( classified_text ) #Headers were stripped, so if this occured in the previous step, we have t account for the offset final_class_and_span = list() for idx, tup in enumerate(classified_text): combined = (classified_text[idx][0], classified_text[idx][1], spans[idx + len_diff][0], spans[idx + len_diff][1]) final_class_and_span.append(combined) # print(classified_text) sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span)) return sent_obj
def ner_tag(questions): path = 'C:\\Users\Martin\\PycharmProjects\\xserpy\\stanford-nlp\\' st_ner = StanfordNERTagger(path+'classifiers\\english.all.3class.distsim.crf.ser.gz', path+'stanford-ner.jar') java_path = "C:\\Program Files\\Java\\jdk1.8.0_65\\bin\\java.exe" os.environ['JAVAHOME'] = java_path tagged = [] i = 0 for q in questions: text = nltk.word_tokenize(q.utterance) tagged.append(st_ner.tag(text)) i += 1 return tagged
class NERHandler(object): """ handler class for the Stanford NER """ def __init__(self): """ constructor for the NERHandler class :return: """ if sys.platform.startswith('win'): os.environ['CLASSPATH'] = os.path.dirname( __file__) + "\\stanford-ner-2015-12-09\\" os.environ['STANFORD_MODELS'] = os.path.dirname( __file__) + "\\stanford-ner-2015-12-09\\classifiers\\" os.environ['JAVAHOME'] = "C:\\Program Files\\Java\\jre1.8.0_91" else: os.environ['CLASSPATH'] = os.path.dirname( __file__) + "/stanford-ner-2015-12-09/" os.environ['STANFORD_MODELS'] = os.path.dirname( __file__) + "/stanford-ner-2015-12-09/classifiers/" self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') def tag(self, text): """ search for Named locations within text :param text: String list containing text that needs to be searched :return: list of locations """ text = '. '.join(text) tags = self.st.tag(text.split()) # if there is tag 'LOCATION' add to locations, note locations can be multiple tags long i = 0 locations = [] while i < len(tags): location = [] if tags[i][1] == "LOCATION": location.append(tags[i][0]) i += 1 while tags[i][1] == "LOCATION": location.append(tags[i][0]) i += 1 locations.append(' '.join(location)) else: i += 1 locations = list(set(locations)) return locations
def ner_tag(questions, path, java_path): """Tag each word in given set of questions with NER tag then return list of lists of tags Keyword arguments: questions -- list of Question objects path -- a path to Stanford NLP library java_path -- path to Java executable """ sep = os.path.sep # Uses Stanford NER tagger with a dictionary st_ner = StanfordNERTagger(path+"classifiers" + sep + "english.all.3class.distsim.crf.ser.gz", path+"stanford-ner.jar") os.environ['JAVAHOME'] = java_path tagged = [] for q in questions: text = nltk.word_tokenize(q.utterance) tagged.append(st_ner.tag(text)) return tagged
def ner_tag(questions, path, java_path): """Tag each word in given set of questions with NER tag then return list of lists of tags Keyword arguments: questions -- list of Question objects path -- a path to Stanford NLP library java_path -- path to Java executable """ sep = os.path.sep # Uses Stanford NER tagger with a dictionary st_ner = StanfordNERTagger( path + "classifiers" + sep + "english.all.3class.distsim.crf.ser.gz", path + "stanford-ner.jar") os.environ['JAVAHOME'] = java_path tagged = [] for q in questions: text = nltk.word_tokenize(q.utterance) tagged.append(st_ner.tag(text)) return tagged
def text2graph(text): from nltk import StanfordNERTagger, word_tokenize import os os.environ['JAVAHOME'] = r"C:\Program Files (x86)\Java\jre1.8.0_181\bin\java.exe" st = StanfordNERTagger(r'..\..\..\stanford-ner-2018-10-16\classifiers\english.all.3class.distsim.crf.ser.gz', r'..\..\..\stanford-ner-2018-10-16\stanford-ner.jar', encoding='utf-8') # merge objects into one classified_text = st.tag(word_tokenize(text)) merged_classified_text = [classified_text[0]] full_word = [] for i in range(1, len(classified_text)): prev_word, prev_class = classified_text[i - 1] current_word, current_class = classified_text[i] if current_class != prev_class or current_class == 'O': merged_classified_text.append((' '.join(full_word), prev_class)) full_word = [current_word] else: full_word.append(current_word) # create dataframe of all edges in graph edges = [] win_size = 20 half_win_size = int(win_size / 2) for i in range(half_win_size, len(merged_classified_text) - half_win_size - 1): word, word_type = merged_classified_text[i] if word_type != 'PERSON': continue for neighbor, neighbor_type in merged_classified_text[i - half_win_size:i + half_win_size + 1]: if neighbor_type != 'PERSON': continue edges.append([word, neighbor, i]) graph_df = pd.DataFrame(edges, columns=['from', 'to', 'time']) return nx.from_pandas_edgelist(graph_df, 'from', 'to', 'time', create_using=nx.MultiGraph())
if any([ll not in event.keys() for ll in lngp]): continue eventsentspr.append( [event[lngp[0]]['title'], event[lngp[1]]['title']]) #lngpair.append(lngp) leftNE = [] rightNE = [] for pr in eventsentspr: timenow = time.time() NElist = [] if lngp[0] == 'es': wordslist = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]', pr[0]) classified_text = stes.tag(wordslist) NElist = [w[0] for w in classified_text if w[1] not in ['o', 'O']] if lngp[0] == 'en': wordslist = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]', pr[0]) classified_text = sten.tag(wordslist) NElist = [w[0] for w in classified_text if w[1] not in ['o', 'O']] if lngp[0] == 'de': wordslist = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]', pr[0]) classified_text = stde.tag(wordslist) NElist = [w[0] for w in classified_text if w[1] not in ['o', 'O']] leftNE.append(list(set(NElist)))
import nltk import corenlp nltk.download('punkt') from nltk import StanfordNERTagger # NER Using NLTK st = StanfordNERTagger( '/home/abin/my_works/nlp/stanford-ner-4.0.0/ner-model.ser.gz', '/home/abin/my_works/nlp/stanford-ner-4.0.0/stanford-ner.jar', encoding='utf-8') # text = 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action.' # tokenized_text = nltk.word_tokenize(text) classified_text = st.tag(tokenized_text) print(classified_text) # NER using stanford-corenlp library # Make sure you have set $CORENLP_HOME as environment variable before start to use Stanford CoreNLPClient with corenlp.CoreNLPClient(annotators="ner".split(), memory='2G') as client: ann = client.annotate(text) print(ann)