def extract_entities_stanford(text): ne_tagger = nltk.StanfordNERTagger("E:/Martin/Škola/KnowledgeBase/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz","E:/Martin/Škola/KnowledgeBase/stanford-ner-2018-02-27/stanford-ner.jar") chunked = ne_tagger.tag(word_tokenize(text)) current_chunk = [] result = [] ent_type = "" for i in chunked: if i[1] != 'O': if i[1] == ent_type or not current_chunk: current_chunk.append(i[0]) else: result = create_ne(current_chunk, result) current_chunk = [] ent_type = i[1] elif current_chunk: result = create_ne(current_chunk, result) current_chunk = [] if current_chunk: result = create_ne(current_chunk, result) return result
def extract_entities_per_sentence(text, lib='NLTK'): assert lib in ('NLTK', 'StanfordNER'), "The valye of 'lib' must either be 'NLTK' or 'StanfordNER'." entities = [] if lib == 'NLTK': for sent in nltk.sent_tokenize(text): entities_per_sentence = collections.defaultdict(int) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label'): entities_per_sentence[(chunk.label(), ' '.join(c[0] for c in chunk))] += 1 entities.append(entities_per_sentence) elif lib == 'StanfordNER': config = yaml.load(open('config.yaml')) stanford_ner_location = config.get('defaults', {}).get('depend', {}).get('stanford-ner') assert stanford_ner_location, "Please provide the location to the StanfordNER 2015-12-09 directory in the" \ " defaults->depend->stanford-ner configuration section." tokenized_sents = [] for sent in nltk.sent_tokenize(text): tokenized_sents.append([ token.replace('/', '-') for token in nltk.word_tokenize(sent)]) stanford_tagger = nltk.StanfordNERTagger( model_filename=os.path.join(stanford_ner_location, 'classifiers/english.all.3class.distsim.crf.ser.gz'), path_to_jar=os.path.join(stanford_ner_location, 'stanford-ner-3.6.0.jar'), encoding='utf-8') ne_tagged_sentences = stanford_tagger.tag_sents(tokenized_sents) for ne_tagged_sentence in ne_tagged_sentences: entities_per_sentence = collections.defaultdict(int) for tag, chunk in itertools.groupby(ne_tagged_sentence, lambda x: x[1]): if tag != 'O': entities_per_sentence[(tag, ' '.join(w for w, t in chunk))] += 1 entities.append(entities_per_sentence) return entities
import nltk import ner import nltk.data from nltk.corpus.reader import path_similarity, Synset from nltk.stem import WordNetLemmatizer import language_check from nltk.corpus import wordnet as wn import collections from langdetect import detect st = nltk.StanfordNERTagger( "nlp/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz", "nlp/stanford-ner-2015-12-09/stanford-ner.jar") tool = language_check.LanguageTool('en-US') sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') wordnet_lemmatizer = WordNetLemmatizer() def check_grammar(text): matches = tool.check(text) len(matches) text = language_check.correct(text, matches) return text def get_sentences(text): sentenced_text = sent_detector.tokenize(text.strip()) return sentenced_text
target = "targetTitle" # "postText" or "targetTitle" prefix = "PT" if target == "postText" else "TA" NORMALIZE = False FEATURES_DATA_PATH = r"../features/pos_features_{}_{}_{}.csv".format( DATASET, target, 'normalized' if NORMALIZE else "no-normalized") print( f"Generating POS features... it might take a while :P\n Path: '{FEATURES_DATA_PATH}' | {target} | {prefix}" ) labeled_instances = get_labeled_instances( "../train_set/instances_converted_{}.pickle".format(DATASET), "../train_set/truth_converted_{}.pickle".format(DATASET)) tagger = nltk.StanfordNERTagger( '../ner/english.all.3class.distsim.crf.ser.gz', '../ner/stanford-ner.jar', encoding='utf-8') tagset = nltk.load("help/tagsets/upenn_tagset.pickle") possible_tags = list(tagset.keys()) ids = list(labeled_instances.id) if target == 'postText': texts = [txt[0] for txt in list(labeled_instances.postText)] else: texts = [txt for txt in list(labeled_instances.targetTitle)] features = [] for idx, txt in enumerate(texts, 1): print(f"Computing features for sample {idx} out of {len(texts)}...") features.append( generate_pos_features(txt,
def nerTagger(tokens): chi_tagger = nltk.StanfordNERTagger( model_filename=r'E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp\models\ner\chinese.misc.distsim.crf.ser.gz', path_to_jar=r'E:\03_tools\machine learning\stanfordnlp\3.7\stanford-ner-2016-10-31\stanford-ner.jar') for word, tag in chi_tagger.tag(tokens): print(word, tag)
def search_for_location(line): """ Verifies if passed in chunks are names of cities/locations Assumptions made: If the statement is about weather, then all named entities are treated as a location (this includes ORGANIZATION, PERSON tags) This function utilizes a combination of nltk's built in pos_tag() function and the Stanford NER Tagger. The function will choose the option that gives a longer location string. @param line: the text to search through @return: a string of the found location, if none found, returns None """ ner_tagger = nltk.StanfordNERTagger( "%s/dep/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz" % (os.getcwd())) # tags to identify for nltk loc_labels = ["GPE", "ORGANIZATION", "PERSON"] tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(line))) # tree.draw() tags = ner_tagger.tag(line.split()) # print(tags) # figure out the location from the Stanford tagger ner_location = "" for tag in tags: if tag[1] == "LOCATION": ner_location += (tag[0] + " ") ner_location = ner_location.strip() ner_location = ner_location.strip("?!., ") print("ner loc: %s" % (ner_location)) # figure out the location from the nltk tagger location = "" # only the top level has the "S" label for subtree in tree.subtrees(lambda t: t.label() == "S"): for chunk in subtree.subtrees(lambda t: t.height() == 2): # some "chunks" are just strings apparently if isinstance(chunk, str): continue # each seperate detected word will be seperated with a comma if chunk.label() in loc_labels: location_elem = "" for word, pos in chunk: location_elem += word + " " location_elem = location_elem.strip() location += location_elem + ", " location = location.strip() location = location.strip(" ,") print("nltk loc: %s" % (location)) ''' if location != "": print("found location %s"%(location)) else: print("No location found") ''' return location if len(location) > len(ner_location) else ner_location