def parse_document(document, information): print 'Acquiring extractor' extractor = named_entity_extractor( os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat')) print 'Cleaning HTML' cleaned_document = clean_html(document) print 'Tokenizing' tokens = filter(None, tokenize(cleaned_document)) print 'Extracting NER Entities' entities = extractor.extract_entities(tokens) print 'Done' normalizes_entities = [] for position, tag in entities: position_indices = list(position) # TODO: join how for other languages? words = ' '.join(tokens[position_indices[0]:position_indices[-1]]) if words: normalizes_entities.append((tag, words)) return normalizes_entities
def tokenize_with_offsets(self, text): _text = text.encode('utf-8') offsets = [] offset = 0 tokens = [w.decode('utf-8') for w in tokenize(_text)] for tok in tokens: m = re.search(re.escape(tok), text[offset:], re.UNICODE) offsets.append(offset + m.start()) offset += m.end() return tokens, offsets
def train(self, training_data, mitie_file, num_threads): # type: (TrainingData, str, Optional[int]) -> None from mitie import tokenize, text_categorizer_trainer trainer = text_categorizer_trainer(mitie_file) trainer.num_threads = num_threads for example in training_data.intent_examples: tokens = tokenize(example["text"]) trainer.add_labeled_text(tokens, example["intent"]) self.clf = trainer.train()
def tokenize_with_offsets(self, text): _text = text.encode('utf-8') offsets = [] offset = 0 tokens = [w.decode('utf-8') for w in tokenize(_text)] for tok in tokens: m = re.search(tok, _text[offset:]) offset += m.start() offsets.append(offset) return tokens, offsets
def features_for_sentences(self, sentences, feature_extractor): # type: (List[Text], mitie.total_word_feature_extractor) -> np.ndarray import mitie import numpy as np X = np.zeros((len(sentences), self.ndim(feature_extractor))) for idx, sentence in enumerate(sentences): tokens = mitie.tokenize(sentence) X[idx, :] = self.features_for_tokens(tokens, feature_extractor) return X
def mitie_context(text, ner_model): """ Send text to MITIE NER, format the results, and return them with the 3 words on either side of every extracted entity. The context words can be used to filter results (e.g., if it says "the province of Aleppo", look for an admin area rather than a city. This version does not produce any HTML marked up text. Parameters ---------- text: string The text to have its entities extracted ner_model: MITIE named entity extractor The NER model produced by `setup_mitie` Returns ------- named_entities: dictionary "entities" contains a list of dictionaries. Each of these dicts has keys "tag", "text", and "score". """ text = text.encode("utf-8") tokens = mitie.tokenize(text) # eventually, handle different NER models. entities = ner_model.extract_entities(tokens) out = [] for e in entities: range = e[0] tag = e[1] score = e[2] entity_text = str(" ").join(tokens[i] for i in range) beg_token = min(range) end_token = max(range) context = [] for i in [3, 2, 1]: try: context.append(tokens[beg_token - i]) except: pass try: context.append(tokens[end_token + i]) except: pass out.append({ u'tag': unicode(tag), u'text': entity_text, u'score': score, u'context': context }) return {"entities": out}
def _nlp_extract_metadata_core(text=None): if text: global ner from mitie import tokenize tokens = tokenize(text) entities = ner.extract_entities(tokens) locations = [] organizations = [] for e in entities: range = e[0] tag = e[1] score = e[2] # score_text = "{:0.3f}".format(score) entity_text = " ".join(tokens[i] for i in range) if tag == "LOCATION": locations.append((entity_text, score)) elif tag == "ORGANIZATION": organizations.append((entity_text, score)) # print tag+" : "+entity_text+" : "+score_text # Remove Duplicates locations = removeDuplicateEntities(locations) organizations = removeDuplicateEntities(organizations) # Resolve Locations to Regions regions = [] for location in locations: location_text, score = location if score > settings.NLP_LOCATION_THRESHOLD: try: region = Region.objects.get(name__iexact=location_text) if region: regions.append(region) except: pass # Resolve organizations to Keywords/Tags keywords = [] for organization in organizations: organization_text, score = organization try: keyword = Tag.objects.get(name__iexact=organization_text) if keyword: keywords.append(keyword.name) except: pass return {'regions': regions, 'keywords': keywords} else: return None
def _nlp_extract_metadata_core(text=None): if text: global ner from mitie import tokenize tokens = tokenize(text) entities = ner.extract_entities(tokens) locations = [] organizations = [] for e in entities: range = e[0] tag = e[1] score = e[2] # score_text = "{:0.3f}".format(score) entity_text = " ".join(tokens[i] for i in range) if tag == "LOCATION": locations.append((entity_text, score)) elif tag == "ORGANIZATION": organizations.append((entity_text, score)) # print tag+" : "+entity_text+" : "+score_text # Remove Duplicates locations = removeDuplicateEntities(locations) organizations = removeDuplicateEntities(organizations) # Resolve Locations to Regions regions = [] for location in locations: location_text, score = location if score > settings.NLP_LOCATION_THRESHOLD: try: region = Region.objects.get(name__iexact=location_text) if region: regions.append(region) except BaseException: pass # Resolve organizations to Keywords/Tags keywords = [] for organization in organizations: organization_text, score = organization try: keyword = Tag.objects.get(name__iexact=organization_text) if keyword: keywords.append(keyword.name) except BaseException: pass return {'regions': regions, 'keywords': keywords} else: return None
def find_entity(ent, text): tk = MITIETokenizer() tokens, offsets = tk.tokenize_with_offsets(text) if ent["start"] not in offsets: message = u"invalid entity {0} in example {1}:".format(ent, text) + \ u" entities must span whole tokens" raise ValueError(message) start = offsets.index(ent["start"]) _slice = text[ent["start"]:ent["end"]] val_tokens = tokenize(_slice) end = start + len(val_tokens) return start, end
def train_entity_extractor(entity_examples, fe_file, max_num_threads): trainer = ner_trainer(fe_file) trainer.num_threads = max_num_threads for example in entity_examples: text = example["text"] tokens = tokenize(text) sample = ner_training_instance(tokens) for ent in example["entities"]: start, end = find_entity(ent, text) sample.add_entity(xrange(start, end), ent["entity"]) trainer.add(sample) return trainer.train()
def train(self, training_data, mitie_file, num_threads): # type: (TrainingData, Text, Optional[int]) -> None import mitie trainer = mitie.text_categorizer_trainer(mitie_file) trainer.num_threads = num_threads for example in training_data.intent_examples: tokens = mitie.tokenize(example["text"]) trainer.add_labeled_text(tokens, example["intent"]) if training_data.intent_examples: # we can not call train if there are no examples! self.clf = trainer.train()
def find_entity(ent, text): import mitie tk = MitieTokenizer() tokens, offsets = tk.tokenize_with_offsets(text) if ent["start"] not in offsets: message = "Invalid entity {} in example '{}': entities must span whole tokens".format( ent, text) raise ValueError(message) start = offsets.index(ent["start"]) _slice = text[ent["start"]:ent["end"]] val_tokens = mitie.tokenize(_slice) end = start + len(val_tokens) return start, end
def tokenize_with_offsets(self, text): _text = text.encode('utf-8') offsets = [] offset = 0 tokens = [w.decode('utf-8') for w in tokenize(_text)] for tok in tokens: m = re.search(re.escape(tok), text[offset:], re.UNICODE) if m is None: message = "Invalid MITIE offset. Token '{}' in message '{}'.".format(str(tok), str(text.encode('utf-8'))) raise ValueError(message) offsets.append(offset + m.start()) offset += m.end() return tokens, offsets
def mitie_extract_entities(text): if isinstance(text, unicode): text = unicodedata.normalize('NFKD', text).encode('ascii','ignore') # MITIE doesn't like unicode and can't set encoding yet entities = [] tokens = mitie.tokenize(text) mitie_entities = mt.extract_entities(tokens) for e in mitie_entities: range = e[0] tag = e[1] entity_text = " ".join(tokens[i] for i in range) entity_record = {u'entity': entity_text, u'type': tag} if entity_record not in mitie_known_bad_entities(): entities.append(entity_record) return entities
def mitie_context(text, ner_model): """ Send text to MITIE NER, format the results, and return them with the 3 words on either side of every extracted entity. The context words can be used to filter results (e.g., if it says "the province of Aleppo", look for an admin area rather than a city. This version does not produce any HTML marked up text. Parameters ---------- text: string The text to have its entities extracted ner_model: MITIE named entity extractor The NER model produced by `setup_mitie` Returns ------- named_entities: dictionary "entities" contains a list of dictionaries. Each of these dicts has keys "tag", "text", and "score". """ text = text.encode("utf-8") tokens = mitie.tokenize(text) # eventually, handle different NER models. entities = ner_model.extract_entities(tokens) out = [] for e in entities: range = e[0] tag = e[1] score = e[2] entity_text = str(" ").join(tokens[i] for i in range) beg_token = min(range) end_token = max(range) context = [] for i in [3, 2, 1]: try: context.append(tokens[beg_token - i]) except: pass try: context.append(tokens[end_token + i]) except: pass out.append({u'tag': unicode(tag), u'text': entity_text, u'score': score, u'context': context}) return {"entities": out}
def get_entities(text, count): ner = named_entity_extractor(parent + '/../../lib/MITIE/MITIE-models/english/ner_model.dat') # Load a text file and convert it into a list of words. tokens = tokenize(text) entities = ner.extract_entities(tokens) entity_count = collections.Counter() for e in entities: range = e[0] entity_text = " ".join(tokens[i].decode() for i in range) if entity_text not in entity_count: entity_count[entity_text] = 1 else: entity_count[entity_text] += 1 return entity_count.most_common(count)
def tokenize_with_offsets(self, text): # type: (Text) -> Tuple[List[Text], List[int]] from mitie import tokenize _text = text.encode('utf-8') offsets = [] offset = 0 tokens = [w.decode('utf-8') for w in tokenize(_text)] for tok in tokens: m = re.search(re.escape(tok), text[offset:], re.UNICODE) if m is None: message = "Invalid MITIE offset. Token '{}' in message '{}'.".format(str(tok), str(text.encode('utf-8'))) raise ValueError(message) offsets.append(offset + m.start()) offset += m.end() return tokens, offsets
def talk_to_mitie(text, ner_model): """ Send text to MITIE NER, format the results, and return them. Note: this code also creates an HTML version of the output with named entities highlighted. That output is not used in Mordecai. Returns ------- named_entities: dictionary "entities" contains a list of dictionaries. Each of these dicts has keys "tag", "text", and "score". """ text = text.encode("utf-8") tokens = mitie.tokenize(text) tokens.append(' x ') # eventually, handle different NER models here. entities = ner_model.extract_entities(tokens) out = [] for e in entities: range = e[0] tag = e[1] score = e[2] entity_text = str(" ").join(tokens[i] for i in range) out.append({ u'tag': unicode(tag), u'text': entity_text, u'score': score }) for e in reversed(entities): range = e[0] tag = e[1] newt = tokens[range[0]] if len(range) > 1: for i in range: if i != range[0]: newt += str(' ') + tokens[i] newt = (str('<span class="mitie-') + tag + str('">') + newt + str('</span>')) tokens = tokens[:range[0]] + [newt] + tokens[(range[-1] + 1):] del tokens[-1] html = str(' ').join(tokens) htmlu = unicode(html.decode("utf-8")) named_entities = {"entities": out, "html": htmlu} return named_entities
def talk_to_mitie(text, ner_model): """ Send text to MITIE NER, format the results, and return them. Note: this code also creates an HTML version of the output with named entities highlighted. That output is not used in Mordecai. Returns ------- named_entities: dictionary "entities" contains a list of dictionaries. Each of these dicts has keys "tag", "text", and "score". """ text = text.encode("utf-8") tokens = mitie.tokenize(text) tokens.append(' x ') # eventually, handle different NER models here. entities = ner_model.extract_entities(tokens) out = [] for e in entities: range = e[0] tag = e[1] score = e[2] entity_text = str(" ").join(tokens[i] for i in range) out.append({u'tag': unicode(tag), u'text': entity_text, u'score': score}) for e in reversed(entities): range = e[0] tag = e[1] newt = tokens[range[0]] if len(range) > 1: for i in range: if i != range[0]: newt += str(' ') + tokens[i] newt = (str('<span class="mitie-') + tag + str('">') + newt + str('</span>')) tokens = tokens[:range[0]] + [newt] + tokens[(range[-1] + 1):] del tokens[-1] html = str(' ').join(tokens) htmlu = unicode(html.decode("utf-8")) named_entities = {"entities": out, "html": htmlu} return named_entities
def train(self, training_data, mitie_file, num_threads): # type: (TrainingData, str, Optional[int]) -> None from mitie import ner_training_instance, ner_trainer, tokenize trainer = ner_trainer(mitie_file) trainer.num_threads = num_threads found_one_entity = False for example in training_data.entity_examples: text = example["text"] tokens = tokenize(text) sample = ner_training_instance(tokens) for ent in example["entities"]: start, end = MitieEntityExtractor.find_entity(ent, text) sample.add_entity(list(range(start, end)), ent["entity"]) found_one_entity = True trainer.add(sample) # Mitie will fail to train if there is not a single entity tagged if found_one_entity: self.ner = trainer.train()
def get_mitie_entities(original_tweets): system_entities = set() original_index = 0 previous_token_end = 0 ner = mitie.named_entity_extractor( 'MITIE/MITIE-models/english/ner_model.dat') original_tweets_clean = original_tweets.replace(u"’", "'") for tweet in original_tweets_clean.split("\n"): entity_start = None stripped_tweet = tweet.strip() if not stripped_tweet: continue tokens = mitie.tokenize(stripped_tweet) entities = ner.extract_entities(tokens) if entities: current_entity = entities.pop(0) else: current_entity = None for i, token in enumerate(tokens): unicode_token = token.decode('utf-8') original_index = original_tweets_clean.index( unicode_token, previous_token_end) if entity_start is not None and i == current_entity[0][-1] + 1: system_entities.add( (entity_start, previous_token_end, current_entity[1])) entity_start = None if entities: current_entity = entities.pop(0) else: current_entity = None if current_entity is not None and i == current_entity[0][0]: entity_start = original_index previous_token_end = original_index + len(unicode_token) if entity_start is not None: system_entities.add( (entity_start, previous_token_end, current_entity[1])) return system_entities
def parse_document(document, information): print 'Acquiring extractor' extractor = named_entity_extractor(os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat')) print 'Cleaning HTML' cleaned_document = clean_html(document) print 'Tokenizing' tokens = filter(None, tokenize(cleaned_document)) print 'Extracting NER Entities' entities = extractor.extract_entities(tokens) print 'Done' normalizes_entities = [] for position, tag in entities: position_indices = list(position) # TODO: join how for other languages? words = ' '.join(tokens[position_indices[0]:position_indices[-1]]) if words: normalizes_entities.append((tag, words)) return normalizes_entities
def parse(self, text): tokens = tokenize(text) intent = self.get_intent(tokens) entities = self.get_entities(tokens) return {'intent': intent, 'entities': entities}
def tokenize(self, text): return [w.decode('utf-8') for w in tokenize(text.encode('utf-8'))]
def tokenize(self, text): # type: (Text) -> List[Text] from mitie import tokenize return [w.decode('utf-8') for w in tokenize(text.encode('utf-8'))]
def tokenize(self, text): return tokenize(text)
def test(text): tokens = tokenize(text) res = mitie_extract_ner_series(tokens) return res
def tokenize(self, text): # type: (str) -> [str] from mitie import tokenize return [w.decode('utf-8') for w in tokenize(text.encode('utf-8'))]
def process(text, model): tokens = mitie.tokenize(text) entities = model.extract_entities(tokens) return tokens, entities
# named entity recognition outputs. # import sys, os # Make sure you put the mitielib folder into the python search path. There are # a lot of ways to do this, here we do it programmatically with the following # two statements: parent = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parent + '/lib/MITIE/mitielib') print parent import mitie print "loading NER model..." ner = mitie.named_entity_extractor('lib/MITIE/english/ner_model.dat') print "\nTags output by this NER model:", ner.get_possible_ner_tags() # Load a text file and convert it into a list of words. tokens = mitie.tokenize(mitie.load_entire_file('lib/MITIE/sample_text.txt')) print "Tokenized input:", tokens entities = ner.extract_entities(tokens) print "\nEntities found:", entities print "\nNumber of entities detected:", len(entities) for e in entities: range = e[0] tag = e[1] entity_text = " ".join(tokens[i] for i in range) print " " + tag + ": " + entity_text