def load( cls, config: Dict[Text, Any], model_storage: ModelStorage, resource: Resource, execution_context: ExecutionContext, **kwargs: Any, ) -> MitieEntityExtractor: """Loads trained component (see parent class for full docstring).""" import mitie try: with model_storage.read_from(resource) as model_path: ner_file = model_path / cls.MITIE_RESOURCE_FILE if not ner_file.exists(): raise FileNotFoundError( f"Expected a MITIE extractor file at {ner_file}.") ner = mitie.named_entity_extractor(str(ner_file)) return cls(config, model_storage, resource, ner=ner) except (FileNotFoundError, ValueError) as e: logger.debug( f"Failed to load {cls.__name__} from model storage. " f"This can happen if the model could not be trained because regexes " f"could not be extracted from the given training data - and hence " f"could not be persisted. Error: {e}.") return cls(config, model_storage, resource)
def parse_document(document, information): print 'Acquiring extractor' extractor = named_entity_extractor( os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat')) print 'Cleaning HTML' cleaned_document = clean_html(document) print 'Tokenizing' tokens = filter(None, tokenize(cleaned_document)) print 'Extracting NER Entities' entities = extractor.extract_entities(tokens) print 'Done' normalizes_entities = [] for position, tag in entities: position_indices = list(position) # TODO: join how for other languages? words = ' '.join(tokens[position_indices[0]:position_indices[-1]]) if words: normalizes_entities.append((tag, words)) return normalizes_entities
def __init__(self, intent_classifier_file=None, entity_extractor_file=None, feature_extractor_file=None, **kwargs): if entity_extractor_file: self.extractor = named_entity_extractor(entity_extractor_file) # ,metadata["feature_extractor"]) with open(intent_classifier_file, 'rb') as f: self.classifier = cloudpickle.load(f) self.featurizer = MITIEFeaturizer(feature_extractor_file) self.tokenizer = MITIETokenizer()
def load(self): user_dir = os.path.expanduser('~') mitieModelFilePath = os.path.join(user_dir, '.verbis/models/mitie', 'english', 'ner_model.dat') mitie_model = named_entity_extractor(mitieModelFilePath) return mitie_model
def load(meta, featurizer=None): """ :type meta: rasa_nlu.model.Metadata :rtype: MITIESklearnInterpreter """ if meta.entity_extractor_path: extractor = named_entity_extractor(meta.entity_extractor_path) else: extractor = None if featurizer is None: featurizer = MITIEFeaturizer(meta.feature_extractor_path) if meta.intent_classifier_path: with open(meta.intent_classifier_path, 'rb') as f: classifier = cloudpickle.load(f) else: classifier = None if meta.entity_synonyms_path: entity_synonyms = Interpreter.load_synonyms( meta.entity_synonyms_path) else: entity_synonyms = None return MITIESklearnInterpreter(classifier, extractor, featurizer, entity_synonyms)
def setup_mitie(mitie_directory, mitie_ner_model): """ Given the location for MITIE and the model, create a named_entity_extractor object. """ sys.path.append(mitie_directory) ner_model = mitie.named_entity_extractor(mitie_ner_model) return ner_model
def __init__(self, model="test_data/edr_ner_model_gigaword_embeddings.dat"): """ Initializes class and MITIE model :param model: filepath of MITIE model :type model: str """ self.ner = named_entity_extractor(model) super(MITIEBasedParser, self).__init__()
def load(cls, model_dir, model_metadata, cached_component, **kwargs): # type: (Text, Metadata, Optional[MitieEntityExtractor], **Any) -> MitieEntityExtractor import mitie if model_dir and model_metadata.get("entity_extractor_mitie"): entity_extractor_file = os.path.join(model_dir, model_metadata.get("entity_extractor_mitie")) extractor = mitie.named_entity_extractor(entity_extractor_file) return MitieEntityExtractor(extractor) else: return MitieEntityExtractor()
def load(cls, model_dir, entity_extractor): # type: (str, str) -> MitieEntityExtractor from mitie import named_entity_extractor if model_dir and entity_extractor: entity_extractor_file = os.path.join(model_dir, entity_extractor) extractor = named_entity_extractor(entity_extractor_file) return MitieEntityExtractor(extractor) else: return MitieEntityExtractor()
def _load_ner(): global ner if settings.NLP_ENABLED: try: if not (settings.NLP_LIBRARY_PATH in sys.path): sys.path.append(settings.NLP_LIBRARY_PATH) if not ner: from mitie import named_entity_extractor ner = named_entity_extractor(settings.NLP_MODEL_PATH) except BaseException: print "Could not load the NLP NER"
def _load_ner(): global ner if settings.NLP_ENABLED: try: if not (settings.NLP_LIBRARY_PATH in sys.path): sys.path.append(settings.NLP_LIBRARY_PATH) if not ner: from mitie import named_entity_extractor ner = named_entity_extractor(settings.NLP_MODEL_PATH) except: print "Could not load the NLP NER"
def load(cls, model_dir, entity_extractor_mitie): # type: (Text, Text) -> MitieEntityExtractor import mitie if model_dir and entity_extractor_mitie: entity_extractor_file = os.path.join(model_dir, entity_extractor_mitie) extractor = mitie.named_entity_extractor(entity_extractor_file) return MitieEntityExtractor(extractor) else: return MitieEntityExtractor()
def test_mitie_context(): __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) config_file = glob.glob(os.path.join(__location__, '../../config.ini')) parser = ConfigParser() parser.read(config_file) mitie_directory = parser.get('Locations', 'mitie_directory') mitie_ner_model = parser.get('Locations', 'mitie_ner_model') sys.path.append(mitie_directory) ner_model = named_entity_extractor(mitie_ner_model) text = "The meeting happened in Ontario." mc = mitie_context(text, ner_model) mc_gold = {u'entities': [{u'text': 'Ontario', u'tag': u'LOCATION', u'score': 1.3923831181343844, u'context': ['meeting', 'happened', 'in', '.']}]} assert mc == mc_gold
def get_entities(text, count): ner = named_entity_extractor(parent + '/../../lib/MITIE/MITIE-models/english/ner_model.dat') # Load a text file and convert it into a list of words. tokens = tokenize(text) entities = ner.extract_entities(tokens) entity_count = collections.Counter() for e in entities: range = e[0] entity_text = " ".join(tokens[i].decode() for i in range) if entity_text not in entity_count: entity_count[entity_text] = 1 else: entity_count[entity_text] += 1 return entity_count.most_common(count)
def main(): try: log('Loading %s', MODEL) global NER NER = mitie.named_entity_extractor(MODEL) except Exception as error: log('Can not load model: "%s"', error) return server = HTTPServer((HOST, PORT), HTTPHandler) try: log('Listening http://%s:%d', HOST, PORT) server.serve_forever() except KeyboardInterrupt: log('Quiting') finally: server.server_close()
def load(cls, meta: Dict[Text, Any], model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional['MitieEntityExtractor'] = None, **kwargs: Any) -> 'MitieEntityExtractor': import mitie file_name = meta.get("file") if not file_name: return cls(meta) classifier_file = os.path.join(model_dir, file_name) if os.path.exists(classifier_file): extractor = mitie.named_entity_extractor(classifier_file) return cls(meta, extractor) else: return cls(meta)
def __init__(self, intent_classifier=None, entity_extractor=None, feature_extractor=None, entity_synonyms=None, **kwargs): self.extractor = None self.classifier = None if entity_extractor: self.extractor = named_entity_extractor(entity_extractor, feature_extractor) if intent_classifier: with open(intent_classifier, 'rb') as f: self.classifier = cloudpickle.load(f) self.featurizer = MITIEFeaturizer(feature_extractor) self.tokenizer = MITIETokenizer() self.ent_synonyms = None if entity_synonyms: self.ent_synonyms = Interpreter.load_synonyms(entity_synonyms)
def load(cls, model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional['MitieEntityExtractor'] = None, **kwargs: Any) -> 'MitieEntityExtractor': import mitie meta = model_metadata.for_component(cls.name) file_name = meta.get("classifier_file", MITIE_ENTITY_MODEL_FILE_NAME) if not file_name: return cls(meta) classifier_file = os.path.join(model_dir, file_name) if os.path.exists(classifier_file): extractor = mitie.named_entity_extractor(classifier_file) return cls(meta, extractor) else: return cls(meta)
def load(cls, meta: Dict[Text, Any], model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional['MitieEntityExtractor'] = None, **kwargs: Any ) -> 'MitieEntityExtractor': import mitie file_name = meta.get("file") if not file_name: return cls(meta) classifier_file = os.path.join(model_dir, file_name) if os.path.exists(classifier_file): extractor = mitie.named_entity_extractor(classifier_file) return cls(meta, extractor) else: return cls(meta)
def test_mitie_context(): __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) config_file = glob.glob(os.path.join(__location__, '../../config.ini')) parser = ConfigParser() parser.read(config_file) mitie_directory = parser.get('Locations', 'mitie_directory') mitie_ner_model = parser.get('Locations', 'mitie_ner_model') sys.path.append(mitie_directory) ner_model = named_entity_extractor(mitie_ner_model) text = "The meeting happened in Ontario." mc = mitie_context(text, ner_model) mc_gold = { u'entities': [{ u'text': 'Ontario', u'tag': u'LOCATION', u'score': 1.3923831181343844, u'context': ['meeting', 'happened', 'in', '.'] }] } assert mc == mc_gold
def get_mitie_entities(original_tweets): system_entities = set() original_index = 0 previous_token_end = 0 ner = mitie.named_entity_extractor( 'MITIE/MITIE-models/english/ner_model.dat') original_tweets_clean = original_tweets.replace(u"’", "'") for tweet in original_tweets_clean.split("\n"): entity_start = None stripped_tweet = tweet.strip() if not stripped_tweet: continue tokens = mitie.tokenize(stripped_tweet) entities = ner.extract_entities(tokens) if entities: current_entity = entities.pop(0) else: current_entity = None for i, token in enumerate(tokens): unicode_token = token.decode('utf-8') original_index = original_tweets_clean.index( unicode_token, previous_token_end) if entity_start is not None and i == current_entity[0][-1] + 1: system_entities.add( (entity_start, previous_token_end, current_entity[1])) entity_start = None if entities: current_entity = entities.pop(0) else: current_entity = None if current_entity is not None and i == current_entity[0][0]: entity_start = original_index previous_token_end = original_index + len(unicode_token) if entity_start is not None: system_entities.add( (entity_start, previous_token_end, current_entity[1])) return system_entities
def load( cls, meta: Dict[Text, Any], model_dir: Text, model_metadata: Metadata = None, cached_component: Optional["MitieEntityExtractor"] = None, **kwargs: Any, ) -> "MitieEntityExtractor": """Loads trained component (see parent class for full docstring).""" import mitie file_name = meta.get("file") if not file_name: return cls(meta) classifier_file = os.path.join(model_dir, file_name) if os.path.exists(classifier_file): extractor = mitie.named_entity_extractor(classifier_file) return cls(meta, extractor) else: return cls(meta)
def load(cls, model_dir=None, # type: Text model_metadata=None, # type: Metadata cached_component=None, # type: Optional[MitieEntityExtractor] **kwargs # type: **Any ): # type: (...) -> MitieEntityExtractor import mitie meta = model_metadata.for_component(cls.name) file_name = meta.get("classifier_file", MITIE_ENTITY_MODEL_FILE_NAME) if not file_name: return cls(meta) classifier_file = os.path.join(model_dir, file_name) if os.path.exists(classifier_file): extractor = mitie.named_entity_extractor(classifier_file) return cls(meta, extractor) else: return cls(meta)
def parse_document(document, information): print 'Acquiring extractor' extractor = named_entity_extractor(os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat')) print 'Cleaning HTML' cleaned_document = clean_html(document) print 'Tokenizing' tokens = filter(None, tokenize(cleaned_document)) print 'Extracting NER Entities' entities = extractor.extract_entities(tokens) print 'Done' normalizes_entities = [] for position, tag in entities: position_indices = list(position) # TODO: join how for other languages? words = ' '.join(tokens[position_indices[0]:position_indices[-1]]) if words: normalizes_entities.append((tag, words)) return normalizes_entities
def load_pretrained_model(self, modelname='pretrained-StanfordNER', numclass=3): """ Loads a pre-trained model. Parameters ---------- modelname : str The name of the pre-trained model to use. The options are: * 'pretrained-StanfordNER': Used a CRF and word embeddings. See: https://nlp.stanford.edu/software/CRF-NER.shtml * 'pretrained-MITIE': Used Structural SVMs and word embeddings. Uses Dhillon et al's "eigenwords" word embeddings. See: https://github.com/mit-nlp/MITIE * 'pretrained-SENNA': Used multilayer perceptrons and the 50-dimensional CW (2008) word embeddings. See: http://ml.nec-labs.com/senna/ * 'pretrained-spacy': Used BILOU scheme; the algorithm is "a pastiche of well-known methods...a greedy transition-based parser guided by a linear model whose weights are learned using the averaged perceptron loss, via the dynamic oracle imitation strategy". See: https://spacy.io/docs/usage/entity-recognition. Using pre-trained model 'en_core_web_sm' here. NOTE: could try 'en_depent_web_md' instead. numclass : int The number of classes for the pre-trained classifier; this is relevant only when modelname is 'pretrained-StanfordNER'. """ self.pretrained_model = modelname self.transfer_method = 'none' if modelname == 'pretrained-StanfordNER': if numclass == 3: self.NER = StanfordNERTagger( STANFORD_MODEL_DIR + 'english.all.3class.distsim.crf.ser.gz') #, #STANFORD_CLASSPATH) self.model['entity_types'] = ['LOC', 'ORG', 'PER'] self.model['training_corpus'] = [ 'CONLL03 eng.train', 'MUC6 train', 'MUC7 train', 'ACE2002', 'in-house data' ] elif numclass == 4: self.NER = StanfordNERTagger( STANFORD_MODEL_DIR + 'english.conll.4class.distsim.crf.ser.gz') #, ##STANFORD_CLASSPATH) self.model['entity_types'] = ['LOC', 'PER', 'ORG', 'MISC'] self.model['training_corpus'] = ['CONLL03 eng.train'] elif numclass == 7: self.NER = StanfordNERTagger( STANFORD_MODEL_DIR + 'english.muc.7class.distsim.crf.ser.gz') #, ##STANFORD_CLASSPATH) self.model['entity_types'] = [ 'LOC', 'ORG', 'PER', 'MISC', 'MON', # MONEY 'PCT', # PERCENT 'DAT', # DATE 'TIM' ] # TIME self.model['training_corpus'] = ['MUC6 train', 'MUC7 train'] else: raise ValueError( 'When using StanfordNER, numclass must be 3, 4 or 7.') elif modelname == 'pretrained-MITIE': self.NER = mitie.named_entity_extractor(MITIE_MODEL_DIR) self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC'] self.model['training_corpus'] = ['?'] elif modelname == 'pretrained-SENNA': self.NER = SennaNERTagger(SENNA_DIR) self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC'] self.model['training_corpus'] = ["?"] elif modelname == 'pretrained-spacy': self.NER = None self.model['entity_types'] = [ 'PER', # PERSON 'NOR', # NORP 'FAC', # FACILITY 'ORG', # ORGANIZATION 'GPE', # GEO-POLITICAL 'LOC', # LOCATION 'PRO', # PRODUCT 'EVE', # EVENT 'WOR', # WORK OF ART 'LAN', # LANGUAGE 'DAT', # DATE 'TIM', # TIME 'PCT', # PERCENT 'MON', # MONEY 'QUA', # QUANTITY 'ORD', # ORDINAL 'CAR' ] # CARDINAL self.model['training_corpus'] = ["?"] else: raise ValueError("Wrong modelname; must be 'pretrained-spacy',\ 'pretrained-SENNA', 'pretrained-MITIE',\ or 'pretrained-StanfordNER'.")
def __init__(self): self.extractor = named_entity_extractor(NamedEntityExtractor.NE_DATA) self.tokenizer = tokenize
def extract_entities(doc_iter, extract_field='body', extracted_lang_field='body_lang', extracted_translated_field="body_translated"): sys.path.append(".") from mitie import tokenize_with_offsets, named_entity_extractor print "loading NER models..." ner_models={} ner_models['en'] = named_entity_extractor('ner_model_english.dat') ner_models['es'] = named_entity_extractor('ner_model_spanish.dat') def entities(extracted_text, lang): extracted_text = re.sub(r'[^\x00-\x7F]',' ', extracted_text) extracted_text = extracted_text.replace("[:newline:]", " ") extracted_text = extracted_text.encode("ascii") #tokens = tokenize(body) tokens = tokenize_with_offsets(extracted_text) entities_markup = ner_models[lang].extract_entities(tokens) #results contains [(tag, entity, offset, score)] results = [ (tag, " ".join([tokens[i][0] for i in rng]), ",".join([str(tokens[i][1]) for i in rng]), "{0:.2f}".format(score)) for rng, tag, score in entities_markup ] entity_doc = {} # entity_doc["entity_full"] = results entity_doc["entity_all"] = [] entity_doc["entity_location"] = [] entity_doc["entity_organization"] = [] entity_doc["entity_person"] = [] entity_doc["entity_misc"] = [] for tag, entity, rng, score in results: if len(entity) > 30: continue entity_doc["entity_all"].append(entity) if tag == 'LOCATION' and score > 0.3: entity_doc["entity_location"].append(entity) elif tag == 'ORGANIZATION' and score > 0.5: entity_doc["entity_organization"].append(entity) elif tag == 'PERSON' and score > 0.3: entity_doc["entity_person"].append(entity) elif score > 0.5: entity_doc["entity_misc"].append(entity) return entity_doc for doc in doc_iter: doc_id = doc["id"] if extract_field in doc: lang = doc.get(extracted_lang_field, 'en') # TODO Hack to ensure at least en is run lang = lang if lang in ner_models else 'en' mitie_entities = entities(doc[extract_field], lang) doc["entities"] = {extract_field+"_entities" : mitie_entities} doc["entities"]["original_lang"] = lang doc["entities"][extract_field+"_entities_translated"] = {} # Now extract entities for any translated fields if not lang == 'en': mitie_entities = entities(doc[extracted_translated_field], 'en') doc["entities"][extract_field+"_entities_translated"] = mitie_entities # TODO do attachments here instead of in a seperate execution of this stage yield doc
def load(self, file_name): self.model = named_entity_extractor(file_name)
def load(self): self.ner = mitie.named_entity_extractor('models/ner_model.dat')
def extract_entities(doc_iter, extract_field='body', extracted_lang_field='body_lang', extracted_translated_field="body_translated"): sys.path.append(".") from mitie import tokenize_with_offsets, named_entity_extractor print "loading NER models..." ner_models = {} ner_models['en'] = named_entity_extractor( '/usr/src/app/static-data/ner_model_english.dat') ner_models['es'] = named_entity_extractor( '/usr/src/app/static-data/ner_model_spanish.dat') def entities(extracted_text, lang): extracted_text = re.sub(r'[^\x00-\x7F]', ' ', extracted_text) extracted_text = extracted_text.replace("[:newline:]", " ") extracted_text = extracted_text.encode("ascii") #tokens = tokenize(body) tokens = tokenize_with_offsets(extracted_text) entities_markup = ner_models[lang].extract_entities(tokens) #results contains [(tag, entity, offset, score)] results = [(tag, " ".join([tokens[i][0] for i in rng]), ",".join([str(tokens[i][1]) for i in rng]), "{0:.2f}".format(score)) for rng, tag, score in entities_markup] entity_doc = {} # entity_doc["entity_full"] = results entity_doc["entity_all"] = [] entity_doc["entity_location"] = [] entity_doc["entity_organization"] = [] entity_doc["entity_person"] = [] entity_doc["entity_misc"] = [] for tag, entity, rng, score in results: if len(entity) > 30: continue entity_doc["entity_all"].append(entity) if tag == 'LOCATION' and score > 0.3: entity_doc["entity_location"].append(entity) elif tag == 'ORGANIZATION' and score > 0.5: entity_doc["entity_organization"].append(entity) elif tag == 'PERSON' and score > 0.3: entity_doc["entity_person"].append(entity) elif score > 0.5: entity_doc["entity_misc"].append(entity) return entity_doc for doc in doc_iter: doc_id = doc["id"] if extract_field in doc: lang = doc.get(extracted_lang_field, 'en') # TODO Hack to ensure at least en is run lang = lang if lang in ner_models else 'en' mitie_entities = entities(doc[extract_field], lang) doc["entities"] = {extract_field + "_entities": mitie_entities} doc["entities"]["original_lang"] = lang doc["entities"][extract_field + "_entities_translated"] = {} # Now extract entities for any translated fields if not lang == 'en': mitie_entities = entities(doc[extracted_translated_field], 'en') doc["entities"][extract_field + "_entities_translated"] = mitie_entities # TODO do attachments here instead of in a seperate execution of this stage yield doc
def setup_mitie(mitie_directory): """ Given the location for MITIE and the model, create a named_entity_extractor object.""" sys.path.append(mitie_directory) ner_model = mitie.named_entity_extractor(mitie_ner_model) return ner_model
def __init__(self, metadata): self.extractor = named_entity_extractor( metadata["entity_extractor"]) # ,metadata["feature_extractor"]) self.classifier = text_categorizer( metadata["intent_classifier"]) # ,metadata["feature_extractor"]) self.tokenizer = MITIETokenizer()
# recognition and also how to run a binary relation detector on top of the # named entity recognition outputs. # import sys, os # Make sure you put the mitielib folder into the python search path. There are # a lot of ways to do this, here we do it programmatically with the following # two statements: parent = os.path.dirname(os.path.realpath(__file__)) sys.path.append(parent + '/lib/MITIE/mitielib') print parent import mitie print "loading NER model..." ner = mitie.named_entity_extractor('lib/MITIE/english/ner_model.dat') print "\nTags output by this NER model:", ner.get_possible_ner_tags() # Load a text file and convert it into a list of words. tokens = mitie.tokenize(mitie.load_entire_file('lib/MITIE/sample_text.txt')) print "Tokenized input:", tokens entities = ner.extract_entities(tokens) print "\nEntities found:", entities print "\nNumber of entities detected:", len(entities) for e in entities: range = e[0] tag = e[1] entity_text = " ".join(tokens[i] for i in range) print " " + tag + ": " + entity_text