def _prepare_mitie_sample(training_example: Message) -> Any: """Prepare a message so that it can be passed to a MITIE trainer.""" import mitie text = training_example.get(TEXT) tokens = training_example.get(TOKENS_NAMES[TEXT]) sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in training_example.get(ENTITIES, []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity( ent, text, tokens) except ValueError as e: rasa.shared.utils.io.raise_warning( f"Failed to use example '{text}' to train MITIE " f"entity extractor. Example will be skipped." f"Error: {e}") continue try: # mitie will raise an exception on malicious # input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: rasa.shared.utils.io.raise_warning( f"Failed to add entity example " f"'{str(e)}' of sentence '{str(text)}'. " f"Example will be ignored. Reason: " f"{e}") continue return sample
def _prepare_mitie_sample(self, training_example: Message) -> Any: import mitie text = training_example.text tokens = self._tokens_without_cls(training_example) sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in training_example.get(ENTITIES_ATTRIBUTE, []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity( ent, text, tokens) except ValueError as e: raise_warning(f"Failed to use example '{text}' to train MITIE " f"entity extractor. Example will be skipped." f"Error: {e}") continue try: # mitie will raise an exception on malicious # input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: raise_warning(f"Failed to add entity example " f"'{str(e)}' of sentence '{str(text)}'. " f"Example will be ignored. Reason: " f"{e}") continue return sample
def get_sample(data): assert 'mapping' in data, "Token mapping missing from training data" assert "utterance" in data, "Utterance text missing from training data" try: utterance = get(data, "case_converted_utterance") logger.debug("Preparing utterance: %s" % utterance) mapping = json.loads(get(data, "mapping")) assert "tags" in mapping, "Tags missing from training data" tags = get(mapping, 'tags') tokens = utterance.split() sample = ner_training_instance(tokens) for tag in tags: start = get(tag, 'start') end = get(tag, 'end') label = get(tag, 'tag') label = label.encode('utf-8') # ignoreTag = (label.upper() in predefined_tags) ignoreTag = (label.upper() in predefined_tags) \ or (label in patterns) or (label in phrases) if not ignoreTag: assert all(v is not None for v in [start, end, label]), \ "Missing information for adding entities to training" logger.info("Adding entity: %s" % label) logger.info("Start range: %s" % start) logger.info("End range: %s" % end) sample.add_entity(range(start, end), label.upper()) if not label.upper() in label_list: label_list.append(label.upper()) logger.info("label_list %s" % (label_list)) data['ner_trained'] = True return sample, data except (TypeError, Exception) as e: data['ner_trained'] = False return None, data
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig) -> None import mitie trainer = mitie.ner_trainer(config["mitie_file"]) trainer.num_threads = config["num_threads"] found_one_entity = False for example in training_data.training_examples: text = example.text tokens = example.get("tokens") sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in example.get("entities", []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity( ent, text, tokens) except ValueError as e: logger.warning("Example skipped: {}".format(str(e))) continue try: # mitie will raise an exception on malicious input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: logger.warning( "Failed to add entity example '{}' of sentence '{}'. Reason: {}" .format(str(e), str(text), e)) continue found_one_entity = True trainer.add(sample) # Mitie will fail to train if there is not a single entity tagged if found_one_entity: self.ner = trainer.train()
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig) -> None import mitie trainer = mitie.ner_trainer(config["mitie_file"]) trainer.num_threads = config["num_threads"] found_one_entity = False for example in training_data.entity_examples: text = example.text tokens = example.get("tokens") sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in example.get("entities", []): try: start, end = MitieEntityExtractor.find_entity( ent, text, tokens) except ValueError as e: logger.warning("Example skipped: {}".format(str(e))) continue sample.add_entity(list(range(start, end)), ent["entity"]) found_one_entity = True trainer.add(sample) # Mitie will fail to train if there is not a single entity tagged if found_one_entity: self.ner = trainer.train()
def _prepare_mitie_sample(self, training_example) -> Any: import mitie text = training_example.text tokens = training_example.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in training_example.get(ENTITIES_ATTRIBUTE, []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity(ent, text, tokens) except ValueError as e: warnings.warn(f"Example skipped: {e}") continue try: # mitie will raise an exception on malicious # input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: warnings.warn( "Failed to add entity example " f"'{str(e)}' of sentence '{str(text)}'. Reason: " f"{e}" ) continue return sample
def _prepare_mitie_sample(self, training_example): import mitie text = training_example.text tokens = training_example.get("tokens") sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in training_example.get("entities", []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity(ent, text, tokens) except ValueError as e: logger.warning("Example skipped: {}".format(str(e))) continue try: # mitie will raise an exception on malicious # input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: logger.warning( "Failed to add entity example " "'{}' of sentence '{}'. Reason: " "{}".format(str(e), str(text), e) ) continue return sample
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig) -> None import mitie trainer = mitie.ner_trainer(config["mitie_file"]) trainer.num_threads = config["num_threads"] found_one_entity = False for example in training_data.entity_examples: text = example.text tokens = example.get("tokens") sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in example.get("entities", []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity(ent, text, tokens) except ValueError as e: logger.warning("Example skipped: {}".format(str(e))) continue try: # mitie will raise an exception on malicious input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: logger.warning("Failed to add entity example '{}' of sentence '{}'. Reason: {}".format( str(e), str(text), e)) continue found_one_entity = True trainer.add(sample) # Mitie will fail to train if there is not a single entity tagged if found_one_entity: self.ner = trainer.train()
def add_phrase(self, phrase): logging.info("add phrase %s" % phrase) tokens = list(phrase.tokens()) sample = mitie.ner_training_instance(tokens) for idx, tag in phrase.entities(): logging.info("%s at position %s" % (tag, idx)) sample.add_entity(idx, tag) self.trainer.add(sample)
def train_entity_extractor(entity_examples, fe_file, max_num_threads): trainer = ner_trainer(fe_file) trainer.num_threads = max_num_threads for example in entity_examples: text = example["text"] tokens = tokenize(text) sample = ner_training_instance(tokens) for ent in example["entities"]: start, end = find_entity(ent, text) sample.add_entity(xrange(start, end), ent["entity"]) trainer.add(sample) return trainer.train()
def train(self, training_data, mitie_file, num_threads): # type: (TrainingData, str, Optional[int]) -> None from mitie import ner_training_instance, ner_trainer, tokenize trainer = ner_trainer(mitie_file) trainer.num_threads = num_threads found_one_entity = False for example in training_data.entity_examples: text = example["text"] tokens = tokenize(text) sample = ner_training_instance(tokens) for ent in example["entities"]: start, end = MitieEntityExtractor.find_entity(ent, text) sample.add_entity(list(range(start, end)), ent["entity"]) found_one_entity = True trainer.add(sample) # Mitie will fail to train if there is not a single entity tagged if found_one_entity: self.ner = trainer.train()
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig) -> None import mitie trainer = mitie.ner_trainer(config["mitie_file"]) trainer.num_threads = config["num_threads"] found_one_entity = False for example in training_data.entity_examples: text = example.text tokens = example.get("tokens") sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in example.get("entities", []): start, end = MitieEntityExtractor.find_entity(ent, text, tokens) sample.add_entity(list(range(start, end)), ent["entity"]) found_one_entity = True trainer.add(sample) # Mitie will fail to train if there is not a single entity tagged if found_one_entity: self.ner = trainer.train()
def train(self): with open('data/training.json') as training_file: training = json.load(training_file) examples = list() for sample in training['samples']: examples.append( mitie.ner_training_instance( [token.text for token in spacy_nlp(sample['text'])])) for entity in sample['entities']: examples[-1].add_entity(range(entity['start'], entity['stop']), entity['type']) try: trainer = mitie.ner_trainer( "models/total_word_feature_extractor.dat") except: trainer = mitie.ner_trainer( "botkit/models/total_word_feature_extractor.dat") trainer.num_threads = 2 for example in examples: trainer.add(example) self.ner = trainer.train() if not os.path.exists('models'): os.mkdir('models') self.ner.save_to_disk("models/ner_model.dat")
def _prepare_mitie_sample(self, training_example): import mitie text = training_example.text tokens = training_example.get("tokens") sample = mitie.ner_training_instance([t.text for t in tokens]) for ent in training_example.get("entities", []): try: # if the token is not aligned an exception will be raised start, end = MitieEntityExtractor.find_entity( ent, text, tokens) except ValueError as e: logger.warning("Example skipped: {}".format(str(e))) continue try: # mitie will raise an exception on malicious # input - e.g. on overlapping entities sample.add_entity(list(range(start, end)), ent["entity"]) except Exception as e: logger.warning("Failed to add entity example " "'{}' of sentence '{}'. Reason: " "{}".format(str(e), str(text), e)) continue return sample