def get_ir_dataset(serviceid, logger): logger = logger or logging.getLogger(__name__) data_manager = DatasourceManager() logger.info("Starting evaluation of service %s" % serviceid) def get_training_data(row): mapping = loads(get(row, "mapping")) intent = None if mapping is not None: intent = get(mapping, "intent") if intent is None: intent = "No intent" row["ir_trained"] = True text = get(row, "case_converted_utterance") l = [] if get(row, "ner_trained") == True: tokens = text.split() tags = get(mapping, 'tags') prev_end = 0 for tag_num, tag in enumerate(tags): start = get(tag, 'start') end = get(tag, 'end') label = get(tag, 'tag') for index, token_each in enumerate(tokens): if ((index < start) and index >= prev_end): l.append(token_each) elif (index == start): l.append(label.upper()) prev_end = end if (prev_end < len(tokens)): l.extend(tokens[prev_end:len(tokens)]) text = ' '.join(l) db_add_dict(serviceid, text) return row, text, intent query = { "serviceid": serviceid } corpus = data_manager.find_model(query) utterances = get(corpus, "utterances") results = list(map(get_training_data, utterances)) trained_utterances = [items[0] for items in results] training_data = [(items[1], items[2]) for items in results] return trained_utterances, pd.DataFrame(training_data, columns=["text", "intent"])
def get_predefined_entities(serviceid): manager = DatasourceManager() query = {"serviceid": serviceid} ds = manager.find_model(query) return get(ds, "predefined_entities", default=[])
def train(self, train_intent): """ :param doc: :param n_test_percent: :return: """ manager = ProjectManager() query = {"serviceid": self.serviceid} config = manager.find_model(query) if config is not None: try: document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINING, "ner.status_message": "Entity training is in progress.", "ner.last_trained": datetime.datetime.utcnow() } } if (train_intent is True): document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINING, "ner.status_message": "Entity training is in progress.", "ir.status": ProjectManager.STATUS_HOLD, "ir.status_message": "Awaiting the completion of entity training.", "ner.last_trained": datetime.datetime.utcnow() } } manager.update_config(query, document) # starting actual training data_manager = DatasourceManager() self.logger.info("Starting training of service %s" % self.serviceid) corpus = data_manager.find_model(query) custom_entity_model = get(config, "custom_entity_model") entity_recognizer = self.instantiate_trainer( custom_entity_model) trained_utterances = entity_recognizer.train(corpus) if entity_recognizer.get_engine( ) not in get_all_corenlp_engines(): VerbisStore().save_ner(entity_recognizer, model_type=MODEL_TYPE_NER) ###############MINIOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO################## # send file to minio server VerbisStore().save_ner_minio(entity_recognizer, model_type=MODEL_TYPE_NER) document = { "$set": { "utterances": trained_utterances, } } data_manager.update_datasource(query, document) document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINED, "ner.status_message": "Entity training completed successfully.", "ner.logs.train": "" } } manager.update_config(query, document) self.logger.info( "Completed training entity recognizer for service %s" % self.serviceid) except (RuntimeError, Exception) as ex: self.logger.exception(ex, exc_info=True) self.logger.error(traceback.format_exc()) if ex == "Cannot have number of folds n_folds=3 greater than the number of samples: 2.": ex = "Add more utterances for entity training" document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINING_FAILED, "ner.status_message": ex, "ner.logs.train": self.logger.handlers[-1].logs } } manager.update_config(query, document) else: description = 'Unable to find project_config with given id.' \ 'Please check your request params and retry' self.logger.error(description)