def encrypt_missed_utterance(): ''' uncache model if their last date of access more than 60 days. ''' try: manager = DatasourceManager() query = {} data = manager.find_all_model(query) for document in data: try: missed_utterances = get(document, 'missedUtterances', []) serviceid = document['serviceid'] encrypted = [] if len(missed_utterances) != 0: for item in missed_utterances: encrypted.append(encrypt(item)) manager.encrypt_missed_utterances(serviceid, encrypted) print('successfully updated') else: print('no missed_utterances, empty list') except Exception as e: print("exception : ", e) except Exception as e: print("exception : ", e)
def data_generator(serviceid=None): try: fetched_value = DatasourceManager().find_datasource_by_service_id( serviceid) temp_value = fetched_value.get('utterances') data = [] for items in temp_value: data.append(items.get('utterance')) return data except Exception as e: logger.exception(e)
def delete_project_if_exists(given_serviceid): ''' delete_project_if_exists ''' manager = ProjectManager() query = {"serviceid": given_serviceid} data = manager.exists(query) if data: manager.delete(query) logger.info("project deleted from PR DB.") ds_manager = DatasourceManager() data = ds_manager.exists(query) if data: ds_manager.delete(query) logger.info("project deleted from DS DB.")
def get_datasource(project_id): """ :param project_id: :return: """ manager = DatasourceManager() query = {"_id": get_datasource_id(project_id)['datasource']} projection = { "_id": 0, "serviceid": 1, "utterances": 1, "entities": 1, "intents": 1, "patterns": 1, "phrases": 1, "synonyms": 1 } return manager.find(query, projection)
def db_add_dict(serviceid, text): manager = DatasourceManager() manager2 = ProjectManager() project_config = manager2.find_config_by_id(serviceid) language_code = get(project_config, "language", "EN") corpus = manager.find_datasource_by_service_id(serviceid) distinct_token_list = get(corpus, "distinct_token_list") if distinct_token_list is None: distinct_token_list = [] stopword_removed_text = stopword_remover(text, language_code) distinct_token_list.extend(list(set(remove_single_character_tokens(stopword_removed_text)))) distinct_token_list = list(set(distinct_token_list)) document = { "$set": { "distinct_token_list": distinct_token_list, } } manager.update_datasource_by_service_id(serviceid, document)
def get_ir_dataset(serviceid, logger): logger = logger or logging.getLogger(__name__) data_manager = DatasourceManager() logger.info("Starting evaluation of service %s" % serviceid) def get_training_data(row): mapping = loads(get(row, "mapping")) intent = None if mapping is not None: intent = get(mapping, "intent") if intent is None: intent = "No intent" row["ir_trained"] = True text = get(row, "case_converted_utterance") l = [] if get(row, "ner_trained") == True: tokens = text.split() tags = get(mapping, 'tags') prev_end = 0 for tag_num, tag in enumerate(tags): start = get(tag, 'start') end = get(tag, 'end') label = get(tag, 'tag') for index, token_each in enumerate(tokens): if ((index < start) and index >= prev_end): l.append(token_each) elif (index == start): l.append(label.upper()) prev_end = end if (prev_end < len(tokens)): l.extend(tokens[prev_end:len(tokens)]) text = ' '.join(l) db_add_dict(serviceid, text) return row, text, intent query = { "serviceid": serviceid } corpus = data_manager.find_model(query) utterances = get(corpus, "utterances") results = list(map(get_training_data, utterances)) trained_utterances = [items[0] for items in results] training_data = [(items[1], items[2]) for items in results] return trained_utterances, pd.DataFrame(training_data, columns=["text", "intent"])
def test_update_datasource_api1(client, mocker): mocker.patch('falcon.request.Request.__init__', mock_init) headers = {"Content-Type": "application/json"} manager = mocker.patch( 'ice_rest.rest.services.parse.update_datasource.DatasourceManager', return_value=DatasourceManager()) mocker.patch( 'ice_rest.rest.services.parse.update_datasource.DatasourceManager.update_datasource', return_value=None) resp = client.simulate_post('/api/parse/update_datasource', headers=headers) assert resp.status_code == 200
def on_post(self, req, resp): doc = req.context['doc'] or {} try: config = DatasourceManager().find_datasource_by_service_id( doc['serviceid']) utterances = get(config, "utterances", []) response = requests.post( url=app_config['VARIATIONS_END_POINT'], json={'text': doc['text']}, headers={'Content-type': 'application/json'}) if response.status_code == 200: paraphrases = ast.literal_eval(response.text) for i in utterances: if i['utterance'] in paraphrases: paraphrases.remove(i['utterance']) paraphrases = list(set(paraphrases[1:])) if doc['text'] in paraphrases: paraphrases.remove(doc['text']) resp.data = paraphrases else: resp.data = [] except requests.exceptions.RequestException: resp.data = [] except InsufficientDataError as ide: raise ide except AssertionError as ae: logger.exception(str(ae)) raise falcon.HTTPBadRequest('Invalid Configuration', str(ae)) except Exception as ex: logger.exception(str(ex)) description = 'Internal Server Error, Please try again later' raise falcon.HTTPServiceUnavailable('Service Outage', description, 30) resp.set_header('X-Powered-By', 'USTGlobal Verbis') resp.status = falcon.HTTP_200
def get_predefined_entities(serviceid): manager = DatasourceManager() query = {"serviceid": serviceid} ds = manager.find_model(query) return get(ds, "predefined_entities", default=[])
import logging from pydash import get from ice_commons.data.dl.manager import DatasourceManager, ProjectManager from ice_rest.rest.services.parse.impl.common.missed_utterances_impl import missedUtterences from ice_rest.rest.services.parse.impl.common.store_utils import get_model_store, get_requested_services from ice_commons.utility.custom_tokenizer import tokenize_utterance from ice_commons.core.model_utils import get_engine from ice_commons.utils import MODEL_TYPE_IR, MODEL_TYPE_NER import re from collections import OrderedDict logger = logging.getLogger(__name__) project_manager = ProjectManager() datasource_manager = DatasourceManager() def get_proba(intent_list): prob = {} for intent_each in intent_list: if intent_each['name'] != "No intent": prob[intent_each['name']] = "0.0%" prob["No intent"] = "100.0%" return prob def updateDatasource(serviceid, missed_text): """ :param serviceid: :param missed_text: :return:
def deploy(self): """ :param doc: :param n_test_percent: :return: """ manager = ProjectManager() query = { "serviceid": self.serviceid } config = manager.find_model(query) if config is not None: try: trained_data, df = get_ir_dataset(self.serviceid, self.logger) self.logger.info("Unique labels %s" % np.unique(df.intent.tolist())) group = df.groupby(['intent']).agg('count') stats = group.reset_index().to_json(orient="records") useSelector = False if (len(group) > 1): useSelector = True self.logger.info(stats) document = { "$set": { "ir.status": ProjectManager.STATUS_TRAINING, "ir.status_message": "Intent training is in progress.", "ir.dataset.stats": stats, "ir.last_trained": datetime.datetime.utcnow() } } manager.update_config(query, document) ir = IntentRecognizer(DEFAULT_CONFIG, serviceid=self.serviceid, useSelector=useSelector) self.logger.info("Starting fitting for deployment") ir.fit(df, df.intent) self.logger.info("Fitting for deployment completed") VerbisStore().save_ir(ir) ###############MINIOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO################## # send file to minio server # no engine. .dat - extension VerbisStore().save_ir_minio(ir) document = { "$set": { "utterances": trained_data, } } data_manager = DatasourceManager() data_manager.update_datasource(query, document) document = { "$set": { "ir.status": ProjectManager.STATUS_TRAINED, "ir.status_message": "The Intent model has been successfully trained", "ir.logs.deploy": "" } } manager.update_config(query, document) except (RuntimeError, ValueError, Exception) as e: self.logger.error(e) message = e if (e == "After pruning, no terms remain. Try a lower min_df or a higher max_df." or e == "max_df corresponds to < documents than min_df"): message = "Sufficient vocabulary to build the model is not available. Please add more utterances." elif e == "Invalid type float for labels": message = "Add more intents for intent training" document = { "$set": { "ir.status": ProjectManager.STATUS_TRAINING_FAILED, "ir.status_message": message, "ir.logs.deploy": self.logger.handlers[-1].logs } } manager.update_config(query, document) traceback.print_exc() else: description = 'Unable to find project_config with given id.' \ 'Please check your request params and retry' self.logger.error(description)
def train(self, train_intent): """ :param doc: :param n_test_percent: :return: """ manager = ProjectManager() query = {"serviceid": self.serviceid} config = manager.find_model(query) if config is not None: try: document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINING, "ner.status_message": "Entity training is in progress.", "ner.last_trained": datetime.datetime.utcnow() } } if (train_intent is True): document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINING, "ner.status_message": "Entity training is in progress.", "ir.status": ProjectManager.STATUS_HOLD, "ir.status_message": "Awaiting the completion of entity training.", "ner.last_trained": datetime.datetime.utcnow() } } manager.update_config(query, document) # starting actual training data_manager = DatasourceManager() self.logger.info("Starting training of service %s" % self.serviceid) corpus = data_manager.find_model(query) custom_entity_model = get(config, "custom_entity_model") entity_recognizer = self.instantiate_trainer( custom_entity_model) trained_utterances = entity_recognizer.train(corpus) if entity_recognizer.get_engine( ) not in get_all_corenlp_engines(): VerbisStore().save_ner(entity_recognizer, model_type=MODEL_TYPE_NER) ###############MINIOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO################## # send file to minio server VerbisStore().save_ner_minio(entity_recognizer, model_type=MODEL_TYPE_NER) document = { "$set": { "utterances": trained_utterances, } } data_manager.update_datasource(query, document) document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINED, "ner.status_message": "Entity training completed successfully.", "ner.logs.train": "" } } manager.update_config(query, document) self.logger.info( "Completed training entity recognizer for service %s" % self.serviceid) except (RuntimeError, Exception) as ex: self.logger.exception(ex, exc_info=True) self.logger.error(traceback.format_exc()) if ex == "Cannot have number of folds n_folds=3 greater than the number of samples: 2.": ex = "Add more utterances for entity training" document = { "$set": { "ner.status": ProjectManager.STATUS_TRAINING_FAILED, "ner.status_message": ex, "ner.logs.train": self.logger.handlers[-1].logs } } manager.update_config(query, document) else: description = 'Unable to find project_config with given id.' \ 'Please check your request params and retry' self.logger.error(description)