def process_data(domain: Text, locale: Text) -> None: global utterance global intent #clear the list before loading utterance.clear() intent.clear() try: file = codecs.open( os.path.join(datapath, domain + '_' + locale + ".md"), 'r', 'utf-8') lines = file.read().split("\n") log_util.loginfomsg( f"[MARKDOWN] recieved data, total lines: {len(lines)}") for line in lines: line = line.strip() header = find_section_header(line) if header: set_current_section(header[0], header[1]) else: parse_item(line) return utterance, intent except FileNotFoundError: log_util.logerrormsg( f"[MARKDOWN] no file found for given domain {domain}, ensure that data is given in format .md." ) return json.loads( '{"response":f"ERROR: no file found for given domain {domain}, ensure that data is given in format .md."}' ) raise ValueError( f"no file found for given domain {domain}, ensure that data is given in format .md." )
def predict_query(): if not (request.args.get('domain') or request.args.get('userUtterance')): log_util.logerrormsg("[APP] missing parameters") abort(404) if request.args.get('locale'): locale = request.args.get('locale') else: locale = 'en' utter = request.args.get('userUtterance') if locale == 'en': utterance = re.sub(r'[^a-zA-Z ]', '', utter) domain = request.args.get('domain') if (nlp_config.getParameter('ALGORITHM') == 'TFIDF'): md = 'TFIDF' else: algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0] algo = algo.split("_")[1].upper() model = 'NLU:' + algo res = { "messageId": "PREDICT", "domain": domain, "locale": locale, "userUtterance": utterance, "model": model, "message": predict_model.predict(domain, locale, utterance) } return make_response( jsonify(res), 200, {'Content-Type': 'application/json; charset=utf-8'})
def set_current_section(section: Text, title: Text) -> None: """Update parsing mode.""" if section not in available_sections: log_util.logerrormsg( f"[MARKDOWN] found markdown section {section} which is not in the allowed sections {', '.join(available_sections)}." ) raise ValueError( f"[MARKDOWN] found markdown section {section} which is not in the allowed sections {', '.join(available_sections)}." ) global current_section, current_intent current_section = section current_intent = title
def predict(domain, locale, userUtterance): response = json.loads('{"response":"ERROR: error during predicting the user utterance"}') if not nlp_config.checkDataAvaialble: log_util.logerrormsg("[PREDICT_MODEL] no intent data found. Exiting...") return json.loads('{"response":"ERROR: no intent data found. Exiting..."}') if nlp_config.getParameter('ALGORITHM') == 'TFIDF': response = predict_tfidf.predict(domain, locale, userUtterance) elif nlp_config.getParameter('ALGORITHM') == 'NLU': response = predict_nlu.predict(domain, locale, userUtterance) else: log_util.logerrormsg("[PREDICT_MODEL] configured algorithm is not supported. Exiting...") return response
def train(domain, locale): response = json.loads( '{"response":"ERROR: Error during training the data"}') if not nlp_config.checkDataAvaialble: log_util.logerrormsg("[TRAIN_MODEL] no intent data found. Exiting...") return response if nlp_config.getParameter('ALGORITHM') == 'TFIDF': response = train_tfidf.train(domain, locale, nlp_config.getProperties()) elif nlp_config.getParameter('ALGORITHM') == 'NLU': response = train_nlu.train(domain, locale, nlp_config.getProperties()) else: log_util.logerrormsg( "[TRAIN_MODEL] configured algorithm is not supported. Exiting...") return response
def trainDomain(): if not (request.args.get('domain')): log_util.logerrormsg("[APP] missing domain parameter") abort(404) if request.args.get('locale'): locale = request.args.get('locale') else: locale = 'en' domain = request.args.get('domain') res = train_model.train(domain, locale) n = int(json.loads(res)["utterances"]) if (nlp_config.getParameter('ALGORITHM') == 'TFIDF'): md = 'TFIDF' else: algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0] algo = algo.split("_")[1].upper() md = 'NLU:' + algo if n > 0: response = { "messageId": "TRAIN_SUCCESS", "domain": domain, "locale": locale, "message": res, "model": md } else: response = { "messageId": "TRAIN_FAIL", "domain": domain, "locale": locale, "message": res, "model": md } return make_response( jsonify(response), 200, {'Content-Type': 'application/json; charset=utf-8'})
def train(domain, locale, prop): datapath = os.path.join(scriptDir, '..', '..', '..', 'trainingData', 'intents') vectorDimension = int(nlp_config.getParameter('VECTOR_DIMENSION')) iterationNumbers = int(nlp_config.getParameter('ITERATION_NUMBER')) format = nlp_config.getParameter('FORMAT') utterance = [] intent = [] if format == 'md': utterance, intent = process_data(domain, locale) if not utterance or not intent: log_util.logerrormsg( "[TRAIN_TFIDF] could not parse the markdown data. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response elif format == 'json': fileData = os.path.join(scriptDir, datapath, domain + '_' + locale + '.json') with codecs.open(fileData, 'r', 'utf-8') as dataFile: data = json.load(dataFile) for nameUtterances in data['tasks']: for utt in nameUtterances['utterances']: utterance.append(utt) intent.append(nameUtterances['name']) else: log_util.logerrormsg("unsupported format. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response mIntent = set(intent) # check if any changes to config if nlp_config.is_config_stale(domain, locale, prop): log_util.loginfomsg( "[TRAIN_TFIDF] no changes found to training data, using pre-trained model" ) res = {"intents": str(len(mIntent)), "utterances": str(len(intent))} response = str(res).replace("'", '"').strip() # make it a string return response else: pass stopListFile = os.path.join(scriptDir, '..', '..', 'dictionary', 'stopwords_' + locale + '.txt') arrayWords = [] stopWords = [] f = codecs.open(stopListFile, 'r', 'utf-8') lines = f.read().split("\n") for line in lines: if line != "": arrayWords.append(line.split(',')) for a_word in arrayWords: for s_word in a_word: if (re.sub(' ', '', s_word)) != "": stopWords.append(s_word) extraStopWords = set(stopWords) if locale == 'ar': stops = set(stopwords.words('arabic')) | extraStopWords elif locale == 'da': stops = set(stopwords.words('danish')) | extraStopWords elif locale == 'en': stops = set(stopwords.words('english')) | extraStopWords elif locale == 'es': stops = set(stopwords.words('spanish')) | extraStopWords elif locale == 'hi': stops = extraStopWords elif locale == 'mr': stops = extraStopWords elif locale == 'nl': stops = set(stopwords.words('dutch')) | extraStopWords elif locale == 'sv': stops = set(stopwords.words('swedish')) | extraStopWords else: res = {"intents": "0", "utterances": "0"} response = str(res).replace("'", '"').strip() return response stemmer.setLocale(locale) tfidfVec = TfidfVectorizer(utterance, decode_error='ignore', stop_words=stops, ngram_range=(1, 5), tokenizer=stemmer.stemTokenize) trainsetIdfVectorizer = tfidfVec.fit_transform(utterance).toarray() vLength = len(trainsetIdfVectorizer[1]) nDimension = vectorDimension if vLength <= vectorDimension: nDimension = vLength - 1 svd = TruncatedSVD(n_components=nDimension, algorithm='randomized', n_iter=iterationNumbers, random_state=42) trainLSA = svd.fit_transform(trainsetIdfVectorizer) pickle_path = os.path.join(scriptDir, '..', '..', 'models', 'tfidf', domain + '_' + locale + '_') fileName = pickle_path + 'utterance.m' fileObject = open(fileName, 'wb') pickle.dump(utterance, fileObject) fileObject.close() fileName = pickle_path + 'intent.m' fileObject = open(fileName, 'wb') pickle.dump(intent, fileObject) fileObject.close() fileName = pickle_path + 'tfidfVec.m' fileObject = open(fileName, 'wb') pickle.dump(tfidfVec, fileObject) fileObject.close() fileName = pickle_path + 'svd.m' fileObject = open(fileName, 'wb') pickle.dump(svd, fileObject) fileObject.close() fileName = pickle_path + 'trainLSA.m' fileObject = open(fileName, 'wb') pickle.dump(trainLSA, fileObject) fileObject.close() log_util.loginfomsg(f'[TRAIN_TFIDF] identified domain: {domain}') log_util.loginfomsg(f'[TRAIN_TFIDF] identified locale: {locale}') log_util.loginfomsg( f'[TRAIN_TFIDF] number of utterances for training: {len(intent)}') log_util.loginfomsg( f'[TRAIN_TFIDF] number of intents for training: {len(mIntent)}') res = { "intents": str(len(mIntent)), "utterances": str(len(intent)), "model": "TFIDF" } response = str(res).replace("'", '"').strip() # make it a string return response
def train(domain, locale, prop): format = nlp_config.getParameter('FORMAT') dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData', 'intents', domain + '_' + locale + '.' + format) configFile = os.path.join(scriptDir, '..', '..', 'config', nlp_config.getParameter('CONFIG_FILE')) modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu') MODEL_NAME = domain + '_' + locale try: if format == 'md' or format == 'json' or format == 'yml': training_data = load_data(dataFile) trainer = Trainer(config.load(configFile)) if not nlp_config.is_config_stale(domain, locale, prop): trainer.train(training_data) #delete the folder if it exist if os.path.exists(modelFile + MODEL_NAME): shutil.rmtree(modelFile + MODEL_NAME) trainer.persist(modelFile, fixed_model_name=MODEL_NAME) else: log_util.loginfomsg( "[TRAIN_NLU] no changes found to training data, using pre-trained model" ) else: log_util.logerrormsg("[TRAIN_NLU] unsupported format. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response except FileNotFoundError: log_util.logerrormsg( "[TRAIN_NLU] could not locate the NLU config file") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response training_examples = OrderedDict() INTENT = 'intent' for example in [e.as_dict_nlu() for e in training_data.training_examples]: intent = example[INTENT] training_examples.setdefault(intent, []) training_examples[intent].append(example) count = 0 for x in training_examples: if isinstance(training_examples[x], list): count += len(training_examples[x]) log_util.loginfomsg(f'[TRAIN_NLU] identified domain: {domain}') log_util.loginfomsg(f'[TRAIN_NLU] identified locale: {locale}') log_util.loginfomsg( f'[TRAIN_NLU] number of utterances for training: {count}') log_util.loginfomsg( f'[TRAIN_NLU] number of intents for training: {len(training_examples)}' ) algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0] algo = algo.split("_")[1].upper() model = 'NLU:' + algo res = { "intents": str(len(training_examples)), "utterances": str(count), "model": model } response = str(res).replace("'", '"').strip() # make it a string return response