def predict(domain, locale, userUtterance): response = json.loads('{"response":"ERROR: error during predicting the user utterance"}') if not nlp_config.checkDataAvaialble: log_util.logerrormsg("[PREDICT_MODEL] no intent data found. Exiting...") return json.loads('{"response":"ERROR: no intent data found. Exiting..."}') if nlp_config.getParameter('ALGORITHM') == 'TFIDF': response = predict_tfidf.predict(domain, locale, userUtterance) elif nlp_config.getParameter('ALGORITHM') == 'NLU': response = predict_nlu.predict(domain, locale, userUtterance) else: log_util.logerrormsg("[PREDICT_MODEL] configured algorithm is not supported. Exiting...") return response
def train(domain, locale): response = json.loads( '{"response":"ERROR: Error during training the data"}') if not nlp_config.checkDataAvaialble: log_util.logerrormsg("[TRAIN_MODEL] no intent data found. Exiting...") return response if nlp_config.getParameter('ALGORITHM') == 'TFIDF': response = train_tfidf.train(domain, locale, nlp_config.getProperties()) elif nlp_config.getParameter('ALGORITHM') == 'NLU': response = train_nlu.train(domain, locale, nlp_config.getProperties()) else: log_util.logerrormsg( "[TRAIN_MODEL] configured algorithm is not supported. Exiting...") return response
def sendMessgae(topicName, key, value): global producer pNum = utils.getPartition(key, int(nlp_config.getParameter('PARTITIONS'))) msg = json.loads(value) producer.send(topicName, value=msg, key=key.encode('utf-8'), partition=pNum) producer.flush() log_util.loginfomsg("[PRODUCER] sending message: \"{}\"".format(value)) log_util.loginfomsg( "[PRODUCER] message sent with key: \"{}\" to partition \"{}\"!".format( key, pNum))
def trainDomain(): if not (request.args.get('domain')): log_util.logerrormsg("[APP] missing domain parameter") abort(404) if request.args.get('locale'): locale = request.args.get('locale') else: locale = 'en' domain = request.args.get('domain') res = train_model.train(domain, locale) n = int(json.loads(res)["utterances"]) if (nlp_config.getParameter('ALGORITHM') == 'TFIDF'): md = 'TFIDF' else: algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0] algo = algo.split("_")[1].upper() md = 'NLU:' + algo if n > 0: response = { "messageId": "TRAIN_SUCCESS", "domain": domain, "locale": locale, "message": res, "model": md } else: response = { "messageId": "TRAIN_FAIL", "domain": domain, "locale": locale, "message": res, "model": md } return make_response( jsonify(response), 200, {'Content-Type': 'application/json; charset=utf-8'})
def train(domain, locale, prop): datapath = os.path.join(scriptDir, '..', '..', '..', 'trainingData', 'intents') vectorDimension = int(nlp_config.getParameter('VECTOR_DIMENSION')) iterationNumbers = int(nlp_config.getParameter('ITERATION_NUMBER')) format = nlp_config.getParameter('FORMAT') utterance = [] intent = [] if format == 'md': utterance, intent = process_data(domain, locale) if not utterance or not intent: log_util.logerrormsg( "[TRAIN_TFIDF] could not parse the markdown data. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response elif format == 'json': fileData = os.path.join(scriptDir, datapath, domain + '_' + locale + '.json') with codecs.open(fileData, 'r', 'utf-8') as dataFile: data = json.load(dataFile) for nameUtterances in data['tasks']: for utt in nameUtterances['utterances']: utterance.append(utt) intent.append(nameUtterances['name']) else: log_util.logerrormsg("unsupported format. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response mIntent = set(intent) # check if any changes to config if nlp_config.is_config_stale(domain, locale, prop): log_util.loginfomsg( "[TRAIN_TFIDF] no changes found to training data, using pre-trained model" ) res = {"intents": str(len(mIntent)), "utterances": str(len(intent))} response = str(res).replace("'", '"').strip() # make it a string return response else: pass stopListFile = os.path.join(scriptDir, '..', '..', 'dictionary', 'stopwords_' + locale + '.txt') arrayWords = [] stopWords = [] f = codecs.open(stopListFile, 'r', 'utf-8') lines = f.read().split("\n") for line in lines: if line != "": arrayWords.append(line.split(',')) for a_word in arrayWords: for s_word in a_word: if (re.sub(' ', '', s_word)) != "": stopWords.append(s_word) extraStopWords = set(stopWords) if locale == 'ar': stops = set(stopwords.words('arabic')) | extraStopWords elif locale == 'da': stops = set(stopwords.words('danish')) | extraStopWords elif locale == 'en': stops = set(stopwords.words('english')) | extraStopWords elif locale == 'es': stops = set(stopwords.words('spanish')) | extraStopWords elif locale == 'hi': stops = extraStopWords elif locale == 'mr': stops = extraStopWords elif locale == 'nl': stops = set(stopwords.words('dutch')) | extraStopWords elif locale == 'sv': stops = set(stopwords.words('swedish')) | extraStopWords else: res = {"intents": "0", "utterances": "0"} response = str(res).replace("'", '"').strip() return response stemmer.setLocale(locale) tfidfVec = TfidfVectorizer(utterance, decode_error='ignore', stop_words=stops, ngram_range=(1, 5), tokenizer=stemmer.stemTokenize) trainsetIdfVectorizer = tfidfVec.fit_transform(utterance).toarray() vLength = len(trainsetIdfVectorizer[1]) nDimension = vectorDimension if vLength <= vectorDimension: nDimension = vLength - 1 svd = TruncatedSVD(n_components=nDimension, algorithm='randomized', n_iter=iterationNumbers, random_state=42) trainLSA = svd.fit_transform(trainsetIdfVectorizer) pickle_path = os.path.join(scriptDir, '..', '..', 'models', 'tfidf', domain + '_' + locale + '_') fileName = pickle_path + 'utterance.m' fileObject = open(fileName, 'wb') pickle.dump(utterance, fileObject) fileObject.close() fileName = pickle_path + 'intent.m' fileObject = open(fileName, 'wb') pickle.dump(intent, fileObject) fileObject.close() fileName = pickle_path + 'tfidfVec.m' fileObject = open(fileName, 'wb') pickle.dump(tfidfVec, fileObject) fileObject.close() fileName = pickle_path + 'svd.m' fileObject = open(fileName, 'wb') pickle.dump(svd, fileObject) fileObject.close() fileName = pickle_path + 'trainLSA.m' fileObject = open(fileName, 'wb') pickle.dump(trainLSA, fileObject) fileObject.close() log_util.loginfomsg(f'[TRAIN_TFIDF] identified domain: {domain}') log_util.loginfomsg(f'[TRAIN_TFIDF] identified locale: {locale}') log_util.loginfomsg( f'[TRAIN_TFIDF] number of utterances for training: {len(intent)}') log_util.loginfomsg( f'[TRAIN_TFIDF] number of intents for training: {len(mIntent)}') res = { "intents": str(len(mIntent)), "utterances": str(len(intent)), "model": "TFIDF" } response = str(res).replace("'", '"').strip() # make it a string return response
def train(domain, locale, prop): format = nlp_config.getParameter('FORMAT') dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData', 'intents', domain + '_' + locale + '.' + format) configFile = os.path.join(scriptDir, '..', '..', 'config', nlp_config.getParameter('CONFIG_FILE')) modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu') MODEL_NAME = domain + '_' + locale try: if format == 'md' or format == 'json' or format == 'yml': training_data = load_data(dataFile) trainer = Trainer(config.load(configFile)) if not nlp_config.is_config_stale(domain, locale, prop): trainer.train(training_data) #delete the folder if it exist if os.path.exists(modelFile + MODEL_NAME): shutil.rmtree(modelFile + MODEL_NAME) trainer.persist(modelFile, fixed_model_name=MODEL_NAME) else: log_util.loginfomsg( "[TRAIN_NLU] no changes found to training data, using pre-trained model" ) else: log_util.logerrormsg("[TRAIN_NLU] unsupported format. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response except FileNotFoundError: log_util.logerrormsg( "[TRAIN_NLU] could not locate the NLU config file") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response training_examples = OrderedDict() INTENT = 'intent' for example in [e.as_dict_nlu() for e in training_data.training_examples]: intent = example[INTENT] training_examples.setdefault(intent, []) training_examples[intent].append(example) count = 0 for x in training_examples: if isinstance(training_examples[x], list): count += len(training_examples[x]) log_util.loginfomsg(f'[TRAIN_NLU] identified domain: {domain}') log_util.loginfomsg(f'[TRAIN_NLU] identified locale: {locale}') log_util.loginfomsg( f'[TRAIN_NLU] number of utterances for training: {count}') log_util.loginfomsg( f'[TRAIN_NLU] number of intents for training: {len(training_examples)}' ) algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0] algo = algo.split("_")[1].upper() model = 'NLU:' + algo res = { "intents": str(len(training_examples)), "utterances": str(count), "model": model } response = str(res).replace("'", '"').strip() # make it a string return response
from utils import nlp_config from utils import log_util from core import train_model, predict_model from pubsub import consumer from pubsub import processMessage # ignore all warnings from pubsub import producer as pr scriptDir = os.path.dirname(__file__) simplefilter(action='ignore') # load all the config parameters nlp_config.loadParameters() if re.search(nlp_config.getParameter('USE_BROKER'), 'true', re.IGNORECASE): log_util.loginfomsg("[APP] broker based NLPEngine enabled") # initialise the producer pr.initialise() # Run consumer listener to process all the NLP_TO_BOT messages consumer_ = consumer.initialise( nlp_config.getParameter('TOPIC_BOT_TO_NLP')) for msg in consumer_: log_util.loginfomsg(msg) t = threading.Thread(target=processMessage.process, args=(msg, )) t.start() else: log_util.loginfomsg("[APP] REST API based NLPEngine enabled") app = flask.Flask(__name__) scriptDir = os.path.dirname(__file__) SERVER_HOST = '0.0.0.0'
def process(message): log_util.loginfomsg('[PROCESS_MESSAGE]: message received with key: ' + message.key.decode('utf-8') + ' message: ' + str(message.value)) key = message.key.decode('utf-8') # check if the message is for training the NLP if utils.parseKey(key) == 0 and key.find('DUMMY') != -1: if 'messageId' in message.value and message.value[ 'messageId'] == 'TRAIN': domain = message.value['domain'] locale = message.value['locale'] log_util.loginfomsg( '[INTENT_ENGINE] training the NLP for domain:{} and locale:{}'. format(domain, locale)) res = train_model.train(domain, locale) n = int(json.loads(res)["utterances"]) if n > 0: res = { "messageId": "TRAIN_SUCCESS", "domain": domain, "locale": locale, "message": res } else: res = { "messageId": "TRAIN_FAIL", "domain": domain, "locale": locale, "message": res } producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key, json.dumps(res)) elif 'messageId' in message.value and message.value[ 'messageId'] == 'PREDICT': domain = message.value['domain'] locale = message.value['locale'] utterance = message.value['userUtterance'] log_util.loginfomsg( '[PROCESS_MESSAGE] predicting the utterance:{} for domain:{} and locale:{}' .format(utterance, domain, locale)) result = predict_model.predict(domain, locale, utterance) res = { "messageId": "PREDICT", "domain": domain, "locale": locale, "userUtterance": utterance, "message": result } producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key, json.dumps(res)) else: domain = message.value['domain'] locale = message.value['locale'] utterance = message.value['userUtterance'] log_util.loginfomsg( '[PROCESS_MESSAGE] processing the utterance:{} for domain:{} and locale:{}' .format(utterance, domain, locale)) result = predict_model.predict(domain, locale, utterance) res = { "messageId": "PREDICT", "domain": domain, "locale": locale, "userUtterance": utterance, "message": result } producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key, json.dumps(res))