Ejemplo n.º 1
0
def process_data(domain: Text, locale: Text) -> None:
    global utterance
    global intent
    #clear the list before loading
    utterance.clear()
    intent.clear()
    try:
        file = codecs.open(
            os.path.join(datapath, domain + '_' + locale + ".md"), 'r',
            'utf-8')
        lines = file.read().split("\n")
        log_util.loginfomsg(
            f"[MARKDOWN] recieved data, total lines: {len(lines)}")
        for line in lines:
            line = line.strip()
            header = find_section_header(line)
            if header:
                set_current_section(header[0], header[1])
            else:
                parse_item(line)
        return utterance, intent
    except FileNotFoundError:
        log_util.logerrormsg(
            f"[MARKDOWN] no file found for given domain {domain}, ensure that data is given in format .md."
        )
        return json.loads(
            '{"response":f"ERROR: no file found for given domain {domain}, ensure that data is given in format .md."}'
        )
        raise ValueError(
            f"no file found for given domain {domain}, ensure that data is given in format .md."
        )
Ejemplo n.º 2
0
    def predict_query():
        if not (request.args.get('domain')
                or request.args.get('userUtterance')):
            log_util.logerrormsg("[APP] missing parameters")
            abort(404)
        if request.args.get('locale'):
            locale = request.args.get('locale')
        else:
            locale = 'en'
        utter = request.args.get('userUtterance')
        if locale == 'en':
            utterance = re.sub(r'[^a-zA-Z ]', '', utter)
        domain = request.args.get('domain')

        if (nlp_config.getParameter('ALGORITHM') == 'TFIDF'):
            md = 'TFIDF'
        else:
            algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0]
            algo = algo.split("_")[1].upper()
            model = 'NLU:' + algo
        res = {
            "messageId": "PREDICT",
            "domain": domain,
            "locale": locale,
            "userUtterance": utterance,
            "model": model,
            "message": predict_model.predict(domain, locale, utterance)
        }
        return make_response(
            jsonify(res), 200,
            {'Content-Type': 'application/json; charset=utf-8'})
Ejemplo n.º 3
0
def set_current_section(section: Text, title: Text) -> None:
    """Update parsing mode."""
    if section not in available_sections:
        log_util.logerrormsg(
            f"[MARKDOWN] found markdown section {section} which is not in the allowed sections {', '.join(available_sections)}."
        )
        raise ValueError(
            f"[MARKDOWN] found markdown section {section} which is not in the allowed sections {', '.join(available_sections)}."
        )

    global current_section, current_intent
    current_section = section
    current_intent = title
Ejemplo n.º 4
0
def predict(domain, locale, userUtterance):

    response = json.loads('{"response":"ERROR: error during predicting the user utterance"}')

    if not nlp_config.checkDataAvaialble:
        log_util.logerrormsg("[PREDICT_MODEL] no intent data found. Exiting...")
        return json.loads('{"response":"ERROR: no intent data found. Exiting..."}')

    if nlp_config.getParameter('ALGORITHM') == 'TFIDF':
        response = predict_tfidf.predict(domain, locale, userUtterance)
    elif nlp_config.getParameter('ALGORITHM') == 'NLU':
        response = predict_nlu.predict(domain, locale, userUtterance)
    else:
        log_util.logerrormsg("[PREDICT_MODEL] configured algorithm is not supported. Exiting...")
    return response
Ejemplo n.º 5
0
def train(domain, locale):
    response = json.loads(
        '{"response":"ERROR: Error during training the data"}')

    if not nlp_config.checkDataAvaialble:
        log_util.logerrormsg("[TRAIN_MODEL] no intent data found. Exiting...")
        return response

    if nlp_config.getParameter('ALGORITHM') == 'TFIDF':
        response = train_tfidf.train(domain, locale,
                                     nlp_config.getProperties())
    elif nlp_config.getParameter('ALGORITHM') == 'NLU':
        response = train_nlu.train(domain, locale, nlp_config.getProperties())
    else:
        log_util.logerrormsg(
            "[TRAIN_MODEL] configured algorithm is not supported. Exiting...")
    return response
Ejemplo n.º 6
0
    def trainDomain():
        if not (request.args.get('domain')):
            log_util.logerrormsg("[APP] missing domain parameter")
            abort(404)
        if request.args.get('locale'):
            locale = request.args.get('locale')
        else:
            locale = 'en'
        domain = request.args.get('domain')
        res = train_model.train(domain, locale)
        n = int(json.loads(res)["utterances"])

        if (nlp_config.getParameter('ALGORITHM') == 'TFIDF'):
            md = 'TFIDF'
        else:
            algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0]
            algo = algo.split("_")[1].upper()
            md = 'NLU:' + algo

        if n > 0:
            response = {
                "messageId": "TRAIN_SUCCESS",
                "domain": domain,
                "locale": locale,
                "message": res,
                "model": md
            }
        else:
            response = {
                "messageId": "TRAIN_FAIL",
                "domain": domain,
                "locale": locale,
                "message": res,
                "model": md
            }

        return make_response(
            jsonify(response), 200,
            {'Content-Type': 'application/json; charset=utf-8'})
Ejemplo n.º 7
0
def train(domain, locale, prop):
    datapath = os.path.join(scriptDir, '..', '..', '..', 'trainingData',
                            'intents')
    vectorDimension = int(nlp_config.getParameter('VECTOR_DIMENSION'))
    iterationNumbers = int(nlp_config.getParameter('ITERATION_NUMBER'))
    format = nlp_config.getParameter('FORMAT')

    utterance = []
    intent = []

    if format == 'md':
        utterance, intent = process_data(domain, locale)
        if not utterance or not intent:
            log_util.logerrormsg(
                "[TRAIN_TFIDF] could not parse the markdown data. Exiting...")
            res = {"intents": "-1", "utterances": "-1"}
            response = str(res).replace("'", '"').strip()
            return response
    elif format == 'json':
        fileData = os.path.join(scriptDir, datapath,
                                domain + '_' + locale + '.json')
        with codecs.open(fileData, 'r', 'utf-8') as dataFile:
            data = json.load(dataFile)
        for nameUtterances in data['tasks']:
            for utt in nameUtterances['utterances']:
                utterance.append(utt)
                intent.append(nameUtterances['name'])
    else:
        log_util.logerrormsg("unsupported format. Exiting...")
        res = {"intents": "-1", "utterances": "-1"}
        response = str(res).replace("'", '"').strip()
        return response

    mIntent = set(intent)

    # check if any changes to config
    if nlp_config.is_config_stale(domain, locale, prop):
        log_util.loginfomsg(
            "[TRAIN_TFIDF] no changes found to training data, using pre-trained model"
        )
        res = {"intents": str(len(mIntent)), "utterances": str(len(intent))}
        response = str(res).replace("'", '"').strip()  # make it a string
        return response
    else:
        pass

    stopListFile = os.path.join(scriptDir, '..', '..', 'dictionary',
                                'stopwords_' + locale + '.txt')
    arrayWords = []
    stopWords = []

    f = codecs.open(stopListFile, 'r', 'utf-8')
    lines = f.read().split("\n")
    for line in lines:
        if line != "":
            arrayWords.append(line.split(','))

    for a_word in arrayWords:
        for s_word in a_word:
            if (re.sub(' ', '', s_word)) != "":
                stopWords.append(s_word)

    extraStopWords = set(stopWords)
    if locale == 'ar':
        stops = set(stopwords.words('arabic')) | extraStopWords
    elif locale == 'da':
        stops = set(stopwords.words('danish')) | extraStopWords
    elif locale == 'en':
        stops = set(stopwords.words('english')) | extraStopWords
    elif locale == 'es':
        stops = set(stopwords.words('spanish')) | extraStopWords
    elif locale == 'hi':
        stops = extraStopWords
    elif locale == 'mr':
        stops = extraStopWords
    elif locale == 'nl':
        stops = set(stopwords.words('dutch')) | extraStopWords
    elif locale == 'sv':
        stops = set(stopwords.words('swedish')) | extraStopWords
    else:
        res = {"intents": "0", "utterances": "0"}
        response = str(res).replace("'", '"').strip()
        return response

    stemmer.setLocale(locale)

    tfidfVec = TfidfVectorizer(utterance,
                               decode_error='ignore',
                               stop_words=stops,
                               ngram_range=(1, 5),
                               tokenizer=stemmer.stemTokenize)
    trainsetIdfVectorizer = tfidfVec.fit_transform(utterance).toarray()
    vLength = len(trainsetIdfVectorizer[1])
    nDimension = vectorDimension
    if vLength <= vectorDimension:
        nDimension = vLength - 1

    svd = TruncatedSVD(n_components=nDimension,
                       algorithm='randomized',
                       n_iter=iterationNumbers,
                       random_state=42)
    trainLSA = svd.fit_transform(trainsetIdfVectorizer)

    pickle_path = os.path.join(scriptDir, '..', '..', 'models', 'tfidf',
                               domain + '_' + locale + '_')
    fileName = pickle_path + 'utterance.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(utterance, fileObject)
    fileObject.close()
    fileName = pickle_path + 'intent.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(intent, fileObject)
    fileObject.close()
    fileName = pickle_path + 'tfidfVec.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(tfidfVec, fileObject)
    fileObject.close()
    fileName = pickle_path + 'svd.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(svd, fileObject)
    fileObject.close()
    fileName = pickle_path + 'trainLSA.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(trainLSA, fileObject)
    fileObject.close()

    log_util.loginfomsg(f'[TRAIN_TFIDF] identified domain: {domain}')
    log_util.loginfomsg(f'[TRAIN_TFIDF] identified locale: {locale}')
    log_util.loginfomsg(
        f'[TRAIN_TFIDF] number of utterances for training: {len(intent)}')
    log_util.loginfomsg(
        f'[TRAIN_TFIDF] number of intents for training: {len(mIntent)}')

    res = {
        "intents": str(len(mIntent)),
        "utterances": str(len(intent)),
        "model": "TFIDF"
    }
    response = str(res).replace("'", '"').strip()  # make it a string
    return response
Ejemplo n.º 8
0
def train(domain, locale, prop):
    format = nlp_config.getParameter('FORMAT')
    dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData',
                            'intents', domain + '_' + locale + '.' + format)
    configFile = os.path.join(scriptDir, '..', '..', 'config',
                              nlp_config.getParameter('CONFIG_FILE'))
    modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu')
    MODEL_NAME = domain + '_' + locale

    try:
        if format == 'md' or format == 'json' or format == 'yml':
            training_data = load_data(dataFile)
            trainer = Trainer(config.load(configFile))

            if not nlp_config.is_config_stale(domain, locale, prop):
                trainer.train(training_data)
                #delete the folder if it exist
                if os.path.exists(modelFile + MODEL_NAME):
                    shutil.rmtree(modelFile + MODEL_NAME)
                trainer.persist(modelFile, fixed_model_name=MODEL_NAME)
            else:
                log_util.loginfomsg(
                    "[TRAIN_NLU] no changes found to training data, using pre-trained model"
                )
        else:
            log_util.logerrormsg("[TRAIN_NLU] unsupported format. Exiting...")
            res = {"intents": "-1", "utterances": "-1"}
            response = str(res).replace("'", '"').strip()
            return response
    except FileNotFoundError:
        log_util.logerrormsg(
            "[TRAIN_NLU] could not locate the NLU config file")
        res = {"intents": "-1", "utterances": "-1"}
        response = str(res).replace("'", '"').strip()
        return response

    training_examples = OrderedDict()
    INTENT = 'intent'
    for example in [e.as_dict_nlu() for e in training_data.training_examples]:
        intent = example[INTENT]
        training_examples.setdefault(intent, [])
        training_examples[intent].append(example)
    count = 0
    for x in training_examples:
        if isinstance(training_examples[x], list):
            count += len(training_examples[x])

    log_util.loginfomsg(f'[TRAIN_NLU] identified domain: {domain}')
    log_util.loginfomsg(f'[TRAIN_NLU] identified locale: {locale}')
    log_util.loginfomsg(
        f'[TRAIN_NLU] number of utterances for training: {count}')
    log_util.loginfomsg(
        f'[TRAIN_NLU] number of intents for training: {len(training_examples)}'
    )

    algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0]
    algo = algo.split("_")[1].upper()
    model = 'NLU:' + algo

    res = {
        "intents": str(len(training_examples)),
        "utterances": str(count),
        "model": model
    }
    response = str(res).replace("'", '"').strip()  # make it a string
    return response