def test_cleaning(self):
     sentences = [
         "I am a sentence.",
         "you   are ?? a sentence",
         "WE ARE THE SENTENCES!!!",
     ]
     ic = IntentClassifier()
     cleaned_sentences = ic.cleaning(sentences)
     expected_sentences = [
         ["i", "am", "a", "sentence"],
         ["you", "are", "a", "sentence"],
         ["we", "are", "the", "sentences"],
     ]
     self.assertEqual(cleaned_sentences, expected_sentences)
    def test_fit(self):
        logreg_clf = IntentClassifier('log_reg')
        perceptron_clf = IntentClassifier('perceptron')
        use_clf = IntentClassifier('use')

        for clf in [logreg_clf, perceptron_clf, use_clf]:
            try:
                clf.fit('aaa', [1])
                clf.fit(['aaa'], 1)
            except ValueError:
                pass
            else:
                raise ValueError

            clf.fit(['aaa'], [1])
            clf.fit(['aaa', 'sdfsd', 'sdfsdf'], [1, 2, 3])
            clf.fit(np.array(['aaa', 'sdfsd', 'sdfsdf']), np.array([1, 2, 3]))
Exemple #3
0
    def initialize(self):
        RELOAD_DATA = os.path.exists(self.data_file_path)
        if RELOAD_DATA:
            logger.info("Reloading data from file {}".format(
                self.data_file_path))
            with open(self.data_file_path, 'rb') as f:
                self.data = pickle.load(f)
                self.update_intents()
        else:
            self.data = []
            with open("raw_intents.csv", 'r') as f:
                rows = f.readlines()
                for row in rows:
                    if len(row) == 0:
                        continue
                    if row.startswith("#"):
                        continue
                    query, intent = row.split(",")
                    self.data.append({
                        'raw': {
                            'query': query.lower().strip(),
                            'intent': intent.lower().strip()
                        },
                        'nlu': {}
                    })
            self.update_intents()
            self.tokenize_data()
        print("Known intents:")
        print(self.known_intents)

        self.model = IntentClassifier(n_intents=self.n_intents)
        self.model.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['categorical_accuracy'])
        X, y = self.make_training_dataset(self.data)
        print(X["input_ids"].shape, X['attention_masks'].shape, y.shape)
        self.model.train_on_batch(X, y)
        self.model.summary()
        self.make_intents_embeddings()
Exemple #4
0
class TestIntentClassifier(unittest.TestCase):

    ic_model = IntentClassifier()

    def test_load_dataset(self):
        pass

    def test_cleaning(self):
        pass

    def test_create_tokenizer(self):
        pass

    def test_get_max_length(self):
        pass

    def test_encoding_doc(self):
        pass

    def test_padding_doc(self):
        pass

    def test_one_hot(self):
        pass

    def test_create_model(self):
        pass

    def train_model(self):
        pass

    def test_predictions(self):
        pass

    def test_tests(self):
        self.assertEqual(2, 2, "Should be 2")
 def test_creation(self):
     IntentClassifier('log_reg')
     IntentClassifier('perceptron')
from gevent.wsgi import WSGIServer
import pandas
import os

from intent_classifier import IntentClassifier

## Train Data Set File
DATA_PATH = "./data/train_set.csv"


app = Flask(__name__)

# Read the Train Set
data = pandas.read_csv(DATA_PATH, encoding='utf-8',
                    names=['intent', 'utterance'], header=1)

intent_classifier = IntentClassifier()
X = data['utterance']
y = data['intent']
# Train the Engine
intent_classifier.fit(X, y, cv=None)

@app.route('/predict', methods=['POST'])
def predict():
	input = request.json['input']
	results = intent_classifier.predict(input)
	return jsonify(results)

if __name__ == '__main__':
	server = WSGIServer(('', int(os.environ.get('PORT', 5000))), app)
	server.serve_forever()
Exemple #7
0
"""
import flask
import logging
from intent_classifier import IntentClassifier
from decouple import config
CLASSIFIER = "CLASSIFIER"
logging.basicConfig(
    format='%(levelname)s: [%(asctime)s.%(msecs)03d] {} %(name)s '
    '%(filename)s:%(funcName)s:%(lineno)s: %(message)s'.format(
        "CLASSIFIER_WEB_SERVICE"),
    datefmt='%Y-%m-%d %H:%M:%S',
    level="DEBUG")
log = logging.getLogger(__name__)

app = flask.Flask(__name__)
app.config[CLASSIFIER] = IntentClassifier()


def get_settings():
    app.config["HOST_ADDRESS"] = config('IC_FLASK_ADDRESS',
                                        default="0.0.0.0",
                                        cast=str)
    app.config["HOST_PORT"] = config('IC_FLASK_PORT', default="8585", cast=int)
    app.config["DEBUG"] = config('IC_FLASK_DEBUG', default=False, cast=bool)


@app.route("/get_event/<query>", methods=["GET"])
def get_event(query):
    log.info("get_event: IN")
    if not query:
        log.warning("get_event: empty query!")
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from intent_classifier import IntentClassifier

model = IntentClassifier()
sentence = "Жду ответа оператора"
cls = model.predict(sentence)
Exemple #9
0
#!/usr/bin/env python

from flask import Flask, request, jsonify
from intent_classifier import IntentClassifier

app = Flask(__name__)
intent_classifier = IntentClassifier(
    path_to_datafile='./data/data.tsv',
    path_to_embedding='./data/glove.6B.100d.txt'
)

@app.route("/get_intent", methods=['POST'])
def respond():
    text = request.json['text']
    intent, score = _get_intent(text)
    return jsonify({'intent': intent, 'score': score})


def _get_intent(text):
    """
    Args:
        text: input utterance

    Returns:
        intent and its score

    """
    scores = intent_classifier.get_scores(text)
    max_intent, max_intent_score = intent_classifier.knn(text)
    print(scores, max_intent, max_intent_score)
    return max_intent, max_intent_score
Exemple #10
0
class Engine:
    def __init__(self, data_file_path="intents_db.pkl"):
        self.data = []
        self.known_intents = []
        self.n_intents = 0
        self.intents_labels = {}
        self.intents_embeddings = {}
        self.data_file_path = data_file_path
        # self.model_file_path = "intent_classifier.h5"
        self.encoder = Tokenizer()
        self.model = None

    def initialize(self):
        RELOAD_DATA = os.path.exists(self.data_file_path)
        if RELOAD_DATA:
            logger.info("Reloading data from file {}".format(
                self.data_file_path))
            with open(self.data_file_path, 'rb') as f:
                self.data = pickle.load(f)
                self.update_intents()
        else:
            self.data = []
            with open("raw_intents.csv", 'r') as f:
                rows = f.readlines()
                for row in rows:
                    if len(row) == 0:
                        continue
                    if row.startswith("#"):
                        continue
                    query, intent = row.split(",")
                    self.data.append({
                        'raw': {
                            'query': query.lower().strip(),
                            'intent': intent.lower().strip()
                        },
                        'nlu': {}
                    })
            self.update_intents()
            self.tokenize_data()
        print("Known intents:")
        print(self.known_intents)

        self.model = IntentClassifier(n_intents=self.n_intents)
        self.model.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['categorical_accuracy'])
        X, y = self.make_training_dataset(self.data)
        print(X["input_ids"].shape, X['attention_masks'].shape, y.shape)
        self.model.train_on_batch(X, y)
        self.model.summary()
        self.make_intents_embeddings()

    def update_intents(self, new_intent=None):
        if new_intent is None:
            self.known_intents = list(
                set([
                    x['raw']['intent'] for x in self.data
                    if 'intent' in x['raw']
                ]))
        else:
            if new_intent not in self.known_intents:
                self.known_intents.append(new_intent)
        self.known_intents.sort()
        self.n_intents = len(self.known_intents)
        self.intents_labels = {k: i for i, k in enumerate(self.known_intents)}

    def make_intents_embeddings(self):
        embeddings = self.model.get_embedding(self.known_intents)
        self.intents_embeddings = {
            k: emb
            for k, emb in zip(self.known_intents, embeddings)
        }
        print(self.known_intents)

    def tokenize_data(self):
        query = [x['raw']['query'] for x in self.data]
        encoded = self.encoder.encode(query)
        for i in range(len(self.data)):
            x = self.data[i]
            x['nlu'] = {
                'input_ids': encoded['input_ids'][i],
                'attention_masks': encoded['attention_masks'][i],
                'label': self.intents_labels[x['raw']['intent']]
            }

    @staticmethod
    def make_training_dataset(batch):
        X = {
            "input_ids":
            np.array([x['nlu']['input_ids'] for x in batch]),
            'attention_masks':
            np.array([x['nlu']['attention_masks'] for x in batch])
        }
        y = np.array([x['nlu']['label'] for x in batch], dtype=np.int64)
        y = to_categorical(y)
        return X, y

    def write_out(self):
        # Write out to file
        print("Saving data to file {}".format(self.data_file_path))
        with open(self.data_file_path, 'wb') as f:
            pickle.dump(self.data, f)
        #model.save(model_file_path)

    def predict_intent(self, txt):
        this_embedding = self.model.get_embedding([txt])
        all_embeddings = [
            self.intents_embeddings[i] for i in self.known_intents
        ]
        scores = cosine_similarity(this_embedding, all_embeddings)
        k = np.argmax(scores[0])
        confidence = scores[0][k]
        closest_intent = self.known_intents[k]
        return closest_intent, confidence

    def loop(self):
        while True:
            print("Tell me what you would like to do")
            txt = input()
            txt = txt.lower()
            if txt in ['q', 'quit', 'stop']:
                return

            intent, confidence = self.predict_intent(txt)
            print("Is this your purpose? {} (confidence={:.3f})".format(
                intent, confidence))
            reply = input().lower()
            if reply in ['n', 'no', 'nope']:
                print("What is the purpose?")
                intent = input().lower()
                if intent not in self.known_intents:
                    closest_intent, confidence = self.predict_intent(intent)
                    print("Is this the same as {} (confidence={:.3f})? [y, n]".
                          format(closest_intent, confidence))
                    reply = input().lower()
                    if reply in ['y', 'yes']:
                        intent = closest_intent
                    else:
                        print("This is a new intent to me")
                        self.update_intents(intent)
                        self.make_intents_embeddings()
            print("Ok, so you are asking for: {}".format(intent))

            encoded = self.encoder.encode(txt)
            entry = {
                'raw': {
                    'query': txt,
                    'intent': intent
                },
                'nlu': {
                    'input_ids': encoded['input_ids'][0],
                    'attention_masks': encoded['attention_masks'][0],
                    'label': self.intents_labels[intent]
                }
            }
            self.data.append(entry)

            X, y = self.make_training_dataset([entry])
            self.model.train_on_batch(X, y)
 def test_get_max_length(self):
     words = ["this", "is", "a", "list", "of", "words"]
     expected_max = 5  # the word "words"
     ic = IntentClassifier()
     given_max = ic.get_max_length(words)
     self.assertEqual(given_max, expected_max)
 def test_cleaning_empty_set(self):
     sentences = []
     cleaned_sentences = IntentClassifier().cleaning(sentences)
     expected_sentences = []
     self.assertEqual(cleaned_sentences, expected_sentences)
Exemple #13
0
from intent_classifier import IntentClassifier

if __name__ == '__main__':
    """
    Test IntentClassifier on some examples 
    """
    c = IntentClassifier()
    print(c.get_scores('What is this text about?'))
    print(c.get_scores('I want text summary'))
    print(c.get_scores('I want you to ask me a question'))
    print(c.knn('I want you to ask me a question'))
    print(c.knn('I want text summary'))
    print(c.knn('What is your name, man?'))
import pandas

from intent_classifier import IntentClassifier

DATA_PATH = "./data/train_set.csv"
TEST_DATA_PATH = "./data/test_set.csv"

data = pandas.read_csv(
    DATA_PATH,
    encoding='utf-8',
    #    names=['intent', 'utterance', 'instance_id', 'date', 'remarks'], header=0)
    names=['intent', 'utterance'],
    header=1)

test_data = pandas.read_csv(TEST_DATA_PATH,
                            encoding='utf-8',
                            names=['intent', 'utterance'],
                            header=0)

intent_classifier = IntentClassifier()
X = data['utterance']
y = data['intent']
intent_classifier.fit(X, y, cv=None)

X_test = test_data['utterance']
y_test = test_data['intent']
intent_classifier.eval(X_test, y_test)