def test_cleaning(self): sentences = [ "I am a sentence.", "you are ?? a sentence", "WE ARE THE SENTENCES!!!", ] ic = IntentClassifier() cleaned_sentences = ic.cleaning(sentences) expected_sentences = [ ["i", "am", "a", "sentence"], ["you", "are", "a", "sentence"], ["we", "are", "the", "sentences"], ] self.assertEqual(cleaned_sentences, expected_sentences)
def test_fit(self): logreg_clf = IntentClassifier('log_reg') perceptron_clf = IntentClassifier('perceptron') use_clf = IntentClassifier('use') for clf in [logreg_clf, perceptron_clf, use_clf]: try: clf.fit('aaa', [1]) clf.fit(['aaa'], 1) except ValueError: pass else: raise ValueError clf.fit(['aaa'], [1]) clf.fit(['aaa', 'sdfsd', 'sdfsdf'], [1, 2, 3]) clf.fit(np.array(['aaa', 'sdfsd', 'sdfsdf']), np.array([1, 2, 3]))
def initialize(self): RELOAD_DATA = os.path.exists(self.data_file_path) if RELOAD_DATA: logger.info("Reloading data from file {}".format( self.data_file_path)) with open(self.data_file_path, 'rb') as f: self.data = pickle.load(f) self.update_intents() else: self.data = [] with open("raw_intents.csv", 'r') as f: rows = f.readlines() for row in rows: if len(row) == 0: continue if row.startswith("#"): continue query, intent = row.split(",") self.data.append({ 'raw': { 'query': query.lower().strip(), 'intent': intent.lower().strip() }, 'nlu': {} }) self.update_intents() self.tokenize_data() print("Known intents:") print(self.known_intents) self.model = IntentClassifier(n_intents=self.n_intents) self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy']) X, y = self.make_training_dataset(self.data) print(X["input_ids"].shape, X['attention_masks'].shape, y.shape) self.model.train_on_batch(X, y) self.model.summary() self.make_intents_embeddings()
class TestIntentClassifier(unittest.TestCase): ic_model = IntentClassifier() def test_load_dataset(self): pass def test_cleaning(self): pass def test_create_tokenizer(self): pass def test_get_max_length(self): pass def test_encoding_doc(self): pass def test_padding_doc(self): pass def test_one_hot(self): pass def test_create_model(self): pass def train_model(self): pass def test_predictions(self): pass def test_tests(self): self.assertEqual(2, 2, "Should be 2")
def test_creation(self): IntentClassifier('log_reg') IntentClassifier('perceptron')
from gevent.wsgi import WSGIServer import pandas import os from intent_classifier import IntentClassifier ## Train Data Set File DATA_PATH = "./data/train_set.csv" app = Flask(__name__) # Read the Train Set data = pandas.read_csv(DATA_PATH, encoding='utf-8', names=['intent', 'utterance'], header=1) intent_classifier = IntentClassifier() X = data['utterance'] y = data['intent'] # Train the Engine intent_classifier.fit(X, y, cv=None) @app.route('/predict', methods=['POST']) def predict(): input = request.json['input'] results = intent_classifier.predict(input) return jsonify(results) if __name__ == '__main__': server = WSGIServer(('', int(os.environ.get('PORT', 5000))), app) server.serve_forever()
""" import flask import logging from intent_classifier import IntentClassifier from decouple import config CLASSIFIER = "CLASSIFIER" logging.basicConfig( format='%(levelname)s: [%(asctime)s.%(msecs)03d] {} %(name)s ' '%(filename)s:%(funcName)s:%(lineno)s: %(message)s'.format( "CLASSIFIER_WEB_SERVICE"), datefmt='%Y-%m-%d %H:%M:%S', level="DEBUG") log = logging.getLogger(__name__) app = flask.Flask(__name__) app.config[CLASSIFIER] = IntentClassifier() def get_settings(): app.config["HOST_ADDRESS"] = config('IC_FLASK_ADDRESS', default="0.0.0.0", cast=str) app.config["HOST_PORT"] = config('IC_FLASK_PORT', default="8585", cast=int) app.config["DEBUG"] = config('IC_FLASK_DEBUG', default=False, cast=bool) @app.route("/get_event/<query>", methods=["GET"]) def get_event(query): log.info("get_event: IN") if not query: log.warning("get_event: empty query!")
#!/usr/bin/env python # -*- coding: utf-8 -*- from intent_classifier import IntentClassifier model = IntentClassifier() sentence = "Жду ответа оператора" cls = model.predict(sentence)
#!/usr/bin/env python from flask import Flask, request, jsonify from intent_classifier import IntentClassifier app = Flask(__name__) intent_classifier = IntentClassifier( path_to_datafile='./data/data.tsv', path_to_embedding='./data/glove.6B.100d.txt' ) @app.route("/get_intent", methods=['POST']) def respond(): text = request.json['text'] intent, score = _get_intent(text) return jsonify({'intent': intent, 'score': score}) def _get_intent(text): """ Args: text: input utterance Returns: intent and its score """ scores = intent_classifier.get_scores(text) max_intent, max_intent_score = intent_classifier.knn(text) print(scores, max_intent, max_intent_score) return max_intent, max_intent_score
class Engine: def __init__(self, data_file_path="intents_db.pkl"): self.data = [] self.known_intents = [] self.n_intents = 0 self.intents_labels = {} self.intents_embeddings = {} self.data_file_path = data_file_path # self.model_file_path = "intent_classifier.h5" self.encoder = Tokenizer() self.model = None def initialize(self): RELOAD_DATA = os.path.exists(self.data_file_path) if RELOAD_DATA: logger.info("Reloading data from file {}".format( self.data_file_path)) with open(self.data_file_path, 'rb') as f: self.data = pickle.load(f) self.update_intents() else: self.data = [] with open("raw_intents.csv", 'r') as f: rows = f.readlines() for row in rows: if len(row) == 0: continue if row.startswith("#"): continue query, intent = row.split(",") self.data.append({ 'raw': { 'query': query.lower().strip(), 'intent': intent.lower().strip() }, 'nlu': {} }) self.update_intents() self.tokenize_data() print("Known intents:") print(self.known_intents) self.model = IntentClassifier(n_intents=self.n_intents) self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy']) X, y = self.make_training_dataset(self.data) print(X["input_ids"].shape, X['attention_masks'].shape, y.shape) self.model.train_on_batch(X, y) self.model.summary() self.make_intents_embeddings() def update_intents(self, new_intent=None): if new_intent is None: self.known_intents = list( set([ x['raw']['intent'] for x in self.data if 'intent' in x['raw'] ])) else: if new_intent not in self.known_intents: self.known_intents.append(new_intent) self.known_intents.sort() self.n_intents = len(self.known_intents) self.intents_labels = {k: i for i, k in enumerate(self.known_intents)} def make_intents_embeddings(self): embeddings = self.model.get_embedding(self.known_intents) self.intents_embeddings = { k: emb for k, emb in zip(self.known_intents, embeddings) } print(self.known_intents) def tokenize_data(self): query = [x['raw']['query'] for x in self.data] encoded = self.encoder.encode(query) for i in range(len(self.data)): x = self.data[i] x['nlu'] = { 'input_ids': encoded['input_ids'][i], 'attention_masks': encoded['attention_masks'][i], 'label': self.intents_labels[x['raw']['intent']] } @staticmethod def make_training_dataset(batch): X = { "input_ids": np.array([x['nlu']['input_ids'] for x in batch]), 'attention_masks': np.array([x['nlu']['attention_masks'] for x in batch]) } y = np.array([x['nlu']['label'] for x in batch], dtype=np.int64) y = to_categorical(y) return X, y def write_out(self): # Write out to file print("Saving data to file {}".format(self.data_file_path)) with open(self.data_file_path, 'wb') as f: pickle.dump(self.data, f) #model.save(model_file_path) def predict_intent(self, txt): this_embedding = self.model.get_embedding([txt]) all_embeddings = [ self.intents_embeddings[i] for i in self.known_intents ] scores = cosine_similarity(this_embedding, all_embeddings) k = np.argmax(scores[0]) confidence = scores[0][k] closest_intent = self.known_intents[k] return closest_intent, confidence def loop(self): while True: print("Tell me what you would like to do") txt = input() txt = txt.lower() if txt in ['q', 'quit', 'stop']: return intent, confidence = self.predict_intent(txt) print("Is this your purpose? {} (confidence={:.3f})".format( intent, confidence)) reply = input().lower() if reply in ['n', 'no', 'nope']: print("What is the purpose?") intent = input().lower() if intent not in self.known_intents: closest_intent, confidence = self.predict_intent(intent) print("Is this the same as {} (confidence={:.3f})? [y, n]". format(closest_intent, confidence)) reply = input().lower() if reply in ['y', 'yes']: intent = closest_intent else: print("This is a new intent to me") self.update_intents(intent) self.make_intents_embeddings() print("Ok, so you are asking for: {}".format(intent)) encoded = self.encoder.encode(txt) entry = { 'raw': { 'query': txt, 'intent': intent }, 'nlu': { 'input_ids': encoded['input_ids'][0], 'attention_masks': encoded['attention_masks'][0], 'label': self.intents_labels[intent] } } self.data.append(entry) X, y = self.make_training_dataset([entry]) self.model.train_on_batch(X, y)
def test_get_max_length(self): words = ["this", "is", "a", "list", "of", "words"] expected_max = 5 # the word "words" ic = IntentClassifier() given_max = ic.get_max_length(words) self.assertEqual(given_max, expected_max)
def test_cleaning_empty_set(self): sentences = [] cleaned_sentences = IntentClassifier().cleaning(sentences) expected_sentences = [] self.assertEqual(cleaned_sentences, expected_sentences)
from intent_classifier import IntentClassifier if __name__ == '__main__': """ Test IntentClassifier on some examples """ c = IntentClassifier() print(c.get_scores('What is this text about?')) print(c.get_scores('I want text summary')) print(c.get_scores('I want you to ask me a question')) print(c.knn('I want you to ask me a question')) print(c.knn('I want text summary')) print(c.knn('What is your name, man?'))
import pandas from intent_classifier import IntentClassifier DATA_PATH = "./data/train_set.csv" TEST_DATA_PATH = "./data/test_set.csv" data = pandas.read_csv( DATA_PATH, encoding='utf-8', # names=['intent', 'utterance', 'instance_id', 'date', 'remarks'], header=0) names=['intent', 'utterance'], header=1) test_data = pandas.read_csv(TEST_DATA_PATH, encoding='utf-8', names=['intent', 'utterance'], header=0) intent_classifier = IntentClassifier() X = data['utterance'] y = data['intent'] intent_classifier.fit(X, y, cv=None) X_test = test_data['utterance'] y_test = test_data['intent'] intent_classifier.eval(X_test, y_test)