def answerQuestion(question): reader = FaqDatasetReader() faq_data = reader.read(data_url=FAQ_DATASET_URL, x_col_name='Question', y_col_name='Answer') iterator = DataLearningIterator(data=faq_data) x, y = iterator.get_instances() tokenizer = StreamSpacyTokenizer(lemmas=True) x_tokenized = tokenizer(x) x_tokens_joined = tokenizer(x_tokenized) # fit TF-IDF vectorizer on train FAQ dataset vectorizer = SklearnComponent( model_class="sklearn.feature_extraction.text:TfidfVectorizer", save_path='../model/tfidf.pkl', infer_method='transform') vectorizer.fit(x_tokens_joined) # Now collect (x,y) pairs: x_train - vectorized question, y_train - answer from FAQ x_train = vectorizer(x_tokens_joined) y_train = y # Let's use top 2 answers for each incoming questions (top_n param) clf = SklearnComponent( model_class="sklearn.linear_model:LogisticRegression", top_n=2, c=1000, penalty='l2', save_path='../model/tfidf.pkl', infer_method='predict') clf.fit(x_train, y_train) test_questions = question tokenized_test_questions = tokenizer(test_questions) joined_test_q_tokens = tokenizer(tokenized_test_questions) test_q_vectorized = vectorizer(joined_test_q_tokens) answers = clf(test_q_vectorized) return answers
def _fit(model: Estimator, iterator: DataLearningIterator, train_config) -> Estimator: x, y = iterator.get_instances('train') model.fit(x, y) model.save() return model
def _fit(model: Estimator, iterator: DataLearningIterator, train_config) -> Estimator: x, y = iterator.get_instances('train') model.fit(x, y) model.save() return model