def __read_test_data(self, file, num): # load 2D matrix of user questions vs actual questions test_data = list(csv.reader(open(file))) numrows = len(test_data) numcols = len(test_data[0]) # number of questions to ask (default is all of them) if num is None or num > numcols - 2: num = numcols - 2 # get user questions user_questions = {} for c in range(0, num): user_question = sanitize_string(test_data[0][c + 2]) # get ideal and reasonable matches for user questions for r in range(1, numrows): match = sanitize_string(test_data[r][c + 2]) if match == "i" or match == "r": answer = sanitize_string(test_data[r][1]) try: user_questions[user_question].append(answer) except KeyError: user_questions[user_question] = [answer] return user_questions
def evaluate( self, question: str, shared_root: str, canned_question_match_disabled=False, ) -> QuestionClassiferPredictionResult: if not canned_question_match_disabled: sanitized_question = sanitize_string(question) if sanitized_question in self.mentor.questions_by_text: q = self.mentor.questions_by_text[sanitized_question] answer_id = q["answer_id"] answer = q["answer"] answer_media = q["media"] feedback_id = create_user_question( self.mentor.id, question, answer_id, "PARAPHRASE" if sanitized_question != sanitize_string(q["question_text"]) else "EXACT", 1.0, ) return QuestionClassiferPredictionResult( answer_id, answer, answer_media, 1.0, feedback_id ) preprocessor = SpacyPreprocessor(shared_root) processed_question = preprocessor.transform(question) w2v_vector, lstm_vector = self.w2v_model.w2v_for_question(processed_question) off_topic_threshold = get_off_topic_threshold() ( answer_id, answer_text, answer_media, highest_confidence, ) = self.__get_prediction(w2v_vector) feedback_id = create_user_question( self.mentor.id, question, answer_id, "OFF_TOPIC" if highest_confidence < off_topic_threshold else "CLASSIFIER", highest_confidence, ) if highest_confidence < off_topic_threshold: answer_id, answer_text, answer_media = self.__get_offtopic() return QuestionClassiferPredictionResult( answer_id, answer_text, answer_media, highest_confidence, feedback_id )
def __get_prediction(self, w2v_vector): if not self.model: self.model = joblib.load(self.model_file) test_vector = w2v_vector.reshape(1, -1) prediction = self.model.predict(test_vector) decision = self.model.decision_function(test_vector) confidence_scores = ( sorted(decision[0]) if decision.ndim >= 2 else sorted(decision) ) highest_confidence = confidence_scores[-1] if not (prediction and prediction[0]): raise Exception( f"Prediction should be a list with at least one element (answer text) but found {prediction}" ) answer_text = prediction[0] answer_key = sanitize_string(answer_text) answer_id = ( self.mentor.questions_by_answer[answer_key].get("answer_id", "") if answer_key in self.mentor.questions_by_answer else "" ) answer_media = ( self.mentor.questions_by_answer[answer_key].get("media", []) if answer_key in self.mentor.questions_by_answer else [] ) if not answer_id: raise Exception( f"No answer id found for answer text (classifier_data may be out of sync with trained model): {answer_text}" ) return answer_id, answer_text, answer_media, highest_confidence
def test_accuracy(self, classifier, test_file, num=None): mentor = classifier.mentor path = os.path.join("checkpoint", "tests", mentor.id, test_file) user_questions = self.__read_test_data(path, num) # return 0 print("Loaded test set '{0}' of {1} questions for {2}".format( test_file, len(user_questions), mentor.id)) correct_predictions = 0 total_predictions = 0 for q in user_questions: ID, text, confidence = self.answer_confidence(classifier, q) if sanitize_string(text) in user_questions[q]: correct_predictions += 1 else: print("{0}. '{1}'".format(total_predictions + 1, q)) print(" Expected:") for i in user_questions[q]: print(" - {0}".format(" ".join(i.split()[:15]))) print(" Got:\n - {0}".format(" ".join(text.split()[:15]))) total_predictions += 1 print("{0}/{1} ({2:.1f}%) questions answered correctly".format( correct_predictions, total_predictions, (correct_predictions / total_predictions) * 100, )) return correct_predictions / total_predictions
def evaluate( self, question: str, shared_root, canned_question_match_disabled: bool = False ) -> QuestionClassiferPredictionResult: sanitized_question = sanitize_string(question) if not canned_question_match_disabled: if sanitized_question in self.mentor.questions_by_text: q = self.mentor.questions_by_text[sanitized_question] answer_id = q["answer_id"] answer = q["answer"] answer_media = q["media"] feedback_id = create_user_question( self.mentor.id, question, answer_id, "PARAPHRASE" if sanitized_question != sanitize_string(q["question_text"]) else "EXACT", 1.0, ) return QuestionClassiferPredictionResult( answer_id, answer, answer_media, 1.0, feedback_id ) embedded_question = self.transformer.get_embeddings(question) answer_id, answer, answer_media, highest_confidence = self.__get_prediction( embedded_question ) feedback_id = create_user_question( self.mentor.id, question, answer_id, "OFF_TOPIC" if highest_confidence < OFF_TOPIC_THRESHOLD_DEFAULT else "CLASSIFIER", highest_confidence, ) if highest_confidence < OFF_TOPIC_THRESHOLD_DEFAULT: answer_id, answer, answer_media = self.__get_offtopic() return QuestionClassiferPredictionResult( answer_id, answer, answer_media, highest_confidence, feedback_id )
def load(self): data = fetch_mentor_data(self.id) for subject in data.get("subjects", []): self.topics.append(subject["name"]) for topic in data.get("topics", []): self.topics.append(topic["name"]) for answer in data.get("answers", []): question = answer["question"] if answer["status"] != "COMPLETE": continue if question["type"] == "UTTERANCE": if question["name"] not in self.utterances_by_type: self.utterances_by_type[question["name"]] = [] self.utterances_by_type[question["name"]].append([ answer["_id"], answer["transcript"], answer.get("media", []) ]) continue q = { "id": question["_id"], "question_text": question["question"], "paraphrases": question["paraphrases"], "answer": answer["transcript"], "answer_id": answer["_id"], "media": answer.get("media", []), "topics": [], } self.answer_id_by_answer[answer["_id"]] = answer["transcript"] self.questions_by_id[question["_id"]] = q for question in data.get("questions", []): q = self.questions_by_id.get(question["question"]["_id"], None) if q is not None: for topic in question["topics"]: self.questions_by_id[q["id"]]["topics"].append( topic["name"]) self.questions_by_text[sanitize_string(q["question_text"])] = q for paraphrase in q["paraphrases"]: self.questions_by_text[sanitize_string(paraphrase)] = q self.questions_by_answer[sanitize_string(q["answer"])] = q
def __get_prediction( self, embedded_question ) -> Tuple[str, str, List[Media], float]: prediction = self.model.predict([embedded_question]) decision = self.model.decision_function([embedded_question]) highest_confidence = max(decision[0]) answer_text = self.mentor.answer_id_by_answer[prediction[0]] answer_key = sanitize_string(answer_text) answer_media = ( self.mentor.questions_by_answer[answer_key].get("media", []) if answer_key in self.mentor.questions_by_answer else [] ) return prediction[0], answer_text, answer_media, float(highest_confidence)
def get_answer(self, question, canned_question_match_disabled=False): if not canned_question_match_disabled: sanitized_question = sanitize_string(question) if sanitized_question in self.mentor.question_ids: answer_id = self.mentor.question_ids[sanitized_question] answer_question = self.mentor.ids_answers[answer_id] return answer_id, answer_question, 1.0 preprocessor = NLTKPreprocessor() processed_question = preprocessor.transform(question) w2v_vector, lstm_vector = self.w2v_model.w2v_for_question( processed_question) padded_vector = pad_sequences( [lstm_vector], maxlen=25, dtype="float32", padding="post", truncating="post", value=0.0, ) topic_vector = self.__get_topic_vector(padded_vector) predicted_answer = self.__get_prediction(w2v_vector, topic_vector) return predicted_answer
def load_ids_answers(self): classifier_data = pd.read_csv( self.mentor_data_path("classifier_data.csv")) corpus = classifier_data.fillna("") answer_ids = {} ids_answers = {} question_ids = {} ids_questions = {} questions_by_id = {} for i in range(0, len(corpus)): id = corpus.iloc[i]["ID"] answer = corpus.iloc[i]["text"] topics_csv = corpus.iloc[i]["topics"] answer = answer.replace("\u00a0", " ") answer_ids[answer] = id ids_answers[id] = answer questions = corpus.iloc[i]["question"].split("\n") for i, question in enumerate(questions): question_ids[sanitize_string(question)] = id if i == 0: questions_by_id[id] = {"question_text": question} _add_question_to_topics( id, topics_csv, self.topics_by_id, self.topic_id_default, topic_id_by_question_topic_id=self. topic_id_by_question_topic_id, ) ids_questions[id] = questions for tid in [k for k in self.topics_by_id.keys()]: topic = self.topics_by_id[tid] if not topic["questions"] or len(topic["questions"]) == 0: del self.topics_by_id[tid] continue topic["questions"] = to_unique_sorted(topic["questions"]) return ids_answers, answer_ids, ids_questions, question_ids, questions_by_id