コード例 #1
0
    def __read_test_data(self, file, num):
        # load 2D matrix of user questions vs actual questions
        test_data = list(csv.reader(open(file)))
        numrows = len(test_data)
        numcols = len(test_data[0])

        # number of questions to ask (default is all of them)
        if num is None or num > numcols - 2:
            num = numcols - 2

        # get user questions
        user_questions = {}
        for c in range(0, num):
            user_question = sanitize_string(test_data[0][c + 2])

            # get ideal and reasonable matches for user questions
            for r in range(1, numrows):
                match = sanitize_string(test_data[r][c + 2])
                if match == "i" or match == "r":
                    answer = sanitize_string(test_data[r][1])
                    try:
                        user_questions[user_question].append(answer)
                    except KeyError:
                        user_questions[user_question] = [answer]

        return user_questions
コード例 #2
0
    def evaluate(
        self,
        question: str,
        shared_root: str,
        canned_question_match_disabled=False,
    ) -> QuestionClassiferPredictionResult:
        if not canned_question_match_disabled:
            sanitized_question = sanitize_string(question)
            if sanitized_question in self.mentor.questions_by_text:
                q = self.mentor.questions_by_text[sanitized_question]
                answer_id = q["answer_id"]
                answer = q["answer"]
                answer_media = q["media"]
                feedback_id = create_user_question(
                    self.mentor.id,
                    question,
                    answer_id,
                    "PARAPHRASE"
                    if sanitized_question != sanitize_string(q["question_text"])
                    else "EXACT",
                    1.0,
                )
                return QuestionClassiferPredictionResult(
                    answer_id, answer, answer_media, 1.0, feedback_id
                )

        preprocessor = SpacyPreprocessor(shared_root)
        processed_question = preprocessor.transform(question)
        w2v_vector, lstm_vector = self.w2v_model.w2v_for_question(processed_question)
        off_topic_threshold = get_off_topic_threshold()
        (
            answer_id,
            answer_text,
            answer_media,
            highest_confidence,
        ) = self.__get_prediction(w2v_vector)
        feedback_id = create_user_question(
            self.mentor.id,
            question,
            answer_id,
            "OFF_TOPIC" if highest_confidence < off_topic_threshold else "CLASSIFIER",
            highest_confidence,
        )
        if highest_confidence < off_topic_threshold:
            answer_id, answer_text, answer_media = self.__get_offtopic()
        return QuestionClassiferPredictionResult(
            answer_id, answer_text, answer_media, highest_confidence, feedback_id
        )
コード例 #3
0
    def __get_prediction(self, w2v_vector):
        if not self.model:
            self.model = joblib.load(self.model_file)
        test_vector = w2v_vector.reshape(1, -1)
        prediction = self.model.predict(test_vector)
        decision = self.model.decision_function(test_vector)
        confidence_scores = (
            sorted(decision[0]) if decision.ndim >= 2 else sorted(decision)
        )
        highest_confidence = confidence_scores[-1]
        if not (prediction and prediction[0]):
            raise Exception(
                f"Prediction should be a list with at least one element (answer text) but found {prediction}"
            )
        answer_text = prediction[0]

        answer_key = sanitize_string(answer_text)
        answer_id = (
            self.mentor.questions_by_answer[answer_key].get("answer_id", "")
            if answer_key in self.mentor.questions_by_answer
            else ""
        )
        answer_media = (
            self.mentor.questions_by_answer[answer_key].get("media", [])
            if answer_key in self.mentor.questions_by_answer
            else []
        )
        if not answer_id:
            raise Exception(
                f"No answer id found for answer text (classifier_data may be out of sync with trained model): {answer_text}"
            )
        return answer_id, answer_text, answer_media, highest_confidence
コード例 #4
0
    def test_accuracy(self, classifier, test_file, num=None):
        mentor = classifier.mentor
        path = os.path.join("checkpoint", "tests", mentor.id, test_file)
        user_questions = self.__read_test_data(path, num)

        # return 0

        print("Loaded test set '{0}' of {1} questions for {2}".format(
            test_file, len(user_questions), mentor.id))

        correct_predictions = 0
        total_predictions = 0

        for q in user_questions:
            ID, text, confidence = self.answer_confidence(classifier, q)
            if sanitize_string(text) in user_questions[q]:
                correct_predictions += 1
            else:
                print("{0}. '{1}'".format(total_predictions + 1, q))
                print("   Expected:")
                for i in user_questions[q]:
                    print("    - {0}".format(" ".join(i.split()[:15])))
                print("   Got:\n    - {0}".format(" ".join(text.split()[:15])))
            total_predictions += 1

        print("{0}/{1} ({2:.1f}%) questions answered correctly".format(
            correct_predictions,
            total_predictions,
            (correct_predictions / total_predictions) * 100,
        ))

        return correct_predictions / total_predictions
コード例 #5
0
    def evaluate(
        self, question: str, shared_root, canned_question_match_disabled: bool = False
    ) -> QuestionClassiferPredictionResult:

        sanitized_question = sanitize_string(question)
        if not canned_question_match_disabled:
            if sanitized_question in self.mentor.questions_by_text:
                q = self.mentor.questions_by_text[sanitized_question]
                answer_id = q["answer_id"]
                answer = q["answer"]
                answer_media = q["media"]
                feedback_id = create_user_question(
                    self.mentor.id,
                    question,
                    answer_id,
                    "PARAPHRASE"
                    if sanitized_question != sanitize_string(q["question_text"])
                    else "EXACT",
                    1.0,
                )
                return QuestionClassiferPredictionResult(
                    answer_id, answer, answer_media, 1.0, feedback_id
                )
        embedded_question = self.transformer.get_embeddings(question)
        answer_id, answer, answer_media, highest_confidence = self.__get_prediction(
            embedded_question
        )
        feedback_id = create_user_question(
            self.mentor.id,
            question,
            answer_id,
            "OFF_TOPIC"
            if highest_confidence < OFF_TOPIC_THRESHOLD_DEFAULT
            else "CLASSIFIER",
            highest_confidence,
        )
        if highest_confidence < OFF_TOPIC_THRESHOLD_DEFAULT:
            answer_id, answer, answer_media = self.__get_offtopic()
        return QuestionClassiferPredictionResult(
            answer_id, answer, answer_media, highest_confidence, feedback_id
        )
コード例 #6
0
 def load(self):
     data = fetch_mentor_data(self.id)
     for subject in data.get("subjects", []):
         self.topics.append(subject["name"])
     for topic in data.get("topics", []):
         self.topics.append(topic["name"])
     for answer in data.get("answers", []):
         question = answer["question"]
         if answer["status"] != "COMPLETE":
             continue
         if question["type"] == "UTTERANCE":
             if question["name"] not in self.utterances_by_type:
                 self.utterances_by_type[question["name"]] = []
             self.utterances_by_type[question["name"]].append([
                 answer["_id"], answer["transcript"],
                 answer.get("media", [])
             ])
             continue
         q = {
             "id": question["_id"],
             "question_text": question["question"],
             "paraphrases": question["paraphrases"],
             "answer": answer["transcript"],
             "answer_id": answer["_id"],
             "media": answer.get("media", []),
             "topics": [],
         }
         self.answer_id_by_answer[answer["_id"]] = answer["transcript"]
         self.questions_by_id[question["_id"]] = q
     for question in data.get("questions", []):
         q = self.questions_by_id.get(question["question"]["_id"], None)
         if q is not None:
             for topic in question["topics"]:
                 self.questions_by_id[q["id"]]["topics"].append(
                     topic["name"])
             self.questions_by_text[sanitize_string(q["question_text"])] = q
             for paraphrase in q["paraphrases"]:
                 self.questions_by_text[sanitize_string(paraphrase)] = q
             self.questions_by_answer[sanitize_string(q["answer"])] = q
コード例 #7
0
 def __get_prediction(
     self, embedded_question
 ) -> Tuple[str, str, List[Media], float]:
     prediction = self.model.predict([embedded_question])
     decision = self.model.decision_function([embedded_question])
     highest_confidence = max(decision[0])
     answer_text = self.mentor.answer_id_by_answer[prediction[0]]
     answer_key = sanitize_string(answer_text)
     answer_media = (
         self.mentor.questions_by_answer[answer_key].get("media", [])
         if answer_key in self.mentor.questions_by_answer
         else []
     )
     return prediction[0], answer_text, answer_media, float(highest_confidence)
コード例 #8
0
 def get_answer(self, question, canned_question_match_disabled=False):
     if not canned_question_match_disabled:
         sanitized_question = sanitize_string(question)
         if sanitized_question in self.mentor.question_ids:
             answer_id = self.mentor.question_ids[sanitized_question]
             answer_question = self.mentor.ids_answers[answer_id]
             return answer_id, answer_question, 1.0
     preprocessor = NLTKPreprocessor()
     processed_question = preprocessor.transform(question)
     w2v_vector, lstm_vector = self.w2v_model.w2v_for_question(
         processed_question)
     padded_vector = pad_sequences(
         [lstm_vector],
         maxlen=25,
         dtype="float32",
         padding="post",
         truncating="post",
         value=0.0,
     )
     topic_vector = self.__get_topic_vector(padded_vector)
     predicted_answer = self.__get_prediction(w2v_vector, topic_vector)
     return predicted_answer
コード例 #9
0
 def load_ids_answers(self):
     classifier_data = pd.read_csv(
         self.mentor_data_path("classifier_data.csv"))
     corpus = classifier_data.fillna("")
     answer_ids = {}
     ids_answers = {}
     question_ids = {}
     ids_questions = {}
     questions_by_id = {}
     for i in range(0, len(corpus)):
         id = corpus.iloc[i]["ID"]
         answer = corpus.iloc[i]["text"]
         topics_csv = corpus.iloc[i]["topics"]
         answer = answer.replace("\u00a0", " ")
         answer_ids[answer] = id
         ids_answers[id] = answer
         questions = corpus.iloc[i]["question"].split("\n")
         for i, question in enumerate(questions):
             question_ids[sanitize_string(question)] = id
             if i == 0:
                 questions_by_id[id] = {"question_text": question}
             _add_question_to_topics(
                 id,
                 topics_csv,
                 self.topics_by_id,
                 self.topic_id_default,
                 topic_id_by_question_topic_id=self.
                 topic_id_by_question_topic_id,
             )
         ids_questions[id] = questions
     for tid in [k for k in self.topics_by_id.keys()]:
         topic = self.topics_by_id[tid]
         if not topic["questions"] or len(topic["questions"]) == 0:
             del self.topics_by_id[tid]
             continue
         topic["questions"] = to_unique_sorted(topic["questions"])
     return ids_answers, answer_ids, ids_questions, question_ids, questions_by_id