Beispiel #1
0
    def setUp(self):
        self.data = [
            {
                "_question": "What do you like to eat?",
                "answer": "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.",
            },
            {
                "_question": "Tell me about your mother.",
                "answer": "My mother spends a lot of time driving my brother around to baseball practice.",
            },
            {
                "_question": "Is driving safe?",
                "answer": "Some health experts suggest that driving may cause increased tension and blood pressure.",
            },
            {
                "_question": "Does your mother love you?",
                "answer": "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.",
            },
            {
                "_question": "Want some brocolli?",
                "answer": "Health professionals say that brocolli is good for your health.",
            }
        ]
        self.data_store = DataStore(self.data)

        dir_path = os.path.dirname(os.path.abspath(__file__))
        self.test_md_file_path = os.path.join(dir_path, 'test_tfidf.md')
        self.test_dict_file_path = os.path.join(dir_path, 'test_tfidf.dict')
        self.test_q_simmx_file_path = os.path.join(dir_path, 'test_tfidf_q.simmx')
        self.test_a_simmx_file_path = os.path.join(dir_path, 'test_tfidf_a.simmx')
 def setUp(self):
     self.data = [
         {
             "_question": "tell me about a16z",
             "_question_topic_words": ["a16z"],
             "answer":
             "a16z is a Silicon Valley-based venture capital firm with $2.7 billion under management. They invest from seed to growth.",
             "answer_topic_words": ["a16z", "venture", "capital"],
         },
         {
             "_question": "who is the founder of a16z",
             "_question_topic_words": ["founder", "a16z"],
             "answer": "Marc Andreessen and Ben Horowitz co-founded a16z.",
             "answer_topic_words": ["co-founded", "a16z"],
         },
         {
             "_question":
             "who is Steven Sinofsky",
             "_question_topic_words": ["Steven", "Sinofsky"],
             "answer":
             "Steven Sinofsky is a board partner at a16z.",
             "answer_topic_words":
             ["Steven", "Sinofsky", "board", "partner", "a16z"],
         },
     ]
     self.data_store = DataStore(self.data)
     self.dir_path = os.path.dirname(os.path.abspath(__file__))
     self.test_simmx_file_path = os.path.join(self.dir_path,
                                              'test_topic_words.simmx')
     self.test_dict_file_path = os.path.join(self.dir_path,
                                             'test_topic_words.dict')
Beispiel #3
0
    def test_add_doc(self):
        d = DataStore({})
        doc1 = "a"
        did, added = d._add_doc(doc1)
        self.assertEqual(did, 0)
        self.assertTrue(added)
        self.assertEqual(d.doc_set, [doc1])
        self.assertEqual(d.doc_to_id, {doc1: 0})

        did, added = d._add_doc(doc1)
        self.assertEqual(did, 0)
        self.assertFalse(added)

        doc2 = "b"
        did, added = d._add_doc(doc2)
        self.assertEqual(did, 1)
        self.assertTrue(added)
        self.assertEqual(d.doc_set, [doc1, doc2])
        self.assertEqual(d.doc_to_id, {doc1: 0, doc2: 1})
Beispiel #4
0
    def setUpClass(cls):
        cls.data_store = DataStore({})
        cls.qe = CompositeQueryEngine(cls.data_store, eager_loading=False)

        class MockJudge(object):
            def __init__(self):
                predicts = [[0.8, 0.1, 0.1], [0.3, 0.5, 0.2], [0.3, 0.1, 0.6]]
                self.predict_iter = iter(predicts)

            def predict_proba(self, input):
                return next(self.predict_iter)

        cls.qe.set_models([
            TfIdfQueryEngineComponent(cls.data_store, eager_loading=False),
            RankBasedQueryEngineComponent(cls.data_store, eager_loading=False)
        ], MockJudge())
Beispiel #5
0
 def setUpClass(cls):
     cls.data = [{
         "_question": "What do you like to eat?",
         "answer":
         "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.",
         "context": [{
             "Object": "Brocolli"
         }],
     }, {
         "_question": "Tell me about your mother.",
         "answer":
         "My mother spends a lot of time driving my brother around to baseball practice.",
         "context": [{
             "Person": "Mother"
         }],
     }, {
         "_question": "Is driving safe?",
         "answer":
         "Some health experts suggest that driving may cause increased tension and blood pressure.",
         "context": [{
             "Concept": "Driving"
         }],
     }, {
         "_question": "Does your mother love you?",
         "answer":
         "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.",
         "context": [{
             "Person": "Mother"
         }],
     }, {
         "_question": "Want some brocolli?",
         "answer":
         "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.",
         "context": [{
             "Object": "Brocolli"
         }],
     }]
     cls.data_store = DataStore(cls.data)
     cls.qe = RankBasedQueryEngineComponent(cls.data_store,
                                            eager_loading=False)
     cls.qe.set_models(TfIdfModelStruct(), LdaModelStruct(),
                       TopicWordLookupModelStruct(None, None),
                       Word2VecModel(eager_loading=False),
                       LinearSVMRankModel())
Beispiel #6
0
 def test_getters(self):
     d = DataStore(self.data)
     self.assertEqual(d.get_all_questions(),
                      ["tell me about a16z", "who is the founder of a16z", "Where is a16z?",
                       "Where is the headquarter of 500px?", "who co-founded of a16z",
                       "who is Steven Sinofsky"])
     self.assertEqual(d.get_all_answers(),
                      ["a16z is a Silicon Valley-based venture capital firm with $2.7 billion under management. They invest from seed to growth.",
                       "Marc Andreessen and Ben Horowitz co-founded a16z.",
                       "a16z is located in Menlo Park",
                       "500px headquarter in san francisco",
                       "Steven Sinofsky is a board partner at a16z."
                       ])
     self.assertEqual(d.get_doc_id_from_question_pos(0), 0)
     self.assertEqual(d.get_doc_id_from_answer_pos(0), 1)
Beispiel #7
0
    def test_init_without_rank_training_data(self):
        d = DataStore(self.data, load_rank_training_data=False)

        expected_doc_set = [
          "tell me about a16z",
          "a16z is a Silicon Valley-based venture capital firm with $2.7 billion under management. They invest from seed to growth.",
          "who is the founder of a16z",
          "Marc Andreessen and Ben Horowitz co-founded a16z.",
        ]
        self.assertEqual(d.doc_set, expected_doc_set)

        expected_doc_to_id = dict()
        for idx, doc in enumerate(expected_doc_set):
            expected_doc_to_id[doc] = idx

        self.assertEqual(d.doc_to_id, expected_doc_to_id)

        expected_question_set = [0, 2]
        self.assertEqual(d.question_set, expected_question_set)

        expected_answer_set = [1, 3]
        self.assertEqual(expected_answer_set, d.answer_set)

        expected_qid_to_qa_pair = {0: (0, 1), 2: (2, 3)}
        self.assertEqual(d.qid_to_qa_pair, expected_qid_to_qa_pair)

        expected_aid_to_qa_pairs = {1: [(0, 1)], 3: [(2, 3)]}
        self.assertEqual(d.aid_to_qa_pairs, expected_aid_to_qa_pairs)

        expected_topic_word_docs = [
          ("tell me about a16z", ["a16z"]),
          ("a16z is a Silicon Valley-based venture capital firm with $2.7 billion under management. They invest from seed to growth.",
           ["a16z", "venture", "capital"]),
          ("who is the founder of a16z", ["founder", "a16z"]),
          ("Marc Andreessen and Ben Horowitz co-founded a16z.", ["co-founded", "a16z"])
        ]
        self.assertEqual(d.topic_word_docs, expected_topic_word_docs)
        self.assertEqual(d.rank_data, {})

        expected_contexts = {0: [{'company': 'a16z'}], 2: [{'Investor': 'Alex Rampell'}]}
        self.assertEqual(d.qa_context, expected_contexts)
Beispiel #8
0
def split_raw_data_k_fold(raw_data, num_folds):
    """Return tuple of training data store, training data set for evaluation, and the unseen test data set."""
    assert isinstance(raw_data, list)
    # since our generated raw_data has an inherent order,
    # we must shuffle it before k-folds
    random.shuffle(raw_data)
    train_test_data_tuples = []
    kf = KFold(len(raw_data), n_folds=num_folds)
    for train_index, test_index in kf:
        train_data = []
        test_data = []

        for data_piece in sample_by_index_generator(train_index, raw_data):
            train_data.append(data_piece)
        for data_piece in sample_by_index_generator(test_index, raw_data):
            test_data.append(data_piece)

        train_test_data_tuples.append(
            (DataStore(train_data, load_rank_training_data=False), TestDataSet(train_data), TestDataSet(test_data))
        )

    return train_test_data_tuples
Beispiel #9
0
class TfIdfModelTestCase(unittest.TestCase):

    def setUp(self):
        self.data = [
            {
                "_question": "What do you like to eat?",
                "answer": "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.",
            },
            {
                "_question": "Tell me about your mother.",
                "answer": "My mother spends a lot of time driving my brother around to baseball practice.",
            },
            {
                "_question": "Is driving safe?",
                "answer": "Some health experts suggest that driving may cause increased tension and blood pressure.",
            },
            {
                "_question": "Does your mother love you?",
                "answer": "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.",
            },
            {
                "_question": "Want some brocolli?",
                "answer": "Health professionals say that brocolli is good for your health.",
            }
        ]
        self.data_store = DataStore(self.data)

        dir_path = os.path.dirname(os.path.abspath(__file__))
        self.test_md_file_path = os.path.join(dir_path, 'test_tfidf.md')
        self.test_dict_file_path = os.path.join(dir_path, 'test_tfidf.dict')
        self.test_q_simmx_file_path = os.path.join(dir_path, 'test_tfidf_q.simmx')
        self.test_a_simmx_file_path = os.path.join(dir_path, 'test_tfidf_a.simmx')

    def tearDown(self):
        # cleanup
        if os.path.isfile(self.test_dict_file_path):
            os.remove(self.test_dict_file_path)
            os.remove(self.test_md_file_path)
            os.remove(self.test_q_simmx_file_path)
            os.remove(self.test_a_simmx_file_path)

    @patch('ir_query_engine.match_models.tfidf_model.get_md_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_dict_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_a_simmx_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_q_simmx_path')
    def test_get_model(self, mock_q_simmx_file_path, mock_a_simmx_file_path, mock_dict_file_path, mock_md_file_path):
        mock_md_file_path.return_value = self.test_md_file_path
        mock_dict_file_path.return_value = self.test_dict_file_path
        mock_q_simmx_file_path.return_value = self.test_q_simmx_file_path
        mock_a_simmx_file_path.return_value = self.test_a_simmx_file_path

        new_model_struct = TfIdfModelStruct.get_model(data_store=self.data_store)
        # assert file saves
        self.assertTrue(os.path.isfile(self.test_md_file_path))
        self.assertTrue(os.path.isfile(self.test_dict_file_path))
        self.assertTrue(os.path.isfile(self.test_q_simmx_file_path))
        self.assertTrue(os.path.isfile(self.test_a_simmx_file_path))

        loaded_model_struct = TfIdfModelStruct.get_model()
        # assert models loaded identity
        self.assertEqual(new_model_struct.dictionary,
                         loaded_model_struct.dictionary)

        bow = [(0, 1), (4, 1)]
        self.assertEqual(new_model_struct.model[bow],
                         loaded_model_struct.model[bow])
        vec = new_model_struct.model[bow]
        self.assertEqual(new_model_struct.answer_sim_matrix[vec][0],
                         loaded_model_struct.answer_sim_matrix[vec][0])
        self.assertEqual(new_model_struct.question_sim_matrix[vec][0],
                         loaded_model_struct.question_sim_matrix[vec][0])

        # regen
        self.data_store.doc_set[0] = "Yo man"
        regen_model_struct = TfIdfModelStruct.get_model(data_store=self.data_store, regen=True)
        self.assertNotEqual(new_model_struct.dictionary,
                            regen_model_struct.dictionary)

    @patch('ir_query_engine.match_models.tfidf_model.get_md_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_dict_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_a_simmx_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_q_simmx_path')
    def test_query(self, mock_q_simmx_file_path, mock_a_simmx_file_path, mock_dict_file_path, mock_md_file_path):
        mock_md_file_path.return_value = self.test_md_file_path
        mock_dict_file_path.return_value = self.test_dict_file_path
        mock_q_simmx_file_path.return_value = self.test_q_simmx_file_path
        mock_a_simmx_file_path.return_value = self.test_a_simmx_file_path

        model_struct = TfIdfModelStruct.get_model(data_store=self.data_store)

        query_doc = "Is brocolli tasty to eat?"
        results = model_struct.query_answers(raw_doc=query_doc)
        # The ids in the results tuple are the positions in the answer corpus,
        # i.e. the index in the answer_set, translate back to doc_id
        results = self.data_store.translate_answer_query_results(results)

        # the most similar one is the first answer
        self.assertEqual(results[0][0], 1)

        results = model_struct.query_questions(raw_doc=query_doc)
        # The ids in the results tuple are the positions in the answer corpus,
        # i.e. the index in the answer_set, translate back to doc_id
        results = self.data_store.translate_question_query_results(results)

        self.assertEqual(results[0][0], 0)

    @patch('ir_query_engine.match_models.tfidf_model.get_md_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_dict_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_a_simmx_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_q_simmx_path')
    def test_td_idf(self, mock_q_simmx_file_path, mock_a_simmx_file_path, mock_dict_file_path, mock_md_file_path):
        mock_md_file_path.return_value = self.test_md_file_path
        mock_dict_file_path.return_value = self.test_dict_file_path
        mock_q_simmx_file_path.return_value = self.test_q_simmx_file_path
        mock_a_simmx_file_path.return_value = self.test_a_simmx_file_path

        model_struct = TfIdfModelStruct.get_model(data_store=self.data_store)

        query_doc = "Is brocolli tasty to eat?"
        lowered_doc = query_doc.lower()
        tf_vec = model_struct.get_tf_vec(query_doc)
        for termid, count in tf_vec:
            self.assertEqual(count, 1)
            term = model_struct.dictionary.get(termid)
            self.assertTrue(term in lowered_doc)

        idf_vec = model_struct.get_idf_vec(tf_vec=tf_vec)
        for termid, idf in idf_vec:
            term = model_struct.dictionary.get(termid)
            df = 0
            for doc in self.data_store.doc_set:
                if term in doc.lower():
                    df += 1
            self.assertAlmostEqual(df2idf(df, len(self.data_store.doc_set)),
                                   idf)

        tf, idf = model_struct.get_tf_and_idf(query_doc, 'to')
        self.assertEqual(tf, 1)
        self.assertAlmostEqual(idf, df2idf(4, len(self.data_store.doc_set)))

        tf, idf = model_struct.get_tf_and_idf(query_doc, 'AAA')
        self.assertEqual(tf, 0)
        self.assertEqual(idf, 10.0)

        # test cooccurance features
        doc2 = self.data_store.doc_set[1]
        coocur_size, coocur_rate, cooccur_sum_idf, cooccur_avg_idf = \
            model_struct.get_cooccur_features(query_doc, doc2)

        self.assertEqual(coocur_size, 4)
        self.assertAlmostEqual(coocur_rate, float(4) / len(model_struct.get_tf_vec(doc2)))
        expected_sum = 0
        for t in model_struct.get_idf_vec(raw_doc=query_doc):
            expected_sum += t[1]

        self.assertAlmostEqual(cooccur_sum_idf, expected_sum)
        self.assertAlmostEqual(cooccur_avg_idf, expected_sum / 4)

    @patch('ir_query_engine.match_models.tfidf_model.get_md_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_dict_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_a_simmx_path')
    @patch('ir_query_engine.match_models.tfidf_model.get_q_simmx_path')
    def test_get_similarities(self, mock_q_simmx_file_path, mock_a_simmx_file_path, mock_dict_file_path, mock_md_file_path):
        mock_md_file_path.return_value = self.test_md_file_path
        mock_dict_file_path.return_value = self.test_dict_file_path
        mock_q_simmx_file_path.return_value = self.test_q_simmx_file_path
        mock_a_simmx_file_path.return_value = self.test_a_simmx_file_path

        model_struct = TfIdfModelStruct.get_model(data_store=self.data_store)

        query_doc = "Is brocolli tasty to eat?"
        compare_docs = self.data_store.doc_set

        sims = model_struct.get_similarities(query_doc, compare_docs)

        for idx, sim in enumerate(sims):
            expected_sim = cossim(
                model_struct.get_tfidf_vec(query_doc),
                model_struct.get_tfidf_vec(compare_docs[idx])
            )
            self.assertAlmostEqual(sim[1], expected_sim)
Beispiel #10
0
    def test_init_with_rank_training_data(self):
        d = DataStore(self.data)

        expected_doc_set = [
          "tell me about a16z",
          "a16z is a Silicon Valley-based venture capital firm with $2.7 billion under management. They invest from seed to growth.",
          "who is the founder of a16z",
          "Marc Andreessen and Ben Horowitz co-founded a16z.",
          "Where is a16z?",
          "a16z is located in Menlo Park",
          "Where is the headquarter of 500px?",
          "500px headquarter in san francisco",
          "who co-founded of a16z",
          "who is Steven Sinofsky",
          "Steven Sinofsky is a board partner at a16z."
        ]
        self.assertEqual(d.doc_set, expected_doc_set)

        expected_doc_to_id = dict()
        for idx, doc in enumerate(expected_doc_set):
            expected_doc_to_id[doc] = idx

        self.assertEqual(d.doc_to_id, expected_doc_to_id)

        expected_question_set = [0, 2, 4, 6, 8, 9]
        self.assertEqual(d.question_set, expected_question_set)

        expected_answer_set = [1, 3, 5, 7, 10]
        self.assertEqual(expected_answer_set, d.answer_set)

        expected_qid_to_qa_pair = {0: (0, 1), 2: (2, 3), 4: (4, 5), 6: (6, 7), 8: (8, 3), 9: (9, 10)}
        self.assertEqual(d.qid_to_qa_pair, expected_qid_to_qa_pair)

        expected_aid_to_qa_pairs = {1: [(0, 1)], 3: [(2, 3), (8, 3)], 5: [(4, 5)], 7: [(6, 7)], 10: [(9, 10)]}
        self.assertEqual(d.aid_to_qa_pairs, expected_aid_to_qa_pairs)

        expected_topic_word_docs = [
          ("tell me about a16z", ["a16z"]),
          ("a16z is a Silicon Valley-based venture capital firm with $2.7 billion under management. They invest from seed to growth.",
           ["a16z", "venture", "capital"]),
          ("who is the founder of a16z", ["founder", "a16z"]),
          ("Marc Andreessen and Ben Horowitz co-founded a16z.", ["co-founded", "a16z"])
        ]
        self.assertEqual(d.topic_word_docs, expected_topic_word_docs)

        expected_rank_data =  {
            "tell me about a16z": [
                (
                    ("tell me about a16z",
                     "a16z is a Silicon Valley-based venture capital firm with $2.7 billion under management. They invest from seed to growth."),
                    100
                ),
                (
                    ("who is the founder of a16z",
                     "Marc Andreessen and Ben Horowitz co-founded a16z."),
                    50
                ),
                (
                    ("Where is a16z?",
                     "a16z is located in Menlo Park"),
                    10
                ),
                (
                    ("Where is the headquarter of 500px?",
                     "500px headquarter in san francisco"),
                    0
                )
            ],
            "who is the founder of a16z": [
                (
                    ("who is the founder of a16z",
                     "Marc Andreessen and Ben Horowitz co-founded a16z."),
                    100
                ),
                (
                    ("who co-founded of a16z",
                     "Marc Andreessen and Ben Horowitz co-founded a16z."),
                    100
                ),
                (
                    ("who is Steven Sinofsky",
                     "Steven Sinofsky is a board partner at a16z."),
                    40
                ),
                (
                    ("Where is a16z?",
                     "a16z is located in Menlo Park"),
                    5
                )
            ]
        }
        self.assertEqual(d.rank_data, expected_rank_data)

        expected_contexts = {0: [{'company': 'a16z'}], 2: [{'Investor': 'Alex Rampell'}]}
        self.assertEqual(d.qa_context, expected_contexts)