Beispiel #1
0
    def test_collect_training_data(self):
        app = Vespa(url="https://api.cord19.vespa.ai")
        query_model = QueryModel(match_phase=OR(),
                                 rank_profile=Ranking(name="bm25",
                                                      list_features=True))
        labeled_data = [
            {
                "query_id": 0,
                "query":
                "Intrauterine virus infections and congenital heart disease",
                "relevant_docs": [{
                    "id": 0,
                    "score": 1
                }, {
                    "id": 3,
                    "score": 1
                }],
            },
            {
                "query_id": 1,
                "query":
                "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
                "relevant_docs": [{
                    "id": 1,
                    "score": 1
                }, {
                    "id": 5,
                    "score": 1
                }],
            },
        ]
        training_data_batch = app.collect_training_data(
            labeled_data=labeled_data,
            id_field="id",
            query_model=query_model,
            number_additional_docs=2,
            fields=["rankfeatures"],
        )
        self.assertEqual(training_data_batch.shape[0], 12)
        # It should have at least one rank feature in addition to document_id, query_id and	label
        self.assertTrue(training_data_batch.shape[1] > 3)

        training_data = []
        for query_data in labeled_data:
            for doc_data in query_data["relevant_docs"]:
                training_data_point = app.collect_training_data_point(
                    query=query_data["query"],
                    query_id=query_data["query_id"],
                    relevant_id=doc_data["id"],
                    id_field="id",
                    query_model=query_model,
                    number_additional_docs=2,
                    fields=["rankfeatures"],
                )
                training_data.extend(training_data_point)
        training_data = DataFrame.from_records(training_data)

        self.assertEqual(training_data.shape[0], 12)
        # It should have at least one rank feature in addition to document_id, query_id and	label
        self.assertTrue(training_data.shape[1] > 3)
Beispiel #2
0
class TestVespaCollectData(unittest.TestCase):
    def setUp(self) -> None:
        self.app = Vespa(url="http://localhost", port=8080)
        self.raw_vespa_result_recall = {
            "root": {
                "id":
                "toplevel",
                "relevance":
                1.0,
                "fields": {
                    "totalCount": 1083
                },
                "coverage": {
                    "coverage": 100,
                    "documents": 62529,
                    "full": True,
                    "nodes": 2,
                    "results": 1,
                    "resultsFull": 1,
                },
                "children": [{
                    "id": "id:covid-19:doc::40215",
                    "relevance": 30.368213170494712,
                    "source": "content",
                    "fields": {
                        "vespa_id_field": "abc",
                        "sddocname": "doc",
                        "body_text": "this is a body",
                        "title": "this is a title",
                        "rankfeatures": {
                            "a": 1,
                            "b": 2
                        },
                    },
                }],
            }
        }

        self.raw_vespa_result_additional = {
            "root": {
                "id":
                "toplevel",
                "relevance":
                1.0,
                "fields": {
                    "totalCount": 1083
                },
                "coverage": {
                    "coverage": 100,
                    "documents": 62529,
                    "full": True,
                    "nodes": 2,
                    "results": 1,
                    "resultsFull": 1,
                },
                "children": [
                    {
                        "id": "id:covid-19:doc::40216",
                        "relevance": 10,
                        "source": "content",
                        "fields": {
                            "vespa_id_field": "def",
                            "sddocname": "doc",
                            "body_text": "this is a body 2",
                            "title": "this is a title 2",
                            "rankfeatures": {
                                "a": 3,
                                "b": 4
                            },
                        },
                    },
                    {
                        "id": "id:covid-19:doc::40217",
                        "relevance": 8,
                        "source": "content",
                        "fields": {
                            "vespa_id_field": "ghi",
                            "sddocname": "doc",
                            "body_text": "this is a body 3",
                            "title": "this is a title 3",
                            "rankfeatures": {
                                "a": 5,
                                "b": 6
                            },
                        },
                    },
                ],
            }
        }

    def test_disable_rank_features(self):
        with self.assertRaises(AssertionError):
            self.app.collect_training_data_point(
                query="this is a query",
                query_id="123",
                relevant_id="abc",
                id_field="vespa_id_field",
                query_model=Query(),
                number_additional_docs=2,
            )

    def test_collect_training_data_point(self):

        self.app.query = Mock(side_effect=[
            VespaResult(self.raw_vespa_result_recall),
            VespaResult(self.raw_vespa_result_additional),
        ])
        query_model = Query(rank_profile=RankProfile(list_features=True))
        data = self.app.collect_training_data_point(
            query="this is a query",
            query_id="123",
            relevant_id="abc",
            id_field="vespa_id_field",
            query_model=query_model,
            number_additional_docs=2,
            timeout="15s",
        )

        self.assertEqual(self.app.query.call_count, 2)
        self.app.query.assert_has_calls([
            call(
                query="this is a query",
                query_model=query_model,
                recall=("vespa_id_field", ["abc"]),
                timeout="15s",
            ),
            call(
                query="this is a query",
                query_model=query_model,
                hits=2,
                timeout="15s",
            ),
        ])
        expected_data = [
            {
                "document_id": "abc",
                "query_id": "123",
                "relevant": 1,
                "a": 1,
                "b": 2
            },
            {
                "document_id": "def",
                "query_id": "123",
                "relevant": 0,
                "a": 3,
                "b": 4
            },
            {
                "document_id": "ghi",
                "query_id": "123",
                "relevant": 0,
                "a": 5,
                "b": 6
            },
        ]
        self.assertEqual(data, expected_data)

    def test_collect_training_data_point_0_recall_hits(self):

        self.raw_vespa_result_recall = {
            "root": {
                "id": "toplevel",
                "relevance": 1.0,
                "fields": {
                    "totalCount": 0
                },
                "coverage": {
                    "coverage": 100,
                    "documents": 62529,
                    "full": True,
                    "nodes": 2,
                    "results": 1,
                    "resultsFull": 1,
                },
            }
        }
        self.app.query = Mock(side_effect=[
            VespaResult(self.raw_vespa_result_recall),
            VespaResult(self.raw_vespa_result_additional),
        ])
        query_model = Query(rank_profile=RankProfile(list_features=True))
        data = self.app.collect_training_data_point(
            query="this is a query",
            query_id="123",
            relevant_id="abc",
            id_field="vespa_id_field",
            query_model=query_model,
            number_additional_docs=2,
            timeout="15s",
        )

        self.assertEqual(self.app.query.call_count, 1)
        self.app.query.assert_has_calls([
            call(
                query="this is a query",
                query_model=query_model,
                recall=("vespa_id_field", ["abc"]),
                timeout="15s",
            ),
        ])
        expected_data = []
        self.assertEqual(data, expected_data)

    def test_collect_training_data(self):

        mock_return_value = [
            {
                "document_id": "abc",
                "query_id": "123",
                "relevant": 1,
                "a": 1,
                "b": 2,
            },
            {
                "document_id": "def",
                "query_id": "123",
                "relevant": 0,
                "a": 3,
                "b": 4,
            },
            {
                "document_id": "ghi",
                "query_id": "123",
                "relevant": 0,
                "a": 5,
                "b": 6,
            },
        ]
        self.app.collect_training_data_point = Mock(
            return_value=mock_return_value)
        labelled_data = [{
            "query_id": 123,
            "query": "this is a query",
            "relevant_docs": [{
                "id": "abc",
                "score": 1
            }],
        }]
        query_model = Query(rank_profile=RankProfile(list_features=True))
        data = self.app.collect_training_data(
            labelled_data=labelled_data,
            id_field="vespa_id_field",
            query_model=query_model,
            number_additional_docs=2,
            timeout="15s",
        )
        self.app.collect_training_data_point.assert_has_calls([
            call(
                query="this is a query",
                query_id=123,
                relevant_id="abc",
                id_field="vespa_id_field",
                query_model=query_model,
                number_additional_docs=2,
                relevant_score=1,
                default_score=0,
                timeout="15s",
            )
        ])
        assert_frame_equal(data, DataFrame.from_records(mock_return_value))
Beispiel #3
0
    def test_workflow(self):
        #
        # Connect to a running Vespa Application
        #
        app = Vespa(url="https://api.cord19.vespa.ai")
        #
        # Define a query model
        #
        match_phase = Union(
            WeakAnd(hits=10),
            ANN(
                doc_vector="title_embedding",
                query_vector="title_vector",
                hits=10,
                label="title",
            ),
        )
        rank_profile = Ranking(name="bm25", list_features=True)
        query_model = QueryModel(
            name="ANN_bm25",
            query_properties=[
                QueryRankingFeature(
                    name="title_vector",
                    mapping=lambda x: [random() for x in range(768)],
                )
            ],
            match_phase=match_phase,
            rank_profile=rank_profile,
        )
        #
        # Query Vespa app
        #
        query_result = app.query(
            query="Is remdesivir an effective treatment for COVID-19?",
            query_model=query_model,
        )
        self.assertTrue(query_result.number_documents_retrieved > 0)
        self.assertEqual(len(query_result.hits), 10)
        #
        # Define labelled data
        #
        labeled_data = [
            {
                "query_id": 0,
                "query":
                "Intrauterine virus infections and congenital heart disease",
                "relevant_docs": [{
                    "id": 0,
                    "score": 1
                }, {
                    "id": 3,
                    "score": 1
                }],
            },
            {
                "query_id": 1,
                "query":
                "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
                "relevant_docs": [{
                    "id": 1,
                    "score": 1
                }, {
                    "id": 5,
                    "score": 1
                }],
            },
        ]
        # equivalent data in df format
        labeled_data_df = DataFrame(
            data={
                "qid": [0, 0, 1, 1],
                "query":
                ["Intrauterine virus infections and congenital heart disease"]
                * 2 + [
                    "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus"
                ] * 2,
                "doc_id": [0, 3, 1, 5],
                "relevance": [1, 1, 1, 1],
            })

        #
        # Collect training data
        #
        training_data_batch = app.collect_training_data(
            labeled_data=labeled_data,
            id_field="id",
            query_model=query_model,
            number_additional_docs=2,
            fields=["rankfeatures"],
        )
        self.assertTrue(training_data_batch.shape[0] > 0)
        self.assertEqual(
            len({"document_id", "query_id",
                 "label"}.intersection(set(training_data_batch.columns))),
            3,
        )
        #
        # Evaluate a query model
        #
        eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)]
        evaluation = app.evaluate(
            labeled_data=labeled_data,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
        )
        self.assertEqual(evaluation.shape, (9, 1))

        #
        # AssertionError - two models with the same name
        #
        with self.assertRaises(AssertionError):
            _ = app.evaluate(
                labeled_data=labeled_data,
                eval_metrics=eval_metrics,
                query_model=[QueryModel(),
                             QueryModel(), query_model],
                id_field="id",
            )

        evaluation = app.evaluate(
            labeled_data=labeled_data,
            eval_metrics=eval_metrics,
            query_model=[QueryModel(), query_model],
            id_field="id",
        )
        self.assertEqual(evaluation.shape, (9, 2))

        evaluation = app.evaluate(
            labeled_data=labeled_data_df,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
            detailed_metrics=True,
        )
        self.assertEqual(evaluation.shape, (15, 1))

        evaluation = app.evaluate(
            labeled_data=labeled_data_df,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
            detailed_metrics=True,
            per_query=True,
        )
        self.assertEqual(evaluation.shape, (2, 7))