Ejemplo n.º 1
0
 def test_weak_and(self):
     match_filter = WeakAnd(hits=10, field="field_name")
     self.assertEqual(
         match_filter.create_match_filter(query=self.query),
         '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", field_name contains "", '
         'field_name contains "a", field_name contains "test"))',
     )
     self.assertDictEqual(
         match_filter.get_query_properties(query=self.query), {})
Ejemplo n.º 2
0
 def test_union(self):
     match_filter = Union(
         WeakAnd(hits=10, field="field_name"),
         ANN(
             doc_vector="doc_vector",
             query_vector="query_vector",
             hits=10,
             label="label",
         ),
     )
     self.assertEqual(
         match_filter.create_match_filter(query=self.query),
         '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", '
         'field_name contains "", '
         'field_name contains "a", field_name contains "test")) or '
         '([{"targetNumHits": 10, "label": "label", "approximate": true}]nearestNeighbor(doc_vector, query_vector))',
     )
     self.assertDictEqual(
         match_filter.get_query_properties(query=self.query),
         {},
     )
Ejemplo n.º 3
0
 def test_union(self):
     match_filter = Union(
         WeakAnd(hits=10, field="field_name"),
         ANN(
             doc_vector="doc_vector",
             query_vector="query_vector",
             embedding_model=lambda x: [1, 2, 3],
             hits=10,
             label="label",
         ),
     )
     self.assertEqual(
         match_filter.create_match_filter(query=self.query),
         '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", '
         'field_name contains "", '
         'field_name contains "a", field_name contains "test")) or '
         '([{"targetNumHits": 10, "label": "label"}]nearestNeighbor(doc_vector, query_vector))',
     )
     self.assertDictEqual(
         match_filter.get_query_properties(query=self.query),
         {"ranking.features.query(query_vector)": "[1, 2, 3]"},
     )
Ejemplo n.º 4
0
    def test_workflow(self):
        #
        # Connect to a running Vespa Application
        #
        app = Vespa(url="https://api.cord19.vespa.ai")
        #
        # Define a query model
        #
        match_phase = Union(
            WeakAnd(hits=10),
            ANN(
                doc_vector="title_embedding",
                query_vector="title_vector",
                hits=10,
                label="title",
            ),
        )
        rank_profile = Ranking(name="bm25", list_features=True)
        query_model = QueryModel(
            name="ANN_bm25",
            query_properties=[
                QueryRankingFeature(
                    name="title_vector",
                    mapping=lambda x: [random() for x in range(768)],
                )
            ],
            match_phase=match_phase,
            rank_profile=rank_profile,
        )
        #
        # Query Vespa app
        #
        query_result = app.query(
            query="Is remdesivir an effective treatment for COVID-19?",
            query_model=query_model,
        )
        self.assertTrue(query_result.number_documents_retrieved > 0)
        self.assertEqual(len(query_result.hits), 10)
        #
        # Define labelled data
        #
        labeled_data = [
            {
                "query_id": 0,
                "query":
                "Intrauterine virus infections and congenital heart disease",
                "relevant_docs": [{
                    "id": 0,
                    "score": 1
                }, {
                    "id": 3,
                    "score": 1
                }],
            },
            {
                "query_id": 1,
                "query":
                "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
                "relevant_docs": [{
                    "id": 1,
                    "score": 1
                }, {
                    "id": 5,
                    "score": 1
                }],
            },
        ]
        # equivalent data in df format
        labeled_data_df = DataFrame(
            data={
                "qid": [0, 0, 1, 1],
                "query":
                ["Intrauterine virus infections and congenital heart disease"]
                * 2 + [
                    "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus"
                ] * 2,
                "doc_id": [0, 3, 1, 5],
                "relevance": [1, 1, 1, 1],
            })

        #
        # Collect training data
        #
        training_data_batch = app.collect_training_data(
            labeled_data=labeled_data,
            id_field="id",
            query_model=query_model,
            number_additional_docs=2,
            fields=["rankfeatures"],
        )
        self.assertTrue(training_data_batch.shape[0] > 0)
        self.assertEqual(
            len({"document_id", "query_id",
                 "label"}.intersection(set(training_data_batch.columns))),
            3,
        )
        #
        # Evaluate a query model
        #
        eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)]
        evaluation = app.evaluate(
            labeled_data=labeled_data,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
        )
        self.assertEqual(evaluation.shape, (9, 1))

        #
        # AssertionError - two models with the same name
        #
        with self.assertRaises(AssertionError):
            _ = app.evaluate(
                labeled_data=labeled_data,
                eval_metrics=eval_metrics,
                query_model=[QueryModel(),
                             QueryModel(), query_model],
                id_field="id",
            )

        evaluation = app.evaluate(
            labeled_data=labeled_data,
            eval_metrics=eval_metrics,
            query_model=[QueryModel(), query_model],
            id_field="id",
        )
        self.assertEqual(evaluation.shape, (9, 2))

        evaluation = app.evaluate(
            labeled_data=labeled_data_df,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
            detailed_metrics=True,
        )
        self.assertEqual(evaluation.shape, (15, 1))

        evaluation = app.evaluate(
            labeled_data=labeled_data_df,
            eval_metrics=eval_metrics,
            query_model=query_model,
            id_field="id",
            detailed_metrics=True,
            per_query=True,
        )
        self.assertEqual(evaluation.shape, (2, 7))