class TestVespaEvaluate(unittest.TestCase): def setUp(self) -> None: self.app = Vespa(url="http://localhost", port=8080) self.labelled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": "def", "score": 1 }, { "id": "abc", "score": 1 }], }, ] self.query_results = { "root": { "id": "toplevel", "relevance": 1.0, "fields": { "totalCount": 1083 }, "coverage": { "coverage": 100, "documents": 62529, "full": True, "nodes": 2, "results": 1, "resultsFull": 1, }, "children": [ { "id": "id:covid-19:doc::40216", "relevance": 10, "source": "content", "fields": { "vespa_id_field": "ghi", "sddocname": "doc", "body_text": "this is a body 2", "title": "this is a title 2", "rankfeatures": { "a": 3, "b": 4 }, }, }, { "id": "id:covid-19:doc::40217", "relevance": 8, "source": "content", "fields": { "vespa_id_field": "def", "sddocname": "doc", "body_text": "this is a body 3", "title": "this is a title 3", "rankfeatures": { "a": 5, "b": 6 }, }, }, ], } } def test_evaluate_query(self): self.app.query = Mock(return_value={}) eval_metric = Mock() eval_metric.evaluate_query = Mock(return_value={"metric": 1}) eval_metric2 = Mock() eval_metric2.evaluate_query = Mock(return_value={"metric_2": 2}) query_model = Query() evaluation = self.app.evaluate_query( eval_metrics=[eval_metric, eval_metric2], query_model=query_model, query_id="0", query="this is a test", id_field="vespa_id_field", relevant_docs=self.labelled_data[0]["relevant_docs"], default_score=0, hits=10, ) self.assertEqual(self.app.query.call_count, 1) self.app.query.assert_has_calls([ call(query="this is a test", query_model=query_model, hits=10), ]) self.assertEqual(eval_metric.evaluate_query.call_count, 1) eval_metric.evaluate_query.assert_has_calls([ call({}, self.labelled_data[0]["relevant_docs"], "vespa_id_field", 0), ]) self.assertDictEqual(evaluation, { "query_id": "0", "metric": 1, "metric_2": 2 }) def test_evaluate(self): self.app.evaluate_query = Mock(side_effect=[ { "query_id": "0", "metric": 1 }, ]) evaluation = self.app.evaluate( labelled_data=self.labelled_data, eval_metrics=[Mock()], query_model=Mock(), id_field="mock", default_score=0, ) assert_frame_equal( evaluation, DataFrame.from_records([{ "query_id": "0", "metric": 1 }]))
def test_workflow(self): # # Connect to a running Vespa Application # app = Vespa(url="https://api.cord19.vespa.ai") # # Define a query model # match_phase = Union( WeakAnd(hits=10), ANN( doc_vector="title_embedding", query_vector="title_vector", hits=10, label="title", ), ) rank_profile = Ranking(name="bm25", list_features=True) query_model = QueryModel( name="ANN_bm25", query_properties=[ QueryRankingFeature( name="title_vector", mapping=lambda x: [random() for x in range(768)], ) ], match_phase=match_phase, rank_profile=rank_profile, ) # # Query Vespa app # query_result = app.query( query="Is remdesivir an effective treatment for COVID-19?", query_model=query_model, ) self.assertTrue(query_result.number_documents_retrieved > 0) self.assertEqual(len(query_result.hits), 10) # # Define labelled data # labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] # equivalent data in df format labeled_data_df = DataFrame( data={ "qid": [0, 0, 1, 1], "query": ["Intrauterine virus infections and congenital heart disease"] * 2 + [ "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus" ] * 2, "doc_id": [0, 3, 1, 5], "relevance": [1, 1, 1, 1], }) # # Collect training data # training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertTrue(training_data_batch.shape[0] > 0) self.assertEqual( len({"document_id", "query_id", "label"}.intersection(set(training_data_batch.columns))), 3, ) # # Evaluate a query model # eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)] evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=query_model, id_field="id", ) self.assertEqual(evaluation.shape, (9, 1)) # # AssertionError - two models with the same name # with self.assertRaises(AssertionError): _ = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), QueryModel(), query_model], id_field="id", ) evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), query_model], id_field="id", ) self.assertEqual(evaluation.shape, (9, 2)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, ) self.assertEqual(evaluation.shape, (15, 1)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, per_query=True, ) self.assertEqual(evaluation.shape, (2, 7))