def test_collect_training_data(self): app = Vespa(url="https://api.cord19.vespa.ai") query_model = QueryModel(match_phase=OR(), rank_profile=Ranking(name="bm25", list_features=True)) labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertEqual(training_data_batch.shape[0], 12) # It should have at least one rank feature in addition to document_id, query_id and label self.assertTrue(training_data_batch.shape[1] > 3) training_data = [] for query_data in labeled_data: for doc_data in query_data["relevant_docs"]: training_data_point = app.collect_training_data_point( query=query_data["query"], query_id=query_data["query_id"], relevant_id=doc_data["id"], id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) training_data.extend(training_data_point) training_data = DataFrame.from_records(training_data) self.assertEqual(training_data.shape[0], 12) # It should have at least one rank feature in addition to document_id, query_id and label self.assertTrue(training_data.shape[1] > 3)
class TestVespaCollectData(unittest.TestCase): def setUp(self) -> None: self.app = Vespa(url="http://localhost", port=8080) self.raw_vespa_result_recall = { "root": { "id": "toplevel", "relevance": 1.0, "fields": { "totalCount": 1083 }, "coverage": { "coverage": 100, "documents": 62529, "full": True, "nodes": 2, "results": 1, "resultsFull": 1, }, "children": [{ "id": "id:covid-19:doc::40215", "relevance": 30.368213170494712, "source": "content", "fields": { "vespa_id_field": "abc", "sddocname": "doc", "body_text": "this is a body", "title": "this is a title", "rankfeatures": { "a": 1, "b": 2 }, }, }], } } self.raw_vespa_result_additional = { "root": { "id": "toplevel", "relevance": 1.0, "fields": { "totalCount": 1083 }, "coverage": { "coverage": 100, "documents": 62529, "full": True, "nodes": 2, "results": 1, "resultsFull": 1, }, "children": [ { "id": "id:covid-19:doc::40216", "relevance": 10, "source": "content", "fields": { "vespa_id_field": "def", "sddocname": "doc", "body_text": "this is a body 2", "title": "this is a title 2", "rankfeatures": { "a": 3, "b": 4 }, }, }, { "id": "id:covid-19:doc::40217", "relevance": 8, "source": "content", "fields": { "vespa_id_field": "ghi", "sddocname": "doc", "body_text": "this is a body 3", "title": "this is a title 3", "rankfeatures": { "a": 5, "b": 6 }, }, }, ], } } def test_disable_rank_features(self): with self.assertRaises(AssertionError): self.app.collect_training_data_point( query="this is a query", query_id="123", relevant_id="abc", id_field="vespa_id_field", query_model=Query(), number_additional_docs=2, ) def test_collect_training_data_point(self): self.app.query = Mock(side_effect=[ VespaResult(self.raw_vespa_result_recall), VespaResult(self.raw_vespa_result_additional), ]) query_model = Query(rank_profile=RankProfile(list_features=True)) data = self.app.collect_training_data_point( query="this is a query", query_id="123", relevant_id="abc", id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, timeout="15s", ) self.assertEqual(self.app.query.call_count, 2) self.app.query.assert_has_calls([ call( query="this is a query", query_model=query_model, recall=("vespa_id_field", ["abc"]), timeout="15s", ), call( query="this is a query", query_model=query_model, hits=2, timeout="15s", ), ]) expected_data = [ { "document_id": "abc", "query_id": "123", "relevant": 1, "a": 1, "b": 2 }, { "document_id": "def", "query_id": "123", "relevant": 0, "a": 3, "b": 4 }, { "document_id": "ghi", "query_id": "123", "relevant": 0, "a": 5, "b": 6 }, ] self.assertEqual(data, expected_data) def test_collect_training_data_point_0_recall_hits(self): self.raw_vespa_result_recall = { "root": { "id": "toplevel", "relevance": 1.0, "fields": { "totalCount": 0 }, "coverage": { "coverage": 100, "documents": 62529, "full": True, "nodes": 2, "results": 1, "resultsFull": 1, }, } } self.app.query = Mock(side_effect=[ VespaResult(self.raw_vespa_result_recall), VespaResult(self.raw_vespa_result_additional), ]) query_model = Query(rank_profile=RankProfile(list_features=True)) data = self.app.collect_training_data_point( query="this is a query", query_id="123", relevant_id="abc", id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, timeout="15s", ) self.assertEqual(self.app.query.call_count, 1) self.app.query.assert_has_calls([ call( query="this is a query", query_model=query_model, recall=("vespa_id_field", ["abc"]), timeout="15s", ), ]) expected_data = [] self.assertEqual(data, expected_data) def test_collect_training_data(self): mock_return_value = [ { "document_id": "abc", "query_id": "123", "relevant": 1, "a": 1, "b": 2, }, { "document_id": "def", "query_id": "123", "relevant": 0, "a": 3, "b": 4, }, { "document_id": "ghi", "query_id": "123", "relevant": 0, "a": 5, "b": 6, }, ] self.app.collect_training_data_point = Mock( return_value=mock_return_value) labelled_data = [{ "query_id": 123, "query": "this is a query", "relevant_docs": [{ "id": "abc", "score": 1 }], }] query_model = Query(rank_profile=RankProfile(list_features=True)) data = self.app.collect_training_data( labelled_data=labelled_data, id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, timeout="15s", ) self.app.collect_training_data_point.assert_has_calls([ call( query="this is a query", query_id=123, relevant_id="abc", id_field="vespa_id_field", query_model=query_model, number_additional_docs=2, relevant_score=1, default_score=0, timeout="15s", ) ]) assert_frame_equal(data, DataFrame.from_records(mock_return_value))
def test_workflow(self): # # Connect to a running Vespa Application # app = Vespa(url="https://api.cord19.vespa.ai") # # Define a query model # match_phase = Union( WeakAnd(hits=10), ANN( doc_vector="title_embedding", query_vector="title_vector", hits=10, label="title", ), ) rank_profile = Ranking(name="bm25", list_features=True) query_model = QueryModel( name="ANN_bm25", query_properties=[ QueryRankingFeature( name="title_vector", mapping=lambda x: [random() for x in range(768)], ) ], match_phase=match_phase, rank_profile=rank_profile, ) # # Query Vespa app # query_result = app.query( query="Is remdesivir an effective treatment for COVID-19?", query_model=query_model, ) self.assertTrue(query_result.number_documents_retrieved > 0) self.assertEqual(len(query_result.hits), 10) # # Define labelled data # labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] # equivalent data in df format labeled_data_df = DataFrame( data={ "qid": [0, 0, 1, 1], "query": ["Intrauterine virus infections and congenital heart disease"] * 2 + [ "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus" ] * 2, "doc_id": [0, 3, 1, 5], "relevance": [1, 1, 1, 1], }) # # Collect training data # training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertTrue(training_data_batch.shape[0] > 0) self.assertEqual( len({"document_id", "query_id", "label"}.intersection(set(training_data_batch.columns))), 3, ) # # Evaluate a query model # eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)] evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=query_model, id_field="id", ) self.assertEqual(evaluation.shape, (9, 1)) # # AssertionError - two models with the same name # with self.assertRaises(AssertionError): _ = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), QueryModel(), query_model], id_field="id", ) evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), query_model], id_field="id", ) self.assertEqual(evaluation.shape, (9, 2)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, ) self.assertEqual(evaluation.shape, (15, 1)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, per_query=True, ) self.assertEqual(evaluation.shape, (2, 7))