def test_weak_and(self): match_filter = WeakAnd(hits=10, field="field_name") self.assertEqual( match_filter.create_match_filter(query=self.query), '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", field_name contains "", ' 'field_name contains "a", field_name contains "test"))', ) self.assertDictEqual( match_filter.get_query_properties(query=self.query), {})
def test_union(self): match_filter = Union( WeakAnd(hits=10, field="field_name"), ANN( doc_vector="doc_vector", query_vector="query_vector", hits=10, label="label", ), ) self.assertEqual( match_filter.create_match_filter(query=self.query), '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", ' 'field_name contains "", ' 'field_name contains "a", field_name contains "test")) or ' '([{"targetNumHits": 10, "label": "label", "approximate": true}]nearestNeighbor(doc_vector, query_vector))', ) self.assertDictEqual( match_filter.get_query_properties(query=self.query), {}, )
def test_union(self): match_filter = Union( WeakAnd(hits=10, field="field_name"), ANN( doc_vector="doc_vector", query_vector="query_vector", embedding_model=lambda x: [1, 2, 3], hits=10, label="label", ), ) self.assertEqual( match_filter.create_match_filter(query=self.query), '([{"targetNumHits": 10}]weakAnd(field_name contains "this", field_name contains "is", ' 'field_name contains "", ' 'field_name contains "a", field_name contains "test")) or ' '([{"targetNumHits": 10, "label": "label"}]nearestNeighbor(doc_vector, query_vector))', ) self.assertDictEqual( match_filter.get_query_properties(query=self.query), {"ranking.features.query(query_vector)": "[1, 2, 3]"}, )
def test_workflow(self): # # Connect to a running Vespa Application # app = Vespa(url="https://api.cord19.vespa.ai") # # Define a query model # match_phase = Union( WeakAnd(hits=10), ANN( doc_vector="title_embedding", query_vector="title_vector", hits=10, label="title", ), ) rank_profile = Ranking(name="bm25", list_features=True) query_model = QueryModel( name="ANN_bm25", query_properties=[ QueryRankingFeature( name="title_vector", mapping=lambda x: [random() for x in range(768)], ) ], match_phase=match_phase, rank_profile=rank_profile, ) # # Query Vespa app # query_result = app.query( query="Is remdesivir an effective treatment for COVID-19?", query_model=query_model, ) self.assertTrue(query_result.number_documents_retrieved > 0) self.assertEqual(len(query_result.hits), 10) # # Define labelled data # labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] # equivalent data in df format labeled_data_df = DataFrame( data={ "qid": [0, 0, 1, 1], "query": ["Intrauterine virus infections and congenital heart disease"] * 2 + [ "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus" ] * 2, "doc_id": [0, 3, 1, 5], "relevance": [1, 1, 1, 1], }) # # Collect training data # training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertTrue(training_data_batch.shape[0] > 0) self.assertEqual( len({"document_id", "query_id", "label"}.intersection(set(training_data_batch.columns))), 3, ) # # Evaluate a query model # eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10)] evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=query_model, id_field="id", ) self.assertEqual(evaluation.shape, (9, 1)) # # AssertionError - two models with the same name # with self.assertRaises(AssertionError): _ = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), QueryModel(), query_model], id_field="id", ) evaluation = app.evaluate( labeled_data=labeled_data, eval_metrics=eval_metrics, query_model=[QueryModel(), query_model], id_field="id", ) self.assertEqual(evaluation.shape, (9, 2)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, ) self.assertEqual(evaluation.shape, (15, 1)) evaluation = app.evaluate( labeled_data=labeled_data_df, eval_metrics=eval_metrics, query_model=query_model, id_field="id", detailed_metrics=True, per_query=True, ) self.assertEqual(evaluation.shape, (2, 7))