def test_or(self): match_filter = OR() self.assertEqual( match_filter.create_match_filter(query=self.query), '([{"grammar": "any"}]userInput("this is a test"))', ) self.assertDictEqual( match_filter.get_query_properties(query=self.query), {})
def bert_model_input_and_output(self, app, schema_name, fields_to_send, model_config): # # Feed a data point # response = app.feed_data_point( schema=schema_name, data_id=fields_to_send["id"], fields=fields_to_send, ) self.assertEqual( response.json["id"], "id:{}:{}::{}".format(schema_name, schema_name, fields_to_send["id"]), ) # # Run a test query # result = app.query( query="this is a test", query_model=QueryModel( query_properties=[ QueryRankingFeature( name=model_config.query_token_ids_name, mapping=model_config.query_tensor_mapping, ) ], match_phase=OR(), rank_profile=Ranking(name="pretrained_bert_tiny"), ), ) vespa_input_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(input_ids)") vespa_attention_mask = self._parse_vespa_tensor( result.hits[0], "rankingExpression(attention_mask)") vespa_token_type_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(token_type_ids)") expected_inputs = model_config.create_encodings( queries=["this is a test"], docs=[fields_to_send["title"]]) self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0]) self.assertEqual(vespa_attention_mask, expected_inputs["attention_mask"][0]) self.assertEqual(vespa_token_type_ids, expected_inputs["token_type_ids"][0]) expected_logits = model_config.predict(queries=["this is a test"], docs=[fields_to_send["title"]]) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit0)"], expected_logits[0][0], 5, ) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit1)"], expected_logits[0][1], 5, )
def test_rank_input_output(self): # # Feed a data point # fields = { "cord_uid": "1", "title": "this is my first title", } fields.update(self.bert_config.doc_fields(text=str(fields["title"]))) response = self.app.feed_data_point( schema="cord19", data_id="1", fields=fields, ) self.assertEqual(response.json()["id"], "id:cord19:cord19::1") # # Run a test query # result = self.app.query( query="this is a test", query_model=QueryModel( query_properties=[ QueryRankingFeature( name=self.bert_config.query_token_ids_name, mapping=self.bert_config.query_tensor_mapping, ) ], match_phase=OR(), rank_profile=Ranking(name="pretrained_bert_tiny"), ), ) vespa_input_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(input_ids)") vespa_attention_mask = self._parse_vespa_tensor( result.hits[0], "rankingExpression(attention_mask)") vespa_token_type_ids = self._parse_vespa_tensor( result.hits[0], "rankingExpression(token_type_ids)") expected_inputs = self.bert_config.create_encodings( queries=["this is a test"], docs=["this is my first title"]) self.assertEqual(vespa_input_ids, expected_inputs["input_ids"][0]) self.assertEqual(vespa_attention_mask, expected_inputs["attention_mask"][0]) self.assertEqual(vespa_token_type_ids, expected_inputs["token_type_ids"][0]) expected_logits = self.bert_config.predict( queries=["this is a test"], docs=["this is my first title"]) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit0)"], expected_logits[0][0], 5, ) self.assertAlmostEqual( result.hits[0]["fields"]["summaryfeatures"] ["rankingExpression(logit1)"], expected_logits[0][1], 5, )
def test_collect_training_data(self): app = Vespa(url="https://api.cord19.vespa.ai") query_model = QueryModel(match_phase=OR(), rank_profile=Ranking(name="bm25", list_features=True)) labeled_data = [ { "query_id": 0, "query": "Intrauterine virus infections and congenital heart disease", "relevant_docs": [{ "id": 0, "score": 1 }, { "id": 3, "score": 1 }], }, { "query_id": 1, "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus", "relevant_docs": [{ "id": 1, "score": 1 }, { "id": 5, "score": 1 }], }, ] training_data_batch = app.collect_training_data( labeled_data=labeled_data, id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) self.assertEqual(training_data_batch.shape[0], 12) # It should have at least one rank feature in addition to document_id, query_id and label self.assertTrue(training_data_batch.shape[1] > 3) training_data = [] for query_data in labeled_data: for doc_data in query_data["relevant_docs"]: training_data_point = app.collect_training_data_point( query=query_data["query"], query_id=query_data["query_id"], relevant_id=doc_data["id"], id_field="id", query_model=query_model, number_additional_docs=2, fields=["rankfeatures"], ) training_data.extend(training_data_point) training_data = DataFrame.from_records(training_data) self.assertEqual(training_data.shape[0], 12) # It should have at least one rank feature in addition to document_id, query_id and label self.assertTrue(training_data.shape[1] > 3)
def test_query(self): app = Vespa(url="http://localhost", port=8080) body = {"yql": "select * from sources * where test"} self.assertDictEqual( app.query(body=body, debug_request=True).request_body, body) self.assertDictEqual( app.query( query="this is a test", query_model=Query(match_phase=OR(), rank_profile=RankProfile()), debug_request=True, hits=10, ).request_body, { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "default", "listFeatures": "false" }, "hits": 10, }, ) self.assertDictEqual( app.query( query="this is a test", query_model=Query(match_phase=OR(), rank_profile=RankProfile()), debug_request=True, hits=10, recall=("id", [1, 5]), ).request_body, { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "default", "listFeatures": "false" }, "hits": 10, "recall": "+(id:1 id:5)", }, )
def test_query_properties_match_and_rank(self): query_model = QueryModel( query_properties=[ QueryRankingFeature(name="query_vector", mapping=lambda x: [1, 2, 3]) ], match_phase=OR(), rank_profile=RankProfile(name="bm25", list_features=True), ) self.assertDictEqual( query_model.create_body(query=self.query), { "yql": 'select * from sources * where ([{"grammar": "any"}]userInput("this is a test"));', "ranking": { "profile": "bm25", "listFeatures": "true" }, "ranking.features.query(query_vector)": "[1, 2, 3]", }, ) query_model = QueryModel( query_properties=[ QueryRankingFeature(name="query_vector", mapping=lambda x: [1, 2, 3]) ], match_phase=ANN( doc_vector="doc_vector", query_vector="query_vector", hits=10, label="label", ), rank_profile=RankProfile(name="bm25", list_features=True), ) self.assertDictEqual( query_model.create_body(query=self.query), { "yql": 'select * from sources * where ([{"targetNumHits": 10, "label": "label", "approximate": true}]nearestNeighbor(doc_vector, query_vector));', "ranking": { "profile": "bm25", "listFeatures": "true" }, "ranking.features.query(query_vector)": "[1, 2, 3]", }, )