def setUp(self) -> None: test_schema = Schema( name="msmarco", document=Document( fields=[ Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), ] ), fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="default", first_phase="nativeRank(title, body)"), RankProfile( name="bm25", first_phase="bm25(title) + bm25(body)", inherits="default", ), ], ) self.app_package = ApplicationPackage(name="test_app", schema=test_schema)
def setUp(self) -> None: # # Create application package # document = Document( fields=[ Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="metadata", type="string", indexing=["attribute", "summary"], attribute=["fast-search", "fast-access"], ), Field( name="tensor_field", type="tensor<float>(x[128])", indexing=["attribute"], ann=HNSW( distance_metric="euclidean", max_links_per_node=16, neighbors_to_explore_at_insert=200, ), ), ] ) msmarco_schema = Schema( name="msmarco", document=document, fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="default", first_phase="nativeRank(title, body)") ], ) app_package = ApplicationPackage(name="msmarco", schema=msmarco_schema) # # Deploy on Vespa Cloud # self.vespa_cloud = VespaCloud( tenant="vespa-team", application="pyvespa-integration", key_content=os.getenv("VESPA_CLOUD_USER_KEY").replace(r"\n", "\n"), application_package=app_package, ) self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application") self.instance_name = "test" self.app = self.vespa_cloud.deploy( instance=self.instance_name, disk_folder=self.disk_folder )
def test_field_name_type_indexing_index(self): field = Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ) self.assertEqual(field.name, "body") self.assertEqual(field.type, "string") self.assertEqual(field.indexing, ["index", "summary"]) self.assertEqual(field.index, "enable-bm25") self.assertEqual( field.to_dict, { "name": "body", "type": "string", "indexing": ["index", "summary"], "index": "enable-bm25", }, ) self.assertEqual( field, Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), ) self.assertEqual(field, Field.from_dict(field.to_dict)) self.assertEqual(field.indexing_to_text, "index | summary")
def create_cord19_application_package(): app_package = ApplicationPackage(name="cord19") app_package.schema.add_fields( Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), ) app_package.schema.add_field_set(FieldSet(name="default", fields=["title"])) app_package.schema.add_rank_profile( RankProfile(name="bm25", first_phase="bm25(title)")) bert_config = BertModelConfig( model_id="pretrained_bert_tiny", tokenizer="google/bert_uncased_L-2_H-128_A-2", model="google/bert_uncased_L-2_H-128_A-2", query_input_size=5, doc_input_size=10, ) app_package.add_model_ranking( model_config=bert_config, include_model_summary_features=True, inherits="default", first_phase="bm25(title)", second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"), ) return app_package
def test_field_name_type(self): field = Field(name="test_name", type="string") self.assertEqual(field.name, "test_name") self.assertEqual(field.type, "string") self.assertEqual(field.to_dict, {"name": "test_name", "type": "string"}) self.assertEqual(field, Field(name="test_name", type="string")) self.assertEqual(field, Field.from_dict(field.to_dict)) self.assertIsNone(field.indexing_to_text)
def create_qa_application_package(): app_package = QuestionAnswering() # # Our test suite requires that each schema has a 'id' field # app_package.get_schema("sentence").add_fields( Field(name="id", type="string", indexing=["attribute", "summary"])) app_package.get_schema("context").add_fields( Field(name="id", type="string", indexing=["attribute", "summary"])) return app_package
def setUp(self) -> None: self.app_package = ApplicationPackage(name="test_app") self.app_package.schema.add_fields( Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), ) self.app_package.schema.add_field_set( FieldSet(name="default", fields=["title", "body"])) self.app_package.schema.add_rank_profile( RankProfile(name="default", first_phase="nativeRank(title, body)")) self.app_package.schema.add_rank_profile( RankProfile( name="bm25", first_phase="bm25(title) + bm25(body)", inherits="default", )) self.app_package.query_profile_type.add_fields( QueryTypeField( name="ranking.features.query(query_bert)", type="tensor<float>(x[768])", )) self.app_package.query_profile.add_fields( QueryField(name="maxHits", value=100), QueryField(name="anotherField", value="string_value"), ) bert_config = BertModelConfig( model_id="bert_tiny", query_input_size=4, doc_input_size=8, tokenizer=os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_tokenizer"), model=os.path.join(os.environ["RESOURCES_DIR"], "bert_tiny_model"), ) self.app_package.add_model_ranking( model_config=bert_config, include_model_summary_features=True, inherits="default", first_phase="bm25(title)", second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"), )
def test_tensor_with_hsnw(self): field = Field( name="tensor_field", type="tensor<float>(x[128])", indexing=["attribute"], attribute=["fast-search", "fast-access"], ann=HNSW( distance_metric="enclidean", max_links_per_node=16, neighbors_to_explore_at_insert=200, ), ) self.assertEqual(field, Field.from_dict(field.to_dict))
def test_document_two_fields(self): document = Document() field_1 = Field(name="test_name", type="string") field_2 = Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ) document.add_fields(field_1, field_2) self.assertEqual(document.fields, [field_1, field_2]) self.assertEqual(document, Document.from_dict(document.to_dict)) self.assertEqual(document, Document([field_1, field_2]))
def setUp(self) -> None: self.app_package = ApplicationPackage(name="test_app") self.app_package.schema.add_fields( Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="tensor_field", type="tensor<float>(x[128])", indexing=["attribute"], attribute=["fast-search", "fast-access"], ann=HNSW( distance_metric="euclidean", max_links_per_node=16, neighbors_to_explore_at_insert=200, ), ), ) self.app_package.schema.add_field_set( FieldSet(name="default", fields=["title", "body"])) self.app_package.schema.add_rank_profile( RankProfile(name="default", first_phase="nativeRank(title, body)")) self.app_package.schema.add_rank_profile( RankProfile( name="bm25", first_phase="bm25(title) + bm25(body)", inherits="default", )) self.app_package.query_profile_type.add_fields( QueryTypeField( name="ranking.features.query(query_bert)", type="tensor<float>(x[768])", )) self.app_package.query_profile.add_fields( QueryField(name="maxHits", value=100), QueryField(name="anotherField", value="string_value"), )
def setUp(self) -> None: # # Create application package # document = Document(fields=[ Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="metadata", type="string", indexing=["attribute", "summary"], attribute=["fast-search", "fast-access"], ), Field( name="tensor_field", type="tensor<float>(x[128])", indexing=["attribute"], ann=HNSW( distance_metric="euclidean", max_links_per_node=16, neighbors_to_explore_at_insert=200, ), ), ]) msmarco_schema = Schema( name="msmarco", document=document, fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="default", first_phase="nativeRank(title, body)") ], ) self.app_package = ApplicationPackage(name="msmarco", schema=msmarco_schema) self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application")
def setUp(self) -> None: # # Create application package # self.app_package = ApplicationPackage(name="cord19") self.app_package.schema.add_fields( Field(name="cord_uid", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), ) self.app_package.schema.add_field_set( FieldSet(name="default", fields=["title"]) ) self.app_package.schema.add_rank_profile( RankProfile(name="bm25", first_phase="bm25(title)") ) self.bert_config = BertModelConfig( model_id="pretrained_bert_tiny", tokenizer="google/bert_uncased_L-2_H-128_A-2", model="google/bert_uncased_L-2_H-128_A-2", query_input_size=5, doc_input_size=10, ) self.app_package.add_model_ranking( model_config=self.bert_config, include_model_summary_features=True, inherits="default", first_phase="bm25(title)", second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"), ) # # Deploy on Vespa Cloud # self.vespa_cloud = VespaCloud( tenant="vespa-team", application="pyvespa-integration", key_content=os.getenv("VESPA_CLOUD_USER_KEY").replace(r"\n", "\n"), application_package=self.app_package, ) self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application") self.instance_name = "test" self.app = self.vespa_cloud.deploy( instance=self.instance_name, disk_folder=self.disk_folder )
def test_document_one_field(self): document = Document() field = Field(name="test_name", type="string") document.add_fields(field) self.assertEqual(document.fields, [field]) self.assertEqual(document, Document.from_dict(document.to_dict)) self.assertEqual(document, Document([field]))
def test_schema(self): schema = Schema( name="test_schema", document=Document(fields=[Field(name="test_name", type="string")]), fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)") ], ) self.assertEqual(schema, Schema.from_dict(schema.to_dict)) self.assertDictEqual( schema.rank_profiles, {"bm25": RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)")}, ) schema.add_rank_profile( RankProfile(name="default", first_phase="NativeRank(title)") ) self.assertDictEqual( schema.rank_profiles, { "bm25": RankProfile( name="bm25", first_phase="bm25(title) + bm25(body)" ), "default": RankProfile(name="default", first_phase="NativeRank(title)"), }, )
def setUp(self) -> None: self.app_package = create_qa_application_package() self.app_package.get_schema("sentence").add_fields( Field(name="id", type="string", indexing=["attribute", "summary"])) self.app_package.get_schema("context").add_fields( Field(name="id", type="string", indexing=["attribute", "summary"])) self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application") self.vespa_docker = VespaDocker(port=8089, disk_folder=self.disk_folder) self.app = self.vespa_docker.deploy( application_package=self.app_package) with open( os.path.join(os.environ["RESOURCES_DIR"], "qa_sample_sentence_data.json"), "r", ) as f: sample_sentence_data = json.load(f) self.fields_to_send_sentence = sample_sentence_data self.expected_fields_from_sentence_get_operation = [] for d in sample_sentence_data: expected_d = { "id": d["id"], "text": d["text"], "dataset": d["dataset"], "questions": d["questions"], "context_id": d["context_id"], "sentence_embedding": { "cells": [{ "address": { "x": str(idx) }, "value": value } for idx, value in enumerate(d["sentence_embedding"] ["values"])] }, } self.expected_fields_from_sentence_get_operation.append(expected_d) with open( os.path.join(os.environ["RESOURCES_DIR"], "qa_sample_context_data.json"), "r", ) as f: sample_context_data = json.load(f) self.fields_to_send_context = sample_context_data self.fields_to_update = {"text": "this is my updated text"}
def create_msmarco_application_package(): # # Application package # document = Document(fields=[ Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="metadata", type="string", indexing=["attribute", "summary"], attribute=["fast-search", "fast-access"], ), Field( name="tensor_field", type="tensor<float>(x[128])", indexing=["attribute", "index"], ann=HNSW( distance_metric="euclidean", max_links_per_node=16, neighbors_to_explore_at_insert=200, ), ), ]) msmarco_schema = Schema( name="msmarco", document=document, fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="default", first_phase="nativeRank(title, body)") ], ) app_package = ApplicationPackage(name="msmarco", schema=[msmarco_schema]) return app_package
def setUp(self) -> None: # # Create application package # self.app_package = ApplicationPackage(name="cord19") self.app_package.schema.add_fields( Field(name="cord_uid", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), ) self.app_package.schema.add_field_set( FieldSet(name="default", fields=["title"])) self.app_package.schema.add_rank_profile( RankProfile(name="bm25", first_phase="bm25(title)")) self.bert_config = BertModelConfig( model_id="pretrained_bert_tiny", tokenizer="google/bert_uncased_L-2_H-128_A-2", model="google/bert_uncased_L-2_H-128_A-2", query_input_size=5, doc_input_size=10, ) self.app_package.add_model_ranking( model_config=self.bert_config, include_model_summary_features=True, inherits="default", first_phase="bm25(title)", second_phase=SecondPhaseRanking(rerank_count=10, expression="logit1"), ) self.disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application") self.vespa_docker = VespaDocker(port=8089) self.app = self.vespa_docker.deploy( application_package=self.app_package, disk_folder=self.disk_folder)
def test_schema(self): schema = Schema( name="test_schema", document=Document(fields=[Field(name="test_name", type="string")]), fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)") ], models=[ OnnxModel( model_name="bert", model_file_path="bert.onnx", inputs={ "input_ids": "input_ids", "token_type_ids": "token_type_ids", "attention_mask": "attention_mask", }, outputs={"logits": "logits"}, ) ], ) self.assertEqual(schema, Schema.from_dict(schema.to_dict)) self.assertDictEqual( schema.rank_profiles, { "bm25": RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)") }, ) schema.add_rank_profile( RankProfile(name="default", first_phase="NativeRank(title)")) self.assertDictEqual( schema.rank_profiles, { "bm25": RankProfile(name="bm25", first_phase="bm25(title) + bm25(body)"), "default": RankProfile(name="default", first_phase="NativeRank(title)"), }, )
def setUp(self) -> None: test_schema = Schema( name="msmarco", document=Document(fields=[ Field(name="id", type="string", indexing=["attribute", "summary"]), Field( name="title", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="body", type="string", indexing=["index", "summary"], index="enable-bm25", ), Field( name="embedding", type="tensor<float>(x[128])", indexing=["attribute", "summary"], attribute=["fast-search", "fast-access"], ), ]), fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="default", first_phase="nativeRank(title, body)"), RankProfile( name="bm25", first_phase="bm25(title) + bm25(body)", inherits="default", ), RankProfile( name="bert", first_phase="bm25(title) + bm25(body)", second_phase=SecondPhaseRanking( rerank_count=10, expression="sum(onnx(bert).logits{d0:0,d1:0})"), inherits="default", constants={ "TOKEN_NONE": 0, "TOKEN_CLS": 101, "TOKEN_SEP": 102 }, functions=[ Function( name="question_length", expression= "sum(map(query(query_token_ids), f(a)(a > 0)))", ), Function( name="doc_length", expression= "sum(map(attribute(doc_token_ids), f(a)(a > 0)))", ), Function( name="input_ids", expression="tensor<float>(d0[1],d1[128])(\n" " if (d1 == 0,\n" " TOKEN_CLS,\n" " if (d1 < question_length + 1,\n" " query(query_token_ids){d0:(d1-1)},\n" " if (d1 == question_length + 1,\n" " TOKEN_SEP,\n" " if (d1 < question_length + doc_length + 2,\n" " attribute(doc_token_ids){d0:(d1-question_length-2)},\n" " if (d1 == question_length + doc_length + 2,\n" " TOKEN_SEP,\n" " TOKEN_NONE\n" " ))))))", ), Function( name="attention_mask", expression="map(input_ids, f(a)(a > 0))", ), Function( name="token_type_ids", expression="tensor<float>(d0[1],d1[128])(\n" " if (d1 < question_length,\n" " 0,\n" " if (d1 < question_length + doc_length,\n" " 1,\n" " TOKEN_NONE\n" " )))", ), ], summary_features=[ "onnx(bert).logits", "input_ids", "attention_mask", "token_type_ids", ], ), ], models=[ OnnxModel( model_name="bert", model_file_path="bert.onnx", inputs={ "input_ids": "input_ids", "token_type_ids": "token_type_ids", "attention_mask": "attention_mask", }, outputs={"logits": "logits"}, ) ], ) test_query_profile_type = QueryProfileType(fields=[ QueryTypeField( name="ranking.features.query(query_bert)", type="tensor<float>(x[768])", ) ]) test_query_profile = QueryProfile(fields=[ QueryField(name="maxHits", value=100), QueryField(name="anotherField", value="string_value"), ]) self.app_package = ApplicationPackage( name="test_app", schema=test_schema, query_profile=test_query_profile, query_profile_type=test_query_profile_type, )
def __init__(self, name: str = "qa"): context_document = Document( fields=[ Field( name="questions", type="array<int>", indexing=["summary", "attribute"], ), Field(name="dataset", type="string", indexing=["summary", "attribute"]), Field(name="context_id", type="int", indexing=["summary", "attribute"]), Field( name="text", type="string", indexing=["summary", "index"], index="enable-bm25", ), ] ) context_schema = Schema( name="context", document=context_document, fieldsets=[FieldSet(name="default", fields=["text"])], rank_profiles=[ RankProfile(name="bm25", inherits="default", first_phase="bm25(text)"), RankProfile( name="nativeRank", inherits="default", first_phase="nativeRank(text)", ), ], ) sentence_document = Document( inherits="context", fields=[ Field( name="sentence_embedding", type="tensor<float>(x[512])", indexing=["attribute", "index"], ann=HNSW( distance_metric="euclidean", max_links_per_node=16, neighbors_to_explore_at_insert=500, ), ) ], ) sentence_schema = Schema( name="sentence", document=sentence_document, fieldsets=[FieldSet(name="default", fields=["text"])], rank_profiles=[ RankProfile( name="semantic-similarity", inherits="default", first_phase="closeness(sentence_embedding)", ), RankProfile(name="bm25", inherits="default", first_phase="bm25(text)"), RankProfile( name="bm25-semantic-similarity", inherits="default", first_phase="bm25(text) + closeness(sentence_embedding)", ), ], ) super().__init__( name=name, schema=[context_schema, sentence_schema], query_profile=QueryProfile(), query_profile_type=QueryProfileType( fields=[ QueryTypeField( name="ranking.features.query(query_embedding)", type="tensor<float>(x[512])", ) ] ), )
from vespa.package import Document, Field document = Document(fields=[ Field(name="id", type="string", indexing=["attribute", "summary"]), Field(name="title", type="string", indexing=["index", "summary"], index="enable-bm25"), Field(name="body", type="string", indexing=["index", "summary"], index="enable-bm25") ]) from vespa.package import Schema, FieldSet, RankProfile msmarco_schema = Schema( name="msmarco", document=document, fieldsets=[FieldSet(name="default", fields=["title", "body"])], rank_profiles=[ RankProfile(name="default", first_phase="nativeRank(title, body)") ]) from vespa.package import ApplicationPackage app_package = ApplicationPackage(name="msmarco", schema=msmarco_schema) from vespa.package import VespaDocker path = "mnt/c/Users/User/OneDrive - NTNU/NTNU/Prosjekt oppgave NLP/"