def test_file_embed_lookup(): steamship = get_steamship_client() content_a = "Ted likes to run." content_b = "Grace likes to bike." file = steamship.upload(content=content_a, mime_type=MimeTypes.MKD).data blockify_res = file.blockify( plugin_instance="markdown-blockifier-default-1.0") assert blockify_res.error is None blockify_res.wait() parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data parse_res = file.tag(plugin_instance=parser.handle) assert parse_res.error is None parse_res.wait() b = steamship.upload(content=content_b, mime_type=MimeTypes.MKD).data blockify_res = b.blockify( plugin_instance="markdown-blockifier-default-1.0") assert blockify_res.error is None blockify_res.wait() parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data parse_res = b.tag(plugin_instance=parser.handle) assert parse_res.error is None parse_res.wait() embedder = PluginInstance.create(steamship, plugin_handle="test-embedder").data # Now we add the file to the index with random_index(steamship, embedder.handle) as index: index.insert_file(file.id, block_type="sentence", reindex=True) index.insert_file(b.id, block_type="sentence", reindex=True) res = index.search("What does Ted like to do?").data assert len(res.items) == 1 assert res.items[0].value.value == content_a res = index.search("What does Grace like to do?").data assert len(res.items) == 1 assert res.items[0].value.value == content_b # Now we list the items itemsa = index.list_items(file_id=file.id).data assert len(itemsa.items) == 1 assert len(itemsa.items[0].embedding) > 0 assert itemsa.items[0].value == content_a itemsb = index.list_items(file_id=b.id).data assert len(itemsb.items) == 1 assert len(itemsb.items[0].embedding) > 0 assert len(itemsb.items[0].embedding) == len(itemsa.items[0].embedding) assert itemsb.items[0].value == content_b
def test_file_index(): steamship = get_steamship_client() t = "A nice poem" p1_1 = "Roses are red." p1_2 = "Violets are blue." p2_1 = "Sugar is sweet." p2_2 = "I love you." t2 = "A flavorful story" p3_1 = "Cake is made of flour." p3_2 = "Cake tastes good with milk." p4_1 = "Cake comes in chocolate and vanilla flavors." p4_2 = "Cake can be cut into mAny pieces and shared." content1 = f"# {t}\n\n{p1_1} {p1_2}\n\n{p2_1} {p2_2}" content2 = f"# {t2}\n\n{p3_1} {p3_2}\n\n{p4_1} {p4_2}" content = f"{content1}\n\n{content2}" file = steamship.upload(content=content, mime_type=MimeTypes.MKD).data assert file.id is not None assert file.mime_type == MimeTypes.MKD blockify_resp = file.blockify( plugin_instance="markdown-blockifier-default-1.0") assert blockify_resp.error is None blockify_resp.wait() # Now we parse parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data parse_resp = file.tag(plugin_instance=parser.handle) assert parse_resp.error is None parse_resp.wait() # Now the sentences should be parsed! q2 = file.refresh().data assert len(q2.blocks) == 6 # Now we add the file to the index via the shortcut. embedder = PluginInstance.create(steamship, plugin_handle="test-embedder").data # noinspection PyUnresolvedReferences index = file.index(plugin_instance=embedder.handle) res = index.search("What color are roses?").data assert len(res.items) == 1 # Because the simdex now indexes entire blocks and not sentences, the result of this is the whole block text assert res.items[0].value.value == " ".join([p1_1, p1_2]) res = index.search("What flavors does cake come in?").data assert len(res.items) == 1 # Because the simdex now indexes entire blocks and not sentences, the result of this is the whole block text assert res.items[0].value.value == " ".join([p4_1, p4_2]) index.delete() file.delete()
def test_plugin_instance_get(): steamship = get_steamship_client() handle = f"test_tagger_test_handle{uuid.uuid4()}" instance = PluginInstance.create(steamship, plugin_handle="test-tagger", handle=handle).data assert instance.id is not None other_instance = PluginInstance.get(steamship, handle=handle).data assert instance.id == other_instance.id
def test_deploy_in_space(): client = get_steamship_client() space = Space.create(client, handle="test-non-default-space").data instance = PluginInstance.create(client, plugin_handle="test-tagger", space_id=space.id).data assert instance.space_id == space.id
def basic_embeddings(plugin_instance: PluginInstance): e1 = plugin_instance.tag("This is a test") e1b = plugin_instance.tag("Banana") e1.wait() e1b.wait() assert count_embeddings(e1.data.file) == 1 assert count_embeddings(e1b.data.file) == 1 assert len(e1.data.file.blocks[0].tags[0].value["embedding"]) > 1 e2 = plugin_instance.tag("This is a test") e2.wait() assert count_embeddings(e2.data.file) == 1 assert len(e2.data.file.blocks[0].tags[0].value["embedding"]) == len( e1.data.file.blocks[0].tags[0].value["embedding"]) e4 = plugin_instance.tag("This is a test") e4.wait() assert count_embeddings(e4.data.file) == 1
def test_parsing(): steamship = get_steamship_client() parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data resp = parser.tag("This is a test") resp.wait() resp = resp.data assert len(resp.file.blocks) == 1 d = resp.file.blocks[0] assert d.text == "This is a test" assert len(d.tags) == 5
def test_file_parse(): steamship = get_steamship_client() content1 = "# {}\n\n{} {}\n\n{} {}".format(T, P1_1, P1_2, P2_1, P2_2) content2 = "# {}\n\n{} {}\n\n{} {}".format(T2, P3_1, P3_2, P4_1, P4_2) content = "{}\n\n{}".format(content1, content2) file = steamship.upload(content=content, mime_type=MimeTypes.MKD).data assert file.id is not None assert file.mime_type == MimeTypes.MKD blockify_resp = file.blockify( plugin_instance="markdown-blockifier-default-1.0") assert blockify_resp.error is None blockify_resp.wait() # Now we parse parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data parse_resp = file.tag(plugin_instance=parser.handle) assert parse_resp.error is None parse_resp.wait() # Now the sentences should be parsed! q2 = file.refresh().data assert len(q2.blocks) == 6 # Now we add the file to the index plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data with random_index(steamship, plugin_instance=plugin_instance.handle) as index: index.insert_file(file.id, reindex=False) embed_resp = index.embed() assert embed_resp.error is None embed_resp.wait() res = index.search("What color are roses?").data assert len(res.items) == 1 # Because the simdex now indexes entire blocks and not sentences, the result of this is the whole block text assert res.items[0].value.value == " ".join([P1_1, P1_2]) file.delete()
def test_task_comment_feedback_reporting(): """ We want to be able to generate reports like this: Select Across Gorup -- externalGroup Inputs Seen: XXX -- Distinct externalId Inputs Suggested: YYY -- Add to metadata Inputs Liked / Disliked / Used -- Add to metadata So really we just need to test the group aggregation """ client = get_steamship_client() embedder = PluginInstance.create(client, plugin_handle="test-embedder").data with random_index(client, plugin_instance=embedder.handle) as index: item1 = EmbeddedItem( value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3] ) group_name_1 = random_name() group_name_2 = random_name() index.insert( item1.value, external_id=item1.external_id, external_type=item1.external_type, metadata=item1.metadata, ) task = index.embed() task.wait() res = index.search(item1.value, include_metadata=True, k=1) res.task.add_comment( external_id="Foo1", external_type="Bar1", external_group=group_name_1, metadata=[1, 2, 3], ) res.task.add_comment( external_id="Foo2", external_type="Bar1", external_group=group_name_1, metadata=[1, 2, 3], ) res.task.add_comment( external_id="Foo2", external_type="Bar1", external_group=group_name_2, metadata=[1, 2, 3], ) comments = res.task.list_comments() assert len(comments.data.comments) == 3 g1 = client.list_comments(external_group=group_name_1) assert len(g1.data.comments) == 2 g2 = client.list_comments(external_group=group_name_2) assert len(g2.data.comments) == 1 g1 = client.list_comments(task_id=res.task.task_id, external_group=group_name_1) assert len(g1.data.comments) == 2 g2 = client.list_comments(task_id=res.task.task_id, external_group=group_name_2) assert len(g2.data.comments) == 1 g1 = client.list_comments( task_id=res.task.task_id, external_id="Foo1", external_group=group_name_1 ) assert len(g1.data.comments) == 1 g2 = client.list_comments( task_id=res.task.task_id, external_id="Foo1", external_group=group_name_2 ) assert len(g2.data.comments) == 0 comments.data.comments[0].delete() comments.data.comments[1].delete() comments.data.comments[2].delete() g1 = client.list_comments(external_group=group_name_1) assert len(g1.data.comments) == 0 g2 = client.list_comments(external_group=group_name_2) assert len(g2.data.comments) == 0
def test_basic_task_comment(): steamship = get_steamship_client() embedder = PluginInstance.create(steamship, plugin_handle="test-embedder").data with random_index(steamship, embedder.handle) as index: item1 = EmbeddedItem( value="Pizza", external_id="pizza", external_type="food", metadata=[1, 2, 3] ) index.insert( item1.value, external_id=item1.external_id, external_type=item1.external_type, metadata=item1.metadata, ) task = index.embed() task.wait() res2 = index.search(item1.value, include_metadata=True, k=1) res2.task.add_comment(external_id="Foo", external_type="Bar", metadata=[1, 2]) # We don't return to Res2 until the end to make sure we aren't co-mingling comments! res = index.search(item1.value, include_metadata=True, k=1) assert res.data.items is not None assert len(res.data.items) == 1 assert res.data.items[0].value.value == item1.value assert res.data.items[0].value.external_id == item1.external_id assert res.data.items[0].value.external_type == item1.external_type _list_equal(res.data.items[0].value.metadata, item1.metadata) res.task.add_comment(external_id="Foo", external_type="Bar", metadata=[1, 2]) comments = res.task.list_comments() assert len(comments.data.comments) == 1 comment = comments.data.comments[0] assert comment.external_id == "Foo" assert comment.external_type == "Bar" _list_equal(comment.metadata, [1, 2]) comment.delete() comments = res.task.list_comments() assert len(comments.data.comments) == 0 # Now let's add one res.task.add_comment(external_id="Foo1", external_type="Bar1", metadata=[1, 2, 3]) res.task.add_comment(external_id="Foo2", external_type="Bar2", metadata=[1, 2, 3, 4]) comments = res.task.list_comments() assert len(comments.data.comments) == 2 comment = comments.data.comments[0] assert comment.external_id == "Foo1" assert comment.external_type == "Bar1" _list_equal(comment.metadata, [1, 2, 3]) comment = comments.data.comments[1] assert comment.external_id == "Foo2" assert comment.external_type == "Bar2" _list_equal(comment.metadata, [1, 2, 3, 4]) comments.data.comments[0].delete() comments.data.comments[1].delete() comments = res.task.list_comments() assert len(comments.data.comments) == 0 # Now we handle res2 comments = res2.task.list_comments() assert len(comments.data.comments) == 1 comment = comments.data.comments[0] assert comment.external_id == "Foo" assert comment.external_type == "Bar" _list_equal(comment.metadata, [1, 2]) comments.data.comments[0].delete() comments = res.task.list_comments() assert len(comments.data.comments) == 0
def test_snapshot_create(): steamship = get_steamship_client() plugin_instance = PluginInstance.create(steamship, plugin_handle=_TEST_EMBEDDER).data index = steamship.create_index(plugin_instance=plugin_instance.handle).data _insert(index, ["Oranges are orange."]) search_results = index.search("What color are oranges?", include_metadata=True) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.index_source == "index" assert search_results.data.items[0].value.value == "Oranges are orange." assert search_results.data.items[0].value.external_id == "TestId" assert search_results.data.items[0].value.external_type == "TestType" assert len(search_results.data.items[0].value.metadata) == 3 _snapshot(index) search_results = index.search("What color are oranges?", include_metadata=True) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.index_source == "snapshot" assert search_results.data.items[0].value.value == "Oranges are orange." assert search_results.data.items[0].value.external_id == "TestId" assert search_results.data.items[0].value.external_type == "TestType" assert len(search_results.data.items[0].value.metadata) == 3 _insert(index, ["Apples are red."]) search_results = index.search("What color are apples?", include_metadata=True) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.index_source == "index" assert search_results.data.items[0].value.value == "Apples are red." assert search_results.data.items[0].value.external_id == "TestId" assert search_results.data.items[0].value.external_type == "TestType" assert len(search_results.data.items[0].value.metadata) == 3 _snapshot(index) search_results = index.search("What color are apples?", include_metadata=True) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.index_source == "snapshot" assert search_results.data.items[0].value.value == "Apples are red." assert search_results.data.items[0].value.external_id == "TestId" assert search_results.data.items[0].value.external_type == "TestType" assert len(search_results.data.items[0].value.metadata) == 3 index.delete() steamship = get_steamship_client() index = steamship.create_index(plugin_instance=plugin_instance.handle).data sentences = [] for i in range(15): sentences.append("Orange number {} is as good as the last".format(i)) sent = "Is orange number 13 Any good?" _insert(index, sentences) search_results = index.search(sent, include_metadata=True) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.index_source == "index" assert search_results.data.items[ 0].value.value == "Orange number 13 is as good as the last" assert search_results.data.items[0].value.external_id == "TestId" assert search_results.data.items[0].value.external_type == "TestType" assert len(search_results.data.items[0].value.metadata) == 3 _snapshot(index, window_size=2) search_results = index.search(sent, include_metadata=True) assert len(search_results.data.items) == 1 assert search_results.data.items[0].value.index_source == "snapshot" assert search_results.data.items[ 0].value.value == "Orange number 13 is as good as the last" assert search_results.data.items[0].value.external_id == "TestId" assert search_results.data.items[0].value.external_type == "TestType" assert len(search_results.data.items[0].value.metadata) == 3 index.delete()
def test_basic_embedding_search(): client = get_steamship_client() plugin_instance = PluginInstance.create(client, plugin_handle=_TEST_EMBEDDER).data basic_embedding_search(client, plugin_instance.handle)
def test_parse_file(): steamship = get_steamship_client() parser = PluginInstance.create(steamship, plugin_handle="test-tagger").data tag_file(steamship, parser.handle)
def test_e2e_parser(): client = get_steamship_client() tagger_plugin_path = PLUGINS_PATH / "taggers" / "plugin_configurable_tagger.py" config_template = { "tagKind": { "type": "string" }, "tagName": { "type": "string" }, "numberValue": { "type": "number" }, "booleanValue": { "type": "boolean" }, } instance_config1 = { "tagKind": "testTagKind", "tagName": "testTagName", "numberValue": 3, "booleanValue": True, } with deploy_plugin( client, tagger_plugin_path, "tagger", version_config_template=config_template, instance_config=instance_config1, ) as (plugin, version, instance): test_doc = "Hi there" res = instance.tag(doc=test_doc) res.wait() assert res.error is None assert res.data is not None assert len(res.data.file.blocks) == 1 assert res.data.file.blocks[0].text == test_doc # Validate configured content assert len(res.data.file.tags) == 1 tag = res.data.file.tags[0] assert tag.name == instance_config1["tagName"] assert tag.kind == instance_config1["tagKind"] tag_value = tag.value assert tag_value["numberValue"] == instance_config1["numberValue"] assert tag_value["booleanValue"] == instance_config1["booleanValue"] instance_config2 = { "tagKind": "testTagKind2", "tagName": "testTagName2", "numberValue": 4, "booleanValue": False, } instance2 = PluginInstance.create( client, plugin_id=plugin.id, plugin_version_id=version.id, config=instance_config2, ) instance2.wait() assert instance2.error is None assert instance2.data is not None instance2 = instance2.data res = instance2.tag(doc=test_doc) res.wait() assert res.error is None assert res.data is not None assert len(res.data.file.blocks) == 1 assert res.data.file.blocks[0].text == test_doc # Validate configured content assert len(res.data.file.tags) == 1 tag = res.data.file.tags[0] assert tag.name == instance_config2["tagName"] assert tag.kind == instance_config2["tagKind"] tag_value = tag.value assert tag_value["numberValue"] == instance_config2["numberValue"] assert tag_value["booleanValue"] == instance_config2["booleanValue"]