Example #1
0
def test_e2e_csv_blockifier_plugin():
    client = get_steamship_client()
    csv_blockifier_plugin_path = PLUGINS_PATH / "blockifiers" / "csv_blockifier.py"

    version_config_template = dict(
        text_column=dict(type="string"),
        tag_columns=dict(type="string"),
        tag_kind=dict(type="string"),
    )  # TODO (enias): Derive this from Config
    instance_config = dict(  # Has to match up
        text_column="Message",
        tag_columns="Category",
        tag_kind="Intent",
    )
    with deploy_plugin(
            client,
            csv_blockifier_plugin_path,
            "blockifier",
            version_config_template=version_config_template,
            instance_config=instance_config,
    ) as (plugin, version, instance):
        with upload_file(client, "utterances.csv") as file:
            assert len(file.refresh().data.blocks) == 0
            file.blockify(plugin_instance=instance.handle).wait()
            # Check the number of blocks
            blocks = file.refresh().data.blocks
            assert len(blocks) == 5
            for block in blocks:
                assert block.tags is not None
                assert len(block.tags) > 0
                for tag in block.tags:
                    assert tag.name is not None
                    assert tag.kind is not None
            file.delete()
def test_get_training_parameters():
    """Any trainable plugin needs a Python+Lambda component that can report its trainable params.
    This tests that all the plumbing works for that to be returned"""
    client = get_steamship_client()
    tagger_path = PLUGINS_PATH / "taggers" / "plugin_trainable_tagger.py"
    # Now make a trainable tagger to train on those tags
    with deploy_plugin(
        client,
        tagger_path,
        "tagger",
        training_platform=HostingType.LAMBDA,
    ) as (tagger, taggerVersion, taggerInstance):
        training_request = TrainingParameterPluginInput(plugin_instance=taggerInstance.handle)
        res = taggerInstance.get_training_parameters(
            training_request
        )  # TODO (enias): How is this working?
        assert res.data is not None
        params = res.data

        assert params.training_epochs is not None
        assert params.training_epochs == TRAINING_PARAMETERS.training_epochs
        assert math.isclose(
            params.testing_holdout_percent,
            TRAINING_PARAMETERS.testing_holdout_percent,
            abs_tol=0.0001,
        )
        assert params.training_params == TRAINING_PARAMETERS.training_params
Example #3
0
def test_e2e_corpus_importer():
    client = get_steamship_client()
    corpus_importer_path = PLUGINS_PATH / "importers" / "plugin_corpus_importer.py"
    file_importer_path = PLUGINS_PATH / "importers" / "plugin_file_importer.py"

    with temporary_space(client) as space:
        with deploy_plugin(client,
                           file_importer_path,
                           "fileImporter",
                           space_id=space.id) as (
                               _,
                               _,
                               fi_instance,
                           ):
            with deploy_plugin(client,
                               corpus_importer_path,
                               "corpusImporter",
                               space_id=space.id) as (
                                   plugin,
                                   version,
                                   instance,
                               ):
                req = CorpusImportRequest(
                    type="file",
                    value="dummy-value",
                    plugin_instance=instance.handle,
                    file_importer_plugin_instance=fi_instance.handle,
                )
                res = client.post(
                    "plugin/instance/importCorpus",
                    req,
                    expect=CorpusImportResponse,
                    space_id=space.id,
                )
                res.wait()

                # We should now have two files!
                files = File.list(client, space_id=space.id).data
                assert files.files is not None
                assert len(files.files) == 2

                for file in files.files:
                    data = file.raw().data
                    assert data.decode("utf-8") == TEST_DOC
                    file.delete()
def test_e2e_blockifier_plugin():
    client = get_steamship_client()
    blockifier_path = PLUGINS_PATH / "blockifiers" / "blockifier.py"
    with deploy_plugin(client, blockifier_path, "blockifier") as (
            plugin,
            version,
            instance,
    ):
        file = File.create(client=client, content="This is a test.").data
        assert len(file.refresh().data.blocks) == 0
        file.blockify(plugin_instance=instance.handle).wait()
        assert len(file.refresh().data.blocks) == 4
        file.delete()
Example #5
0
def test_e2e_tagger():
    client = get_steamship_client()
    parser_path = PLUGINS_PATH / "taggers" / "plugin_parser.py"
    # TODO (enias): Use Enum for plugin type
    with deploy_plugin(client, parser_path,
                       "tagger") as (plugin, version, instance):
        test_doc = "Hi there"
        res = instance.tag(doc=test_doc)
        res.wait()
        assert res.error is None
        assert res.data is not None
        assert len(res.data.file.blocks) == 1
        assert res.data.file.blocks[0].text == test_doc

        # Let's try it on a file. This is the same test we run on the Swift test parser.
        # Since the python test parser is implemented to behave the same, we can reuse it!
        tag_file(client, instance.handle)
Example #6
0
def test_e2e_importer(client: Steamship):
    file_importer_path = PLUGINS_PATH / "importers" / "plugin_file_importer.py"
    with deploy_plugin(client, file_importer_path, "fileImporter") as (
            plugin,
            version,
            instance,
    ):
        # The test FileImporter should always return a string file with contents TEST_DOC
        file = File.create(client=client,
                           content="This is a test.",
                           plugin_instance=instance.handle).data

        # Now fetch the data from Steamship and assert that it is the SAME as the data the FileImporter creates
        data = file.raw().data
        assert data.decode("utf-8") == TEST_DOC

        file.delete()
Example #7
0
def test_e2e_corpus_export(client: Steamship):
    version_config_template = dict(
        text_column=dict(type="string"),
        tag_columns=dict(type="string"),
        tag_kind=dict(type="string"),
    )  # TODO (enias): Derive this from Config
    instance_config = dict(  # Has to match up
        text_column="Message",
        tag_columns="Category",
        tag_kind="Intent",
    )
    exporter_plugin_r = PluginInstance.create(
        client=client,
        handle=EXPORTER_HANDLE,
        plugin_handle=EXPORTER_HANDLE,
        upsert=True,
    )
    assert exporter_plugin_r.data is not None
    exporter_plugin = exporter_plugin_r.data
    assert exporter_plugin.handle is not None

    _input = ExportPluginInput(handle="default", type="file")

    csv_blockifier_path = PLUGINS_PATH / "blockifiers" / "csv_blockifier.py"

    # Make a blockifier which will generate our trainable corpus
    with deploy_plugin(
            client,
            csv_blockifier_path,
            "blockifier",
            version_config_template=version_config_template,
            instance_config=instance_config,
    ) as (plugin, version, instance):
        with upload_file(client, "utterances.csv") as file:
            assert len(file.refresh().data.blocks) == 0
            # Use the plugin we just registered
            file.blockify(plugin_instance=instance.handle).wait()
            assert len(file.refresh().data.blocks) == 5

            # Now export the corpus
            raw_data_r = exporter_plugin.export(_input)
            assert raw_data_r is not None

            # The results of a corpus exporter are MD5 encoded!
            _ = raw_data_r.data
def test_e2e_third_party_trainable_tagger_lambda_training():
    client = get_steamship_client()
    spaceR = Space.get(client)  # TODO (enias): Remove
    assert spaceR.data is not None

    exporter_plugin_r = PluginInstance.create(
        client=client,
        handle=EXPORTER_HANDLE,
        plugin_handle=EXPORTER_HANDLE,
        upsert=True,  # Don't care if it already exists
    )
    assert exporter_plugin_r.data is not None
    exporter_plugin = exporter_plugin_r.data
    assert exporter_plugin.handle is not None

    third_party_trainable_tagger_path = (
        PLUGINS_PATH / "taggers" / "plugin_third_party_trainable_tagger.py")

    # Note that we're going to do the below training on ZERO data for simplicity.
    # The particular test model doesn't actually incorporate any data given to it at training time, so
    # it would just slow the test down to create, blockify, and export a training corpus.

    with deploy_plugin(
            client,
            third_party_trainable_tagger_path,
            "tagger",
            training_platform=HostingType.LAMBDA) as (tagger, tagger_version,
                                                      tagger_instance):
        # Now train the plugin
        training_request = TrainingParameterPluginInput(
            plugin_instance=tagger_instance.handle,
            export_plugin_input=ExportPluginInput(
                plugin_instance=exporter_plugin.handle,
                type="file",
                query="all"),
        )
        train_result = tagger_instance.train(training_request)
        train_result.wait()
        assert train_result.data is not None
        output = train_result.data
        assert output.training_complete
        assert output.training_reference_data is not None
        assert output.training_reference_data["num_checkins"] == 3

        logging.info("Waiting 15 seconds for instance to deploy.")
        import time

        time.sleep(15)

        # Now we'll attempt to USE this plugin. This plugin's behavior is to simply tag every block with
        # the parameters `MockClient.LABELS`

        # First we'll create a file
        test_doc = "Hi there"
        res = tagger_instance.tag(doc=test_doc)
        res.wait()
        assert res.error is None
        assert res.data is not None
        assert res.data.file is not None
        assert not res.data.file.tags
        assert res.data.file.blocks is not None
        assert len(res.data.file.blocks) > 0
        for block in res.data.file.blocks:
            assert block.tags is not None
            assert sorted([tag.name
                           for tag in block.tags]) == sorted(MockClient.LABELS)
Example #9
0
def test_e2e_trainable_tagger_lambda_training(client: Steamship):

    version_config_template = dict(
        text_column=dict(type="string"),
        tag_columns=dict(type="string"),
        tag_kind=dict(type="string"),
    )
    instance_config = dict(text_column="Message", tag_columns="Category", tag_kind="Intent")

    exporter_plugin_r = PluginInstance.create(
        client=client,
        handle=EXPORTER_HANDLE,
        plugin_handle=EXPORTER_HANDLE,
        upsert=True,
    )
    assert exporter_plugin_r.data is not None
    exporter_plugin = exporter_plugin_r.data
    assert exporter_plugin.handle is not None

    csv_blockifier_path = PLUGINS_PATH / "blockifiers" / "csv_blockifier.py"
    trainable_tagger_path = PLUGINS_PATH / "taggers" / "plugin_trainable_tagger.py"

    # Make a blockifier which will generate our trainable corpus
    with deploy_plugin(
        client,
        csv_blockifier_path,
        "blockifier",
        version_config_template=version_config_template,
        instance_config=instance_config,
    ) as (plugin, version, instance):
        with upload_file(client, "utterances.csv") as file:
            assert len(file.refresh().data.blocks) == 0
            # Use the plugin we just registered
            file.blockify(plugin_instance=instance.handle).wait()
            assert len(file.refresh().data.blocks) == 5

            # Now make a trainable tagger to train on those tags
            with deploy_plugin(
                client, trainable_tagger_path, "tagger", training_platform=HostingType.LAMBDA
            ) as (tagger, tagger_version, tagger_instance):
                # Now train the plugin
                training_request = TrainingParameterPluginInput(
                    plugin_instance=tagger_instance.handle,
                    export_plugin_input=ExportPluginInput(
                        plugin_instance=EXPORTER_HANDLE, type="file", query='kind "foo1"'
                    ),
                    training_params=dict(
                        keyword_list=KEYWORDS  # This is a key defined by the test model we're training
                    ),
                )

                train_result = tagger_instance.train(training_request)
                train_result.wait()

                # At this point, the PluginInstance will have written a parameter file to disk. We should be able to
                # retrieve it since we know that it is tagged as the `default`.

                checkpoint = ModelCheckpoint(
                    client=client,
                    handle="default",
                    plugin_instance_id=tagger_instance.id,
                )
                checkpoint_path = checkpoint.download_model_bundle()
                assert checkpoint_path.exists()
                keyword_path = Path(checkpoint_path) / TestTrainableTaggerModel.KEYWORD_LIST_FILE
                assert keyword_path.exists()
                with open(keyword_path, "r") as f:
                    params = json.loads(f.read())
                    assert params == KEYWORDS

                logging.info("Waiting 15 seconds for instance to deploy.")
                import time

                time.sleep(15)

                # If we're here, we have verified that the plugin instance has correctly recorded its parameters
                # into the pluginData bucket under a path unique to the PluginInstnace/ModelCheckpoint.

                # Now we'll attempt to USE this plugin. This plugin's behavior is to simply tag any file with the
                # tags that parameter it. Since those tags are (see above) ["product", "coupon"] we should expect
                # this tagger to apply those tags to any file provided to it.

                # First we'll create a file
                test_doc = "Hi there"
                res = tagger_instance.tag(doc=test_doc)
                res.wait()
                assert res.error is None
                assert res.data is not None
                assert res.data.file is not None
                assert res.data.file.tags is not None
                assert len(res.data.file.tags) == len(KEYWORDS)
                assert sorted([tag.name for tag in res.data.file.tags]) == sorted(KEYWORDS)
Example #10
0
def test_e2e_parser():
    client = get_steamship_client()
    tagger_plugin_path = PLUGINS_PATH / "taggers" / "plugin_configurable_tagger.py"
    config_template = {
        "tagKind": {
            "type": "string"
        },
        "tagName": {
            "type": "string"
        },
        "numberValue": {
            "type": "number"
        },
        "booleanValue": {
            "type": "boolean"
        },
    }
    instance_config1 = {
        "tagKind": "testTagKind",
        "tagName": "testTagName",
        "numberValue": 3,
        "booleanValue": True,
    }

    with deploy_plugin(
            client,
            tagger_plugin_path,
            "tagger",
            version_config_template=config_template,
            instance_config=instance_config1,
    ) as (plugin, version, instance):
        test_doc = "Hi there"
        res = instance.tag(doc=test_doc)
        res.wait()
        assert res.error is None
        assert res.data is not None
        assert len(res.data.file.blocks) == 1
        assert res.data.file.blocks[0].text == test_doc

        # Validate configured content
        assert len(res.data.file.tags) == 1
        tag = res.data.file.tags[0]
        assert tag.name == instance_config1["tagName"]
        assert tag.kind == instance_config1["tagKind"]
        tag_value = tag.value
        assert tag_value["numberValue"] == instance_config1["numberValue"]
        assert tag_value["booleanValue"] == instance_config1["booleanValue"]

        instance_config2 = {
            "tagKind": "testTagKind2",
            "tagName": "testTagName2",
            "numberValue": 4,
            "booleanValue": False,
        }

        instance2 = PluginInstance.create(
            client,
            plugin_id=plugin.id,
            plugin_version_id=version.id,
            config=instance_config2,
        )
        instance2.wait()
        assert instance2.error is None
        assert instance2.data is not None
        instance2 = instance2.data

        res = instance2.tag(doc=test_doc)
        res.wait()
        assert res.error is None
        assert res.data is not None
        assert len(res.data.file.blocks) == 1
        assert res.data.file.blocks[0].text == test_doc

        # Validate configured content
        assert len(res.data.file.tags) == 1
        tag = res.data.file.tags[0]
        assert tag.name == instance_config2["tagName"]
        assert tag.kind == instance_config2["tagKind"]
        tag_value = tag.value
        assert tag_value["numberValue"] == instance_config2["numberValue"]
        assert tag_value["booleanValue"] == instance_config2["booleanValue"]