def test_no_mapping(self):

        ds = DataSource(
            format="json", source=os.path.join(FILES_PATH, "dataset_source.jsonl")
        )
        with pytest.raises(ValueError):
            ds.to_mapped_dataframe()
Esempio n. 2
0
def test_dataset_creation_with_partial_mapping(
    datasource_with_partial_mapping: DataSource, pipeline_test: Pipeline
):
    df = datasource_with_partial_mapping.to_mapped_dataframe()
    dataset = pipeline_test.create_dataset(datasource_with_partial_mapping)
    assert isinstance(dataset, AllennlpDataset)
    assert len(dataset) == len(df.text)

    for instance in dataset:
        assert isinstance(instance, Instance)
        assert "text" in instance.fields
        assert "label" in instance.fields
Esempio n. 3
0
    def create_dataset(self,
                       datasource: DataSource,
                       lazy: bool = False) -> InstancesDataset:
        """
        Creates an instances torch Dataset from an data source

        Parameters
        ----------
        datasource:
            The source of data
        lazy:
            If enabled, the returned dataset is a subclass of `torch.data.utils.IterableDataset`

        Returns
        -------

        A torch Dataset containing the instances collection

        """
        mapping = {k: k for k in self.inputs + [self.output] if k}
        mapping.update(datasource.mapping)

        datasource.mapping = mapping
        ddf = datasource.to_mapped_dataframe()
        instances_series: "dask.dataframe.core.Series" = ddf.map_partitions(
            lambda df: df.apply(
                lambda row: self.head.featurize(**row.to_dict()), axis=1),
            meta=object,
        ).persist()
        # We remove the not featurizable examples from the data set. The head should log a warning for them though!
        instances_series = instances_series.dropna()

        def build_instance_generator(instances: DataFrame):
            """Configures an instance generator from DataFrame"""
            def instance_generator(path: str) -> Iterable[Instance]:
                yield from instances

            return instance_generator

        return (AllennlpLazyDataset(
            instance_generator=build_instance_generator(instances_series),
            file_path="dummy",
        ) if lazy else AllennlpDataset(list(instances_series.compute())))
Esempio n. 4
0
def _explore(
    pipeline: Pipeline,
    data_source: DataSource,
    config: ExploreConfiguration,
    elasticsearch: ElasticsearchExplore,
) -> dd.DataFrame:
    """
    Executes a pipeline prediction over a datasource and register results int a elasticsearch index

    Parameters
    ----------
    pipeline
    data_source
    config
    elasticsearch

    Returns
    -------

    """
    if config.prediction_cache > 0:
        pipeline.init_prediction_cache(config.prediction_cache)

    ddf_mapped = data_source.to_mapped_dataframe()
    # Stringify input data for better elasticsearch index mapping integration,
    # avoiding properties with multiple value types (string and long,...)
    for column in ddf_mapped.columns:
        ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify)

    # this only makes really sense when we have a predict_batch_json method implemented ...
    n_partitions = max(1, round(len(ddf_mapped) / config.batch_size))

    apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch

    def annotate_batch(df: pd.DataFrame):
        """Applies data annotation at batch level"""
        input_batch = df.to_dict(orient="records")
        predictions = apply_func(input_batch)
        return pd.Series(map(sanitize, predictions), index=df.index)

    # a persist is necessary here, otherwise it fails for n_partitions == 1
    # the reason is that with only 1 partition we pass on a generator to predict_batch_json
    ddf_mapped: dd.DataFrame = ddf_mapped.repartition(
        npartitions=n_partitions).persist()
    ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch,
                                                         meta=(None, object))

    ddf_source = (data_source.to_dataframe().repartition(
        npartitions=n_partitions).persist())
    # Keep as metadata only non used values/columns
    ddf_source = ddf_source[[
        c for c in ddf_source.columns if c not in ddf_mapped.columns
    ]]
    ddf_mapped["metadata"] = ddf_source.map_partitions(
        lambda df: helpers.stringify(sanitize(df.to_dict(orient="records"))))

    ddf = DaskElasticClient(host=elasticsearch.es_host,
                            retry_on_timeout=True,
                            http_compress=True).save(
                                ddf_mapped,
                                index=elasticsearch.es_index,
                                doc_type=elasticsearch.es_doc)

    elasticsearch.create_explore_data_index(force_delete=config.force_delete)
    elasticsearch.create_explore_data_record({
        **(config.metadata or {}),
        "datasource":
        data_source.source,
        # TODO: This should change when ui is normalized (action detail and action link naming)
        "explore_name":
        elasticsearch.es_index,
        "model":
        pipeline.name,
        "columns":
        ddf.columns.values.tolist(),
        "metadata_columns":
        data_source.to_dataframe().columns.values.tolist(),
        "pipeline":
        pipeline.type_name,
        "output":
        pipeline.output,
        "inputs":
        pipeline.inputs,  # backward compatibility
        "signature":
        pipeline.inputs + [pipeline.output],
        "predict_signature":
        pipeline.inputs,
        "labels":
        pipeline.head.labels,
        "task":
        pipeline.head.task_name().as_string(),
    })
    return ddf.persist()