def test_flatten_json(self):
        file_path = os.path.join(FILES_PATH, "to-be-flattened.jsonl")
        ds = DataSource(format="json", flatten=True, source=file_path)
        df = ds.to_dataframe().compute()

        for c in ["persons.*.lastName", "persons.*.name"]:
            self.assertIn(c, df.columns, f"Expected {c} as column name")
Exemple #2
0
    def test_read_parquet(self):
        file_path = os.path.join(FILES_PATH, "test.parquet")
        ds = DataSource(format="parquet", source=file_path)

        df = ds.to_dataframe().compute()
        self.assertTrue("reviewerID" in df.columns)
        self.assertTrue("path" in df.columns)
    def test_read_csv(self):
        file_path = os.path.join(FILES_PATH, "dataset_source.csv")

        datasource = DataSource(format="csv", source=file_path)
        data_frame = datasource.to_dataframe().compute()

        assert len(data_frame) > 0
        self.assertTrue("path" in data_frame.columns)
 def test_reader_csv_with_leading_and_trailing_spaces_in_examples(self):
     ds = DataSource(
         format="csv",
         source=os.path.join(FILES_PATH, "trailing_coma_in_headers.csv"),
         sep=";",
     )
     df = ds.to_dataframe().compute()
     self.assertIn("name", df.columns)
Exemple #5
0
def test_lazy_dataset_creation(pipeline_test: Pipeline, datasource_test: DataSource):
    df = datasource_test.to_dataframe()
    dataset = pipeline_test.create_dataset(datasource_test, lazy=True)
    assert isinstance(dataset, AllennlpLazyDataset)
    assert len([x for x in dataset]) == len(df.text)

    for instance in dataset:
        assert isinstance(instance, Instance)
        assert "text" in instance.fields
        assert "label" in instance.fields
    def test_flatten_nested_list(self):
        file_path = os.path.join(FILES_PATH, "nested-list.jsonl")

        ds = DataSource(format="json", flatten=True, source=file_path)
        df = ds.to_dataframe().compute()

        for c in [
                "classification.*.origin.*.key",
                "classification.*.origin.*.source"
        ]:
            self.assertIn(c, df.columns, f"Expected {c} as data column")
Exemple #7
0
def _explore(
    pipeline: Pipeline,
    data_source: DataSource,
    config: ExploreConfiguration,
    elasticsearch: ElasticsearchExplore,
) -> dd.DataFrame:
    """
    Executes a pipeline prediction over a datasource and register results int a elasticsearch index

    Parameters
    ----------
    pipeline
    data_source
    config
    elasticsearch

    Returns
    -------

    """
    if config.prediction_cache > 0:
        pipeline.init_prediction_cache(config.prediction_cache)

    ddf_mapped = data_source.to_mapped_dataframe()
    # Stringify input data for better elasticsearch index mapping integration,
    # avoiding properties with multiple value types (string and long,...)
    for column in ddf_mapped.columns:
        ddf_mapped[column] = ddf_mapped[column].apply(helpers.stringify)

    # this only makes really sense when we have a predict_batch_json method implemented ...
    n_partitions = max(1, round(len(ddf_mapped) / config.batch_size))

    apply_func = pipeline.explain_batch if config.explain else pipeline.predict_batch

    def annotate_batch(df: pd.DataFrame):
        """Applies data annotation at batch level"""
        input_batch = df.to_dict(orient="records")
        predictions = apply_func(input_batch)
        return pd.Series(map(sanitize, predictions), index=df.index)

    # a persist is necessary here, otherwise it fails for n_partitions == 1
    # the reason is that with only 1 partition we pass on a generator to predict_batch_json
    ddf_mapped: dd.DataFrame = ddf_mapped.repartition(
        npartitions=n_partitions).persist()
    ddf_mapped["annotation"] = ddf_mapped.map_partitions(annotate_batch,
                                                         meta=(None, object))

    ddf_source = (data_source.to_dataframe().repartition(
        npartitions=n_partitions).persist())
    # Keep as metadata only non used values/columns
    ddf_source = ddf_source[[
        c for c in ddf_source.columns if c not in ddf_mapped.columns
    ]]
    ddf_mapped["metadata"] = ddf_source.map_partitions(
        lambda df: helpers.stringify(sanitize(df.to_dict(orient="records"))))

    ddf = DaskElasticClient(host=elasticsearch.es_host,
                            retry_on_timeout=True,
                            http_compress=True).save(
                                ddf_mapped,
                                index=elasticsearch.es_index,
                                doc_type=elasticsearch.es_doc)

    elasticsearch.create_explore_data_index(force_delete=config.force_delete)
    elasticsearch.create_explore_data_record({
        **(config.metadata or {}),
        "datasource":
        data_source.source,
        # TODO: This should change when ui is normalized (action detail and action link naming)
        "explore_name":
        elasticsearch.es_index,
        "model":
        pipeline.name,
        "columns":
        ddf.columns.values.tolist(),
        "metadata_columns":
        data_source.to_dataframe().columns.values.tolist(),
        "pipeline":
        pipeline.type_name,
        "output":
        pipeline.output,
        "inputs":
        pipeline.inputs,  # backward compatibility
        "signature":
        pipeline.inputs + [pipeline.output],
        "predict_signature":
        pipeline.inputs,
        "labels":
        pipeline.head.labels,
        "task":
        pipeline.head.task_name().as_string(),
    })
    return ddf.persist()