Ejemplo n.º 1
0
def test_from_elasticsearch(dataset, default_pipeline_config):
    pipeline = Pipeline.from_config(default_pipeline_config)
    es_index = explore.create(pipeline,
                              dataset,
                              explore_id="test_index",
                              show_explore=False)
    es_client = Elasticsearch()
    __wait_for_index_creation__(es_client, es_index)
    ds = Dataset.from_elasticsearch(es_client,
                                    index=es_index,
                                    query={"query": {
                                        "match_all": {}
                                    }})

    assert len(ds) == len(dataset)
    for key in ["_id", "_index", "_type", "_score"]:
        assert key in ds.column_names

    ds = Dataset.from_elasticsearch(
        es_client,
        index=es_index,
        query={"query": {
            "exists": {
                "field": "not_found.field"
            }
        }},
    )
    assert len(ds) == 0

    ds = Dataset.from_elasticsearch(es_client,
                                    index=es_index,
                                    fields=["label", "text", "_id"])
    assert len(ds) == len(dataset)
    assert "label" in ds.column_names
    assert "text" in ds.column_names
    assert "_id" in ds.column_names
    assert "prediction" not in ds.column_names
Ejemplo n.º 2
0
def dataset_from_path(path: str) -> Dataset:
    file_extension = Path(path).suffix
    if file_extension in [".csv"]:
        return Dataset.from_csv(path)
    elif file_extension in [".json", ".jsonl"]:
        return Dataset.from_json(path)
    # yaml files are used for elasticsearch data
    elif file_extension in [".yaml", ".yml"]:
        from_es_kwargs = yaml_to_dict(path)
        client = Elasticsearch(**from_es_kwargs["client"])
        return Dataset.from_elasticsearch(
            client=client,
            index=from_es_kwargs["index"],
            query=from_es_kwargs.get("query"),
        )
    else:
        raise ValueError(f"Could not create a Dataset from '{path}'")