def test_from_elasticsearch(dataset, default_pipeline_config): pipeline = Pipeline.from_config(default_pipeline_config) es_index = explore.create(pipeline, dataset, explore_id="test_index", show_explore=False) es_client = Elasticsearch() __wait_for_index_creation__(es_client, es_index) ds = Dataset.from_elasticsearch(es_client, index=es_index, query={"query": { "match_all": {} }}) assert len(ds) == len(dataset) for key in ["_id", "_index", "_type", "_score"]: assert key in ds.column_names ds = Dataset.from_elasticsearch( es_client, index=es_index, query={"query": { "exists": { "field": "not_found.field" } }}, ) assert len(ds) == 0 ds = Dataset.from_elasticsearch(es_client, index=es_index, fields=["label", "text", "_id"]) assert len(ds) == len(dataset) assert "label" in ds.column_names assert "text" in ds.column_names assert "_id" in ds.column_names assert "prediction" not in ds.column_names
def dataset_from_path(path: str) -> Dataset: file_extension = Path(path).suffix if file_extension in [".csv"]: return Dataset.from_csv(path) elif file_extension in [".json", ".jsonl"]: return Dataset.from_json(path) # yaml files are used for elasticsearch data elif file_extension in [".yaml", ".yml"]: from_es_kwargs = yaml_to_dict(path) client = Elasticsearch(**from_es_kwargs["client"]) return Dataset.from_elasticsearch( client=client, index=from_es_kwargs["index"], query=from_es_kwargs.get("query"), ) else: raise ValueError(f"Could not create a Dataset from '{path}'")