def test_from_json(resources_data_path): json_path = str(resources_data_path / "dataset_sequence.jsonl") ds = Dataset.from_json(paths=json_path) ds2 = Dataset.from_json(paths=[json_path, json_path]) assert len(ds) == 4 assert len(ds2) == 8 json_path = str(resources_data_path / "dataset_sequence.json") ds = Dataset.from_json(paths=json_path, field="data") assert len(ds) == 4
def test_flatten_json(resources_data_path): """Showcases the behavior of Dataset.flatten_""" file_path = str(resources_data_path / "to-be-flattened.jsonl") dataset_flatten_source = Dataset.from_json(paths=file_path) dataset_flatten_source.flatten_() for c in ["complexData.a", "complexData.b"]: assert c in dataset_flatten_source.column_names file_path = str(resources_data_path / "nested-list.jsonl") dataset_nested_list = Dataset.from_json(paths=file_path) dataset_nested_list.flatten_() assert len(dataset_nested_list) == 1 assert dataset_nested_list.column_names == ["classification"]
def dataset_from_path(path: str) -> Dataset: file_extension = Path(path).suffix if file_extension in [".csv"]: return Dataset.from_csv(path) elif file_extension in [".json", ".jsonl"]: return Dataset.from_json(path) else: raise ValueError( f"Could not create a Dataset from '{path}'. " f"We only support following formats: [csv, json, jsonl]" )
def dataset_from_path(path: str) -> Dataset: file_extension = Path(path).suffix if file_extension in [".csv"]: return Dataset.from_csv(path) elif file_extension in [".json", ".jsonl"]: return Dataset.from_json(path) # yaml files are used for elasticsearch data elif file_extension in [".yaml", ".yml"]: from_es_kwargs = yaml_to_dict(path) client = Elasticsearch(**from_es_kwargs["client"]) return Dataset.from_elasticsearch( client=client, index=from_es_kwargs["index"], query=from_es_kwargs.get("query"), ) else: raise ValueError(f"Could not create a Dataset from '{path}'")