예제 #1
0
def test_from_json(resources_data_path):
    json_path = str(resources_data_path / "dataset_sequence.jsonl")
    ds = Dataset.from_json(paths=json_path)
    ds2 = Dataset.from_json(paths=[json_path, json_path])

    assert len(ds) == 4
    assert len(ds2) == 8

    json_path = str(resources_data_path / "dataset_sequence.json")
    ds = Dataset.from_json(paths=json_path, field="data")

    assert len(ds) == 4
예제 #2
0
def test_flatten_json(resources_data_path):
    """Showcases the behavior of Dataset.flatten_"""
    file_path = str(resources_data_path / "to-be-flattened.jsonl")
    dataset_flatten_source = Dataset.from_json(paths=file_path)
    dataset_flatten_source.flatten_()

    for c in ["complexData.a", "complexData.b"]:
        assert c in dataset_flatten_source.column_names

    file_path = str(resources_data_path / "nested-list.jsonl")
    dataset_nested_list = Dataset.from_json(paths=file_path)
    dataset_nested_list.flatten_()

    assert len(dataset_nested_list) == 1
    assert dataset_nested_list.column_names == ["classification"]
예제 #3
0
def dataset_from_path(path: str) -> Dataset:
    file_extension = Path(path).suffix
    if file_extension in [".csv"]:
        return Dataset.from_csv(path)
    elif file_extension in [".json", ".jsonl"]:
        return Dataset.from_json(path)
    else:
        raise ValueError(
            f"Could not create a Dataset from '{path}'. "
            f"We only support following formats: [csv, json, jsonl]"
        )
예제 #4
0
def dataset_from_path(path: str) -> Dataset:
    file_extension = Path(path).suffix
    if file_extension in [".csv"]:
        return Dataset.from_csv(path)
    elif file_extension in [".json", ".jsonl"]:
        return Dataset.from_json(path)
    # yaml files are used for elasticsearch data
    elif file_extension in [".yaml", ".yml"]:
        from_es_kwargs = yaml_to_dict(path)
        client = Elasticsearch(**from_es_kwargs["client"])
        return Dataset.from_elasticsearch(
            client=client,
            index=from_es_kwargs["index"],
            query=from_es_kwargs.get("query"),
        )
    else:
        raise ValueError(f"Could not create a Dataset from '{path}'")