Example #1
0
def test_from_csv(resources_data_path):
    csv_path = str(resources_data_path / "business.cat.2k.valid.csv")
    ds = Dataset.from_csv(paths=csv_path)
    ds2 = Dataset.from_csv(paths=[csv_path, csv_path])

    assert len(ds) == 400
    assert len(ds2) == 800
    assert ds.dataset.column_names == ["label", "text"]
Example #2
0
def train_valid_dataset(resources_data_path) -> Tuple[Dataset, Dataset]:
    """Returns both training and validation datasets"""

    training_ds = Dataset.from_csv(paths=str(resources_data_path /
                                             "business.cat.2k.train.csv"))
    validation_ds = Dataset.from_csv(paths=str(resources_data_path /
                                               "business.cat.2k.valid.csv"))

    return training_ds, validation_ds
Example #3
0
def train_data_source() -> Dataset:
    """Creates the training dataset"""
    resources_path = Path(__file__).parent.parent / "resources" / "data"
    training_ds = Dataset.from_csv(paths=str(resources_path /
                                             "business.cat.2k.train.csv"))

    return training_ds
Example #4
0
def dataset_from_path(path: str) -> Dataset:
    file_extension = Path(path).suffix
    if file_extension in [".csv"]:
        return Dataset.from_csv(path)
    elif file_extension in [".json", ".jsonl"]:
        return Dataset.from_json(path)
    else:
        raise ValueError(
            f"Could not create a Dataset from '{path}'. "
            f"We only support following formats: [csv, json, jsonl]"
        )
Example #5
0
def dataset(tmp_path_factory, request):
    cache_path = tmp_path_factory.mktemp("test_instance_caching")
    data_path = cache_path / "data.csv"
    data_path.write_text("text,label\ncheck,this\nand,that")

    dataset = Dataset.from_csv(paths=str(data_path), cache_dir=str(cache_path))

    # inject to classes decorated with `pytest.mark.usefixtures`
    if request.cls:
        request.cls.dataset = dataset

    return dataset
Example #6
0
def training_dataset() -> Dataset:
    """Creates the training dataset and gives the structure"""
    resources_path = (
        Path(__file__).parent.parent.parent / "tests" / "resources" / "data"
    )
    training_ds = Dataset.from_csv(paths=str(resources_path / "dataset_source.csv"))

    # Keeping just 'label' and text 'category'
    training_ds = training_ds.map(
        lambda x: {"label": x["job"], "text": x["education"] + " " + x["marital"]},
    )

    return training_ds
Example #7
0
def train_dataset() -> Dataset:
    """Creates the training dataset"""
    source = (
        Path(__file__).parent.parent
        / "resources"
        / "data"
        / "emotions_with_transformers.txt"
    )

    train_dataset = Dataset.from_csv(
        paths=str(source), delimiter=";", column_names=["text", "label"]
    )
    return train_dataset
Example #8
0
def dataset_from_path(path: str) -> Dataset:
    file_extension = Path(path).suffix
    if file_extension in [".csv"]:
        return Dataset.from_csv(path)
    elif file_extension in [".json", ".jsonl"]:
        return Dataset.from_json(path)
    # yaml files are used for elasticsearch data
    elif file_extension in [".yaml", ".yml"]:
        from_es_kwargs = yaml_to_dict(path)
        client = Elasticsearch(**from_es_kwargs["client"])
        return Dataset.from_elasticsearch(
            client=client,
            index=from_es_kwargs["index"],
            query=from_es_kwargs.get("query"),
        )
    else:
        raise ValueError(f"Could not create a Dataset from '{path}'")
Example #9
0
def dataset(resources_data_path) -> Dataset:
    return Dataset.from_csv(paths=str(resources_data_path /
                                      "business.cat.2k.valid.csv"))