Ejemplo n.º 1
0
def test_fail_using_reserved_words():
    # This issue was reported and maybe gets resolved: https://github.com/huggingface/datasets/issues/1110

    ds = Dataset.from_dict({
        "a": [{
            "a": 1,
            "b": "two"
        } for _ in range(0, 100)],
        "_type": ["whatever" for _ in range(0, 100)],
    })
    split = ds.train_test_split()
    new_ds = Dataset.from_datasets(split.values())

    assert len(new_ds) == len(ds)

    ds = ds.map(
        lambda example: {
            "new_field": {
                "c": str(example["a"]["a"]),
                "d": f"this {example['a']['b']}"
            }
        })
    split = ds.train_test_split()
    with pytest.raises(TypeError):
        Dataset.from_datasets(split.values())
Ejemplo n.º 2
0
def test_load_dataset(resources_data_path):
    json_path = str(resources_data_path / "dataset_sequence.jsonl")

    with pytest.raises(TypeError):
        Dataset.load_dataset("json", data_files=[json_path])

    ds = Dataset.load_dataset("json", data_files=[json_path], split="train")
    assert len(ds) == 4
Ejemplo n.º 3
0
def test_from_csv(resources_data_path):
    csv_path = str(resources_data_path / "business.cat.2k.valid.csv")
    ds = Dataset.from_csv(paths=csv_path)
    ds2 = Dataset.from_csv(paths=[csv_path, csv_path])

    assert len(ds) == 400
    assert len(ds2) == 800
    assert ds.dataset.column_names == ["label", "text"]
Ejemplo n.º 4
0
def train_valid_dataset(resources_data_path) -> Tuple[Dataset, Dataset]:
    """Returns both training and validation datasets"""

    training_ds = Dataset.from_csv(paths=str(resources_data_path /
                                             "business.cat.2k.train.csv"))
    validation_ds = Dataset.from_csv(paths=str(resources_data_path /
                                               "business.cat.2k.valid.csv"))

    return training_ds, validation_ds
Ejemplo n.º 5
0
def dataset_from_path(path: str) -> Dataset:
    file_extension = Path(path).suffix
    if file_extension in [".csv"]:
        return Dataset.from_csv(path)
    elif file_extension in [".json", ".jsonl"]:
        return Dataset.from_json(path)
    else:
        raise ValueError(
            f"Could not create a Dataset from '{path}'. "
            f"We only support following formats: [csv, json, jsonl]"
        )
Ejemplo n.º 6
0
def test_from_json(resources_data_path):
    json_path = str(resources_data_path / "dataset_sequence.jsonl")
    ds = Dataset.from_json(paths=json_path)
    ds2 = Dataset.from_json(paths=[json_path, json_path])

    assert len(ds) == 4
    assert len(ds2) == 8

    json_path = str(resources_data_path / "dataset_sequence.json")
    ds = Dataset.from_json(paths=json_path, field="data")

    assert len(ds) == 4
Ejemplo n.º 7
0
def dataset(tmp_path) -> Dataset:
    data = {
        "text":
        ["A common text", "This is why you get", "Seriosly?, I'm not sure"],
        "label": ["one", "zero", "zero"],
    }
    ds = Dataset.from_dict(data)

    # we save and load it here to be able to lazily read from it
    ds_path = tmp_path / "test_pipeline_datasets" / "dataset"
    ds.save_to_disk(str(ds_path))

    return Dataset.load_from_disk(str(ds_path))
Ejemplo n.º 8
0
def test_flatten_json(resources_data_path):
    """Showcases the behavior of Dataset.flatten_"""
    file_path = str(resources_data_path / "to-be-flattened.jsonl")
    dataset_flatten_source = Dataset.from_json(paths=file_path)
    dataset_flatten_source.flatten_()

    for c in ["complexData.a", "complexData.b"]:
        assert c in dataset_flatten_source.column_names

    file_path = str(resources_data_path / "nested-list.jsonl")
    dataset_nested_list = Dataset.from_json(paths=file_path)
    dataset_nested_list.flatten_()

    assert len(dataset_nested_list) == 1
    assert dataset_nested_list.column_names == ["classification"]
Ejemplo n.º 9
0
def train_data_source() -> Dataset:
    """Creates the training dataset"""
    resources_path = Path(__file__).parent.parent / "resources" / "data"
    training_ds = Dataset.from_csv(paths=str(resources_path /
                                             "business.cat.2k.train.csv"))

    return training_ds
Ejemplo n.º 10
0
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2,
                                           deactivate_pipeline_trainer,
                                           caplog):
    pipeline = Pipeline.from_config(pipeline_config)
    # create vocab
    pipeline.train(
        output="dummy",
        training=dataset,
    )

    # extending the vocab with the weights file available should apply the pretrained weights
    pipeline.train(
        output="dummy",
        training=dataset2,
    )
    instance = pipeline.head.featurize("this")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"]),
        torch.tensor([[0.25, 0.75]]),
    )

    # extending the vocab with the weights file deleted should trigger a warning
    logging.captureWarnings(True)
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    pipeline.train(
        output="dummy",
        training=Dataset.from_dict({
            "text": ["that"],
            "label": ["good"]
        }),
    )
    assert caplog.records[0].module == "embedding"
    assert "cannot locate the pretrained_file" in caplog.records[0].message
Ejemplo n.º 11
0
def test_from_pandas():
    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
    ds = Dataset.from_pandas(df)

    assert ds.dataset.column_names == ["a", "b"]
    assert ds["a"] == [1, 2, 3]
    assert len(ds) == 3
Ejemplo n.º 12
0
def dataset():
    return Dataset.from_dict(
        {
            "text": ["this is", "a test"],
            "label": ["a", "b"],
        }
    )
def training_dataset() -> Dataset:
    """Creating the dataframe."""
    data = {
        "record1": [
            {
                "@first_name": "Hans",
                "@last_name": "Peter"
            },
            {
                "@first_name": "Heinrich",
                "@last_name": "Meier"
            },
            {
                "@first_name": "Hans",
                "@last_name": "Peter"
            },
        ],
        "record2": [
            {
                "@first_name": "Hans",
                "@last_name": "Petre"
            },
            {
                "@first_name": "Heinz",
                "@last_name": "Meier"
            },
            {
                "@first_name": "Hansel",
                "@last_name": "Peter"
            },
        ],
        "label": ["duplicate", "not_duplicate", "duplicate"],
    }

    return Dataset.from_dict(data)
Ejemplo n.º 14
0
def dataset() -> Dataset:
    data = {
        "text": ["Test", "this", "shaight", "!"],
        "label": ["0", "1", "0", "0"]
    }

    return Dataset.from_dict(data)
Ejemplo n.º 15
0
def test_from_parquet_file(resources_data_path):
    """This only shows an example of how one could read in a parquet file"""
    file_path = resources_data_path / "test.parquet"
    df = pd.read_parquet(file_path)
    dataset = Dataset.from_pandas(df)

    assert "reviewerID" in dataset.column_names
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2,
                                           capsys, caplog):
    pipeline = Pipeline.from_config(pipeline_config)
    # create vocab
    pipeline.create_vocab([dataset.to_instances(pipeline)])

    # extending the vocab with the weights file available should apply the pretrained weights
    pipeline.create_vocab([dataset2.to_instances(pipeline)])

    instance = pipeline.head.featurize("this")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"]),
        torch.tensor([[0.25, 0.75]]),
    )

    # extending the vocab with the weights file deleted should trigger a warning
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    ds = Dataset.from_dict({"text": ["that"], "label": ["good"]})
    pipeline.create_vocab([ds.to_instances(pipeline)])

    assert caplog.record_tuples[-1][
        0] == "allennlp.modules.token_embedders.embedding"
    assert caplog.record_tuples[-1][1] == 30
    assert (
        "Embedding at model_path, "
        "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file."
        in caplog.record_tuples[-1][2])
Ejemplo n.º 17
0
def dataset_from_path(path: str) -> Dataset:
    file_extension = Path(path).suffix
    if file_extension in [".csv"]:
        return Dataset.from_csv(path)
    elif file_extension in [".json", ".jsonl"]:
        return Dataset.from_json(path)
    # yaml files are used for elasticsearch data
    elif file_extension in [".yaml", ".yml"]:
        from_es_kwargs = yaml_to_dict(path)
        client = Elasticsearch(**from_es_kwargs["client"])
        return Dataset.from_elasticsearch(
            client=client,
            index=from_es_kwargs["index"],
            query=from_es_kwargs.get("query"),
        )
    else:
        raise ValueError(f"Could not create a Dataset from '{path}'")
Ejemplo n.º 18
0
def training_dataset() -> Dataset:
    """Creating the dataframe."""
    data = {
        "text": [
            "this is a text",
            "my name is dani",
            "this is a table",
            "my name is paco",
        ],
    }
    return Dataset.from_dict(data)
Ejemplo n.º 19
0
def dataset(tmp_path_factory, request):
    cache_path = tmp_path_factory.mktemp("test_instance_caching")
    data_path = cache_path / "data.csv"
    data_path.write_text("text,label\ncheck,this\nand,that")

    dataset = Dataset.from_csv(paths=str(data_path), cache_dir=str(cache_path))

    # inject to classes decorated with `pytest.mark.usefixtures`
    if request.cls:
        request.cls.dataset = dataset

    return dataset
Ejemplo n.º 20
0
def train_dataset() -> Dataset:
    """Creates the training dataset"""
    source = (
        Path(__file__).parent.parent
        / "resources"
        / "data"
        / "emotions_with_transformers.txt"
    )

    train_dataset = Dataset.from_csv(
        paths=str(source), delimiter=";", column_names=["text", "label"]
    )
    return train_dataset
Ejemplo n.º 21
0
def training_dataset() -> Dataset:
    """Creates the training dataset and gives the structure"""
    resources_path = (
        Path(__file__).parent.parent.parent / "tests" / "resources" / "data"
    )
    training_ds = Dataset.from_csv(paths=str(resources_path / "dataset_source.csv"))

    # Keeping just 'label' and text 'category'
    training_ds = training_ds.map(
        lambda x: {"label": x["job"], "text": x["education"] + " " + x["marital"]},
    )

    return training_ds
Ejemplo n.º 22
0
def test_from_elasticsearch(dataset, default_pipeline_config):
    pipeline = Pipeline.from_config(default_pipeline_config)
    es_index = explore.create(pipeline,
                              dataset,
                              explore_id="test_index",
                              show_explore=False)
    es_client = Elasticsearch()
    __wait_for_index_creation__(es_client, es_index)
    ds = Dataset.from_elasticsearch(es_client,
                                    index=es_index,
                                    query={"query": {
                                        "match_all": {}
                                    }})

    assert len(ds) == len(dataset)
    for key in ["_id", "_index", "_type", "_score"]:
        assert key in ds.column_names

    ds = Dataset.from_elasticsearch(
        es_client,
        index=es_index,
        query={"query": {
            "exists": {
                "field": "not_found.field"
            }
        }},
    )
    assert len(ds) == 0

    ds = Dataset.from_elasticsearch(es_client,
                                    index=es_index,
                                    fields=["label", "text", "_id"])
    assert len(ds) == len(dataset)
    assert "label" in ds.column_names
    assert "text" in ds.column_names
    assert "_id" in ds.column_names
    assert "prediction" not in ds.column_names
Ejemplo n.º 23
0
def test_from_excel_file(resources_data_path):
    """This only shows an example of how one could read in an excel file"""
    str_value = Value("string")
    int_value = Value("int64")
    features = Features(Notification=int_value,
                        Type=str_value,
                        Plant=int_value,
                        Serial=str_value)

    file_path = resources_data_path / "test.xlsx"
    df = pd.read_excel(file_path)

    dataset = Dataset.from_pandas(df, features=features)

    assert len(dataset) > 0
Ejemplo n.º 24
0
def training_dataset() -> Dataset:
    df = pd.DataFrame(
        {
            "text": [
                "This is a simple NER test",
                "This is a simple NER test with misaligned spans",
                "No NER here",
            ],
            "entities": [
                [{"start": 17, "end": 20, "label": "NER"}],
                [{"start": 17, "end": 22, "label": "NER"}],
                [],
            ],
        }
    )

    return Dataset.from_pandas(df)
Ejemplo n.º 25
0
def test_trainer_configs(configurations_path):
    configs = _read_configs(configurations_path, "Trainer")
    pipeline = Pipeline.from_config(
        {
            "name": "test",
            "head": {"type": "TextClassification", "labels": ["pos", "neg"]},
        }
    )
    dataset = Dataset.from_dict({"text": ["test"], "label": ["pos"]})
    linear = nn.Linear(2, 2)
    for config_name, config in configs.items():
        assert isinstance(config, TrainerConfiguration)

        trainer = Trainer(
            pipeline=pipeline, train_dataset=dataset, trainer_config=config
        )
        assert isinstance(trainer.trainer, pytorch_lightning.Trainer)
Ejemplo n.º 26
0
def training_dataset() -> Dataset:
    """Creating the dataframe."""
    data = {
        "text": [
            "The most common audits were about waste and recycling.",
            "The company fabricates plastic chairs.",
        ],
        "entities": [
            [
                {
                    "start": 34,
                    "end": 39,
                    "label": "PN",
                    "text": "waste"
                },
                {
                    "start": 16,
                    "end": 22,
                    "label": "QTY",
                    "text": "audits"
                },
            ],
            [
                {
                    "start": 4,
                    "end": 11,
                    "label": "OBJECT",
                    "text": "company"
                },
                {
                    "start": 31,
                    "end": 37,
                    "label": "SUBJECT",
                    "text": "chairs"
                },
            ],
        ],
        "label": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"],
    }

    return Dataset.from_dict(data)
Ejemplo n.º 27
0
def dataset(resources_data_path) -> Dataset:
    return Dataset.from_csv(paths=str(resources_data_path /
                                      "business.cat.2k.valid.csv"))
Ejemplo n.º 28
0
def dataset() -> Dataset:
    """Creating the dataset"""
    data = {"text": ["This is a simple test"], "label": ["a"]}
    return Dataset.from_dict(data)
def dataset2() -> Dataset:
    data = {"text": ["this"], "label": ["good"]}
    return Dataset.from_dict(data)
Ejemplo n.º 30
0
def test_from_dict():
    ds = Dataset.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]})

    assert ds.dataset.column_names == ["a", "b"]
    assert ds["a"] == [1, 2, 3]
    assert len(ds) == 3