Beispiel #1
0
def test_fail_using_reserved_words():
    # This issue was reported and maybe gets resolved: https://github.com/huggingface/datasets/issues/1110

    ds = Dataset.from_dict({
        "a": [{
            "a": 1,
            "b": "two"
        } for _ in range(0, 100)],
        "_type": ["whatever" for _ in range(0, 100)],
    })
    split = ds.train_test_split()
    new_ds = Dataset.from_datasets(split.values())

    assert len(new_ds) == len(ds)

    ds = ds.map(
        lambda example: {
            "new_field": {
                "c": str(example["a"]["a"]),
                "d": f"this {example['a']['b']}"
            }
        })
    split = ds.train_test_split()
    with pytest.raises(TypeError):
        Dataset.from_datasets(split.values())
Beispiel #2
0
def dataset() -> Dataset:
    data = {
        "text": ["Test", "this", "shaight", "!"],
        "label": ["0", "1", "0", "0"]
    }

    return Dataset.from_dict(data)
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2,
                                           capsys, caplog):
    pipeline = Pipeline.from_config(pipeline_config)
    # create vocab
    pipeline.create_vocab([dataset.to_instances(pipeline)])

    # extending the vocab with the weights file available should apply the pretrained weights
    pipeline.create_vocab([dataset2.to_instances(pipeline)])

    instance = pipeline.head.featurize("this")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"]),
        torch.tensor([[0.25, 0.75]]),
    )

    # extending the vocab with the weights file deleted should trigger a warning
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    ds = Dataset.from_dict({"text": ["that"], "label": ["good"]})
    pipeline.create_vocab([ds.to_instances(pipeline)])

    assert caplog.record_tuples[-1][
        0] == "allennlp.modules.token_embedders.embedding"
    assert caplog.record_tuples[-1][1] == 30
    assert (
        "Embedding at model_path, "
        "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file."
        in caplog.record_tuples[-1][2])
Beispiel #4
0
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2,
                                           deactivate_pipeline_trainer,
                                           caplog):
    pipeline = Pipeline.from_config(pipeline_config)
    # create vocab
    pipeline.train(
        output="dummy",
        training=dataset,
    )

    # extending the vocab with the weights file available should apply the pretrained weights
    pipeline.train(
        output="dummy",
        training=dataset2,
    )
    instance = pipeline.head.featurize("this")
    instance.index_fields(pipeline.vocab)

    assert_allclose(
        pipeline.backbone.embedder(instance.as_tensor_dict()["text"]),
        torch.tensor([[0.25, 0.75]]),
    )

    # extending the vocab with the weights file deleted should trigger a warning
    logging.captureWarnings(True)
    Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
    pipeline.train(
        output="dummy",
        training=Dataset.from_dict({
            "text": ["that"],
            "label": ["good"]
        }),
    )
    assert caplog.records[0].module == "embedding"
    assert "cannot locate the pretrained_file" in caplog.records[0].message
def dataset():
    return Dataset.from_dict(
        {
            "text": ["this is", "a test"],
            "label": ["a", "b"],
        }
    )
def training_dataset() -> Dataset:
    """Creating the dataframe."""
    data = {
        "record1": [
            {
                "@first_name": "Hans",
                "@last_name": "Peter"
            },
            {
                "@first_name": "Heinrich",
                "@last_name": "Meier"
            },
            {
                "@first_name": "Hans",
                "@last_name": "Peter"
            },
        ],
        "record2": [
            {
                "@first_name": "Hans",
                "@last_name": "Petre"
            },
            {
                "@first_name": "Heinz",
                "@last_name": "Meier"
            },
            {
                "@first_name": "Hansel",
                "@last_name": "Peter"
            },
        ],
        "label": ["duplicate", "not_duplicate", "duplicate"],
    }

    return Dataset.from_dict(data)
Beispiel #7
0
def training_dataset() -> Dataset:
    """Creating the dataframe."""
    data = {
        "text": [
            "this is a text",
            "my name is dani",
            "this is a table",
            "my name is paco",
        ],
    }
    return Dataset.from_dict(data)
Beispiel #8
0
def dataset(tmp_path) -> Dataset:
    data = {
        "text":
        ["A common text", "This is why you get", "Seriosly?, I'm not sure"],
        "label": ["one", "zero", "zero"],
    }
    ds = Dataset.from_dict(data)

    # we save and load it here to be able to lazily read from it
    ds_path = tmp_path / "test_pipeline_datasets" / "dataset"
    ds.save_to_disk(str(ds_path))

    return Dataset.load_from_disk(str(ds_path))
def test_trainer_configs(configurations_path):
    configs = _read_configs(configurations_path, "Trainer")
    pipeline = Pipeline.from_config(
        {
            "name": "test",
            "head": {"type": "TextClassification", "labels": ["pos", "neg"]},
        }
    )
    dataset = Dataset.from_dict({"text": ["test"], "label": ["pos"]})
    linear = nn.Linear(2, 2)
    for config_name, config in configs.items():
        assert isinstance(config, TrainerConfiguration)

        trainer = Trainer(
            pipeline=pipeline, train_dataset=dataset, trainer_config=config
        )
        assert isinstance(trainer.trainer, pytorch_lightning.Trainer)
def training_dataset() -> Dataset:
    """Creating the dataframe."""
    data = {
        "text": [
            "The most common audits were about waste and recycling.",
            "The company fabricates plastic chairs.",
        ],
        "entities": [
            [
                {
                    "start": 34,
                    "end": 39,
                    "label": "PN",
                    "text": "waste"
                },
                {
                    "start": 16,
                    "end": 22,
                    "label": "QTY",
                    "text": "audits"
                },
            ],
            [
                {
                    "start": 4,
                    "end": 11,
                    "label": "OBJECT",
                    "text": "company"
                },
                {
                    "start": 31,
                    "end": 37,
                    "label": "SUBJECT",
                    "text": "chairs"
                },
            ],
        ],
        "label": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"],
    }

    return Dataset.from_dict(data)
Beispiel #11
0
def test_from_dict():
    ds = Dataset.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]})

    assert ds.dataset.column_names == ["a", "b"]
    assert ds["a"] == [1, 2, 3]
    assert len(ds) == 3
def dataset2() -> Dataset:
    data = {"text": ["this"], "label": ["good"]}
    return Dataset.from_dict(data)
Beispiel #13
0
def dataset() -> Dataset:
    """Creating the dataset"""
    data = {"text": ["This is a simple test"], "label": ["a"]}
    return Dataset.from_dict(data)
Beispiel #14
0
def dataset() -> Dataset:
    data = {
        "text": ["test", "this", "shaight", "good"],
        "label": ["good", "good", "bad", "good"],
    }
    return Dataset.from_dict(data)
Beispiel #15
0
def valid_dataset():
    data = {
        "text": ["and what about the validation", "do not forget this one"],
        "label": ["bad", "good"],
    }
    return Dataset.from_dict(data)
Beispiel #16
0
def train_dataset():
    data = {
        "text": ["this is a test", "and another one"],
        "label": ["good", "bad"]
    }
    return Dataset.from_dict(data)