Esempio n. 1
0
def test_apply_(example_data):
    train_dataset = Dataset("train", example_data["train"])
    ner_stats_pre: NERStats = cast(NERStats,
                                   train_dataset.apply(get_ner_stats))

    assert len(train_dataset.operations) == 0

    train_dataset.apply_("recon.v1.upcase_labels")

    ner_stats_post: NERStats = cast(NERStats,
                                    train_dataset.apply(get_ner_stats))

    pre_keys = sorted(ner_stats_pre.n_annotations_per_type.keys())
    post_keys = sorted(ner_stats_post.n_annotations_per_type.keys())

    assert pre_keys != post_keys

    assert pre_keys == ["JOB_ROLE", "PRODUCT", "SKILL", "product", "skill"]
    assert post_keys == ["JOB_ROLE", "PRODUCT", "SKILL"]

    assert len(train_dataset.operations) == 1

    op = train_dataset.operations[0]

    assert op.name == "recon.v1.upcase_labels"
    assert op.status == OperationStatus.COMPLETED
    assert len(op.transformations) == 3

    for t in op.transformations:
        assert t.type == TransformationType.EXAMPLE_CHANGED
Esempio n. 2
0
def main(data_file: Path, output_file: Path):
    ds = Dataset("train").from_disk(data_file)

    print("STATS BEFORE")
    print("============")
    print(ds.apply(get_ner_stats, serialize=True))

    ds.apply_("recon.v1.upcase_labels")

    print("STATS AFTER")
    print("===========")
    print(ds.apply(get_ner_stats, serialize=True))
Esempio n. 3
0
def test_dataset_to_from_disk(example_data, tmp_path):
    train_dataset = Dataset("train", example_data["train"])
    ner_stats_pre: NERStats = cast(NERStats,
                                   train_dataset.apply(get_ner_stats))

    assert len(train_dataset.operations) == 0

    with pytest.raises(FileNotFoundError):
        train_dataset.to_disk(tmp_path / "train.jsonl")

    train_dataset.to_disk(tmp_path / "train.jsonl", force=True)
    train_dataset_loaded = Dataset("train").from_disk(tmp_path / "train.jsonl")
    assert len(train_dataset_loaded.operations) == 0
    assert train_dataset_loaded.commit_hash == train_dataset.commit_hash

    train_dataset.apply_("recon.v1.upcase_labels")

    train_dataset.to_disk(tmp_path / "train.jsonl", force=True)
    train_dataset_loaded_2 = Dataset("train").from_disk(tmp_path /
                                                        "train.jsonl")

    assert len(train_dataset_loaded_2.operations) == 1
    assert train_dataset_loaded_2.commit_hash == train_dataset.commit_hash
    assert train_dataset_loaded_2.commit_hash != train_dataset_loaded.commit_hash

    op = train_dataset_loaded_2.operations[0]

    assert op.name == "recon.v1.upcase_labels"
    assert op.status == OperationStatus.COMPLETED
    assert len(op.transformations) == 3

    for t in op.transformations:
        assert t.type == TransformationType.EXAMPLE_CHANGED
Esempio n. 4
0
def test_apply(example_data):
    train_dataset = Dataset("train", example_data["train"])

    ner_stats: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats))
    ner_stats_apply: NERStats = cast(NERStats,
                                     get_ner_stats(train_dataset.data))

    assert ner_stats.n_examples == ner_stats_apply.n_examples
    assert ner_stats.n_examples_no_entities == ner_stats_apply.n_examples_no_entities
    assert ner_stats.n_annotations == ner_stats_apply.n_annotations
    assert ner_stats.n_annotations_per_type == ner_stats_apply.n_annotations_per_type