Esempio n. 1
0
def test_apply(example_data):
    train_dataset = Dataset("train", example_data["train"])

    ner_stats: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats))
    ner_stats_apply: NERStats = cast(NERStats,
                                     get_ner_stats(train_dataset.data))

    assert ner_stats.n_examples == ner_stats_apply.n_examples
    assert ner_stats.n_examples_no_entities == ner_stats_apply.n_examples_no_entities
    assert ner_stats.n_annotations == ner_stats_apply.n_annotations
    assert ner_stats.n_annotations_per_type == ner_stats_apply.n_annotations_per_type
Esempio n. 2
0
def test_dataset_commit_hash(example_data):
    train_dataset = Dataset("train", example_data["train"][:-1])
    dev_dataset = Dataset("train", example_data["dev"])

    assert train_dataset.commit_hash != dev_dataset.commit_hash

    train_commit = train_dataset.commit_hash
    train_dataset.data.append(example_data["train"][-1])

    assert train_dataset.commit_hash != train_commit
    assert hash(train_dataset) == 1186038092183970443
Esempio n. 3
0
def main(data_dir: Path):
    ds = Dataset.from_disk(data_dir)
    ds_stats = ds.apply(get_ner_stats, serialize=True, no_print=True)
    for name, stats in ds_stats.items():
        print(f"{name}")
        print("=" * 50)
        print(stats)
Esempio n. 4
0
def test_dataset_initialize(example_data):

    dataset = Dataset("train")
    assert dataset.name == "train"
    assert dataset.data == []
    assert dataset.example_store._map == {}
    assert dataset.commit_hash == "94efdd6f628eda9c1ae893467c9652808443ef3e"
    assert dataset.operations == []

    store = ExampleStore()
    dataset2 = Dataset("dev", example_data["dev"], [], store)
    assert dataset2.name == "dev"
    assert dataset2.data == example_data["dev"]
    assert dataset2.example_store == store
    assert dataset2.commit_hash == "dd05e54668c166d075bc4406bfee590e4c89a292"
    assert dataset2.operations == []
Esempio n. 5
0
def ner_merge(
    dataset: str,
    recon_dataset: str,
    source: Union[str, Dataset],
    output_dir: Optional[str] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Stream a List of `recon.types.HardestExample` instances to prodigy
    for review/correction. Uses the Prodigy blocks interface to display
    prediction error information along with ner view
    """
    log("RECIPE: Starting recipe recon.ner_merge", locals())
    if isinstance(source, str):
        dataset = Dataset(recon_dataset).from_disk(source)
    else:
        dataset = source

    DB = connect()
    if dataset not in DB:
        msg.fail(f"Can't find dataset '{dataset}'", exits=1)

    prodigy_raw_examples = DB.get_dataset(dataset)
    prodigy_examples = [Example(**eg) for eg in prodigy_raw_examples if eg["answer"] == "accept"]
    prodigy_texts_to_examples = {e.text: e for e in prodigy_examples}

    prev_len = len(dataset)
    dataset.apply_("recon.v1.prodigy.merge_examples", prodigy_texts_to_examples)
    assert len(dataset) == prev_len

    if output_dir:
        log(f"RECIPE: Fixing {len(prodigy_examples)} examples in data")
        dataset.to_disk(output_dir)
Esempio n. 6
0
def ds():
    ds = Dataset(
        name="test",
        data=[
            Example(
                text="this is a test example with something else",
                spans=[
                    Span(text="something",
                         start=28,
                         end=37,
                         label="TEST_ENTITY")
                ],
            )
        ],
    )
    ds.apply_("recon.v1.add_tokens")

    return ds
Esempio n. 7
0
def main(data_file: Path, output_file: Path):
    ds = Dataset("train").from_disk(data_file)

    print("STATS BEFORE")
    print("============")
    print(ds.apply(get_ner_stats, serialize=True))

    ds.apply_("recon.v1.upcase_labels")

    print("STATS AFTER")
    print("===========")
    print(ds.apply(get_ner_stats, serialize=True))

    ds.to_disk(output_file, force=True)
Esempio n. 8
0
def test_apply_(example_data):
    train_dataset = Dataset("train", example_data["train"])
    ner_stats_pre: NERStats = cast(NERStats,
                                   train_dataset.apply(get_ner_stats))

    assert len(train_dataset.operations) == 0

    train_dataset.apply_("recon.v1.upcase_labels")

    ner_stats_post: NERStats = cast(NERStats,
                                    train_dataset.apply(get_ner_stats))

    pre_keys = sorted(ner_stats_pre.n_annotations_per_type.keys())
    post_keys = sorted(ner_stats_post.n_annotations_per_type.keys())

    assert pre_keys != post_keys

    assert pre_keys == ["JOB_ROLE", "PRODUCT", "SKILL", "product", "skill"]
    assert post_keys == ["JOB_ROLE", "PRODUCT", "SKILL"]

    assert len(train_dataset.operations) == 1

    op = train_dataset.operations[0]

    assert op.name == "recon.v1.upcase_labels"
    assert op.status == OperationStatus.COMPLETED
    assert len(op.transformations) == 3

    for t in op.transformations:
        assert t.type == TransformationType.EXAMPLE_CHANGED
Esempio n. 9
0
def test_dataset_to_from_disk(example_data, tmp_path):
    train_dataset = Dataset("train", example_data["train"])
    ner_stats_pre: NERStats = cast(NERStats,
                                   train_dataset.apply(get_ner_stats))

    assert len(train_dataset.operations) == 0

    with pytest.raises(FileNotFoundError):
        train_dataset.to_disk(tmp_path / "train.jsonl")

    train_dataset.to_disk(tmp_path / "train.jsonl", force=True)
    train_dataset_loaded = Dataset("train").from_disk(tmp_path / "train.jsonl")
    assert len(train_dataset_loaded.operations) == 0
    assert train_dataset_loaded.commit_hash == train_dataset.commit_hash

    train_dataset.apply_("recon.v1.upcase_labels")

    train_dataset.to_disk(tmp_path / "train.jsonl", force=True)
    train_dataset_loaded_2 = Dataset("train").from_disk(tmp_path /
                                                        "train.jsonl")

    assert len(train_dataset_loaded_2.operations) == 1
    assert train_dataset_loaded_2.commit_hash == train_dataset.commit_hash
    assert train_dataset_loaded_2.commit_hash != train_dataset_loaded.commit_hash

    op = train_dataset_loaded_2.operations[0]

    assert op.name == "recon.v1.upcase_labels"
    assert op.status == OperationStatus.COMPLETED
    assert len(op.transformations) == 3

    for t in op.transformations:
        assert t.type == TransformationType.EXAMPLE_CHANGED
Esempio n. 10
0
def test_len(example_data):
    train_dataset = Dataset("train", example_data["train"])
    assert len(train_dataset) == len(example_data["train"])