def test_apply_(example_data): train_dataset = Dataset("train", example_data["train"]) ner_stats_pre: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) assert len(train_dataset.operations) == 0 train_dataset.apply_("recon.v1.upcase_labels") ner_stats_post: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) pre_keys = sorted(ner_stats_pre.n_annotations_per_type.keys()) post_keys = sorted(ner_stats_post.n_annotations_per_type.keys()) assert pre_keys != post_keys assert pre_keys == ["JOB_ROLE", "PRODUCT", "SKILL", "product", "skill"] assert post_keys == ["JOB_ROLE", "PRODUCT", "SKILL"] assert len(train_dataset.operations) == 1 op = train_dataset.operations[0] assert op.name == "recon.v1.upcase_labels" assert op.status == OperationStatus.COMPLETED assert len(op.transformations) == 3 for t in op.transformations: assert t.type == TransformationType.EXAMPLE_CHANGED
def main(data_file: Path, output_file: Path): ds = Dataset("train").from_disk(data_file) print("STATS BEFORE") print("============") print(ds.apply(get_ner_stats, serialize=True)) ds.apply_("recon.v1.upcase_labels") print("STATS AFTER") print("===========") print(ds.apply(get_ner_stats, serialize=True))
def test_dataset_to_from_disk(example_data, tmp_path): train_dataset = Dataset("train", example_data["train"]) ner_stats_pre: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) assert len(train_dataset.operations) == 0 with pytest.raises(FileNotFoundError): train_dataset.to_disk(tmp_path / "train.jsonl") train_dataset.to_disk(tmp_path / "train.jsonl", force=True) train_dataset_loaded = Dataset("train").from_disk(tmp_path / "train.jsonl") assert len(train_dataset_loaded.operations) == 0 assert train_dataset_loaded.commit_hash == train_dataset.commit_hash train_dataset.apply_("recon.v1.upcase_labels") train_dataset.to_disk(tmp_path / "train.jsonl", force=True) train_dataset_loaded_2 = Dataset("train").from_disk(tmp_path / "train.jsonl") assert len(train_dataset_loaded_2.operations) == 1 assert train_dataset_loaded_2.commit_hash == train_dataset.commit_hash assert train_dataset_loaded_2.commit_hash != train_dataset_loaded.commit_hash op = train_dataset_loaded_2.operations[0] assert op.name == "recon.v1.upcase_labels" assert op.status == OperationStatus.COMPLETED assert len(op.transformations) == 3 for t in op.transformations: assert t.type == TransformationType.EXAMPLE_CHANGED
def test_apply(example_data): train_dataset = Dataset("train", example_data["train"]) ner_stats: NERStats = cast(NERStats, train_dataset.apply(get_ner_stats)) ner_stats_apply: NERStats = cast(NERStats, get_ner_stats(train_dataset.data)) assert ner_stats.n_examples == ner_stats_apply.n_examples assert ner_stats.n_examples_no_entities == ner_stats_apply.n_examples_no_entities assert ner_stats.n_annotations == ner_stats_apply.n_annotations assert ner_stats.n_annotations_per_type == ner_stats_apply.n_annotations_per_type