def test_fail_using_reserved_words(): # This issue was reported and maybe gets resolved: https://github.com/huggingface/datasets/issues/1110 ds = Dataset.from_dict({ "a": [{ "a": 1, "b": "two" } for _ in range(0, 100)], "_type": ["whatever" for _ in range(0, 100)], }) split = ds.train_test_split() new_ds = Dataset.from_datasets(split.values()) assert len(new_ds) == len(ds) ds = ds.map( lambda example: { "new_field": { "c": str(example["a"]["a"]), "d": f"this {example['a']['b']}" } }) split = ds.train_test_split() with pytest.raises(TypeError): Dataset.from_datasets(split.values())
def test_load_dataset(resources_data_path): json_path = str(resources_data_path / "dataset_sequence.jsonl") with pytest.raises(TypeError): Dataset.load_dataset("json", data_files=[json_path]) ds = Dataset.load_dataset("json", data_files=[json_path], split="train") assert len(ds) == 4
def test_from_csv(resources_data_path): csv_path = str(resources_data_path / "business.cat.2k.valid.csv") ds = Dataset.from_csv(paths=csv_path) ds2 = Dataset.from_csv(paths=[csv_path, csv_path]) assert len(ds) == 400 assert len(ds2) == 800 assert ds.dataset.column_names == ["label", "text"]
def train_valid_dataset(resources_data_path) -> Tuple[Dataset, Dataset]: """Returns both training and validation datasets""" training_ds = Dataset.from_csv(paths=str(resources_data_path / "business.cat.2k.train.csv")) validation_ds = Dataset.from_csv(paths=str(resources_data_path / "business.cat.2k.valid.csv")) return training_ds, validation_ds
def dataset_from_path(path: str) -> Dataset: file_extension = Path(path).suffix if file_extension in [".csv"]: return Dataset.from_csv(path) elif file_extension in [".json", ".jsonl"]: return Dataset.from_json(path) else: raise ValueError( f"Could not create a Dataset from '{path}'. " f"We only support following formats: [csv, json, jsonl]" )
def test_from_json(resources_data_path): json_path = str(resources_data_path / "dataset_sequence.jsonl") ds = Dataset.from_json(paths=json_path) ds2 = Dataset.from_json(paths=[json_path, json_path]) assert len(ds) == 4 assert len(ds2) == 8 json_path = str(resources_data_path / "dataset_sequence.json") ds = Dataset.from_json(paths=json_path, field="data") assert len(ds) == 4
def dataset(tmp_path) -> Dataset: data = { "text": ["A common text", "This is why you get", "Seriosly?, I'm not sure"], "label": ["one", "zero", "zero"], } ds = Dataset.from_dict(data) # we save and load it here to be able to lazily read from it ds_path = tmp_path / "test_pipeline_datasets" / "dataset" ds.save_to_disk(str(ds_path)) return Dataset.load_from_disk(str(ds_path))
def test_flatten_json(resources_data_path): """Showcases the behavior of Dataset.flatten_""" file_path = str(resources_data_path / "to-be-flattened.jsonl") dataset_flatten_source = Dataset.from_json(paths=file_path) dataset_flatten_source.flatten_() for c in ["complexData.a", "complexData.b"]: assert c in dataset_flatten_source.column_names file_path = str(resources_data_path / "nested-list.jsonl") dataset_nested_list = Dataset.from_json(paths=file_path) dataset_nested_list.flatten_() assert len(dataset_nested_list) == 1 assert dataset_nested_list.column_names == ["classification"]
def train_data_source() -> Dataset: """Creates the training dataset""" resources_path = Path(__file__).parent.parent / "resources" / "data" training_ds = Dataset.from_csv(paths=str(resources_path / "business.cat.2k.train.csv")) return training_ds
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2, deactivate_pipeline_trainer, caplog): pipeline = Pipeline.from_config(pipeline_config) # create vocab pipeline.train( output="dummy", training=dataset, ) # extending the vocab with the weights file available should apply the pretrained weights pipeline.train( output="dummy", training=dataset2, ) instance = pipeline.head.featurize("this") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"]), torch.tensor([[0.25, 0.75]]), ) # extending the vocab with the weights file deleted should trigger a warning logging.captureWarnings(True) Path(pipeline_config["features"]["word"]["weights_file"]).unlink() pipeline.train( output="dummy", training=Dataset.from_dict({ "text": ["that"], "label": ["good"] }), ) assert caplog.records[0].module == "embedding" assert "cannot locate the pretrained_file" in caplog.records[0].message
def test_from_pandas(): df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) ds = Dataset.from_pandas(df) assert ds.dataset.column_names == ["a", "b"] assert ds["a"] == [1, 2, 3] assert len(ds) == 3
def dataset(): return Dataset.from_dict( { "text": ["this is", "a test"], "label": ["a", "b"], } )
def training_dataset() -> Dataset: """Creating the dataframe.""" data = { "record1": [ { "@first_name": "Hans", "@last_name": "Peter" }, { "@first_name": "Heinrich", "@last_name": "Meier" }, { "@first_name": "Hans", "@last_name": "Peter" }, ], "record2": [ { "@first_name": "Hans", "@last_name": "Petre" }, { "@first_name": "Heinz", "@last_name": "Meier" }, { "@first_name": "Hansel", "@last_name": "Peter" }, ], "label": ["duplicate", "not_duplicate", "duplicate"], } return Dataset.from_dict(data)
def dataset() -> Dataset: data = { "text": ["Test", "this", "shaight", "!"], "label": ["0", "1", "0", "0"] } return Dataset.from_dict(data)
def test_from_parquet_file(resources_data_path): """This only shows an example of how one could read in a parquet file""" file_path = resources_data_path / "test.parquet" df = pd.read_parquet(file_path) dataset = Dataset.from_pandas(df) assert "reviewerID" in dataset.column_names
def test_extending_vocab_with_weights_file(pipeline_config, dataset, dataset2, capsys, caplog): pipeline = Pipeline.from_config(pipeline_config) # create vocab pipeline.create_vocab([dataset.to_instances(pipeline)]) # extending the vocab with the weights file available should apply the pretrained weights pipeline.create_vocab([dataset2.to_instances(pipeline)]) instance = pipeline.head.featurize("this") instance.index_fields(pipeline.vocab) assert_allclose( pipeline.backbone.embedder(instance.as_tensor_dict()["text"]), torch.tensor([[0.25, 0.75]]), ) # extending the vocab with the weights file deleted should trigger a warning Path(pipeline_config["features"]["word"]["weights_file"]).unlink() ds = Dataset.from_dict({"text": ["that"], "label": ["good"]}) pipeline.create_vocab([ds.to_instances(pipeline)]) assert caplog.record_tuples[-1][ 0] == "allennlp.modules.token_embedders.embedding" assert caplog.record_tuples[-1][1] == 30 assert ( "Embedding at model_path, " "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file." in caplog.record_tuples[-1][2])
def dataset_from_path(path: str) -> Dataset: file_extension = Path(path).suffix if file_extension in [".csv"]: return Dataset.from_csv(path) elif file_extension in [".json", ".jsonl"]: return Dataset.from_json(path) # yaml files are used for elasticsearch data elif file_extension in [".yaml", ".yml"]: from_es_kwargs = yaml_to_dict(path) client = Elasticsearch(**from_es_kwargs["client"]) return Dataset.from_elasticsearch( client=client, index=from_es_kwargs["index"], query=from_es_kwargs.get("query"), ) else: raise ValueError(f"Could not create a Dataset from '{path}'")
def training_dataset() -> Dataset: """Creating the dataframe.""" data = { "text": [ "this is a text", "my name is dani", "this is a table", "my name is paco", ], } return Dataset.from_dict(data)
def dataset(tmp_path_factory, request): cache_path = tmp_path_factory.mktemp("test_instance_caching") data_path = cache_path / "data.csv" data_path.write_text("text,label\ncheck,this\nand,that") dataset = Dataset.from_csv(paths=str(data_path), cache_dir=str(cache_path)) # inject to classes decorated with `pytest.mark.usefixtures` if request.cls: request.cls.dataset = dataset return dataset
def train_dataset() -> Dataset: """Creates the training dataset""" source = ( Path(__file__).parent.parent / "resources" / "data" / "emotions_with_transformers.txt" ) train_dataset = Dataset.from_csv( paths=str(source), delimiter=";", column_names=["text", "label"] ) return train_dataset
def training_dataset() -> Dataset: """Creates the training dataset and gives the structure""" resources_path = ( Path(__file__).parent.parent.parent / "tests" / "resources" / "data" ) training_ds = Dataset.from_csv(paths=str(resources_path / "dataset_source.csv")) # Keeping just 'label' and text 'category' training_ds = training_ds.map( lambda x: {"label": x["job"], "text": x["education"] + " " + x["marital"]}, ) return training_ds
def test_from_elasticsearch(dataset, default_pipeline_config): pipeline = Pipeline.from_config(default_pipeline_config) es_index = explore.create(pipeline, dataset, explore_id="test_index", show_explore=False) es_client = Elasticsearch() __wait_for_index_creation__(es_client, es_index) ds = Dataset.from_elasticsearch(es_client, index=es_index, query={"query": { "match_all": {} }}) assert len(ds) == len(dataset) for key in ["_id", "_index", "_type", "_score"]: assert key in ds.column_names ds = Dataset.from_elasticsearch( es_client, index=es_index, query={"query": { "exists": { "field": "not_found.field" } }}, ) assert len(ds) == 0 ds = Dataset.from_elasticsearch(es_client, index=es_index, fields=["label", "text", "_id"]) assert len(ds) == len(dataset) assert "label" in ds.column_names assert "text" in ds.column_names assert "_id" in ds.column_names assert "prediction" not in ds.column_names
def test_from_excel_file(resources_data_path): """This only shows an example of how one could read in an excel file""" str_value = Value("string") int_value = Value("int64") features = Features(Notification=int_value, Type=str_value, Plant=int_value, Serial=str_value) file_path = resources_data_path / "test.xlsx" df = pd.read_excel(file_path) dataset = Dataset.from_pandas(df, features=features) assert len(dataset) > 0
def training_dataset() -> Dataset: df = pd.DataFrame( { "text": [ "This is a simple NER test", "This is a simple NER test with misaligned spans", "No NER here", ], "entities": [ [{"start": 17, "end": 20, "label": "NER"}], [{"start": 17, "end": 22, "label": "NER"}], [], ], } ) return Dataset.from_pandas(df)
def test_trainer_configs(configurations_path): configs = _read_configs(configurations_path, "Trainer") pipeline = Pipeline.from_config( { "name": "test", "head": {"type": "TextClassification", "labels": ["pos", "neg"]}, } ) dataset = Dataset.from_dict({"text": ["test"], "label": ["pos"]}) linear = nn.Linear(2, 2) for config_name, config in configs.items(): assert isinstance(config, TrainerConfiguration) trainer = Trainer( pipeline=pipeline, train_dataset=dataset, trainer_config=config ) assert isinstance(trainer.trainer, pytorch_lightning.Trainer)
def training_dataset() -> Dataset: """Creating the dataframe.""" data = { "text": [ "The most common audits were about waste and recycling.", "The company fabricates plastic chairs.", ], "entities": [ [ { "start": 34, "end": 39, "label": "PN", "text": "waste" }, { "start": 16, "end": 22, "label": "QTY", "text": "audits" }, ], [ { "start": 4, "end": 11, "label": "OBJECT", "text": "company" }, { "start": 31, "end": 37, "label": "SUBJECT", "text": "chairs" }, ], ], "label": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"], } return Dataset.from_dict(data)
def dataset(resources_data_path) -> Dataset: return Dataset.from_csv(paths=str(resources_data_path / "business.cat.2k.valid.csv"))
def dataset() -> Dataset: """Creating the dataset""" data = {"text": ["This is a simple test"], "label": ["a"]} return Dataset.from_dict(data)
def dataset2() -> Dataset: data = {"text": ["this"], "label": ["good"]} return Dataset.from_dict(data)
def test_from_dict(): ds = Dataset.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]}) assert ds.dataset.column_names == ["a", "b"] assert ds["a"] == [1, 2, 3] assert len(ds) == 3