def test_from_csv(resources_data_path): csv_path = str(resources_data_path / "business.cat.2k.valid.csv") ds = Dataset.from_csv(paths=csv_path) ds2 = Dataset.from_csv(paths=[csv_path, csv_path]) assert len(ds) == 400 assert len(ds2) == 800 assert ds.dataset.column_names == ["label", "text"]
def train_valid_dataset(resources_data_path) -> Tuple[Dataset, Dataset]: """Returns both training and validation datasets""" training_ds = Dataset.from_csv(paths=str(resources_data_path / "business.cat.2k.train.csv")) validation_ds = Dataset.from_csv(paths=str(resources_data_path / "business.cat.2k.valid.csv")) return training_ds, validation_ds
def train_data_source() -> Dataset: """Creates the training dataset""" resources_path = Path(__file__).parent.parent / "resources" / "data" training_ds = Dataset.from_csv(paths=str(resources_path / "business.cat.2k.train.csv")) return training_ds
def dataset_from_path(path: str) -> Dataset: file_extension = Path(path).suffix if file_extension in [".csv"]: return Dataset.from_csv(path) elif file_extension in [".json", ".jsonl"]: return Dataset.from_json(path) else: raise ValueError( f"Could not create a Dataset from '{path}'. " f"We only support following formats: [csv, json, jsonl]" )
def dataset(tmp_path_factory, request): cache_path = tmp_path_factory.mktemp("test_instance_caching") data_path = cache_path / "data.csv" data_path.write_text("text,label\ncheck,this\nand,that") dataset = Dataset.from_csv(paths=str(data_path), cache_dir=str(cache_path)) # inject to classes decorated with `pytest.mark.usefixtures` if request.cls: request.cls.dataset = dataset return dataset
def training_dataset() -> Dataset: """Creates the training dataset and gives the structure""" resources_path = ( Path(__file__).parent.parent.parent / "tests" / "resources" / "data" ) training_ds = Dataset.from_csv(paths=str(resources_path / "dataset_source.csv")) # Keeping just 'label' and text 'category' training_ds = training_ds.map( lambda x: {"label": x["job"], "text": x["education"] + " " + x["marital"]}, ) return training_ds
def train_dataset() -> Dataset: """Creates the training dataset""" source = ( Path(__file__).parent.parent / "resources" / "data" / "emotions_with_transformers.txt" ) train_dataset = Dataset.from_csv( paths=str(source), delimiter=";", column_names=["text", "label"] ) return train_dataset
def dataset_from_path(path: str) -> Dataset: file_extension = Path(path).suffix if file_extension in [".csv"]: return Dataset.from_csv(path) elif file_extension in [".json", ".jsonl"]: return Dataset.from_json(path) # yaml files are used for elasticsearch data elif file_extension in [".yaml", ".yml"]: from_es_kwargs = yaml_to_dict(path) client = Elasticsearch(**from_es_kwargs["client"]) return Dataset.from_elasticsearch( client=client, index=from_es_kwargs["index"], query=from_es_kwargs.get("query"), ) else: raise ValueError(f"Could not create a Dataset from '{path}'")
def dataset(resources_data_path) -> Dataset: return Dataset.from_csv(paths=str(resources_data_path / "business.cat.2k.valid.csv"))