def test_text_dataset_reader(path_type, split, features, keep_in_memory, text_path, tmp_path): if issubclass(path_type, str): path = text_path elif issubclass(path_type, list): path = [text_path] cache_dir = tmp_path / "cache" expected_split = str(split) if split else "train" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = TextDatasetReader(path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_text_datasetdict_reader(split, features, keep_in_memory, text_path, tmp_path): if split: path = {split: text_path} else: split = "train" path = {"train": text_path, "test": text_path} cache_dir = tmp_path / "cache" default_expected_features = {"text": "string"} expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = TextDatasetReader(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 1 assert dataset.column_names == ["text"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_csv_dataset_reader(path_type, split, features, keep_in_memory, csv_path, tmp_path): if issubclass(path_type, str): path = csv_path elif issubclass(path_type, list): path = [csv_path] cache_dir = tmp_path / "cache" expected_split = str(split) if split else "train" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: split = "train" path = {"train": csv_path, "test": csv_path} cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_datasetdict_from_json( split, features, keep_in_memory, jsonl_path, tmp_path, ): file_path = jsonl_path field = None if split: path = {split: file_path} else: split = "train" path = {"train": file_path, "test": file_path} cache_dir = tmp_path / "cache" default_expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = DatasetDict.from_json( path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, field=field ) assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_json_dataset_reader( path_type, split, features, keep_in_memory, jsonl_path, tmp_path, ): file_path = jsonl_path field = None if issubclass(path_type, str): path = file_path elif issubclass(path_type, list): path = [file_path] cache_dir = tmp_path / "cache" expected_split = str(split) if split else "train" default_expected_features = { "col_1": "string", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = JsonDatasetReader(path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, field=field).read() assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype
def test_csv_datasetdict_reader(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: split = "train" path = {"train": csv_path, "test": csv_path} cache_dir = tmp_path / "cache" # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = { "col_1": "int64", "col_2": "int64", "col_3": "float64" } expected_features = features.copy( ) if features else default_expected_features features = Features( {feature: Value(dtype) for feature, dtype in features.items()}) if features else None previous_allocated_memory = pa.total_allocated_bytes() dataset = CsvDatasetReader(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 assert dataset.num_columns == 3 assert dataset.column_names == ["col_1", "col_2", "col_3"] assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype assert increased_allocated_memory == keep_in_memory