Example #1
0
    def check_load_dataset(self, dataset_name, configs, is_local=False, use_local_dummy_data=False):
        for config in configs:
            with tempfile.TemporaryDirectory() as processed_temp_dir, tempfile.TemporaryDirectory() as raw_temp_dir:

                # create config and dataset
                dataset_builder_cls = self.load_builder_class(dataset_name, is_local=is_local)
                name = config.name if config is not None else None
                dataset_builder = dataset_builder_cls(name=name, cache_dir=processed_temp_dir)

                # TODO: skip Beam datasets and datasets that lack dummy data for now
                if not dataset_builder.test_dummy_data:
                    logger.info("Skip tests for this dataset for now")
                    return

                if config is not None:
                    version = config.version
                else:
                    version = dataset_builder.VERSION

                def check_if_url_is_valid(url):
                    if is_remote_url(url) and "\\" in url:
                        raise ValueError(f"Bad remote url '{url} since it contains a backslash")

                # create mock data loader manager that has a special download_and_extract() method to download dummy data instead of real data
                mock_dl_manager = MockDownloadManager(
                    dataset_name=dataset_name,
                    config=config,
                    version=version,
                    cache_dir=raw_temp_dir,
                    use_local_dummy_data=use_local_dummy_data,
                    download_callbacks=[check_if_url_is_valid],
                )

                # packaged datasets like csv, text, json or pandas require some data files
                if dataset_builder.__class__.__name__.lower() in _PACKAGED_DATASETS_MODULES:
                    mock_dl_manager.download_dummy_data()
                    path_to_dummy_data = mock_dl_manager.dummy_file
                    dataset_builder.config.data_files = get_packaged_dataset_dummy_data_files(
                        dataset_builder.__class__.__name__.lower(), path_to_dummy_data
                    )

                # mock size needed for dummy data instead of actual dataset
                if dataset_builder.info is not None:
                    # approximate upper bound of order of magnitude of dummy data files
                    one_mega_byte = 2 << 19
                    dataset_builder.info.size_in_bytes = 2 * one_mega_byte
                    dataset_builder.info.download_size = one_mega_byte
                    dataset_builder.info.dataset_size = one_mega_byte

                # generate examples from dummy data
                dataset_builder.download_and_prepare(
                    dl_manager=mock_dl_manager,
                    download_mode=GenerateMode.FORCE_REDOWNLOAD,
                    ignore_verifications=True,
                    try_from_hf_gcs=False,
                )

                # get dataset
                dataset = dataset_builder.as_dataset(ignore_verifications=True)

                # check that dataset is not empty
                self.parent.assertListEqual(sorted(dataset_builder.info.splits.keys()), sorted(dataset))
                for split in dataset_builder.info.splits.keys():
                    # check that loaded datset is not empty
                    self.parent.assertTrue(len(dataset[split]) > 0)
                del dataset
    def check_load_dataset(self, dataset_name, configs, is_local=False):
        # test only first config to speed up testing
        for config in configs:
            with tempfile.TemporaryDirectory(
            ) as processed_temp_dir, tempfile.TemporaryDirectory(
            ) as raw_temp_dir:

                # create config and dataset
                dataset_builder_cls = self.load_builder_class(
                    dataset_name, is_local=is_local)
                name = config.name if config is not None else None
                dataset_builder = dataset_builder_cls(
                    name=name, cache_dir=processed_temp_dir)

                # TODO: skip Beam datasets and datasets that lack dummy data for now
                if not dataset_builder.test_dummy_data:
                    logger.info("Skip tests for this dataset for now")
                    return

                if config is not None:
                    version = config.version
                else:
                    version = dataset_builder.VERSION

                # create mock data loader manager that has a special download_and_extract() method to download dummy data instead of real data
                mock_dl_manager = MockDownloadManager(
                    dataset_name=dataset_name,
                    config=config,
                    version=version,
                    cache_dir=raw_temp_dir,
                    is_local=is_local,
                )

                if dataset_builder.__class__.__name__ == "Csv":
                    # need slight adoption for csv dataset
                    mock_dl_manager.download_dummy_data()
                    path_to_dummy_data = mock_dl_manager.dummy_file
                    dataset_builder.config.data_files = {
                        "train": os.path.join(path_to_dummy_data, "train.csv"),
                        "test": os.path.join(path_to_dummy_data, "test.csv"),
                        "dev": os.path.join(path_to_dummy_data, "dev.csv"),
                    }
                elif dataset_builder.__class__.__name__ == "Json":
                    # need slight adoption for json dataset
                    mock_dl_manager.download_dummy_data()
                    path_to_dummy_data = mock_dl_manager.dummy_file
                    dataset_builder.config.data_files = {
                        "train": os.path.join(path_to_dummy_data,
                                              "train.json"),
                        "test": os.path.join(path_to_dummy_data, "test.json"),
                        "dev": os.path.join(path_to_dummy_data, "dev.json"),
                    }
                elif dataset_builder.__class__.__name__ == "Pandas":
                    # need slight adoption for json dataset
                    mock_dl_manager.download_dummy_data()
                    path_to_dummy_data = mock_dl_manager.dummy_file
                    dataset_builder.config.data_files = {
                        "train": os.path.join(path_to_dummy_data, "train.pkl"),
                        "test": os.path.join(path_to_dummy_data, "test.pkl"),
                        "dev": os.path.join(path_to_dummy_data, "dev.pkl"),
                    }
                elif dataset_builder.__class__.__name__ == "Text":
                    mock_dl_manager.download_dummy_data()
                    path_to_dummy_data = mock_dl_manager.dummy_file
                    dataset_builder.config.data_files = {
                        "train": os.path.join(path_to_dummy_data, "train.txt"),
                        "test": os.path.join(path_to_dummy_data, "test.txt"),
                        "dev": os.path.join(path_to_dummy_data, "dev.txt"),
                    }

                # mock size needed for dummy data instead of actual dataset
                if dataset_builder.info is not None:
                    # approximate upper bound of order of magnitude of dummy data files
                    one_mega_byte = 2 << 19
                    dataset_builder.info.size_in_bytes = 2 * one_mega_byte
                    dataset_builder.info.download_size = one_mega_byte
                    dataset_builder.info.dataset_size = one_mega_byte

                # generate examples from dummy data
                dataset_builder.download_and_prepare(
                    dl_manager=mock_dl_manager,
                    download_mode=GenerateMode.FORCE_REDOWNLOAD,
                    ignore_verifications=True,
                    try_from_hf_gcs=False,
                )

                # get dataset
                dataset = dataset_builder.as_dataset()

                # check that dataset is not empty
                for split in dataset_builder.info.splits.keys():
                    # check that loaded datset is not empty
                    self.parent.assertTrue(len(dataset[split]) > 0)
                del dataset