Example #1
0
    def test_compression_extractors(self, compressed_file, dataset_base_url,
                                    dataset_dir, gmb_schema, tmp_path):
        "Test compression extractors (gzip, bzip2, and lzma) to make sure datasets are properly extracted and verified."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = dataset_base_url + '/extractables/' + compressed_file
        compressed_fp = dataset_dir / ('extractables/' + compressed_file)
        fake_schema['sha512sum'] = hashlib.sha512(
            (compressed_fp).read_bytes()).hexdigest()
        dataset = Dataset(fake_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert dataset.is_downloaded() is True

        # Content of the file list
        with open(dataset._file_list_file, mode='r') as f:
            file_list = json.load(f)

        def test_incorrect_file_list(change: dict):
            "Test a single case that somewhere in the file list things are wrong."

            wrong_file_list = copy.deepcopy(file_list)
            wrong_file_list['contents'].update(change)
            with open(dataset._file_list_file, mode='w') as f:
                json.dump(wrong_file_list, f)
            assert dataset.is_downloaded() is False

        # Can't find the file
        test_incorrect_file_list({'filename': 'non-existing-file'})
        # Size incorrect
        changed = copy.deepcopy(file_list['contents'])
        changed['size'] += 100
        test_incorrect_file_list(changed)
Example #2
0
    def test_zip_extractor(self, dataset_base_url, dataset_dir, gmb_schema,
                           tmp_path):
        "Test _ZipExtractor to make sure zip datasets are properly extracted and verified."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = dataset_base_url + '/extractables/test.zip'
        fake_schema['sha512sum'] = hashlib.sha512(
            (dataset_dir / 'extractables/test.zip').read_bytes()).hexdigest()
        zip_dataset = Dataset(fake_schema,
                              data_dir=tmp_path,
                              mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert zip_dataset.is_downloaded() is True

        # Content of the file list
        with open(zip_dataset._file_list_file, mode='r') as f:
            file_list = json.load(f)

        def test_incorrect_file_list(change: dict):
            "Test a single case that somewhere in the file list things are wrong."

            wrong_file_list = copy.deepcopy(file_list)
            wrong_file_list['contents'].update(change)
            with open(zip_dataset._file_list_file, mode='w') as f:
                json.dump(wrong_file_list, f)
            assert zip_dataset.is_downloaded() is False

        # Can't find a file
        test_incorrect_file_list({'non-existing-file': {'isdir': False}})
        # File type incorrect
        test_incorrect_file_list({'test-dir/test.csv': {'isdir': True}})
        # Size incorrect
        changed = copy.deepcopy(file_list['contents']['test-dir/test.txt'])
        changed['size'] += 100
        test_incorrect_file_list({'test-dir/test.txt': changed})
Example #3
0
    def test_is_downloaded(self, tmp_path, gmb_schema):
        "Test is_downloaded method using a ``.tar.gz`` archive."

        data_dir = tmp_path / 'non-existing-dir'
        assert not data_dir.exists()  # Sanity check: data_dir must not exist
        gmb = Dataset(gmb_schema,
                      data_dir=data_dir,
                      mode=Dataset.InitializationMode.LAZY)
        assert gmb.is_downloaded() is False

        gmb.download()
        assert gmb.is_downloaded() is True

        # JSON decoding error
        gmb._file_list_file.write_text("nonsense\n", encoding='utf-8')
        with pytest.raises(JSONDecodeError):
            # We don't check the value of the exception because we clearly only are only interested in ensuring that the
            # file isn't decodable
            gmb.is_downloaded()
Example #4
0
    def test_supported_file_extensions(self, dataset_base_url, dataset_dir,
                                       extractable, extractable_type,
                                       gmb_schema, tmp_path):
        "Test extract_data_files and verify_data_files to make sure proper extractors are used for various datasets."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = dataset_base_url + '/extractables/' + extractable
        fake_schema['sha512sum'] = hashlib.sha512(
            (dataset_dir / 'extractables' /
             extractable).read_bytes()).hexdigest()
        dataset = Dataset(fake_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert dataset.is_downloaded() is True
        with open(dataset._file_list_file, mode='r') as f:
            file_list = json.load(f)
        assert file_list['type'] == extractable_type
Example #5
0
    def test_deleting_data_dir(self, tmp_path, gmb_schema):
        "Test ``Dataset.delete()``."

        # Note we don't use tmp_sub_dir fixture because we want data_dir to be non-existing at the beginning of the
        # test.
        data_dir = tmp_path / 'data-dir'
        dataset = Dataset(gmb_schema,
                          data_dir=data_dir,
                          mode=Dataset.InitializationMode.LAZY)
        assert not data_dir.exists()  # sanity check: data_dir doesn't exist
        dataset.delete()  # no exception should be raised here
        assert not data_dir.exists()  # sanity check: data_dir doesn't exist

        dataset.download()
        # Sanity check: Files are in place
        assert dataset.is_downloaded()
        assert len(os.listdir(data_dir)) > 0
        # Delete the dir
        dataset.delete()
        assert not data_dir.exists()