Beispiel #1
0
def test_downloaded_dataset_duplicate_sources(monkeypatch):
    from batchup.datasets import dataset
    import hashlib

    tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch)

    hasher = hashlib.sha256()
    hasher.update(b'http://someplace.com/other.txt')
    expected_sha_a = hasher.hexdigest()
    f1 = dataset.DownloadSourceFile('test.txt',
                                    url='http://someplace.com/other.txt',
                                    sha256=expected_sha_a)

    hasher = hashlib.sha256()
    hasher.update(b'http://someplace.com/somethingelse.txt')
    expected_sha_b = hasher.hexdigest()
    f2 = dataset.DownloadSourceFile(
        'test.txt',
        url='http://someplace.com/somethingelse.txt',
        sha256=expected_sha_b)

    @dataset.fetch_and_convert_dataset([f1, f2], 'ds.txt')
    def downloaded_dataset(source_paths, target_path):
        raise RuntimeError('Should not get here')

    with pytest.raises(ValueError):
        downloaded_dataset()

    test_config._teardown_batchup_temp(tdir)
Beispiel #2
0
def test_downloaded_dataset(monkeypatch):
    from batchup.datasets import dataset
    import hashlib

    tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch)

    hasher = hashlib.sha256()
    hasher.update(b'http://someplace.com/other.txt')
    expected_sha_a = hasher.hexdigest()
    f1 = dataset.DownloadSourceFile('test.txt',
                                    url='http://someplace.com/other.txt',
                                    sha256=expected_sha_a)

    hasher = hashlib.sha256()
    hasher.update(b'http://someplace.com/somethingelse.txt')
    expected_sha_b = hasher.hexdigest()
    f2 = dataset.DownloadSourceFile(
        'test2.txt',
        url='http://someplace.com/somethingelse.txt',
        sha256=expected_sha_b)

    # Target filename (last arg) must be a string or a callable
    with pytest.raises(TypeError):

        @dataset.fetch_and_convert_dataset([f1, f2], 2)
        def downloaded_dataset(source_paths, target_path):
            raise RuntimeError('Should not get here')

    # Source files must contain `AbstractSourceFile` instances
    with pytest.raises(TypeError):

        @dataset.fetch_and_convert_dataset([f1, 'test2.txt'], 'ds.txt')
        def downloaded_dataset(source_paths, target_path):
            raise RuntimeError('Should not get here')

    @dataset.fetch_and_convert_dataset([f1, f2], 'ds.txt')
    def downloaded_dataset(source_paths, target_path):
        p1, p2 = source_paths
        with open(target_path, 'w') as f_out:
            f_out.write(open(p1, 'r').read())
            f_out.write(open(p2, 'r').read())
        return target_path

    dest = downloaded_dataset()

    # Check the resulting file
    assert os.path.exists(dest)
    assert open(dest, 'r').read() == (f1.url + f2.url)

    # Ensure that the temporary 'downloaded' files have been cleaned up
    assert not os.path.exists(f1.path)
    assert not os.path.exists(f2.path)

    # Invoking a second time should re-use the existing file
    dest2 = downloaded_dataset()
    assert dest2 == dest

    test_config._teardown_batchup_temp(tdir)
Beispiel #3
0
def test_DownloadSourceFile_acquire(monkeypatch):
    from batchup.datasets import dataset
    import hashlib

    tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch)

    hasher = hashlib.sha256()
    hasher.update(b'http://someplace.com/other.txt')
    expected_sha = hasher.hexdigest()

    f1 = dataset.DownloadSourceFile('test.txt',
                                    url='http://someplace.com/other.txt',
                                    sha256=expected_sha)
    assert f1.filename == 'test.txt'
    assert f1.temp_filename == os.path.join('temp', 'test.txt')
    assert f1.path == os.path.join(tdir, 'data', 'temp', 'test.txt')
    assert f1.url == 'http://someplace.com/other.txt'
    assert str(f1) == \
        'downloadable file test.txt from http://someplace.com/other.txt'

    dest = f1.acquire()
    assert dest == os.path.join(tdir, 'data', 'temp', 'test.txt')

    assert os.path.exists(dest)

    # clean up
    f1.clean_up()
    assert not os.path.exists(dest)

    test_config._teardown_batchup_temp(tdir)
Beispiel #4
0
def test_delete_dataset_cache(monkeypatch):
    from batchup.datasets import dataset
    import hashlib

    tdir = test_config._setup_batchup_temp_and_urlretrieve(monkeypatch)

    hasher = hashlib.sha256()
    hasher.update(b'http://someplace.com/other.txt')
    expected_sha_a = hasher.hexdigest()
    f1 = dataset.DownloadSourceFile('test.txt',
                                    url='http://someplace.com/other.txt',
                                    sha256=expected_sha_a)

    hasher = hashlib.sha256()
    hasher.update(b'http://someplace.com/somethingelse.txt')
    expected_sha_b = hasher.hexdigest()
    f2 = dataset.DownloadSourceFile(
        'test2.txt',
        url='http://someplace.com/somethingelse.txt',
        sha256=expected_sha_b)

    @dataset.fetch_and_convert_dataset([f1, f2], 'ds.txt')
    def downloaded_dataset(source_paths, target_path):
        p1, p2 = source_paths
        with open(target_path, 'w') as f_out:
            f_out.write(open(p1, 'r').read())
            f_out.write(open(p2, 'r').read())
        return target_path

    dest = downloaded_dataset()

    # Check the resulting file
    assert os.path.exists(dest)
    assert open(dest, 'r').read() == (f1.url + f2.url)

    # Ensure that the temporary 'downloaded' files have been cleaned up
    assert not os.path.exists(f1.path)
    assert not os.path.exists(f2.path)

    # Delete the dataset cache; provide the filename
    dataset.delete_dataset_cache('ds.txt')
    assert not os.path.exists(dest)

    test_config._teardown_batchup_temp(tdir)
Beispiel #5
0
def test_DownloadSourceFile_constructor(monkeypatch):
    from batchup.datasets import dataset

    _patch_config_datadir(monkeypatch)

    f1 = dataset.DownloadSourceFile('test.txt',
                                    url='http://someplace.com/other.txt')
    assert f1.filename == 'test.txt'
    assert f1.temp_filename == os.path.join('temp', 'test.txt')
    assert f1.path == os.path.join(_get_data_dir(), f1.temp_filename)
    assert f1.url == 'http://someplace.com/other.txt'
    assert str(f1) == \
        'downloadable file test.txt from http://someplace.com/other.txt'

    f2 = dataset.DownloadSourceFile('test.txt',
                                    base_url='http://someplace.com')
    assert f2.filename == 'test.txt'
    assert f2.temp_filename == os.path.join('temp', 'test.txt')
    assert f2.path == os.path.join(_get_data_dir(), f1.temp_filename)
    assert f2.url == 'http://someplace.com/test.txt'

    with pytest.raises(TypeError):
        dataset.DownloadSourceFile('test.txt')