Example #1
0
    def test_base_path(self, commit):
        reference_dir = "tiny-files/"
        os.mkdir(reference_dir)
        # three .file files in tiny-files/
        for filename in ["tiny{}.file".format(i) for i in range(3)]:
            with open(os.path.join(reference_dir, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        sub_dir = "bin/"
        os.mkdir(os.path.join(reference_dir, sub_dir))
        # three .bin files in tiny-files/bin/
        for filename in ["tiny{}.bin".format(i) for i in range(3)]:
            with open(os.path.join(reference_dir, sub_dir, filename),
                      'wb') as f:
                f.write(os.urandom(2**16))

        # commit dataset blob
        blob_path = "data"
        dataset = verta.dataset.Path(
            reference_dir,
            base_path=reference_dir,
            enable_mdb_versioning=True,
        )
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        # `reference_dir` was dropped as base path, so KeyError
        with pytest.raises(KeyError):
            dataset.download(reference_dir)

        dirpath = dataset.download()
        assert os.path.abspath(dirpath) != os.path.abspath(reference_dir)
        assert_dirs_match(dirpath, reference_dir)
Example #2
0
    def test_mngd_ver_folder(self, commit):
        reference_dir = "reference/"
        dirname = "tiny-files/"
        os.mkdir(dirname)
        for filename in ["tiny{}.bin".format(i) for i in range(3)]:
            with open(os.path.join(dirname, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        blob_path = "data"
        dataset = verta.dataset.Path(dirname, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        os.rename(dirname, reference_dir)  # move sources to avoid collision
        dataset = commit.get(blob_path)

        # download to implicit path
        dirpath = dataset.download(dirname)
        assert os.path.isdir(dirpath)
        assert dirpath == os.path.abspath(dirname)
        assert_dirs_match(dirpath, reference_dir)

        # download to implicit path without collision
        dirpath2 = dataset.download(dirname)
        assert os.path.isdir(dirpath2)
        assert dirpath2 != dirpath
        assert_dirs_match(dirpath2, reference_dir)

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(dirpath)
        dirpath3 = dataset.download(dirname, dirpath)
        assert dirpath3 == dirpath
        assert_dirs_match(dirpath3, reference_dir)
        assert os.path.getmtime(dirpath) > last_updated
Example #3
0
    def test_mngd_ver_rollback(self, commit):
        """Recover a versioned file by loading a prior commit."""
        filename = "tiny1.bin"
        file1_contents = os.urandom(2**16)
        with open(filename, 'wb') as f:
            f.write(file1_contents)
        blob_path = "data"

        dataset = verta.dataset.Path(filename, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("First file.")

        # new file with same name
        os.remove(filename)
        file2_contents = os.urandom(2**16)
        with open(filename, 'wb') as f:
            f.write(file2_contents)

        dataset = verta.dataset.Path(filename, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Second file.")

        # check latest commit's file
        dataset = commit.get(blob_path)
        new_filename = dataset.download(filename)
        with open(new_filename, 'rb') as f:
            assert f.read() == file2_contents

        # recover previous commit's file
        commit = commit.parent
        dataset = commit.get(blob_path)
        new_filename = dataset.download(filename)
        with open(new_filename, 'rb') as f:
            assert f.read() == file1_contents
Example #4
0
    def test_mngd_ver_file(self, commit):
        filename = "tiny1.bin"
        FILE_CONTENTS = os.urandom(2**16)
        with open(filename, 'wb') as f:
            f.write(FILE_CONTENTS)
        blob_path = "data"

        dataset = verta.dataset.Path(filename, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        os.remove(filename)  # delete for first download test
        dataset = commit.get(blob_path)

        # download to implicit path
        filepath = dataset.download(filename)
        assert os.path.isfile(filepath)
        assert filepath == os.path.abspath(filename)
        with open(filepath, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to implicit path without collision
        filepath2 = dataset.download(filename)
        assert os.path.isfile(filepath2)
        assert filepath2 != filepath
        with open(filepath2, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(filepath)
        filepath3 = dataset.download(filename, filepath)
        assert filepath3 == filepath
        with open(filepath3, 'rb') as f:
            assert f.read() == FILE_CONTENTS
        assert os.path.getmtime(filepath) > last_updated
Example #5
0
    def test_mngd_ver_folder(self, commit, in_tempdir):
        boto3 = pytest.importorskip("boto3")
        s3 = boto3.client('s3')

        bucket = "verta-versioned-bucket"
        dirname = "tiny-files/"
        s3_folder = "s3://{}/{}".format(bucket, dirname)
        blob_path = "data"

        # get files' contents directly from S3 for reference
        FILE_CONTENTS = dict()  # filename to contents
        for s3_obj in s3.list_objects_v2(Bucket=bucket, Prefix=dirname)['Contents']:
            with tempfile.NamedTemporaryFile('wb', delete=False) as tempf:
                s3.download_fileobj(bucket, s3_obj['Key'], tempf)
            with open(tempf.name, 'rb') as f:
                FILE_CONTENTS[os.path.basename(s3_obj['Key'])] = f.read()
            os.remove(tempf.name)
        assert FILE_CONTENTS

        # commit dataset blob
        dataset = verta.dataset.S3(s3_folder, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        # download to implicit path
        dirpath = dataset.download(s3_folder)
        assert os.path.isdir(dirpath)
        assert dirpath == os.path.abspath(dirname)
        for filename in os.listdir(dirpath):
            with open(os.path.join(dirpath, filename), 'rb') as f:
                assert f.read() == FILE_CONTENTS[filename]

        # download to implicit path without collision
        dirpath2 = dataset.download(s3_folder)
        assert os.path.isdir(dirpath2)
        assert dirpath2 != dirpath
        for filename in os.listdir(dirpath):
            with open(os.path.join(dirpath, filename), 'rb') as f:
                assert f.read() == FILE_CONTENTS[filename]

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(dirpath)
        dirpath3 = dataset.download(s3_folder, dirpath)
        assert dirpath3 == dirpath
        for filename in os.listdir(dirpath):
            with open(os.path.join(dirpath, filename), 'rb') as f:
                assert f.read() == FILE_CONTENTS[filename]
        assert os.path.getmtime(dirpath) > last_updated
Example #6
0
    def test_mngd_ver_to_sibling_dir(self, commit):
        """Download to sibling directory works as expected."""
        child_dirname = "child"
        os.mkdir(child_dirname)
        sibling_dirname = "sibling"
        os.mkdir(sibling_dirname)
        filename = "tiny1.bin"
        FILE_CONTENTS = os.urandom(2**16)

        with utils.chdir(child_dirname):
            with open(filename, 'wb') as f:
                f.write(FILE_CONTENTS)
            blob_path = "data"

            dataset = verta.dataset.Path(filename, enable_mdb_versioning=True)
            commit.update(blob_path, dataset)
            commit.save("First file.")
            dataset = commit.get(blob_path)

            # download to sibling dir
            download_to_path = os.path.join("..", sibling_dirname, filename)
            filepath = dataset.download(filename, download_to_path)
            assert os.path.isfile(filepath)
            assert filepath == os.path.abspath(download_to_path)
            with open(filepath, 'rb') as f:
                assert f.read() == FILE_CONTENTS
Example #7
0
    def test_download_all(self, commit):
        s3 = pytest.importorskip("boto3").client('s3')

        bucket = "verta-versioned-bucket"
        dirname = "tiny-files/"
        s3_folder = "s3://{}/{}".format(bucket, dirname)

        # get files' contents directly from S3 for reference
        reference_dir = "reference/"
        for s3_obj in s3.list_objects_v2(Bucket=bucket,
                                         Prefix=dirname)['Contents']:
            key = s3_obj['Key']
            filepath = os.path.join(reference_dir, bucket, key)
            pathlib2.Path(filepath).parent.mkdir(
                parents=True, exist_ok=True)  # create parent dirs

            s3.download_file(bucket, key, filepath)

        # commit dataset blob
        blob_path = "data"
        dataset = verta.dataset.S3(s3_folder, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        dirpath = dataset.download()
        assert dirpath == os.path.abspath(_dataset.DEFAULT_DOWNLOAD_DIR)

        assert os.path.isdir(dirpath)
        assert_dirs_match(dirpath, reference_dir)
Example #8
0
    def test_concat(self, commit):
        reference_dir = "tiny-files/"
        os.mkdir(reference_dir)
        # two .file files in tiny-files/
        for filename in ["tiny{}.file".format(i) for i in range(2)]:
            with open(os.path.join(reference_dir, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        # create and concatenate datasets
        dataset1 = verta.dataset.Path(
            "tiny-files/tiny0.file",
            enable_mdb_versioning=True,
        )
        dataset2 = verta.dataset.Path(
            "tiny-files/tiny1.file",
            enable_mdb_versioning=True,
        )
        dataset = dataset1 + dataset2

        blob_path = "data"
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        dirpath = dataset.download()
        dirpath = os.path.join(
            dirpath, reference_dir)  # "tiny-files/" nested in new dir
        assert_dirs_match(dirpath, reference_dir)
Example #9
0
    def test_mngd_ver_folder(self, commit):
        s3 = pytest.importorskip("boto3").client('s3')

        bucket = "verta-versioned-bucket"
        dirname = "tiny-files/"
        s3_folder = "s3://{}/{}".format(bucket, dirname)
        blob_path = "data"

        # get files' contents directly from S3 for reference
        reference_dir = "reference/"
        for s3_obj in s3.list_objects_v2(Bucket=bucket,
                                         Prefix=dirname)['Contents']:
            key = s3_obj['Key']
            filepath = os.path.join(reference_dir, key)
            pathlib2.Path(filepath).parent.mkdir(
                parents=True, exist_ok=True)  # create parent dirs

            s3.download_file(bucket, key, filepath)

        # Since we're retrieving files with the S3 prefix `dirname`, the downloaded filetree won't
        # start with `dirname`, so we have to go deeper for `reference_dir` to account for that.
        reference_dir = os.path.join(reference_dir, dirname)

        # commit dataset blob
        dataset = verta.dataset.S3(s3_folder, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        # download to implicit path
        dirpath = dataset.download(s3_folder)
        assert os.path.isdir(dirpath)
        assert dirpath == os.path.abspath(dirname)
        assert_dirs_match(dirpath, reference_dir)

        # download to implicit path without collision
        dirpath2 = dataset.download(s3_folder)
        assert os.path.isdir(dirpath2)
        assert dirpath2 != dirpath
        assert_dirs_match(dirpath2, reference_dir)

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(dirpath)
        dirpath3 = dataset.download(s3_folder, dirpath)
        assert dirpath3 == dirpath
        assert_dirs_match(dirpath3, reference_dir)
        assert os.path.getmtime(dirpath) > last_updated
Example #10
0
    def test_mngd_ver_file(self, commit, in_tempdir):
        boto3 = pytest.importorskip("boto3")
        s3 = boto3.client('s3')

        filename = "tiny1.bin"
        bucket = "verta-versioned-bucket"
        key = "tiny-files/{}".format(filename)
        s3_key = "s3://{}/{}".format(bucket, key)
        blob_path = "data"

        # get file contents directly from S3 for reference
        s3.download_file(bucket, key, filename)
        with open(filename, 'rb') as f:
            FILE_CONTENTS = f.read()
        os.remove(filename)

        # commit dataset blob
        dataset = verta.dataset.S3(s3_key, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        # download to implicit path
        filepath = dataset.download(s3_key)
        assert os.path.isfile(filepath)
        assert filepath == os.path.abspath(filename)
        with open(filepath, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to implicit path without collision
        filepath2 = dataset.download(s3_key)
        assert os.path.isfile(filepath2)
        assert filepath2 != filepath
        with open(filepath2, 'rb') as f:
            assert f.read() == FILE_CONTENTS

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(filepath)
        filepath3 = dataset.download(s3_key, filepath)
        assert filepath3 == filepath
        with open(filepath3, 'rb') as f:
            assert f.read() == FILE_CONTENTS
        assert os.path.getmtime(filepath) > last_updated
Example #11
0
    def test_mngd_ver_folder(self, commit, in_tempdir):
        dirname = "tiny-files/"
        os.mkdir(dirname)
        FILE_CONTENTS = {  # filename to contents
            "tiny{}.bin".format(i): os.urandom(2**16)
            for i in range(3)
        }
        for filename, contents in FILE_CONTENTS.items():
            with open(os.path.join(dirname, filename), 'wb') as f:
                f.write(contents)
        blob_path = "data"

        dataset = verta.dataset.Path(dirname, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        shutil.rmtree(dirname)  # delete for first download test
        dataset = commit.get(blob_path)

        # download to implicit path
        dirpath = dataset.download(dirname)
        assert os.path.isdir(dirpath)
        assert dirpath == os.path.abspath(dirname)
        for filename in os.listdir(dirpath):
            with open(os.path.join(dirpath, filename), 'rb') as f:
                assert f.read() == FILE_CONTENTS[filename]

        # download to implicit path without collision
        dirpath2 = dataset.download(dirname)
        assert os.path.isdir(dirpath2)
        assert dirpath2 != dirpath
        for filename in os.listdir(dirpath2):
            with open(os.path.join(dirpath2, filename), 'rb') as f:
                assert f.read() == FILE_CONTENTS[filename]

        # download to explicit path with overwrite
        last_updated = os.path.getmtime(dirpath)
        dirpath3 = dataset.download(dirname, dirpath)
        assert dirpath3 == dirpath
        for filename in os.listdir(dirpath3):
            with open(os.path.join(dirpath3, filename), 'rb') as f:
                assert f.read() == FILE_CONTENTS[filename]
        assert os.path.getmtime(dirpath) > last_updated
Example #12
0
    def test_not_to_s3_dir(self, commit):
        """If the user specifies "s3://", things shouldn't go into an "s3:" dir."""
        bucket = "verta-versioned-bucket"
        dirname = "tiny-files/"
        s3_folder = "s3://{}/{}".format(bucket, dirname)
        blob_path = "data"

        # commit dataset blob
        dataset = verta.dataset.S3(s3_folder, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        dirpath = dataset.download("s3://")
        assert "s3:" not in pathlib2.Path(dirpath).parts
Example #13
0
    def test_download_all(self, commit):
        reference_dir = "tiny-files/"
        os.mkdir(reference_dir)
        for filename in ["tiny{}.bin".format(i) for i in range(3)]:
            with open(os.path.join(reference_dir, filename), 'wb') as f:
                f.write(os.urandom(2**16))

        # commit dataset blob
        blob_path = "data"
        dataset = verta.dataset.Path(reference_dir, enable_mdb_versioning=True)
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        dirpath = dataset.download()
        assert dirpath == os.path.abspath(_dataset.DEFAULT_DOWNLOAD_DIR)

        # uploaded filetree was recreated within `DEFAULT_DOWNLOAD_DIR`
        destination_dir = os.path.join(_dataset.DEFAULT_DOWNLOAD_DIR,
                                       reference_dir)
        assert os.path.isdir(destination_dir)
        assert_dirs_match(destination_dir, reference_dir)
Example #14
0
    def test_concat(self, commit):
        s3 = pytest.importorskip("boto3").client('s3')

        bucket1 = "verta-starter"
        key1 = "models/model.pkl"
        bucket2 = "verta-versioned-bucket"
        key2 = "tiny-files/tiny2.bin"

        # create dir for reference files
        reference_dir = "reference"
        filepath1 = os.path.join(reference_dir, bucket1, key1)
        pathlib2.Path(filepath1).parent.mkdir(parents=True, exist_ok=True)
        filepath2 = os.path.join(reference_dir, bucket2, key2)
        pathlib2.Path(filepath2).parent.mkdir(parents=True, exist_ok=True)

        # download files directly from S3 for reference
        s3.download_file(bucket1, key1, filepath1)
        s3.download_file(bucket2, key2, filepath2)

        # create and concatenate datasets
        dataset1 = verta.dataset.S3(
            "s3://{}/{}".format(bucket1, key1),
            enable_mdb_versioning=True,
        )
        dataset2 = verta.dataset.S3(
            "s3://{}/{}".format(bucket2, key2),
            enable_mdb_versioning=True,
        )
        dataset = dataset1 + dataset2

        blob_path = "data"
        commit.update(blob_path, dataset)
        commit.save("Version data.")
        dataset = commit.get(blob_path)

        dirpath = dataset.download()
        assert_dirs_match(dirpath, reference_dir)