def test_overwrite(self, client, created_entities, experiment_run, s3_bucket): dataset = client.set_dataset(type="local") created_entities.append(dataset) dataset_version = dataset.create_version(__file__) experiment_run.log_dataset_version('train', dataset_version) new_dataset_version = dataset.create_version("conftest.py") experiment_run.log_dataset_version('train', new_dataset_version, overwrite=True) retrieved_dataset_version = experiment_run.get_dataset_version('train') path = retrieved_dataset_version.dataset_version.path_dataset_version_info.base_path assert path.endswith("conftest.py")
def test_reincarnation(self, client, created_datasets): """Consecutive identical versions are assigned the same ID.""" dataset = client.set_dataset(type="local") created_datasets.append(dataset) version1 = dataset.create_version(path=__file__) version2 = dataset.create_version(path=__file__) assert version1.id == version2.id versions = dataset.get_all_versions() assert len(versions) == 1 version = dataset.get_latest_version(ascending=True) assert version.id == version1.id
def test_mngd_ver_to_sibling_dir(self, dataset): """Download to sibling directory works as expected.""" child_dirname = "child" os.mkdir(child_dirname) sibling_dirname = "sibling" os.mkdir(sibling_dirname) filename = "tiny1.bin" FILE_CONTENTS = os.urandom(2**16) with utils.chdir(child_dirname): with open(filename, 'wb') as f: f.write(FILE_CONTENTS) blob_path = "data" dataset_blob = verta.dataset.Path(filename, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() # download to sibling dir download_to_path = os.path.join("..", sibling_dirname, filename) filepath = dataset_blob.download(filename, download_to_path) assert os.path.isfile(filepath) assert filepath == os.path.abspath(download_to_path) with open(filepath, 'rb') as f: assert f.read() == FILE_CONTENTS
def test_concat(self, dataset): s3 = pytest.importorskip("boto3").client('s3') bucket1 = "verta-starter" key1 = "models/model.pkl" bucket2 = "verta-versioned-bucket" key2 = "tiny-files/tiny2.bin" # create dir for reference files reference_dir = "reference" filepath1 = os.path.join(reference_dir, bucket1, key1) pathlib2.Path(filepath1).parent.mkdir(parents=True, exist_ok=True) filepath2 = os.path.join(reference_dir, bucket2, key2) pathlib2.Path(filepath2).parent.mkdir(parents=True, exist_ok=True) # download files directly from S3 for reference s3.download_file(bucket1, key1, filepath1) s3.download_file(bucket2, key2, filepath2) # create and concatenate datasets dataset1 = verta.dataset.S3( "s3://{}/{}".format(bucket1, key1), enable_mdb_versioning=True, ) dataset2 = verta.dataset.S3( "s3://{}/{}".format(bucket2, key2), enable_mdb_versioning=True, ) dataset_blob = dataset1 + dataset2 dataset_blob = dataset.create_version(dataset_blob).get_content() dirpath = dataset_blob.download() assert_dirs_match(dirpath, reference_dir)
def test_mngd_ver_folder(self, dataset): reference_dir = "reference/" dirname = "tiny-files/" os.mkdir(dirname) for filename in ["tiny{}.bin".format(i) for i in range(3)]: with open(os.path.join(dirname, filename), 'wb') as f: f.write(os.urandom(2**16)) blob_path = "data" dataset_blob = verta.dataset.Path(dirname, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() shutil.move(dirname, reference_dir) # move sources to avoid collision # download to implicit path dirpath = dataset_blob.download(dirname) assert os.path.isdir(dirpath) assert dirpath == os.path.abspath(dirname) assert_dirs_match(dirpath, reference_dir) # download to implicit path without collision dirpath2 = dataset_blob.download(dirname) assert os.path.isdir(dirpath2) assert dirpath2 != dirpath assert_dirs_match(dirpath2, reference_dir) # download to explicit path with overwrite last_updated = os.path.getmtime(dirpath) dirpath3 = dataset_blob.download(dirname, dirpath) assert dirpath3 == dirpath assert_dirs_match(dirpath3, reference_dir) assert os.path.getmtime(dirpath) > last_updated
def test_base_path(self, dataset): reference_dir = "tiny-files/" os.mkdir(reference_dir) # three .file files in tiny-files/ for filename in ["tiny{}.file".format(i) for i in range(3)]: with open(os.path.join(reference_dir, filename), 'wb') as f: f.write(os.urandom(2**16)) sub_dir = "bin/" os.mkdir(os.path.join(reference_dir, sub_dir)) # three .bin files in tiny-files/bin/ for filename in ["tiny{}.bin".format(i) for i in range(3)]: with open(os.path.join(reference_dir, sub_dir, filename), 'wb') as f: f.write(os.urandom(2**16)) # log & get dataset blob blob_path = "data" dataset_blob = verta.dataset.Path( reference_dir, base_path=reference_dir, enable_mdb_versioning=True, ) dataset_blob = dataset.create_version(dataset_blob).get_content() # `reference_dir` was dropped as base path, so KeyError with pytest.raises(KeyError): dataset_blob.download(reference_dir) dirpath = dataset_blob.download() assert os.path.abspath(dirpath) != os.path.abspath(reference_dir) assert_dirs_match(dirpath, reference_dir)
def test_mngd_ver_file(self, dataset): filename = "tiny1.bin" FILE_CONTENTS = os.urandom(2**16) with open(filename, 'wb') as f: f.write(FILE_CONTENTS) blob_path = "data" dataset_blob = verta.dataset.Path(filename, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() os.remove(filename) # delete for first download test # download to implicit path filepath = dataset_blob.download(filename) assert os.path.isfile(filepath) assert filepath == os.path.abspath(filename) with open(filepath, 'rb') as f: assert f.read() == FILE_CONTENTS # download to implicit path without collision filepath2 = dataset_blob.download(filename) assert os.path.isfile(filepath2) assert filepath2 != filepath with open(filepath2, 'rb') as f: assert f.read() == FILE_CONTENTS # download to explicit path with overwrite last_updated = os.path.getmtime(filepath) filepath3 = dataset_blob.download(filename, filepath) assert filepath3 == filepath with open(filepath3, 'rb') as f: assert f.read() == FILE_CONTENTS assert os.path.getmtime(filepath) > last_updated
def test_download_all(self, dataset): s3 = pytest.importorskip("boto3").client('s3') bucket = "verta-versioned-bucket" dirname = "tiny-files/" s3_folder = "s3://{}/{}".format(bucket, dirname) # get files' contents directly from S3 for reference reference_dir = "reference/" for s3_obj in s3.list_objects_v2(Bucket=bucket, Prefix=dirname)['Contents']: key = s3_obj['Key'] filepath = os.path.join(reference_dir, bucket, key) pathlib2.Path(filepath).parent.mkdir( parents=True, exist_ok=True) # create parent dirs s3.download_file(bucket, key, filepath) # log & get dataset blob dataset_blob = verta.dataset.S3(s3_folder, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() dirpath = dataset_blob.download() assert dirpath == os.path.abspath(_dataset.DEFAULT_DOWNLOAD_DIR) assert os.path.isdir(dirpath) assert_dirs_match(dirpath, reference_dir)
def test_creation_from_scratch(self, client, created_datasets): dataset = client.set_dataset(type="local") created_datasets.append(dataset) version = dataset.create_version(__file__) assert version._dataset_type == _DatasetService.DatasetTypeEnum.PATH assert version.id
def test_filesystem_dataset_version_creation(self, client, created_entities): dir_name, _ = self.create_dir_with_files(num_files=3) dataset = client.set_dataset(type="local") created_entities.append(dataset) dataset_version = dataset.create_version(dir_name) assert len(dataset_version.dataset_version_info.dataset_part_infos) == 3 shutil.rmtree(dir_name)
def test_log_dataset_version_diff_workspaces(self, client, organization, created_entities, experiment_run): dataset = client.set_dataset(type="local", workspace=organization.name) created_entities.append(dataset) dataset_version = dataset.create_version(__file__) experiment_run.log_dataset_version('train', dataset_version) retrieved_dataset_version = experiment_run.get_dataset_version('train') assert retrieved_dataset_version.id == dataset_version.id
def test_creation_by_id(self, client, created_entities): dataset = client.set_dataset(type="local") created_entities.append(dataset) version = dataset.create_version(__file__) assert version.id same_version = client.get_dataset_version(id=version.id) assert version.id == same_version.id
def test_get_versions(self, client, created_entities): dataset = client.set_dataset(type="local") created_entities.append(dataset) version1 = dataset.create_version(path=__file__) assert version1.id version2 = dataset.create_version(path=pytest.__file__) assert version2.id versions = dataset.get_all_versions() assert len(versions) == 2 dataset_version1 = client.get_dataset_version(id=version1.id) assert dataset_version1.id == version1.id version = dataset.get_latest_version(ascending=True) assert version.id == version1.id
def test_get_latest_printing(self, client, created_entities, capsys): dataset = client.set_dataset(type="local") created_entities.append(dataset) version = dataset.create_version(path=__file__) dataset.get_latest_version(ascending=True) captured = capsys.readouterr() assert "got existing dataset version: {}".format(version.id) in captured.out
def test_creation_by_id(self, client, created_datasets): dataset = client.set_dataset(type="local") created_datasets.append(dataset) version = dataset.create_version(__file__) assert version._dataset_type == _DatasetService.DatasetTypeEnum.PATH assert version.id same_version = client.get_dataset_version(id=version.id) assert version.id == same_version.id
def test_log_dataset_version(self, client, created_entities, experiment_run): dataset = client.set_dataset(type="local") created_entities.append(dataset) dataset_version = dataset.create_version(__file__) experiment_run.log_dataset_version('train', dataset_version) retrieved_dataset_version = experiment_run.get_dataset_version('train') path = retrieved_dataset_version.dataset_version.path_dataset_version_info.base_path assert path.endswith(__file__)
def test_rdbms_version_creation(self, client, created_entities): dataset = client.set_dataset(type="postgres") created_entities.append(dataset) dataset_version = dataset.create_version(query="SELECT * FROM ner-table", db_connection_str="localhost:6543", num_records=100) assert dataset_version.dataset_version_info.query == "SELECT * FROM ner-table" assert dataset_version.dataset_version_info.data_source_uri == "localhost:6543" assert dataset_version.dataset_version_info.num_records == 100
def test_s3_dataset_version_creation(self, client, s3_bucket, created_entities): botocore = pytest.importorskip("botocore") try: dataset = client.set_dataset(type="s3") created_entities.append(dataset) dataset_version = dataset.create_version(s3_bucket) assert len(dataset_version.dataset_version_info.dataset_part_infos) >= 1 except botocore.exceptions.ClientError: pytest.skip("insufficient AWS credentials")
def test_tags_is_list_of_str(self, client, created_entities, tags): dataset = client.set_dataset(tags=tags) created_entities.append(dataset) version = dataset.create_version("conftest.py", tags=tags) endpoint = "{}://{}/api/v1/modeldb/dataset-version/getDatasetVersionTags".format( client._conn.scheme, client._conn.socket, ) response = verta._internal_utils._utils.make_request("GET", endpoint, client._conn, params={'id': version.id}) verta._internal_utils._utils.raise_for_http_error(response) assert response.json().get('tags', []) == [TAG]
def test_log_dataset_version(self, client, created_datasets, experiment_run): dataset = client.set_dataset(type="local") created_datasets.append(dataset) assert dataset._dataset_type == _DatasetService.DatasetTypeEnum.PATH dataset_version = dataset.create_version(__file__) experiment_run.log_dataset_version('train', dataset_version) retrieved_dataset_version = experiment_run.get_dataset_version('train') path = retrieved_dataset_version.dataset_version.path_dataset_version_info.base_path assert path.endswith(__file__)
def test_log_dataset_version_diff_workspaces_no_access_error( self, client_2, created_entities, experiment_run): dataset = client_2.set_dataset(type="local") created_entities.append(dataset) dataset_version = dataset.create_version(__file__) with pytest.raises(requests.HTTPError) as excinfo: experiment_run.log_dataset_version('train', dataset_version) excinfo_value = str(excinfo.value).strip() assert "403" in excinfo_value
def test_local_file(self, client, created_entities): filepath = "conftest.py" dataset = client.set_dataset(type="local") created_entities.append(dataset) version = dataset.create_version(filepath) retrieved = dataset.get_latest_version() assert version.id == retrieved.id # of course, but just to be sure base_path = os.path.abspath(filepath) self.assert_base_path(version, base_path) self.assert_base_path(retrieved, base_path)
def test_not_to_s3_dir(self, dataset): """If the user specifies "s3://", things shouldn't go into an "s3:" dir.""" bucket = "verta-versioned-bucket" dirname = "tiny-files/" s3_folder = "s3://{}/{}".format(bucket, dirname) blob_path = "data" # log & get dataset blob dataset_blob = verta.dataset.S3(s3_folder, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() dirpath = dataset_blob.download("s3://") assert "s3:" not in pathlib2.Path(dirpath).parts
def test_local_dir(self, client, created_datasets): dirpath = "." dataset = client.set_dataset(type="local") created_datasets.append(dataset) version = dataset.create_version(dirpath) retrieved = dataset.get_latest_version() assert version.id == retrieved.id # of course, but just to be sure base_path = os.path.abspath(dirpath) self.assert_base_path(version, base_path) self.assert_base_path(retrieved, base_path)
def test_s3_bucket(self, client, created_entities): bucket_name = "verta-starter" botocore = pytest.importorskip("botocore") try: dataset = client.set_dataset(type="s3") created_entities.append(dataset) version = dataset.create_version(bucket_name) except botocore.exceptions.ClientError: pytest.skip("insufficient AWS credentials") retrieved = dataset.get_latest_version() assert version.id == retrieved.id # of course, but just to be sure self.assert_base_path(version, bucket_name) self.assert_base_path(retrieved, bucket_name)
def test_big_query_dataset_version_creation(self, client, bq_query, bq_location, created_entities): google = pytest.importorskip("google") bigquery = pytest.importorskip("google.cloud.bigquery") try: query_job = google.cloud.bigquery.Client().query( bq_query, # Location must match that of the dataset(s) referenced in the query. location=bq_location, ) dataset = client.set_dataset(type="big query") created_entities.append(dataset) dataset_version = dataset.create_version(job_id=query_job.job_id, location=bq_location) assert dataset_version.dataset_version_info.query == bq_query except google.auth.exceptions.GoogleAuthError: pytest.skip("insufficient GCP credentials")
def test_mngd_ver_folder(self, dataset): s3 = pytest.importorskip("boto3").client('s3') bucket = "verta-versioned-bucket" dirname = "tiny-files/" s3_folder = "s3://{}/{}".format(bucket, dirname) blob_path = "data" # get files' contents directly from S3 for reference reference_dir = "reference/" for s3_obj in s3.list_objects_v2(Bucket=bucket, Prefix=dirname)['Contents']: key = s3_obj['Key'] filepath = os.path.join(reference_dir, key) pathlib2.Path(filepath).parent.mkdir( parents=True, exist_ok=True) # create parent dirs s3.download_file(bucket, key, filepath) # Since we're retrieving files with the S3 prefix `dirname`, the downloaded filetree won't # start with `dirname`, so we have to go deeper for `reference_dir` to account for that. reference_dir = os.path.join(reference_dir, dirname) # log & get dataset blob dataset_blob = verta.dataset.S3(s3_folder, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() # download to implicit path dirpath = dataset_blob.download(s3_folder) assert os.path.isdir(dirpath) assert dirpath == os.path.abspath(dirname) assert_dirs_match(dirpath, reference_dir) # download to implicit path without collision dirpath2 = dataset_blob.download(s3_folder) assert os.path.isdir(dirpath2) assert dirpath2 != dirpath assert_dirs_match(dirpath2, reference_dir) # download to explicit path with overwrite last_updated = os.path.getmtime(dirpath) dirpath3 = dataset_blob.download(s3_folder, dirpath) assert dirpath3 == dirpath assert_dirs_match(dirpath3, reference_dir) assert os.path.getmtime(dirpath) > last_updated
def test_download_all(self, dataset): reference_dir = "tiny-files/" os.mkdir(reference_dir) for filename in ["tiny{}.bin".format(i) for i in range(3)]: with open(os.path.join(reference_dir, filename), 'wb') as f: f.write(os.urandom(2**16)) # log & get dataset blob blob_path = "data" dataset_blob = verta.dataset.Path(reference_dir, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() dirpath = dataset_blob.download() assert dirpath == os.path.abspath(_dataset.DEFAULT_DOWNLOAD_DIR) # uploaded filetree was recreated within `DEFAULT_DOWNLOAD_DIR` destination_dir = os.path.join(_dataset.DEFAULT_DOWNLOAD_DIR, reference_dir) assert os.path.isdir(destination_dir) assert_dirs_match(destination_dir, reference_dir)
def test_mngd_ver_file(self, dataset): s3 = pytest.importorskip("boto3").client('s3') filename = "tiny1.bin" bucket = "verta-versioned-bucket" key = "tiny-files/{}".format(filename) s3_key = "s3://{}/{}".format(bucket, key) blob_path = "data" # get file contents directly from S3 for reference s3.download_file(bucket, key, filename) with open(filename, 'rb') as f: FILE_CONTENTS = f.read() os.remove(filename) # log & get dataset blob dataset_blob = verta.dataset.S3(s3_key, enable_mdb_versioning=True) dataset_blob = dataset.create_version(dataset_blob).get_content() # download to implicit path filepath = dataset_blob.download(s3_key) assert os.path.isfile(filepath) assert filepath == os.path.abspath(filename) with open(filepath, 'rb') as f: assert f.read() == FILE_CONTENTS # download to implicit path without collision filepath2 = dataset_blob.download(s3_key) assert os.path.isfile(filepath2) assert filepath2 != filepath with open(filepath2, 'rb') as f: assert f.read() == FILE_CONTENTS # download to explicit path with overwrite last_updated = os.path.getmtime(filepath) filepath3 = dataset_blob.download(s3_key, filepath) assert filepath3 == filepath with open(filepath3, 'rb') as f: assert f.read() == FILE_CONTENTS assert os.path.getmtime(filepath) > last_updated
def test_concat(self, dataset): reference_dir = "tiny-files/" os.mkdir(reference_dir) # two .file files in tiny-files/ for filename in ["tiny{}.file".format(i) for i in range(2)]: with open(os.path.join(reference_dir, filename), 'wb') as f: f.write(os.urandom(2**16)) # create and concatenate datasets dataset1 = verta.dataset.Path( "tiny-files/tiny0.file", enable_mdb_versioning=True, ) dataset2 = verta.dataset.Path( "tiny-files/tiny1.file", enable_mdb_versioning=True, ) dataset_blob = dataset1 + dataset2 dataset_blob = dataset.create_version(dataset_blob).get_content() dirpath = dataset_blob.download() dirpath = os.path.join(dirpath, reference_dir) # "tiny-files/" nested in new dir assert_dirs_match(dirpath, reference_dir)