def test_dirpath(self): dataset = verta.dataset.Path("modelapi_hypothesis/") assert len(dataset.list_components()) > 1 for component in dataset.list_components(): assert component.path != "" assert component.size != 0 assert component.last_modified != 0 assert component.md5 != ""
def test_s3_bucket(self): # pylint: disable=no-member dataset = verta.dataset.S3("s3://verta-starter") assert len(dataset.list_components()) > 1 for component in dataset.list_components(): assert component.path != "" assert component.size != 0 assert component.last_modified != 0 assert component.md5 != ""
def test_filepath(self): dataset = verta.dataset.Path("modelapi_hypothesis/api_generator.py") assert len(dataset.list_components()) == 1 component = dataset.list_components()[0] assert component.path != "" assert component.size != 0 assert component.last_modified != 0 assert component.md5 != ""
def test_s3_key(self): # pylint: disable=no-member dataset = verta.dataset.S3("s3://verta-starter/census-test.csv") assert len(dataset.list_components()) == 1 component = dataset.list_components()[0] assert component.path != "" assert component.size != 0 assert component.last_modified != 0 assert component.md5 != ""
def test_multiple_filepaths(self): dataset = verta.dataset.Path([ "modelapi_hypothesis/api_generator.py", "modelapi_hypothesis/test_modelapi.py", ]) assert len(dataset.list_components()) == 2 for component in dataset.list_components(): assert component.path != "" assert component.size != 0 assert component.last_modified != 0 assert component.md5 != ""
def test_versioned_object(self): s3 = pytest.importorskip("boto3").client('s3') bucket = "verta-versioned-bucket" key = "data/census-train.csv" obj = s3.head_object(Bucket=bucket, Key=key) latest_version_id = obj['VersionId'] dataset = verta.dataset.S3("s3://{}/{}".format(bucket, key)) assert len(dataset.list_components()) == 1 assert dataset.list_components()[0].s3_version_id == latest_version_id
def test_s3_multiple_keys(self): # pylint: disable=no-member dataset = verta.dataset.S3([ "s3://verta-starter/census-test.csv", "s3://verta-starter/census-train.csv", ]) assert len(dataset.list_components()) == 2 for component in dataset.list_components(): assert component.path != "" assert component.size != 0 assert component.last_modified != 0 assert component.md5 != ""
def test_versioned_folder(self): s3 = pytest.importorskip("boto3").client('s3') S3_PATH = verta.dataset.S3._S3_PATH bucket = "verta-versioned-bucket" folder = "data/" s3_url = "s3://{}/{}".format(bucket, folder) # collect latest versions of objects with folder as prefix version_ids = { S3_PATH.format(bucket, obj['Key']): obj['VersionId'] for obj in s3.list_object_versions(Bucket=bucket, Prefix=folder) ['Versions'] if obj['IsLatest'] } for path, version_id in version_ids.items(): if version_id == "null": # S3 returns "null" in its API, but we handle that as empty string version_ids[path] = "" dataset = verta.dataset.S3(s3_url) for component in dataset.list_components(): assert component.s3_version_id == version_ids[component.path] assert not component.path.endswith('/') assert component.size != 0
def test_concat(self): dataset1 = verta.dataset.Path("modelapi_hypothesis/") dataset2 = verta.dataset.Path("versioning/") components = dataset1.list_components() + dataset2.list_components() components = list(sorted(components, key=lambda component: component.path)) dataset = dataset1 + dataset2 assert dataset.list_components() == components # commutative dataset = dataset2 + dataset1 assert dataset.list_components() == components # assignment dataset1 += dataset2 assert dataset1.list_components() == components
def test_concat(self): dataset1 = verta.dataset.S3("s3://verta-starter/") dataset2 = verta.dataset.S3("s3://verta-versioned-bucket/") components = dataset1.list_components() + dataset2.list_components() components = list(sorted(components, key=lambda component: component.path)) dataset = dataset1 + dataset2 assert dataset.list_components() == components # commutative dataset = dataset2 + dataset1 assert dataset.list_components() == components # assignment dataset1 += dataset2 assert dataset1.list_components() == components
def test_versioned_object_by_id(self): s3 = pytest.importorskip("boto3").client('s3') bucket = "verta-versioned-bucket" key = "data/census-train.csv" s3_url = "s3://{}/{}".format(bucket, key) # pick a version that's not the latest version_ids = [ obj['VersionId'] for obj in s3.list_object_versions(Bucket=bucket)['Versions'] if not obj['IsLatest'] and obj['Key'] == key ] version_id = version_ids[0] s3_loc = verta.dataset._s3.S3Location(s3_url, version_id) dataset = verta.dataset.S3(s3_loc) assert len(dataset.list_components()) == 1 assert dataset.list_components()[0].s3_version_id == version_id
def test_add(self): path1 = "versioning/test_code.py" path2 = "versioning/test_dataset.py" dataset = verta.dataset.Path(path1) dataset.add(path2) # as if we had added two separate blobs together dataset1 = verta.dataset.Path(path1) dataset2 = verta.dataset.Path(path2) components = dataset1.list_components() + dataset2.list_components() components = list(sorted(components, key=lambda component: component.path)) assert dataset.list_components() == components
def test_concat_base_path(self): dataset1 = verta.dataset.Path( "modelapi_hypothesis/", base_path="modelapi_hypothesis/", ) dataset2 = verta.dataset.Path( "versioning/", base_path="versioning/", ) components = dataset1.list_components() + dataset2.list_components() components = list(sorted(components, key=lambda component: component.path)) dataset = dataset1 + dataset2 assert dataset.list_components() == components
def test_add(self): path1 = "s3://verta-starter/census-train.csv" path2 = "s3://verta-starter/census-test.csv" dataset = verta.dataset.S3(path1) dataset.add(path2) # as if we had added two separate blobs together dataset1 = verta.dataset.S3(path1) dataset2 = verta.dataset.S3(path2) components = dataset1.list_components() + dataset2.list_components() components = list(sorted(components, key=lambda component: component.path)) assert dataset.list_components() == components
def test_versioned_bucket(self): s3 = pytest.importorskip("boto3").client('s3') S3_PATH = verta.dataset.S3._S3_PATH bucket = "verta-versioned-bucket" # collect latest versions of objects version_ids = { S3_PATH.format(bucket, obj['Key']): obj['VersionId'] for obj in s3.list_object_versions(Bucket=bucket)['Versions'] if obj['IsLatest'] } for path, version_id in version_ids.items(): if version_id == "null": # S3 returns "null" in its API, but we handle that as empty string version_ids[path] = "" dataset = verta.dataset.S3("s3://{}".format(bucket)) for component in dataset.list_components(): assert component.s3_version_id == version_ids[component.path]