def test_validate_dataset_spec_hash(self): # Success case spec = get_sample_spec('somebucket') self.assertTrue(validate_spec_hash(spec)) # None or empty cases self.assertFalse(validate_spec_hash(None)) self.assertFalse(validate_spec_hash({})) # Non-integer version spec['dataset']['version'] = 'string' self.assertFalse(validate_spec_hash(spec)) # Missing version spec['dataset'].pop('version') self.assertFalse(validate_spec_hash(spec)) # Missing dataset spec.pop('dataset') self.assertFalse(validate_spec_hash(spec)) # Empty category list spec = get_sample_spec('somebucket') spec['dataset']['categories'] = {} self.assertFalse(validate_spec_hash(spec)) # Missing categories spec['dataset'].pop('categories') self.assertFalse(validate_spec_hash(spec)) # Missing store spec = get_sample_spec('somebucket') spec['dataset']['manifest'].pop('store') self.assertFalse(validate_spec_hash(spec)) # Missing manifest spec['dataset'].pop('manifest') # Bad bucket URL format spec = get_sample_spec('somebucket') spec['dataset']['manifest']['store'] = 'invalid' self.assertFalse(validate_spec_hash(spec)) # Missing and empty dataset name spec = get_sample_spec('somebucket') spec['dataset']['name'] = '' self.assertFalse(validate_spec_hash(spec)) spec['dataset'].pop('name') self.assertFalse(validate_spec_hash(spec))
def test_fetch(self): mdpath = os.path.join(self.tmp_dir, 'metadata-test') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') config_spec = get_sample_config_spec(testbucketname, testprofile, testregion) dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save( { 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'} }, manifestpath) objectpath = os.path.join(self.tmp_dir, 'objects-test') spec = 'vision-computing__images__dataset-ex__5' r = LocalRepository(config_spec, objectpath) r.fetch(mdpath, spec, None) fs = set() for root, dirs, files in os.walk(objectpath): for file in files: fs.add(file) self.assertEqual(len(hs), len(fs)) self.assertTrue(len(hs.difference(fs)) == 0)
def test_remote_fsck(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) ohfs.put(HDATA_IMG_1) s3 = boto3.resource( 's3', region_name='us-east-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete() self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname)) mdpath = os.path.join(self.tmp_dir, 'metadata-test') dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath) fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec')) spec = 'vision-computing__images__dataset-ex__5' c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True) self.assertTrue(ret) self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())