Example #1
0
    def test_validate_dataset_spec_hash(self):
        # Success case
        spec = get_sample_spec('somebucket')
        self.assertTrue(validate_spec_hash(spec))

        # None or empty cases
        self.assertFalse(validate_spec_hash(None))
        self.assertFalse(validate_spec_hash({}))

        # Non-integer version
        spec['dataset']['version'] = 'string'
        self.assertFalse(validate_spec_hash(spec))

        # Missing version
        spec['dataset'].pop('version')
        self.assertFalse(validate_spec_hash(spec))

        # Missing dataset
        spec.pop('dataset')
        self.assertFalse(validate_spec_hash(spec))

        # Empty category list
        spec = get_sample_spec('somebucket')
        spec['dataset']['categories'] = {}
        self.assertFalse(validate_spec_hash(spec))

        # Missing categories
        spec['dataset'].pop('categories')
        self.assertFalse(validate_spec_hash(spec))

        # Missing store
        spec = get_sample_spec('somebucket')
        spec['dataset']['manifest'].pop('store')
        self.assertFalse(validate_spec_hash(spec))

        # Missing manifest
        spec['dataset'].pop('manifest')

        # Bad bucket URL format
        spec = get_sample_spec('somebucket')
        spec['dataset']['manifest']['store'] = 'invalid'
        self.assertFalse(validate_spec_hash(spec))

        # Missing and empty dataset name
        spec = get_sample_spec('somebucket')
        spec['dataset']['name'] = ''
        self.assertFalse(validate_spec_hash(spec))
        spec['dataset'].pop('name')
        self.assertFalse(validate_spec_hash(spec))
Example #2
0
    def test_fetch(self):
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        config_spec = get_sample_config_spec(testbucketname, testprofile,
                                             testregion)
        dataset_spec = get_sample_spec(testbucketname)

        specpath = os.path.join(mdpath, 'vision-computing', 'images',
                                'dataset-ex')
        ensure_path_exists(specpath)
        yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec'))

        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')
        yaml_save(
            {
                'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh':
                {'imghires.jpg'}
            }, manifestpath)

        objectpath = os.path.join(self.tmp_dir, 'objects-test')
        spec = 'vision-computing__images__dataset-ex__5'

        r = LocalRepository(config_spec, objectpath)
        r.fetch(mdpath, spec, None)

        fs = set()
        for root, dirs, files in os.walk(objectpath):
            for file in files:
                fs.add(file)

        self.assertEqual(len(hs), len(fs))
        self.assertTrue(len(hs.difference(fs)) == 0)
Example #3
0
    def test_remote_fsck(self):
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        ohfs.put(HDATA_IMG_1)

        s3 = boto3.resource(
            's3',
            region_name='us-east-1',
            aws_access_key_id='fake_access_key',
            aws_secret_access_key='fake_secret_key',
        )

        s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete()
        self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname))
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')

        dataset_spec = get_sample_spec(testbucketname)
        specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex')
        ensure_path_exists(specpath)

        yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec'))
        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')

        yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath)
        fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec'))
        spec = 'vision-computing__images__dataset-ex__5'
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True)
        self.assertTrue(ret)

        self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())