Ejemplo n.º 1
0
    def test_get_update_links_wspace_with_duplicates(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))

        wspace_file = os.path.join(wspath, DATA_IMG_2)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        self.assertTrue(st.st_nlink == 3)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh',
                                  DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})

        wspath = os.path.join(self.tmp_dir, 'wspace')
        ensure_path_exists(wspath)
        to_be_removed = os.path.join(wspath, 'to_be_removed')
        with open(to_be_removed, 'w') as f:
            f.write('DEAD\n')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1, DATA_IMG_2}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')
        r._remove_unused_links_wspace(wspath, mfiles)
        self.assertFalse(os.path.exists(to_be_removed))
Ejemplo n.º 2
0
    def test_get_update_links_wspace(self):
        wspath = os.path.join(self.tmp_dir, 'wspace')

        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)
        fidx = FullIndex(self.tmp_dir, self.tmp_dir)
        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        mfiles = {}
        files = {DATA_IMG_1}
        r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict')

        wspace_file = os.path.join(wspath, DATA_IMG_1)
        set_write_read(wspace_file)
        self.assertTrue(os.path.exists(wspace_file))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file))
        st = os.stat(wspace_file)
        fi = fidx.get_index()
        for k, v in fi.items():
            self.assertEqual(k, os.path.join('data', 'imghires.jpg'))
            self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh')
            self.assertEqual(v['status'], 'u')
            self.assertEqual(v['ctime'], st.st_ctime)
            self.assertEqual(v['mtime'], st.st_mtime)
        self.assertTrue(st.st_nlink == 2)
        self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
Ejemplo n.º 3
0
    def export(self, bucket, tag, retry):
        try:
            categories_path, spec_name, _ = spec_parse(tag)
            get_root_path()
            if not self._tag_exists(tag):
                return None, None
        except InvalidGitRepositoryError:
            log.error('You are not in an initialized ml-git repository.',
                      class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None
        except Exception as e:
            log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None

        try:
            self._checkout_ref(tag)
        except Exception:
            log.error('Unable to checkout to %s' % tag,
                      class_name=REPOSITORY_CLASS_NAME)
            return None, None

        local = LocalRepository(
            self.__config, get_objects_path(self.__config, self.__repo_type),
            self.__repo_type)
        local.export_tag(get_metadata_path(self.__config, self.__repo_type),
                         tag, bucket, retry)

        self._checkout_ref()
Ejemplo n.º 4
0
    def remote_fsck(self, spec, retries=2, thorough=False, paranoid=False):
        repo_type = self.__repo_type
        try:
            metadata_path = get_metadata_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            ref = Refs(refs_path, spec, repo_type)
            tag, sha = ref.branch()

            categories_path = get_path_with_categories(tag)

            self._checkout_ref(tag)
            spec_path, spec_file = search_spec_file(self.__repo_type, spec,
                                                    categories_path)

        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return
        if spec_path is None:
            return

        full_spec_path = os.path.join(spec_path, spec_file)

        r = LocalRepository(self.__config, objects_path, repo_type)

        r.remote_fsck(metadata_path, tag, full_spec_path, retries, thorough,
                      paranoid)

        # ensure first we're on master !
        self._checkout_ref()
Ejemplo n.º 5
0
    def test_fetch(self):
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        config_spec = get_sample_config_spec(testbucketname, testprofile,
                                             testregion)
        dataset_spec = get_sample_spec(testbucketname)

        specpath = os.path.join(mdpath, 'vision-computing', 'images',
                                'dataset-ex')
        ensure_path_exists(specpath)
        yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec'))

        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')
        yaml_save(
            {
                'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh':
                {'imghires.jpg'}
            }, manifestpath)

        objectpath = os.path.join(self.tmp_dir, 'objects-test')
        spec = 'vision-computing__images__dataset-ex__5'

        r = LocalRepository(config_spec, objectpath)
        r.fetch(mdpath, spec, None)

        fs = set()
        for root, dirs, files in os.walk(objectpath):
            for file in files:
                fs.add(file)

        self.assertEqual(len(hs), len(fs))
        self.assertTrue(len(hs.difference(fs)) == 0)
Ejemplo n.º 6
0
    def test_add_metrics_file(self):
        hashfs_path = os.path.join(self.tmp_dir, 'objectsfs')
        test_config = yaml_load('hdata/config.yaml')
        local_repo = LocalRepository(test_config,
                                     hashfs_path,
                                     repo_type=MODELS)
        spec_path = os.path.join(self.tmp_dir, 'model-ex.spec')
        shutil.copy('hdata/dataset-ex.spec', spec_path)
        spec_file = yaml_load(spec_path)
        model = spec_file[DATASET_SPEC_KEY].copy()
        del spec_file[DATASET_SPEC_KEY]
        spec_file[MODEL_SPEC_KEY] = model
        yaml_save(spec_file, spec_path)
        metrics_file_path = os.path.join(self.tmp_dir, 'metrics.csv')
        self.create_csv_file(metrics_file_path, {
            'metric_a': 10,
            'metric_b': 9
        })
        local_repo.add_metrics(spec_path, (), metrics_file_path)

        test_spec_file = yaml_load(spec_path)
        self.assertEqual(
            test_spec_file[MODEL_SPEC_KEY]['metrics'].get('metric_a', ''),
            10.0)
        self.assertEqual(
            test_spec_file[MODEL_SPEC_KEY]['metrics'].get('metric_b', ''), 9.0)
Ejemplo n.º 7
0
    def test_get_ipld(self):
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')

        s3 = boto3.resource(
            's3',
            region_name='eu-west-1',
            aws_access_key_id='fake_access_key',
            aws_secret_access_key='fake_secret_key',
        )

        keypath = 'zdj7WdjnTVfz5AhTavcpsDT62WiQo4AeQy6s4UC1BSEZYx4NP'
        file = os.path.join('hdata', keypath)

        with open(file, 'rb') as f:
            s3.Bucket(testbucketname).Object(keypath).put(file, Body=f)

        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        s3store = S3Store(testbucketname, bucket)

        links = {
            'Links': [{
                'Hash': 'zdj7WVyQ8wTdnDXsbg8wxwwFkt2Bzp95Tncsfg8PCgKXeLTye',
                'Size': 16822
            }]
        }

        self.assertEqual(links, r._get_ipld(s3store, keypath))
Ejemplo n.º 8
0
    def test_mount_blobs(self):
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')

        s3 = boto3.resource(
            's3',
            region_name='eu-west-1',
            aws_access_key_id='fake_access_key',
            aws_secret_access_key='fake_secret_key',
        )

        keypath = 'zdj7We7Je5MRECsZUF7uptseHHPY29zGoqFsVHw6sbgv1MbWS'
        file = os.path.join('hdata', keypath)

        with open(file, 'rb') as f:
            s3.Bucket(testbucketname).Object(keypath).put(file, Body=f)

        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        s3store = S3Store(testbucketname, bucket)

        links = {'Links': [{'Hash': keypath, 'Size': 16822}]}

        with open(file, 'rb') as f:
            self.assertEqual(f.read(), r._mount_blobs(s3store, links))
Ejemplo n.º 9
0
    def test_remote_fsck(self):
        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        ohfs.put(HDATA_IMG_1)

        s3 = boto3.resource(
            's3',
            region_name='us-east-1',
            aws_access_key_id='fake_access_key',
            aws_secret_access_key='fake_secret_key',
        )

        s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete()
        self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname))
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')

        dataset_spec = get_sample_spec(testbucketname)
        specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex')
        ensure_path_exists(specpath)

        yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec'))
        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')

        yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath)
        fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec'))
        spec = 'vision-computing__images__dataset-ex__5'
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, hfspath)
        ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True)
        self.assertTrue(ret)

        self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())
Ejemplo n.º 10
0
    def status(self, spec, full_option, status_directory):
        repo_type = self.__repo_type
        try:
            objects_path = get_objects_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            log.info('%s: status of ml-git index for [%s]' % (repo_type, spec),
                     class_name=REPOSITORY_CLASS_NAME)
            new_files, deleted_files, untracked_files, corruped_files, changed_files = repo.status(
                spec, status_directory)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        if new_files is not None and deleted_files is not None and untracked_files is not None:
            print('Changes to be committed:')
            self._print_files(new_files, full_option, 'New file: ')

            self._print_files(deleted_files, full_option, 'Deleted: ')

            print('\nUntracked files:')
            self._print_files(untracked_files, full_option)

            print('\nCorrupted files:')
            self._print_files(corruped_files, full_option)

            if changed_files and len(changed_files) > 0:
                print('\nChanges not staged for commit:')
                self._print_files(changed_files, full_option)
Ejemplo n.º 11
0
    def unlock_file(self, spec, file_path):
        repo_type = self.__repo_type

        if not validate_config_spec_hash(self.__config):
            log.error(
                '.ml-git/config.yaml invalid.  It should look something like this:\n%s'
                % get_yaml_str(
                    get_sample_config_spec('somebucket', 'someprofile',
                                           'someregion')),
                class_name=REPOSITORY_CLASS_NAME)
            return None

        path, file = None, None
        try:
            refs_path = get_refs_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            cache_path = get_cache_path(self.__config, repo_type)

            ref = Refs(refs_path, spec, repo_type)
            tag, sha = ref.branch()
            categories_path = get_path_with_categories(tag)

            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        if path is None:
            return

        spec_path = os.path.join(path, file)
        spec_file = yaml_load(spec_path)

        try:
            mutability = spec_file[repo_type]['mutability']
            if mutability not in Mutability.list():
                log.error('Invalid mutability type.',
                          class_name=REPOSITORY_CLASS_NAME)
                return
        except Exception:
            log.info(
                'The spec does not have the \'mutability\' property set. Default: strict.',
                class_name=REPOSITORY_CLASS_NAME)
            return

        if mutability != Mutability.STRICT.value:
            try:
                local = LocalRepository(self.__config, objects_path, repo_type)
                local.unlock_file(path, file_path, index_path, objects_path,
                                  spec, cache_path)
            except Exception as e:
                log.error(e, class_name=REPOSITORY_CLASS_NAME)
                return
        else:
            log.error(
                'You cannot use this command for this entity because mutability cannot be strict.',
                class_name=REPOSITORY_CLASS_NAME)
Ejemplo n.º 12
0
 def test_add_metrics_wrong_entity(self):
     hashfs_path = os.path.join(self.tmp_dir, 'objectsfs')
     test_config = yaml_load('hdata/config.yaml')
     local_repo = LocalRepository(test_config, hashfs_path)
     spec_path = os.path.join(self.tmp_dir, 'dataset-ex.spec')
     shutil.copy('hdata/dataset-ex.spec', spec_path)
     local_repo.add_metrics(spec_path,
                            (('metric_a', '10'), ('metric_b', '9')), None)
     test_spec_file = yaml_load(spec_path)
     self.assertFalse('metrics' in test_spec_file[DATASET_SPEC_KEY])
Ejemplo n.º 13
0
    def push(self, spec, retry=2, clear_on_fail=False):
        repo_type = self.__repo_type
        try:
            objects_path = get_objects_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        met = Metadata(spec, metadata_path, self.__config, repo_type)
        fields = met.git_user_config()
        if None in fields.values():
            log.error(
                'Your name and email address need to be configured in git. '
                'Please see the commands below:',
                class_name=REPOSITORY_CLASS_NAME)

            log.error('git config --global user.name \'Your Name\'',
                      class_name=REPOSITORY_CLASS_NAME)
            log.error('git config --global user.email [email protected]',
                      class_name=REPOSITORY_CLASS_NAME)
            return
        if met.fetch() is False:
            return

        ref = Refs(refs_path, spec, repo_type)
        tag, sha = ref.branch()
        categories_path = get_path_with_categories(tag)

        spec_path, spec_file = None, None
        try:
            spec_path, spec_file = search_spec_file(self.__repo_type, spec,
                                                    categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)

        if spec_path is None:
            return

        full_spec_path = os.path.join(spec_path, spec_file)

        repo = LocalRepository(self.__config, objects_path, repo_type)
        ret = repo.push(objects_path, full_spec_path, retry, clear_on_fail)

        # ensure first we're on master !
        met.checkout()
        if ret == 0:
            # push metadata spec to LocalRepository git repository
            try:
                met.push()
            except Exception as e:
                log.error(e, class_name=REPOSITORY_CLASS_NAME)
                return
            MultihashFS(objects_path).reset_log()
Ejemplo n.º 14
0
 def _fetch(self, tag, samples, retries=2, bare=False):
     repo_type = self.__repo_type
     try:
         objects_path = get_objects_path(self.__config, repo_type)
         metadata_path = get_metadata_path(self.__config, repo_type)
         # check if no data left untracked/uncommitted. othrewise, stop.
         local_rep = LocalRepository(self.__config, objects_path, repo_type)
         return local_rep.fetch(metadata_path, tag, samples, retries, bare)
     except Exception as e:
         log.error(e, class_name=REPOSITORY_CLASS_NAME)
         return
Ejemplo n.º 15
0
    def test_import_files(self):
        path_obj = os.path.join(self.tmp_dir, 'objects')

        c = yaml_load('hdata/config.yaml')

        r = LocalRepository(c, path_obj)
        r.change_config_store(testprofile, testbucketname, 's3', region=None, endpoint_url=None)
        r.import_files(None, None, self.tmp_dir, 2, '{}://{}'.format('s3', testbucketname))

        for h in hs:
            file_path = os.path.join(self.tmp_dir, h)
            dir_file = os.path.join('hdata', h)
            self.assertTrue(os.path.exists(file_path))
            self.assertTrue(filecmp.cmp(dir_file, file_path))
Ejemplo n.º 16
0
    def test_get_update_cache(self):
        hfspath = os.path.join(self.tmp_dir, 'objectsfs')
        ohfs = MultihashFS(hfspath)
        key = ohfs.put(HDATA_IMG_1)

        cachepath = os.path.join(self.tmp_dir, 'cachefs')
        cache = Cache(cachepath, '', '')

        testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets')
        c = get_sample_config_spec(testbucketname, testprofile, testregion)

        r = LocalRepository(c, hfspath)
        r._update_cache(cache, key)

        self.assertTrue(os.path.exists(cache.get_keypath(key)))
        self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
Ejemplo n.º 17
0
    def test_add_metrics_with_none_metrics_options(self):
        hashfs_path = os.path.join(self.tmp_dir, 'objectsfs')
        test_config = yaml_load('hdata/config.yaml')
        local_repo = LocalRepository(test_config,
                                     hashfs_path,
                                     repo_type=MODELS)
        spec_path = os.path.join(self.tmp_dir, 'model-ex.spec')
        shutil.copy('hdata/dataset-ex.spec', spec_path)
        spec_file = yaml_load(spec_path)
        model = spec_file[DATASET_SPEC_KEY].copy()
        del spec_file[DATASET_SPEC_KEY]
        spec_file[MODEL_SPEC_KEY] = model
        yaml_save(spec_file, spec_path)
        local_repo.add_metrics(spec_path, (), None)

        test_spec_file = yaml_load(spec_path)
        self.assertFalse('metrics' in test_spec_file[MODEL_SPEC_KEY])
Ejemplo n.º 18
0
 def create(self, kwargs):
     artifact_name = kwargs['artifact_name']
     categories = list(kwargs['category'])
     version = int(kwargs['version_number'])
     imported_dir = kwargs['import']
     store_type = kwargs['store_type']
     bucket_name = kwargs['bucket_name']
     start_wizard = kwargs['wizard_config']
     import_url = kwargs['import_url']
     unzip_file = kwargs['unzip']
     credentials_path = kwargs['credentials_path']
     repo_type = self.__repo_type
     try:
         create_workspace_tree_structure(repo_type, artifact_name,
                                         categories, store_type,
                                         bucket_name, version, imported_dir,
                                         kwargs['mutability'])
         if start_wizard:
             has_new_store, store_type, bucket, profile, endpoint_url, git_repo = start_wizard_questions(
                 repo_type)
             if has_new_store:
                 store_add(store_type, bucket, profile, endpoint_url)
             update_store_spec(repo_type, artifact_name, store_type, bucket)
             remote_add(repo_type, git_repo)
         if import_url:
             self.create_config_store('gdrive', credentials_path)
             local = LocalRepository(
                 self.__config, get_objects_path(self.__config, repo_type))
             destine_path = os.path.join(repo_type, artifact_name, 'data')
             local.import_file_from_url(destine_path, import_url,
                                        StoreType.GDRIVE.value)
         if unzip_file:
             log.info('Unzipping files', CLASS_NAME=REPOSITORY_CLASS_NAME)
             data_path = os.path.join(get_root_path(), repo_type,
                                      artifact_name, 'data')
             unzip_files_in_directory(data_path)
         log.info("Project Created.", CLASS_NAME=REPOSITORY_CLASS_NAME)
     except Exception as e:
         if not isinstance(e, PermissionError):
             clear(os.path.join(repo_type, artifact_name))
         if isinstance(e, KeyboardInterrupt):
             log.info("Create command aborted!",
                      class_name=REPOSITORY_CLASS_NAME)
         else:
             log.error(e, CLASS_NAME=REPOSITORY_CLASS_NAME)
Ejemplo n.º 19
0
    def test_push(self):

        mlgit_dir = os.path.join(self.tmp_dir, '.ml-git')

        indexpath = os.path.join(mlgit_dir, 'index-test')
        mdpath = os.path.join(mlgit_dir, 'metadata-test')
        objectpath = os.path.join(mlgit_dir, 'objects-test')
        specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex')
        ensure_path_exists(specpath)
        ensure_path_exists(indexpath)
        shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec')
        shutil.copy('hdata/config.yaml', mlgit_dir + '/config.yaml')
        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')
        yaml_save(
            {
                'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh':
                {'imghires.jpg'}
            }, manifestpath)

        # adds chunks to ml-git Index
        idx = MultihashIndex(specpath, indexpath, objectpath)
        idx.add('data-test-push/', manifestpath)

        fi = yaml_load(os.path.join(specpath, 'INDEX.yaml'))
        self.assertTrue(len(fi) > 0)
        self.assertTrue(os.path.exists(indexpath))

        o = Objects(specpath, objectpath)
        o.commit_index(indexpath, self.tmp_dir)

        self.assertTrue(os.path.exists(objectpath))
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, objectpath)
        r.push(objectpath, specpath + '/dataset-ex.spec')
        s3 = boto3.resource(
            's3',
            region_name='eu-west-1',
            aws_access_key_id='fake_access_key',
            aws_secret_access_key='fake_secret_key',
        )
        for key in idx.get_index():
            self.assertIsNotNone(s3.Object(testbucketname, key))
Ejemplo n.º 20
0
    def test_add_metrics(self):
        hashfs_path = os.path.join(self.tmp_dir, 'objectsfs')
        test_config = yaml_load('hdata/config.yaml')
        local_repo = LocalRepository(test_config,
                                     hashfs_path,
                                     repo_type=MODELS)
        spec_path = os.path.join(self.tmp_dir, 'model-ex.spec')
        shutil.copy('hdata/dataset-ex.spec', spec_path)
        spec_file = yaml_load(spec_path)
        model = spec_file[DATASET_SPEC_KEY].copy()
        del spec_file[DATASET_SPEC_KEY]
        spec_file[MODEL_SPEC_KEY] = model
        yaml_save(spec_file, spec_path)
        local_repo.add_metrics(spec_path,
                               (('metric_a', '10'), ('metric_b', '9')), None)

        test_spec_file = yaml_load(spec_path)
        self.assertTrue(test_spec_file[MODEL_SPEC_KEY]['metrics'].get(
            'metric_a', '') == 10.0)
        self.assertTrue(test_spec_file[MODEL_SPEC_KEY]['metrics'].get(
            'metric_b', '') == 9.0)
Ejemplo n.º 21
0
    def import_files(self, object, path, directory, retry, bucket):
        err_msg = 'Invalid ml-git project!'

        try:
            root = get_root_path()
            root_dir = os.path.join(root, directory)
        except Exception:
            log.error(err_msg, class_name=REPOSITORY_CLASS_NAME)
            return

        local = LocalRepository(
            self.__config, get_objects_path(self.__config, self.__repo_type),
            self.__repo_type)
        bucket_name = bucket['bucket_name']
        store_type = bucket['store_type']
        local.change_config_store(bucket['profile'],
                                  bucket_name,
                                  store_type,
                                  region=bucket['region'],
                                  endpoint_url=bucket['endpoint_url'])
        local.import_files(object, path, root_dir, retry,
                           '{}://{}'.format(store_type, bucket_name))
Ejemplo n.º 22
0
    def test_push(self):
        indexpath = os.path.join(self.tmp_dir, 'index-test')
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')
        objectpath = os.path.join(self.tmp_dir, 'objects-test')
        specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex')
        ensure_path_exists(indexpath)
        ensure_path_exists(specpath)
        shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec')
        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')
        yaml_save(files_mock, manifestpath)
        idx = MultihashIndex(specpath, indexpath, objectpath)
        idx.add('data-test-push-1/', manifestpath)
        fidx = FullIndex(specpath, indexpath)

        self.assertTrue(os.path.exists(indexpath))
        c = yaml_load('hdata/config.yaml')
        o = Objects(specpath, objectpath)
        o.commit_index(indexpath, self.tmp_dir)

        self.assertTrue(os.path.exists(objectpath))

        r = LocalRepository(c, objectpath)
        self.assertTrue(r.push(objectpath, specpath + '/dataset-ex.spec') == 0)
        self.assertTrue(len(fidx.get_index()) == 1)
Ejemplo n.º 23
0
    def _checkout(self,
                  tag,
                  samples,
                  retries=2,
                  force_get=False,
                  dataset=False,
                  labels=False,
                  bare=False):
        repo_type = self.__repo_type
        try:
            cache_path = get_cache_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            # find out actual workspace path to save data
            if not self._tag_exists(tag):
                return None, None
            categories_path, spec_name, _ = spec_parse(tag)
            dataset_tag = None
            labels_tag = None
            root_path = get_root_path()
            ws_path = os.path.join(root_path,
                                   os.sep.join([repo_type, categories_path]))
            ensure_path_exists(ws_path)
        except Exception as e:
            log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None

        ref = Refs(refs_path, spec_name, repo_type)
        cur_tag, _ = ref.branch()

        if cur_tag == tag:
            log.info('already at tag [%s]' % tag,
                     class_name=REPOSITORY_CLASS_NAME)
            return None, None

        local_rep = LocalRepository(self.__config, objects_path, repo_type)
        # check if no data left untracked/uncommitted. otherwise, stop.
        if not force_get and local_rep.exist_local_changes(spec_name) is True:
            return None, None

        try:
            self._checkout_ref(tag)
        except Exception:
            log.error('Unable to checkout to %s' % tag,
                      class_name=REPOSITORY_CLASS_NAME)
            return None, None

        spec_path = os.path.join(metadata_path, categories_path,
                                 spec_name + '.spec')

        if dataset is True:
            dataset_tag = get_entity_tag(spec_path, repo_type, 'dataset')
        if labels is True:
            labels_tag = get_entity_tag(spec_path, repo_type, 'labels')

        fetch_success = self._fetch(tag, samples, retries, bare)

        if not fetch_success:
            objs = Objects('', objects_path)
            objs.fsck(remove_corrupted=True)
            self._checkout_ref('master')
            return None, None

        try:
            spec_index_path = os.path.join(
                get_index_metadata_path(self.__config, repo_type), spec_name)
        except Exception:
            return
        if os.path.exists(spec_index_path):
            if os.path.exists(
                    os.path.join(spec_index_path, spec_name + '.spec')):
                os.unlink(os.path.join(spec_index_path, spec_name + '.spec'))
            if os.path.exists(os.path.join(spec_index_path, 'README.md')):
                os.unlink(os.path.join(spec_index_path, 'README.md'))

        try:
            r = LocalRepository(self.__config, objects_path, repo_type)
            r.checkout(cache_path, metadata_path, objects_path, ws_path, tag,
                       samples, bare)
        except OSError as e:
            self._checkout_ref('master')
            if e.errno == errno.ENOSPC:
                log.error(
                    'There is not enough space in the disk. Remove some files and try again.',
                    class_name=REPOSITORY_CLASS_NAME)
            else:
                log.error(
                    'An error occurred while creating the files into workspace: %s \n.'
                    % e,
                    class_name=REPOSITORY_CLASS_NAME)
                return None, None
        except Exception as e:
            self._checkout_ref('master')
            log.error(
                'An error occurred while creating the files into workspace: %s \n.'
                % e,
                class_name=REPOSITORY_CLASS_NAME)
            return None, None

        m = Metadata('', metadata_path, self.__config, repo_type)
        sha = m.sha_from_tag(tag)
        ref.update_head(tag, sha)

        # restore to master/head
        self._checkout_ref('master')
        return dataset_tag, labels_tag
Ejemplo n.º 24
0
    def add(self, spec, file_path, bump_version=False, run_fsck=False):
        repo_type = self.__repo_type

        is_shared_objects = 'objects_path' in self.__config[repo_type]
        is_shared_cache = 'cache_path' in self.__config[repo_type]

        if not validate_config_spec_hash(self.__config):
            log.error(
                '.ml-git/config.yaml invalid. It should look something like this:\n%s'
                % get_yaml_str(
                    get_sample_config_spec('somebucket', 'someprofile',
                                           'someregion')),
                class_name=REPOSITORY_CLASS_NAME)
            return None

        path, file = None, None
        try:

            refs_path = get_refs_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            cache_path = get_cache_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            mutability, check_mutability = repo.get_mutability_from_spec(
                spec, repo_type)
            sampling_flag = os.path.exists(
                os.path.join(index_path, 'metadata', spec, 'sampling'))
            if sampling_flag:
                log.error(
                    'You cannot add new data to an entity that is based on a checkout with the --sampling option.',
                    class_name=REPOSITORY_CLASS_NAME)
                return

            if not mutability:
                return

            if not check_mutability:
                log.error('Spec mutability cannot be changed.',
                          class_name=REPOSITORY_CLASS_NAME)
                return

            if not self._has_new_data(repo, spec):
                return None

            ref = Refs(refs_path, spec, repo_type)
            tag, sha = ref.branch()

            categories_path = get_path_with_categories(tag)

            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        if path is None:
            return
        spec_path = os.path.join(path, file)
        if not self._is_spec_valid(spec_path):
            return None

        # Check tag before anything to avoid creating unstable state
        log.debug('Repository: check if tag already exists',
                  class_name=REPOSITORY_CLASS_NAME)

        m = Metadata(spec, metadata_path, self.__config, repo_type)

        if not m.check_exists():
            log.error('The %s has not been initialized' % self.__repo_type,
                      class_name=REPOSITORY_CLASS_NAME)
            return

        try:
            m.update()
        except Exception:
            pass

        # get version of current manifest file
        manifest = self._get_current_manifest_file(m, tag)

        try:
            # adds chunks to ml-git Index
            log.info('%s adding path [%s] to ml-git index' % (repo_type, path),
                     class_name=REPOSITORY_CLASS_NAME)
            with change_mask_for_routine(is_shared_objects):
                idx = MultihashIndex(spec, index_path, objects_path,
                                     mutability, cache_path)
                idx.add(path, manifest, file_path)

            # create hard links in ml-git Cache
            self.create_hard_links_in_cache(cache_path, index_path,
                                            is_shared_cache, mutability, path,
                                            spec)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return None

        if bump_version and not increment_version_in_spec(
                spec_path, self.__repo_type):
            return None

        idx.add_metadata(path, file)

        self._check_corrupted_files(spec, repo)

        # Run file check
        if run_fsck:
            self.fsck()
Ejemplo n.º 25
0
    def _checkout(self, tag, samples, options):
        dataset = options['with_dataset']
        labels = options['with_labels']
        retries = options['retry']
        force_get = options['force']
        bare = options['bare']
        version = options['version']
        repo_type = self.__repo_type
        try:
            cache_path = get_cache_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)

            if not re.search(RGX_TAG_FORMAT, tag):
                metadata_path = get_metadata_path(self.__config, repo_type)
                metadata = Metadata(tag, metadata_path, self.__config,
                                    repo_type)
                tag = metadata.get_tag(tag, version)
                if not tag:
                    return None, None
            elif not self._tag_exists(tag):
                return None, None
            categories_path, spec_name, _ = spec_parse(tag)
            root_path = get_root_path()
            ws_path = os.path.join(root_path,
                                   os.sep.join([repo_type, categories_path]))
        except Exception as e:
            log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME)
            return None, None

        ref = Refs(refs_path, spec_name, repo_type)
        cur_tag, _ = ref.branch()

        if cur_tag == tag:
            log.info('already at tag [%s]' % tag,
                     class_name=REPOSITORY_CLASS_NAME)
            return None, None

        local_rep = LocalRepository(self.__config, objects_path, repo_type)
        # check if no data left untracked/uncommitted. otherwise, stop.
        if not force_get and local_rep.exist_local_changes(spec_name) is True:
            return None, None

        try:
            self._checkout_ref(tag)
        except Exception:
            log.error('Unable to checkout to %s' % tag,
                      class_name=REPOSITORY_CLASS_NAME)
            return None, None

        dataset_tag, labels_tag = self._get_related_tags(
            categories_path, dataset, labels, metadata_path, repo_type,
            spec_name)

        fetch_success = self._fetch(tag, samples, retries, bare)
        if not fetch_success:
            objs = Objects('', objects_path)
            objs.fsck(remove_corrupted=True)
            self._checkout_ref()
            return None, None
        ensure_path_exists(ws_path)

        try:
            spec_index_path = os.path.join(
                get_index_metadata_path(self.__config, repo_type), spec_name)
        except Exception:
            return
        self._delete_spec_and_readme(spec_index_path, spec_name)

        try:
            r = LocalRepository(self.__config, objects_path, repo_type)
            r.checkout(cache_path, metadata_path, ws_path, tag, samples, bare)
        except OSError as e:
            self._checkout_ref()
            if e.errno == errno.ENOSPC:
                log.error(
                    'There is not enough space in the disk. Remove some files and try again.',
                    class_name=REPOSITORY_CLASS_NAME)
            else:
                log.error(
                    'An error occurred while creating the files into workspace: %s \n.'
                    % e,
                    class_name=REPOSITORY_CLASS_NAME)
                return None, None
        except Exception as e:
            self._checkout_ref()
            log.error(
                'An error occurred while creating the files into workspace: %s \n.'
                % e,
                class_name=REPOSITORY_CLASS_NAME)
            return None, None

        m = Metadata('', metadata_path, self.__config, repo_type)
        sha = m.sha_from_tag(tag)
        ref.update_head(tag, sha)

        # restore to master/head
        self._checkout_ref()
        return dataset_tag, labels_tag
Ejemplo n.º 26
0
    def commit(self, spec, specs, version=None, run_fsck=False, msg=None):
        # Move chunks from index to .ml-git/objects
        repo_type = self.__repo_type
        try:
            index_path = get_index_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            mutability, check_mutability = repo.get_mutability_from_spec(
                spec, repo_type)

            if not mutability:
                return

            if not check_mutability:
                log.error('Spec mutability cannot be changed.',
                          class_name=REPOSITORY_CLASS_NAME)
                return
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        ref = Refs(refs_path, spec, repo_type)

        tag, sha = ref.branch()
        categories_path = get_path_with_categories(tag)
        manifest_path = os.path.join(metadata_path, categories_path, spec,
                                     MANIFEST_FILE)
        path, file = None, None
        try:
            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)

        if path is None:
            return None, None, None

        spec_path = os.path.join(path, file)
        idx = MultihashIndex(spec, index_path, objects_path)

        if version:
            set_version_in_spec(version, spec_path, self.__repo_type)
            idx.add_metadata(path, file)

        # Check tag before anything to avoid creating unstable state
        log.debug('Check if tag already exists',
                  class_name=REPOSITORY_CLASS_NAME)
        m = Metadata(spec, metadata_path, self.__config, repo_type)

        if not m.check_exists():
            log.error('The %s has not been initialized' % self.__repo_type,
                      class_name=REPOSITORY_CLASS_NAME)
            return

        full_metadata_path, categories_sub_path, metadata = m.tag_exists(
            index_path)
        if metadata is None:
            return None

        log.debug('%s -> %s' % (index_path, objects_path),
                  class_name=REPOSITORY_CLASS_NAME)
        # commit objects in index to ml-git objects
        o = Objects(spec, objects_path)
        changed_files, deleted_files = o.commit_index(index_path, path)

        bare_mode = os.path.exists(
            os.path.join(index_path, 'metadata', spec, 'bare'))

        if not bare_mode:
            manifest = m.get_metadata_manifest(manifest_path)
            self._remove_deleted_files(idx, index_path, m, manifest, spec,
                                       deleted_files)
            m.remove_files_added_after_base_tag(manifest, path)
        else:
            tag, _ = ref.branch()
            self._checkout_ref(tag)
        # update metadata spec & README.md
        # option --dataset-spec --labels-spec
        tag, sha = m.commit_metadata(index_path, specs, msg, changed_files,
                                     mutability, path)

        # update ml-git ref spec HEAD == to new SHA-1 / tag
        if tag is None:
            return None
        ref = Refs(refs_path, spec, repo_type)
        ref.update_head(tag, sha)

        # Run file check
        if run_fsck:
            self.fsck()

        return tag