Beispiel #1
0
    def test_get(self):
        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)
        idx.add('data', '')

        mf = idx.get('zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u', self.tmp_dir, 'think-hires.jpg')

        self.assertEqual(singlefile.get('index'), mf)
Beispiel #2
0
    def test_add_idmpotent(self):
        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)
        idx.add('data', '')
        idx.add('data', '')

        mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml')
        self.assertEqual(yaml_load(mf), singlefile['manifest'])
Beispiel #3
0
    def test_add_manifest(self):
        manifestfile = os.path.join(self.tmp_dir, 'MANIFEST.yaml')
        yaml_save(singlefile['manifest'], manifestfile)

        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)
        idx.add('data', manifestfile)

        self.assertFalse(os.path.exists(os.path.join(self.tmp_dir, 'files', 'dataset-spec', 'MANIFEST.yaml')))
Beispiel #4
0
    def test_put(self):
        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)
        idx.add('data', self.tmp_dir)

        mf = idx.get_index()
        self.assertTrue(mf.exists('zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u'))

        idx.add('image.jpg', self.tmp_dir)
        idx.update_index('zdj7WemKEtQMVL81UU6PSuYaoxvBQ6CiUMq1fMvoXBhPUsCK2', 'image.jpg')
        self.assertTrue(mf.exists('zdj7WemKEtQMVL81UU6PSuYaoxvBQ6CiUMq1fMvoXBhPUsCK2'))
Beispiel #5
0
 def _get_blobs_hashes(self, index_path, objects_path, repo_type):
     blobs_hashes = []
     for root, dirs, files in os.walk(os.path.join(index_path, 'metadata')):
         for spec in dirs:
             try:
                 self._check_is_valid_entity(repo_type, spec)
                 idx = MultihashIndex(spec, index_path, objects_path)
                 blobs_hashes.extend(idx.get_hashes_list())
             except Exception:
                 log.debug(output_messages['INFO_ENTITY_DELETED'] % spec,
                           class_name=REPOSITORY_CLASS_NAME)
     return blobs_hashes
Beispiel #6
0
    def test_add_full_index(self):
        manifestfile = os.path.join(self.tmp_dir, 'MANIFEST.yaml')
        yaml_save(singlefile['manifest'], manifestfile)

        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)
        idx.add('data', manifestfile)
        f_idx = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml'))
        self.assertTrue(len(f_idx) > 0)
        for k, v in f_idx.items():
            self.assertEqual(k, 'think-hires.jpg')
            self.assertEqual(v['hash'], 'zdj7WgHSKJkoJST5GWGgS53ARqV7oqMGYVvWzEWku3MBfnQ9u')
            self.assertEqual(v['status'], 'a')

        self.assertFalse(os.path.exists(os.path.join(self.tmp_dir, 'dataset-spec', 'INDEX.yaml')))
Beispiel #7
0
    def test_add(self):
        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)

        # TODO: there is incorrect behavior here.  During unit test runs, the link count can be > 1 in some cases
        # incorrectly, so the file doesn't get added to the index.  I think this is a design issue for index.py
        # add_file in general; for now we will allow the unit tests to not trust this data and add the file anyway
        # by adding a trust_links parameter that defaults to True and cascades its way through the calls.

        idx.add('data', '')

        mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml')
        self.assertEqual(yaml_load(mf), singlefile['manifest'])
        fi = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml'))
        for k, v in fi.items():
            self.assertEqual(v['hash'], singlefile['datastore'])
Beispiel #8
0
    def test_add2(self):
        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)
        idx.add('data', '')

        mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml')
        self.assertEqual(yaml_load(mf), singlefile['manifest'])
        fi = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml'))
        for k, v in fi.items():
            self.assertEqual(v['hash'], singlefile['datastore'])

        idx.add('data2', '')
        self.assertEqual(yaml_load(mf), secondfile['manifest'])
        fi = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml'))
        hashs = []
        for k, v in fi.items():
            hashs.append(v['hash'])
        self.assertIn(secondfile['datastore'], hashs)
Beispiel #9
0
    def test_remove_hash(self):

        idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir)
        data = str(self.test_dir / 'data')
        idx.add(data, '')
        idx.add(str(self.test_dir / 'data2'), '')
        hfs = HashFS(self.tmp_dir, blocksize=1024 * 1024)
        o = Objects('dataset-spec', self.tmp_dir)
        o.commit_index(self.tmp_dir, data)
        for h in hash_list:
            with open(os.path.join(self.tmp_dir, 'hashfs', 'log', STORAGE_LOG)) as f:
                self.assertTrue(h in f.read())

        for h in hash_list:
            hfs.remove_hash(h)

        for h in hash_list:
            with open(os.path.join(self.tmp_dir, 'hashfs', 'log', STORAGE_LOG)) as f:
                self.assertFalse(h in f.read())
Beispiel #10
0
    def test_push(self):

        mlgit_dir = os.path.join(self.tmp_dir, '.ml-git')

        indexpath = os.path.join(mlgit_dir, 'index-test')
        mdpath = os.path.join(mlgit_dir, 'metadata-test')
        objectpath = os.path.join(mlgit_dir, 'objects-test')
        specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex')
        ensure_path_exists(specpath)
        ensure_path_exists(indexpath)
        shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec')
        shutil.copy('hdata/config.yaml', mlgit_dir + '/config.yaml')
        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')
        yaml_save(
            {
                'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh':
                {'imghires.jpg'}
            }, manifestpath)

        # adds chunks to ml-git Index
        idx = MultihashIndex(specpath, indexpath, objectpath)
        idx.add('data-test-push/', manifestpath)

        fi = yaml_load(os.path.join(specpath, 'INDEX.yaml'))
        self.assertTrue(len(fi) > 0)
        self.assertTrue(os.path.exists(indexpath))

        o = Objects(specpath, objectpath)
        o.commit_index(indexpath, self.tmp_dir)

        self.assertTrue(os.path.exists(objectpath))
        c = yaml_load('hdata/config.yaml')
        r = LocalRepository(c, objectpath)
        r.push(objectpath, specpath + '/dataset-ex.spec')
        s3 = boto3.resource(
            's3',
            region_name='eu-west-1',
            aws_access_key_id='fake_access_key',
            aws_secret_access_key='fake_secret_key',
        )
        for key in idx.get_index():
            self.assertIsNotNone(s3.Object(testbucketname, key))
Beispiel #11
0
    def fsck(self):
        repo_type = self.__repo_type
        try:
            objects_path = get_objects_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
        except RootPathException:
            return
        o = Objects('', objects_path)
        corrupted_files_obj = o.fsck()
        corrupted_files_obj_len = len(corrupted_files_obj)

        idx = MultihashIndex('', index_path, objects_path)
        corrupted_files_idx = idx.fsck()
        corrupted_files_idx_len = len(corrupted_files_idx)

        print('[%d] corrupted file(s) in Local Repository: %s' %
              (corrupted_files_obj_len, corrupted_files_obj))
        print('[%d] corrupted file(s) in Index: %s' %
              (corrupted_files_idx_len, corrupted_files_idx))
        print('Total of corrupted files: %d' %
              (corrupted_files_obj_len + corrupted_files_idx_len))
Beispiel #12
0
    def test_push(self):
        indexpath = os.path.join(self.tmp_dir, 'index-test')
        mdpath = os.path.join(self.tmp_dir, 'metadata-test')
        objectpath = os.path.join(self.tmp_dir, 'objects-test')
        specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex')
        ensure_path_exists(indexpath)
        ensure_path_exists(specpath)
        shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec')
        manifestpath = os.path.join(specpath, 'MANIFEST.yaml')
        yaml_save(files_mock, manifestpath)
        idx = MultihashIndex(specpath, indexpath, objectpath)
        idx.add('data-test-push-1/', manifestpath)
        fidx = FullIndex(specpath, indexpath)

        self.assertTrue(os.path.exists(indexpath))
        c = yaml_load('hdata/config.yaml')
        o = Objects(specpath, objectpath)
        o.commit_index(indexpath, self.tmp_dir)

        self.assertTrue(os.path.exists(objectpath))

        r = LocalRepository(c, objectpath)
        self.assertTrue(r.push(objectpath, specpath + '/dataset-ex.spec') == 0)
        self.assertTrue(len(fidx.get_index()) == 1)
Beispiel #13
0
    def reset(self, spec, reset_type, head):
        log.info(output_messages['INFO_INITIALIZING_RESET'] %
                 (reset_type, head),
                 class_name=REPOSITORY_CLASS_NAME)
        if (reset_type == '--soft'
                or reset_type == '--mixed') and head == HEAD:
            return
        try:
            repo_type = self.__repo_type
            metadata_path = get_metadata_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            object_path = get_objects_path(self.__config, repo_type)
            met = Metadata(spec, metadata_path, self.__config, repo_type)
            ref = Refs(refs_path, spec, repo_type)
            idx = MultihashIndex(spec, index_path, object_path)
            fidx = FullIndex(spec, index_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        # get tag before reset
        tag = met.get_current_tag()
        categories_path = get_path_with_categories(str(tag))
        # current manifest file before reset
        manifest_path = os.path.join(metadata_path, categories_path, spec,
                                     MANIFEST_FILE)
        _manifest = Manifest(manifest_path).load()

        if head == HEAD_1:  # HEAD~1
            try:
                # reset the repo
                met.reset()
            except Exception:
                return

        # get tag after reset
        tag_after_reset = met.get_current_tag()
        sha = met.sha_from_tag(tag_after_reset)

        # update ml-git ref HEAD
        ref.update_head(str(tag_after_reset), sha)

        # # get path to reset workspace in case of --hard
        path, file = None, None
        try:
            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)

        if reset_type == '--hard' and path is None:
            return

        # get manifest from metadata after reset
        _manifest_changed = Manifest(manifest_path)

        hash_files, file_names = _manifest_changed.get_diff(_manifest)
        idx_mf = idx.get_index().load()

        if reset_type == '--soft':
            # add in index/metadata/<entity-name>/MANIFEST
            idx.update_index_manifest(idx_mf)
            idx.update_index_manifest(hash_files)
            fidx.update_index_status(file_names, Status.a.name)

        else:  # --hard or --mixed
            # remove hash from index/hashsh/store.log
            file_names.update(*idx_mf.values())
            objs = MultihashFS(index_path)
            for key_hash in hash_files:
                objs.remove_hash(key_hash)
            idx.remove_manifest()
            fidx.remove_from_index_yaml(file_names)
            fidx.remove_uncommitted()

        if reset_type == '--hard':  # reset workspace
            remove_from_workspace(file_names, path, spec)
Beispiel #14
0
    def add(self, spec, file_path, bump_version=False, run_fsck=False):
        repo_type = self.__repo_type

        is_shared_objects = 'objects_path' in self.__config[repo_type]
        is_shared_cache = 'cache_path' in self.__config[repo_type]

        if not validate_config_spec_hash(self.__config):
            log.error(
                '.ml-git/config.yaml invalid. It should look something like this:\n%s'
                % get_yaml_str(
                    get_sample_config_spec('somebucket', 'someprofile',
                                           'someregion')),
                class_name=REPOSITORY_CLASS_NAME)
            return None

        path, file = None, None
        try:

            refs_path = get_refs_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            cache_path = get_cache_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            mutability, check_mutability = repo.get_mutability_from_spec(
                spec, repo_type)
            sampling_flag = os.path.exists(
                os.path.join(index_path, 'metadata', spec, 'sampling'))
            if sampling_flag:
                log.error(
                    'You cannot add new data to an entity that is based on a checkout with the --sampling option.',
                    class_name=REPOSITORY_CLASS_NAME)
                return

            if not mutability:
                return

            if not check_mutability:
                log.error('Spec mutability cannot be changed.',
                          class_name=REPOSITORY_CLASS_NAME)
                return

            if not self._has_new_data(repo, spec):
                return None

            ref = Refs(refs_path, spec, repo_type)
            tag, sha = ref.branch()

            categories_path = get_path_with_categories(tag)

            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        if path is None:
            return
        spec_path = os.path.join(path, file)
        if not self._is_spec_valid(spec_path):
            return None

        # Check tag before anything to avoid creating unstable state
        log.debug('Repository: check if tag already exists',
                  class_name=REPOSITORY_CLASS_NAME)

        m = Metadata(spec, metadata_path, self.__config, repo_type)

        if not m.check_exists():
            log.error('The %s has not been initialized' % self.__repo_type,
                      class_name=REPOSITORY_CLASS_NAME)
            return

        try:
            m.update()
        except Exception:
            pass

        # get version of current manifest file
        manifest = self._get_current_manifest_file(m, tag)

        try:
            # adds chunks to ml-git Index
            log.info('%s adding path [%s] to ml-git index' % (repo_type, path),
                     class_name=REPOSITORY_CLASS_NAME)
            with change_mask_for_routine(is_shared_objects):
                idx = MultihashIndex(spec, index_path, objects_path,
                                     mutability, cache_path)
                idx.add(path, manifest, file_path)

            # create hard links in ml-git Cache
            self.create_hard_links_in_cache(cache_path, index_path,
                                            is_shared_cache, mutability, path,
                                            spec)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return None

        if bump_version and not increment_version_in_spec(
                spec_path, self.__repo_type):
            return None

        idx.add_metadata(path, file)

        self._check_corrupted_files(spec, repo)

        # Run file check
        if run_fsck:
            self.fsck()
Beispiel #15
0
    def commit(self, spec, specs, version=None, run_fsck=False, msg=None):
        # Move chunks from index to .ml-git/objects
        repo_type = self.__repo_type
        try:
            index_path = get_index_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            mutability, check_mutability = repo.get_mutability_from_spec(
                spec, repo_type)

            if not mutability:
                return

            if not check_mutability:
                log.error('Spec mutability cannot be changed.',
                          class_name=REPOSITORY_CLASS_NAME)
                return
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        ref = Refs(refs_path, spec, repo_type)

        tag, sha = ref.branch()
        categories_path = get_path_with_categories(tag)
        manifest_path = os.path.join(metadata_path, categories_path, spec,
                                     MANIFEST_FILE)
        path, file = None, None
        try:
            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)

        if path is None:
            return None, None, None

        spec_path = os.path.join(path, file)
        idx = MultihashIndex(spec, index_path, objects_path)

        if version:
            set_version_in_spec(version, spec_path, self.__repo_type)
            idx.add_metadata(path, file)

        # Check tag before anything to avoid creating unstable state
        log.debug('Check if tag already exists',
                  class_name=REPOSITORY_CLASS_NAME)
        m = Metadata(spec, metadata_path, self.__config, repo_type)

        if not m.check_exists():
            log.error('The %s has not been initialized' % self.__repo_type,
                      class_name=REPOSITORY_CLASS_NAME)
            return

        full_metadata_path, categories_sub_path, metadata = m.tag_exists(
            index_path)
        if metadata is None:
            return None

        log.debug('%s -> %s' % (index_path, objects_path),
                  class_name=REPOSITORY_CLASS_NAME)
        # commit objects in index to ml-git objects
        o = Objects(spec, objects_path)
        changed_files, deleted_files = o.commit_index(index_path, path)

        bare_mode = os.path.exists(
            os.path.join(index_path, 'metadata', spec, 'bare'))

        if not bare_mode:
            manifest = m.get_metadata_manifest(manifest_path)
            self._remove_deleted_files(idx, index_path, m, manifest, spec,
                                       deleted_files)
            m.remove_files_added_after_base_tag(manifest, path)
        else:
            tag, _ = ref.branch()
            self._checkout_ref(tag)
        # update metadata spec & README.md
        # option --dataset-spec --labels-spec
        tag, sha = m.commit_metadata(index_path, specs, msg, changed_files,
                                     mutability, path)

        # update ml-git ref spec HEAD == to new SHA-1 / tag
        if tag is None:
            return None
        ref = Refs(refs_path, spec, repo_type)
        ref.update_head(tag, sha)

        # Run file check
        if run_fsck:
            self.fsck()

        return tag