Ejemplo n.º 1
0
 def checkout(self, tag, samples, options):
     try:
         metadata_path = get_metadata_path(self.__config)
     except RootPathException as e:
         log.warn(e, class_name=REPOSITORY_CLASS_NAME)
         metadata_path = self._initialize_repository_on_the_fly()
     dt_tag, lb_tag = self._checkout(tag, samples, options)
     options['with_dataset'] = False
     options['with_labels'] = False
     if dt_tag is not None:
         try:
             self.__repo_type = 'dataset'
             m = Metadata('', metadata_path, self.__config,
                          self.__repo_type)
             log.info('Initializing related dataset download',
                      class_name=REPOSITORY_CLASS_NAME)
             if not m.check_exists():
                 m.init()
             self._checkout(dt_tag, samples, options)
         except Exception as e:
             log.error('LocalRepository: [%s]' % e,
                       class_name=REPOSITORY_CLASS_NAME)
     if lb_tag is not None:
         try:
             self.__repo_type = 'labels'
             m = Metadata('', metadata_path, self.__config,
                          self.__repo_type)
             log.info('Initializing related labels download',
                      class_name=REPOSITORY_CLASS_NAME)
             if not m.check_exists():
                 m.init()
             self._checkout(lb_tag, samples, options)
         except Exception as e:
             log.error('LocalRepository: [%s]' % e,
                       class_name=REPOSITORY_CLASS_NAME)
Ejemplo n.º 2
0
def check_initialized_entity(context, entity_type, entity_name):
    config = merged_config_load()
    metadata_path = get_metadata_path(config, entity_type)
    metadata = Metadata(entity_name, metadata_path, config, entity_type)
    if not metadata.check_exists():
        log.error(output_messages['ERROR_NOT_INITIALIZED'] % entity_type)
        context.exit()
Ejemplo n.º 3
0
 def metadata_exists(self, entity):
     self.__repo_type = entity
     entity_metadata_path = get_metadata_path(self.__config,
                                              self.__repo_type)
     metadata = Metadata('', entity_metadata_path, self.__config,
                         self.__repo_type)
     return metadata.check_exists()
Ejemplo n.º 4
0
def get_last_entity_version(entity_type, entity_name):
    config = merged_config_load()
    metadata_path = get_metadata_path(config, entity_type)
    metadata = Metadata(entity_name, metadata_path, config, entity_type)
    if not metadata.check_exists():
        log.error(output_messages['ERROR_NOT_INITIALIZED'] % entity_type)
        return
    last_version = metadata.get_last_tag_version(entity_name)
    return last_version + 1
Ejemplo n.º 5
0
 def test_default_branch(self):
     default_branch_for_empty_repo = 'master'
     new_branch = 'main'
     m = Metadata('', self.test_dir, config, DATASETS)
     m.init()
     self.assertTrue(m.check_exists())
     self.assertEqual(m.get_default_branch(), default_branch_for_empty_repo)
     self.change_branch(m.path, new_branch)
     self.assertNotEqual(m.get_default_branch(), default_branch_for_empty_repo)
     self.assertEqual(m.get_default_branch(), new_branch)
     clear(m.path)
Ejemplo n.º 6
0
    def test_clone_empty_config_repo(self):
        config = {
            'mlgit_path': './mdata',
            'mlgit_conf': 'config.yaml',
            'verbose': 'info',
            DATASETS: {'git': '', },
            LABELS: {'git': '', },
            MODELS: {'git': '', }, }

        m = Metadata('', self.test_dir, config, DATASETS)
        m.clone_config_repo()
        self.assertFalse(m.check_exists())
Ejemplo n.º 7
0
 def list(self):
     repo_type = self.__repo_type
     try:
         metadata_path = get_metadata_path(self.__config, repo_type)
         m = Metadata('', metadata_path, self.__config, repo_type)
         if not m.check_exists():
             raise RuntimeError('The %s doesn\'t have been initialized.' %
                                self.__repo_type)
         m.checkout()
         m.list(title='ML ' + repo_type)
     except GitError as g:
         error_message = g.stderr
         if 'did not match any file(s) known' in error_message:
             error_message = 'You don\'t have any entity being managed.'
         log.error(error_message, class_name=REPOSITORY_CLASS_NAME)
         return
     except Exception as e:
         log.error(e, class_name=REPOSITORY_CLASS_NAME)
         return
Ejemplo n.º 8
0
    def test_clone_empty_config_repo(self):
        config = {
            'mlgit_path': './mdata',
            'mlgit_conf': 'config.yaml',
            'verbose': 'info',
            'dataset': {
                'git': '',
            },
            'labels': {
                'git': '',
            },
            'model': {
                'git': '',
            },
        }

        m = Metadata('', self.test_dir, config, repotype)
        m.clone_config_repo()
        self.assertFalse(m.check_exists())
Ejemplo n.º 9
0
    def add(self, spec, file_path, bump_version=False, run_fsck=False):
        repo_type = self.__repo_type

        is_shared_objects = 'objects_path' in self.__config[repo_type]
        is_shared_cache = 'cache_path' in self.__config[repo_type]

        if not validate_config_spec_hash(self.__config):
            log.error(
                '.ml-git/config.yaml invalid. It should look something like this:\n%s'
                % get_yaml_str(
                    get_sample_config_spec('somebucket', 'someprofile',
                                           'someregion')),
                class_name=REPOSITORY_CLASS_NAME)
            return None

        path, file = None, None
        try:

            refs_path = get_refs_path(self.__config, repo_type)
            index_path = get_index_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            cache_path = get_cache_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            mutability, check_mutability = repo.get_mutability_from_spec(
                spec, repo_type)
            sampling_flag = os.path.exists(
                os.path.join(index_path, 'metadata', spec, 'sampling'))
            if sampling_flag:
                log.error(
                    'You cannot add new data to an entity that is based on a checkout with the --sampling option.',
                    class_name=REPOSITORY_CLASS_NAME)
                return

            if not mutability:
                return

            if not check_mutability:
                log.error('Spec mutability cannot be changed.',
                          class_name=REPOSITORY_CLASS_NAME)
                return

            if not self._has_new_data(repo, spec):
                return None

            ref = Refs(refs_path, spec, repo_type)
            tag, sha = ref.branch()

            categories_path = get_path_with_categories(tag)

            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        if path is None:
            return
        spec_path = os.path.join(path, file)
        if not self._is_spec_valid(spec_path):
            return None

        # Check tag before anything to avoid creating unstable state
        log.debug('Repository: check if tag already exists',
                  class_name=REPOSITORY_CLASS_NAME)

        m = Metadata(spec, metadata_path, self.__config, repo_type)

        if not m.check_exists():
            log.error('The %s has not been initialized' % self.__repo_type,
                      class_name=REPOSITORY_CLASS_NAME)
            return

        try:
            m.update()
        except Exception:
            pass

        # get version of current manifest file
        manifest = self._get_current_manifest_file(m, tag)

        try:
            # adds chunks to ml-git Index
            log.info('%s adding path [%s] to ml-git index' % (repo_type, path),
                     class_name=REPOSITORY_CLASS_NAME)
            with change_mask_for_routine(is_shared_objects):
                idx = MultihashIndex(spec, index_path, objects_path,
                                     mutability, cache_path)
                idx.add(path, manifest, file_path)

            # create hard links in ml-git Cache
            self.create_hard_links_in_cache(cache_path, index_path,
                                            is_shared_cache, mutability, path,
                                            spec)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return None

        if bump_version and not increment_version_in_spec(
                spec_path, self.__repo_type):
            return None

        idx.add_metadata(path, file)

        self._check_corrupted_files(spec, repo)

        # Run file check
        if run_fsck:
            self.fsck()
Ejemplo n.º 10
0
    def commit(self, spec, specs, version=None, run_fsck=False, msg=None):
        # Move chunks from index to .ml-git/objects
        repo_type = self.__repo_type
        try:
            index_path = get_index_path(self.__config, repo_type)
            objects_path = get_objects_path(self.__config, repo_type)
            metadata_path = get_metadata_path(self.__config, repo_type)
            refs_path = get_refs_path(self.__config, repo_type)
            repo = LocalRepository(self.__config, objects_path, repo_type)
            mutability, check_mutability = repo.get_mutability_from_spec(
                spec, repo_type)

            if not mutability:
                return

            if not check_mutability:
                log.error('Spec mutability cannot be changed.',
                          class_name=REPOSITORY_CLASS_NAME)
                return
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)
            return

        ref = Refs(refs_path, spec, repo_type)

        tag, sha = ref.branch()
        categories_path = get_path_with_categories(tag)
        manifest_path = os.path.join(metadata_path, categories_path, spec,
                                     MANIFEST_FILE)
        path, file = None, None
        try:
            path, file = search_spec_file(self.__repo_type, spec,
                                          categories_path)
        except Exception as e:
            log.error(e, class_name=REPOSITORY_CLASS_NAME)

        if path is None:
            return None, None, None

        spec_path = os.path.join(path, file)
        idx = MultihashIndex(spec, index_path, objects_path)

        if version:
            set_version_in_spec(version, spec_path, self.__repo_type)
            idx.add_metadata(path, file)

        # Check tag before anything to avoid creating unstable state
        log.debug('Check if tag already exists',
                  class_name=REPOSITORY_CLASS_NAME)
        m = Metadata(spec, metadata_path, self.__config, repo_type)

        if not m.check_exists():
            log.error('The %s has not been initialized' % self.__repo_type,
                      class_name=REPOSITORY_CLASS_NAME)
            return

        full_metadata_path, categories_sub_path, metadata = m.tag_exists(
            index_path)
        if metadata is None:
            return None

        log.debug('%s -> %s' % (index_path, objects_path),
                  class_name=REPOSITORY_CLASS_NAME)
        # commit objects in index to ml-git objects
        o = Objects(spec, objects_path)
        changed_files, deleted_files = o.commit_index(index_path, path)

        bare_mode = os.path.exists(
            os.path.join(index_path, 'metadata', spec, 'bare'))

        if not bare_mode:
            manifest = m.get_metadata_manifest(manifest_path)
            self._remove_deleted_files(idx, index_path, m, manifest, spec,
                                       deleted_files)
            m.remove_files_added_after_base_tag(manifest, path)
        else:
            tag, _ = ref.branch()
            self._checkout_ref(tag)
        # update metadata spec & README.md
        # option --dataset-spec --labels-spec
        tag, sha = m.commit_metadata(index_path, specs, msg, changed_files,
                                     mutability, path)

        # update ml-git ref spec HEAD == to new SHA-1 / tag
        if tag is None:
            return None
        ref = Refs(refs_path, spec, repo_type)
        ref.update_head(tag, sha)

        # Run file check
        if run_fsck:
            self.fsck()

        return tag
Ejemplo n.º 11
0
 def test_init(self):
     m = Metadata(spec, self.test_dir, config, repotype)
     m.init()
     self.assertTrue(m.check_exists())
     clear(m.path)
Ejemplo n.º 12
0
 def test_clone_config_repo(self):
     m = Metadata('', self.test_dir, config, repotype)
     m.clone_config_repo()
     self.assertTrue(m.check_exists())
Ejemplo n.º 13
0
 def test_init_local_repo(self):
     m = Metadata(spec, self.test_dir, config, DATASETS)
     m.init()
     self.assertTrue(m.check_exists())
     clear(m.path)