def checkout(self, tag, samples, options): try: metadata_path = get_metadata_path(self.__config) except RootPathException as e: log.warn(e, class_name=REPOSITORY_CLASS_NAME) metadata_path = self._initialize_repository_on_the_fly() dt_tag, lb_tag = self._checkout(tag, samples, options) options['with_dataset'] = False options['with_labels'] = False if dt_tag is not None: try: self.__repo_type = 'dataset' m = Metadata('', metadata_path, self.__config, self.__repo_type) log.info('Initializing related dataset download', class_name=REPOSITORY_CLASS_NAME) if not m.check_exists(): m.init() self._checkout(dt_tag, samples, options) except Exception as e: log.error('LocalRepository: [%s]' % e, class_name=REPOSITORY_CLASS_NAME) if lb_tag is not None: try: self.__repo_type = 'labels' m = Metadata('', metadata_path, self.__config, self.__repo_type) log.info('Initializing related labels download', class_name=REPOSITORY_CLASS_NAME) if not m.check_exists(): m.init() self._checkout(lb_tag, samples, options) except Exception as e: log.error('LocalRepository: [%s]' % e, class_name=REPOSITORY_CLASS_NAME)
def check_initialized_entity(context, entity_type, entity_name): config = merged_config_load() metadata_path = get_metadata_path(config, entity_type) metadata = Metadata(entity_name, metadata_path, config, entity_type) if not metadata.check_exists(): log.error(output_messages['ERROR_NOT_INITIALIZED'] % entity_type) context.exit()
def metadata_exists(self, entity): self.__repo_type = entity entity_metadata_path = get_metadata_path(self.__config, self.__repo_type) metadata = Metadata('', entity_metadata_path, self.__config, self.__repo_type) return metadata.check_exists()
def get_last_entity_version(entity_type, entity_name): config = merged_config_load() metadata_path = get_metadata_path(config, entity_type) metadata = Metadata(entity_name, metadata_path, config, entity_type) if not metadata.check_exists(): log.error(output_messages['ERROR_NOT_INITIALIZED'] % entity_type) return last_version = metadata.get_last_tag_version(entity_name) return last_version + 1
def test_default_branch(self): default_branch_for_empty_repo = 'master' new_branch = 'main' m = Metadata('', self.test_dir, config, DATASETS) m.init() self.assertTrue(m.check_exists()) self.assertEqual(m.get_default_branch(), default_branch_for_empty_repo) self.change_branch(m.path, new_branch) self.assertNotEqual(m.get_default_branch(), default_branch_for_empty_repo) self.assertEqual(m.get_default_branch(), new_branch) clear(m.path)
def test_clone_empty_config_repo(self): config = { 'mlgit_path': './mdata', 'mlgit_conf': 'config.yaml', 'verbose': 'info', DATASETS: {'git': '', }, LABELS: {'git': '', }, MODELS: {'git': '', }, } m = Metadata('', self.test_dir, config, DATASETS) m.clone_config_repo() self.assertFalse(m.check_exists())
def list(self): repo_type = self.__repo_type try: metadata_path = get_metadata_path(self.__config, repo_type) m = Metadata('', metadata_path, self.__config, repo_type) if not m.check_exists(): raise RuntimeError('The %s doesn\'t have been initialized.' % self.__repo_type) m.checkout() m.list(title='ML ' + repo_type) except GitError as g: error_message = g.stderr if 'did not match any file(s) known' in error_message: error_message = 'You don\'t have any entity being managed.' log.error(error_message, class_name=REPOSITORY_CLASS_NAME) return except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def test_clone_empty_config_repo(self): config = { 'mlgit_path': './mdata', 'mlgit_conf': 'config.yaml', 'verbose': 'info', 'dataset': { 'git': '', }, 'labels': { 'git': '', }, 'model': { 'git': '', }, } m = Metadata('', self.test_dir, config, repotype) m.clone_config_repo() self.assertFalse(m.check_exists())
def add(self, spec, file_path, bump_version=False, run_fsck=False): repo_type = self.__repo_type is_shared_objects = 'objects_path' in self.__config[repo_type] is_shared_cache = 'cache_path' in self.__config[repo_type] if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) mutability, check_mutability = repo.get_mutability_from_spec( spec, repo_type) sampling_flag = os.path.exists( os.path.join(index_path, 'metadata', spec, 'sampling')) if sampling_flag: log.error( 'You cannot add new data to an entity that is based on a checkout with the --sampling option.', class_name=REPOSITORY_CLASS_NAME) return if not mutability: return if not check_mutability: log.error('Spec mutability cannot be changed.', class_name=REPOSITORY_CLASS_NAME) return if not self._has_new_data(repo, spec): return None ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) if not self._is_spec_valid(spec_path): return None # Check tag before anything to avoid creating unstable state log.debug('Repository: check if tag already exists', class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if not m.check_exists(): log.error('The %s has not been initialized' % self.__repo_type, class_name=REPOSITORY_CLASS_NAME) return try: m.update() except Exception: pass # get version of current manifest file manifest = self._get_current_manifest_file(m, tag) try: # adds chunks to ml-git Index log.info('%s adding path [%s] to ml-git index' % (repo_type, path), class_name=REPOSITORY_CLASS_NAME) with change_mask_for_routine(is_shared_objects): idx = MultihashIndex(spec, index_path, objects_path, mutability, cache_path) idx.add(path, manifest, file_path) # create hard links in ml-git Cache self.create_hard_links_in_cache(cache_path, index_path, is_shared_cache, mutability, path, spec) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return None if bump_version and not increment_version_in_spec( spec_path, self.__repo_type): return None idx.add_metadata(path, file) self._check_corrupted_files(spec, repo) # Run file check if run_fsck: self.fsck()
def commit(self, spec, specs, version=None, run_fsck=False, msg=None): # Move chunks from index to .ml-git/objects repo_type = self.__repo_type try: index_path = get_index_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) mutability, check_mutability = repo.get_mutability_from_spec( spec, repo_type) if not mutability: return if not check_mutability: log.error('Spec mutability cannot be changed.', class_name=REPOSITORY_CLASS_NAME) return except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) manifest_path = os.path.join(metadata_path, categories_path, spec, MANIFEST_FILE) path, file = None, None try: path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if path is None: return None, None, None spec_path = os.path.join(path, file) idx = MultihashIndex(spec, index_path, objects_path) if version: set_version_in_spec(version, spec_path, self.__repo_type) idx.add_metadata(path, file) # Check tag before anything to avoid creating unstable state log.debug('Check if tag already exists', class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if not m.check_exists(): log.error('The %s has not been initialized' % self.__repo_type, class_name=REPOSITORY_CLASS_NAME) return full_metadata_path, categories_sub_path, metadata = m.tag_exists( index_path) if metadata is None: return None log.debug('%s -> %s' % (index_path, objects_path), class_name=REPOSITORY_CLASS_NAME) # commit objects in index to ml-git objects o = Objects(spec, objects_path) changed_files, deleted_files = o.commit_index(index_path, path) bare_mode = os.path.exists( os.path.join(index_path, 'metadata', spec, 'bare')) if not bare_mode: manifest = m.get_metadata_manifest(manifest_path) self._remove_deleted_files(idx, index_path, m, manifest, spec, deleted_files) m.remove_files_added_after_base_tag(manifest, path) else: tag, _ = ref.branch() self._checkout_ref(tag) # update metadata spec & README.md # option --dataset-spec --labels-spec tag, sha = m.commit_metadata(index_path, specs, msg, changed_files, mutability, path) # update ml-git ref spec HEAD == to new SHA-1 / tag if tag is None: return None ref = Refs(refs_path, spec, repo_type) ref.update_head(tag, sha) # Run file check if run_fsck: self.fsck() return tag
def test_init(self): m = Metadata(spec, self.test_dir, config, repotype) m.init() self.assertTrue(m.check_exists()) clear(m.path)
def test_clone_config_repo(self): m = Metadata('', self.test_dir, config, repotype) m.clone_config_repo() self.assertTrue(m.check_exists())
def test_init_local_repo(self): m = Metadata(spec, self.test_dir, config, DATASETS) m.init() self.assertTrue(m.check_exists()) clear(m.path)