def test_paths(self): config = config_load() self.assertTrue(len(get_index_path(config)) > 0) self.assertTrue(len(get_objects_path(config)) > 0) self.assertTrue(len(get_cache_path(config)) > 0) self.assertTrue(len(get_metadata_path(config)) > 0) self.assertTrue('.ml-git' in get_refs_path(config))
def garbage_collector(self): any_metadata = False removed_files = 0 reclaimed_space = 0 for entity in EntityType: repo_type = entity.value if self.metadata_exists(repo_type): log.info(output_messages['INFO_STARTING_GC'] % repo_type, class_name=REPOSITORY_CLASS_NAME) any_metadata = True index_path = get_index_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) blobs_hashes = self._get_blobs_hashes(index_path, objects_path, repo_type) cache = Cache(get_cache_path(self.__config, repo_type)) count_removed_cache, reclaimed_cache_space = cache.garbage_collector( blobs_hashes) objects = Objects('', objects_path) count_removed_objects, reclaimed_objects_space = objects.garbage_collector( blobs_hashes) reclaimed_space += reclaimed_objects_space + reclaimed_cache_space removed_files += count_removed_objects + count_removed_cache if not any_metadata: log.error(output_messages['ERROR_UNINITIALIZED_METADATA'], class_name=REPOSITORY_CLASS_NAME) return log.info(output_messages['INFO_REMOVED_FILES'] % (humanize.intword(removed_files), os.path.join(get_root_path(), '.ml-git')), class_name=REPOSITORY_CLASS_NAME) log.info(output_messages['INFO_RECLAIMED_SPACE'] % humanize.naturalsize(reclaimed_space), class_name=REPOSITORY_CLASS_NAME)
def unlock_file(self, spec, file_path): repo_type = self.__repo_type if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) spec_file = yaml_load(spec_path) try: mutability = spec_file[repo_type]['mutability'] if mutability not in Mutability.list(): log.error('Invalid mutability type.', class_name=REPOSITORY_CLASS_NAME) return except Exception: log.info( 'The spec does not have the \'mutability\' property set. Default: strict.', class_name=REPOSITORY_CLASS_NAME) return if mutability != Mutability.STRICT.value: try: local = LocalRepository(self.__config, objects_path, repo_type) local.unlock_file(path, file_path, index_path, objects_path, spec, cache_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return else: log.error( 'You cannot use this command for this entity because mutability cannot be strict.', class_name=REPOSITORY_CLASS_NAME)
def add(self, spec, file_path, bump_version=False, run_fsck=False): repo_type = self.__repo_type is_shared_objects = 'objects_path' in self.__config[repo_type] is_shared_cache = 'cache_path' in self.__config[repo_type] if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) mutability, check_mutability = repo.get_mutability_from_spec( spec, repo_type) sampling_flag = os.path.exists( os.path.join(index_path, 'metadata', spec, 'sampling')) if sampling_flag: log.error( 'You cannot add new data to an entity that is based on a checkout with the --sampling option.', class_name=REPOSITORY_CLASS_NAME) return if not mutability: return if not check_mutability: log.error('Spec mutability cannot be changed.', class_name=REPOSITORY_CLASS_NAME) return if not self._has_new_data(repo, spec): return None ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) if not self._is_spec_valid(spec_path): return None # Check tag before anything to avoid creating unstable state log.debug('Repository: check if tag already exists', class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if not m.check_exists(): log.error('The %s has not been initialized' % self.__repo_type, class_name=REPOSITORY_CLASS_NAME) return try: m.update() except Exception: pass # get version of current manifest file manifest = self._get_current_manifest_file(m, tag) try: # adds chunks to ml-git Index log.info('%s adding path [%s] to ml-git index' % (repo_type, path), class_name=REPOSITORY_CLASS_NAME) with change_mask_for_routine(is_shared_objects): idx = MultihashIndex(spec, index_path, objects_path, mutability, cache_path) idx.add(path, manifest, file_path) # create hard links in ml-git Cache self.create_hard_links_in_cache(cache_path, index_path, is_shared_cache, mutability, path, spec) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return None if bump_version and not increment_version_in_spec( spec_path, self.__repo_type): return None idx.add_metadata(path, file) self._check_corrupted_files(spec, repo) # Run file check if run_fsck: self.fsck()
def _checkout(self, tag, samples, options): dataset = options['with_dataset'] labels = options['with_labels'] retries = options['retry'] force_get = options['force'] bare = options['bare'] version = options['version'] repo_type = self.__repo_type try: cache_path = get_cache_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) if not re.search(RGX_TAG_FORMAT, tag): metadata_path = get_metadata_path(self.__config, repo_type) metadata = Metadata(tag, metadata_path, self.__config, repo_type) tag = metadata.get_tag(tag, version) if not tag: return None, None elif not self._tag_exists(tag): return None, None categories_path, spec_name, _ = spec_parse(tag) root_path = get_root_path() ws_path = os.path.join(root_path, os.sep.join([repo_type, categories_path])) except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None ref = Refs(refs_path, spec_name, repo_type) cur_tag, _ = ref.branch() if cur_tag == tag: log.info('already at tag [%s]' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local_rep = LocalRepository(self.__config, objects_path, repo_type) # check if no data left untracked/uncommitted. otherwise, stop. if not force_get and local_rep.exist_local_changes(spec_name) is True: return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None dataset_tag, labels_tag = self._get_related_tags( categories_path, dataset, labels, metadata_path, repo_type, spec_name) fetch_success = self._fetch(tag, samples, retries, bare) if not fetch_success: objs = Objects('', objects_path) objs.fsck(remove_corrupted=True) self._checkout_ref() return None, None ensure_path_exists(ws_path) try: spec_index_path = os.path.join( get_index_metadata_path(self.__config, repo_type), spec_name) except Exception: return self._delete_spec_and_readme(spec_index_path, spec_name) try: r = LocalRepository(self.__config, objects_path, repo_type) r.checkout(cache_path, metadata_path, ws_path, tag, samples, bare) except OSError as e: self._checkout_ref() if e.errno == errno.ENOSPC: log.error( 'There is not enough space in the disk. Remove some files and try again.', class_name=REPOSITORY_CLASS_NAME) else: log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None except Exception as e: self._checkout_ref() log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None m = Metadata('', metadata_path, self.__config, repo_type) sha = m.sha_from_tag(tag) ref.update_head(tag, sha) # restore to master/head self._checkout_ref() return dataset_tag, labels_tag
def _checkout(self, tag, samples, retries=2, force_get=False, dataset=False, labels=False, bare=False): repo_type = self.__repo_type try: cache_path = get_cache_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) # find out actual workspace path to save data if not self._tag_exists(tag): return None, None categories_path, spec_name, _ = spec_parse(tag) dataset_tag = None labels_tag = None root_path = get_root_path() ws_path = os.path.join(root_path, os.sep.join([repo_type, categories_path])) ensure_path_exists(ws_path) except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None ref = Refs(refs_path, spec_name, repo_type) cur_tag, _ = ref.branch() if cur_tag == tag: log.info('already at tag [%s]' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local_rep = LocalRepository(self.__config, objects_path, repo_type) # check if no data left untracked/uncommitted. otherwise, stop. if not force_get and local_rep.exist_local_changes(spec_name) is True: return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None spec_path = os.path.join(metadata_path, categories_path, spec_name + '.spec') if dataset is True: dataset_tag = get_entity_tag(spec_path, repo_type, 'dataset') if labels is True: labels_tag = get_entity_tag(spec_path, repo_type, 'labels') fetch_success = self._fetch(tag, samples, retries, bare) if not fetch_success: objs = Objects('', objects_path) objs.fsck(remove_corrupted=True) self._checkout_ref('master') return None, None try: spec_index_path = os.path.join( get_index_metadata_path(self.__config, repo_type), spec_name) except Exception: return if os.path.exists(spec_index_path): if os.path.exists( os.path.join(spec_index_path, spec_name + '.spec')): os.unlink(os.path.join(spec_index_path, spec_name + '.spec')) if os.path.exists(os.path.join(spec_index_path, 'README.md')): os.unlink(os.path.join(spec_index_path, 'README.md')) try: r = LocalRepository(self.__config, objects_path, repo_type) r.checkout(cache_path, metadata_path, objects_path, ws_path, tag, samples, bare) except OSError as e: self._checkout_ref('master') if e.errno == errno.ENOSPC: log.error( 'There is not enough space in the disk. Remove some files and try again.', class_name=REPOSITORY_CLASS_NAME) else: log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None except Exception as e: self._checkout_ref('master') log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None m = Metadata('', metadata_path, self.__config, repo_type) sha = m.sha_from_tag(tag) ref.update_head(tag, sha) # restore to master/head self._checkout_ref('master') return dataset_tag, labels_tag