def test_spec_parse(self): tag = 'computer-vision__images__imagenet8__1' spec = 'imagenet8' categories = ['computer-vision', 'images', spec] version = '1' self.assertEqual((os.sep.join(categories), spec, version), spec_parse(tag)) self.assertRaises(SearchSpecException, lambda: spec_parse(''))
def export(self, bucket, tag, retry): try: categories_path, spec_name, _ = spec_parse(tag) get_root_path() if not self._tag_exists(tag): return None, None except InvalidGitRepositoryError: log.error('You are not in an initialized ml-git repository.', class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local = LocalRepository( self.__config, get_objects_path(self.__config, self.__repo_type), self.__repo_type) local.export_tag(get_metadata_path(self.__config, self.__repo_type), tag, bucket, retry) self._checkout_ref()
def _get_related_entity_info(spec_file, entity_type): related_entity = spec_file.get(entity_type, None) if related_entity: entity_tag = related_entity['tag'] _, entity_name, version = spec_parse(entity_tag) return entity_tag, '{} - ({})'.format(entity_name, version) return None, None
def checkout(entity, tag, sampling=None, retries=2, force=False, dataset=False, labels=False, version=-1): """This command allows retrieving the data of a specific version of an ML entity. Example: checkout('datasets', 'computer-vision__images3__imagenet__1') Args: entity (str): The type of an ML entity. (datasets, labels or models) tag (str): An ml-git tag to identify a specific version of an ML entity. sampling (dict): group: <amount>:<group> The group sample option consists of amount and group used to download a sample.\n range: <start:stop:step> The range sample option consists of start, stop and step used to download a sample. The start parameter can be equal or greater than zero. The stop parameter can be 'all', -1 or any integer above zero.\n random: <amount:frequency> The random sample option consists of amount and frequency used to download a sample. seed: The seed is used to initialize the pseudorandom numbers. retries (int, optional): Number of retries to download the files from the storage [default: 2]. force (bool, optional): Force checkout command to delete untracked/uncommitted files from the local repository [default: False]. dataset (bool, optional): If exist a dataset related with the model or labels, this one must be downloaded [default: False]. labels (bool, optional): If exist labels related with the model, they must be downloaded [default: False]. Returns: str: Return the path where the data was checked out. """ repo = get_repository_instance(entity) repo.update() if sampling is not None and not validate_sample(sampling): return None options = {} options['with_dataset'] = dataset options['with_labels'] = labels options['retry'] = retries options['force'] = force options['bare'] = False options['version'] = version repo.checkout(tag, sampling, options) spec_name = tag if re.search(RGX_TAG_FORMAT, tag): _, spec_name, _ = spec_parse(tag) spec_path, _ = search_spec_file(entity, spec_name) data_path = os.path.relpath(spec_path, get_root_path()) if not os.path.exists(data_path): data_path = None return data_path
def _checkout(self, tag, samples, options): dataset = options['with_dataset'] labels = options['with_labels'] retries = options['retry'] force_get = options['force'] bare = options['bare'] version = options['version'] repo_type = self.__repo_type try: cache_path = get_cache_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) if not re.search(RGX_TAG_FORMAT, tag): metadata_path = get_metadata_path(self.__config, repo_type) metadata = Metadata(tag, metadata_path, self.__config, repo_type) tag = metadata.get_tag(tag, version) if not tag: return None, None elif not self._tag_exists(tag): return None, None categories_path, spec_name, _ = spec_parse(tag) root_path = get_root_path() ws_path = os.path.join(root_path, os.sep.join([repo_type, categories_path])) except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None ref = Refs(refs_path, spec_name, repo_type) cur_tag, _ = ref.branch() if cur_tag == tag: log.info('already at tag [%s]' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local_rep = LocalRepository(self.__config, objects_path, repo_type) # check if no data left untracked/uncommitted. otherwise, stop. if not force_get and local_rep.exist_local_changes(spec_name) is True: return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None dataset_tag, labels_tag = self._get_related_tags( categories_path, dataset, labels, metadata_path, repo_type, spec_name) fetch_success = self._fetch(tag, samples, retries, bare) if not fetch_success: objs = Objects('', objects_path) objs.fsck(remove_corrupted=True) self._checkout_ref() return None, None ensure_path_exists(ws_path) try: spec_index_path = os.path.join( get_index_metadata_path(self.__config, repo_type), spec_name) except Exception: return self._delete_spec_and_readme(spec_index_path, spec_name) try: r = LocalRepository(self.__config, objects_path, repo_type) r.checkout(cache_path, metadata_path, ws_path, tag, samples, bare) except OSError as e: self._checkout_ref() if e.errno == errno.ENOSPC: log.error( 'There is not enough space in the disk. Remove some files and try again.', class_name=REPOSITORY_CLASS_NAME) else: log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None except Exception as e: self._checkout_ref() log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None m = Metadata('', metadata_path, self.__config, repo_type) sha = m.sha_from_tag(tag) ref.update_head(tag, sha) # restore to master/head self._checkout_ref() return dataset_tag, labels_tag
def _checkout(self, tag, samples, retries=2, force_get=False, dataset=False, labels=False, bare=False): repo_type = self.__repo_type try: cache_path = get_cache_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) # find out actual workspace path to save data if not self._tag_exists(tag): return None, None categories_path, spec_name, _ = spec_parse(tag) dataset_tag = None labels_tag = None root_path = get_root_path() ws_path = os.path.join(root_path, os.sep.join([repo_type, categories_path])) ensure_path_exists(ws_path) except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None ref = Refs(refs_path, spec_name, repo_type) cur_tag, _ = ref.branch() if cur_tag == tag: log.info('already at tag [%s]' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local_rep = LocalRepository(self.__config, objects_path, repo_type) # check if no data left untracked/uncommitted. otherwise, stop. if not force_get and local_rep.exist_local_changes(spec_name) is True: return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None spec_path = os.path.join(metadata_path, categories_path, spec_name + '.spec') if dataset is True: dataset_tag = get_entity_tag(spec_path, repo_type, 'dataset') if labels is True: labels_tag = get_entity_tag(spec_path, repo_type, 'labels') fetch_success = self._fetch(tag, samples, retries, bare) if not fetch_success: objs = Objects('', objects_path) objs.fsck(remove_corrupted=True) self._checkout_ref('master') return None, None try: spec_index_path = os.path.join( get_index_metadata_path(self.__config, repo_type), spec_name) except Exception: return if os.path.exists(spec_index_path): if os.path.exists( os.path.join(spec_index_path, spec_name + '.spec')): os.unlink(os.path.join(spec_index_path, spec_name + '.spec')) if os.path.exists(os.path.join(spec_index_path, 'README.md')): os.unlink(os.path.join(spec_index_path, 'README.md')) try: r = LocalRepository(self.__config, objects_path, repo_type) r.checkout(cache_path, metadata_path, objects_path, ws_path, tag, samples, bare) except OSError as e: self._checkout_ref('master') if e.errno == errno.ENOSPC: log.error( 'There is not enough space in the disk. Remove some files and try again.', class_name=REPOSITORY_CLASS_NAME) else: log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None except Exception as e: self._checkout_ref('master') log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None m = Metadata('', metadata_path, self.__config, repo_type) sha = m.sha_from_tag(tag) ref.update_head(tag, sha) # restore to master/head self._checkout_ref('master') return dataset_tag, labels_tag
def get_metadata_path(self, tag): _, specname, _ = spec_parse(tag) entity_dir = get_entity_dir(self.__repo_type, specname) return os.path.join(self.__path, entity_dir)