def clone(repository_url, folder=None, track=False): """This command will clone minimal configuration files from repository-url with valid .ml-git/config.yaml, then initialize the metadata according to configurations. Example: clone('https://[email protected]/mlgit-repository') Args: repository_url (str): The git repository that will be cloned. folder (str, optional): Directory that can be created to execute the clone command [default: current path]. track (bool, optional): Set if the tracking of the cloned repository should be kept [default: False]. """ repo = Repository(config_load(), 'project') if folder is not None: repo.clone_config(repository_url, folder, track) else: current_directory = os.getcwd() with tempfile.TemporaryDirectory(dir=current_directory) as tempdir: mlgit_path = os.path.join(tempdir, 'mlgit') repo.clone_config(repository_url, mlgit_path, track) if not os.path.exists(os.path.join(current_directory, '.ml-git')): shutil.move(os.path.join(mlgit_path, '.ml-git'), current_directory) os.chdir(current_directory)
def get_repository_instance(repo_type): project_repo_type = 'project' if repo_type not in EntityType.to_list( ) and repo_type != project_repo_type: raise RuntimeError(output_messages['ERROR_INVALID_ENTITY_TYPE'] % EntityType.to_list()) return Repository(config_load(), repo_type)
def create(entity, entity_name, categories, mutability, **kwargs): """This command will create the workspace structure with data and spec file for an entity and set the store configurations. Example: create('dataset', 'dataset-ex', categories=['computer-vision', 'images'], mutability='strict') Args: entity (str): The type of an ML entity. (dataset, labels or model). entity_name (str): An ml-git entity name to identify a ML entity. categories (list): Artifact's category name. mutability (str): Mutability type. The mutability options are strict, flexible and mutable. store_type (str, optional): Data store type [default: s3h]. version (int, optional): Number of retries to upload the files to the storage [default: 2]. import_path (str, optional): Path to be imported to the project. bucket_name (str, optional): Bucket name. import_url (str, optional): Import data from a google drive url. credentials_path (str, optional): Directory of credentials.json. unzip (bool, optional): Unzip imported zipped files [default: False]. """ args = {'artifact_name': entity_name, 'category': categories, 'mutability': mutability, 'version_number': kwargs.get('version', 1), 'import': kwargs.get('import_path', None), 'store_type': kwargs.get('store_type', StoreType.S3H.value), 'bucket_name': kwargs.get('bucket_name', None), 'unzip': kwargs.get('unzip', False), 'import_url': kwargs.get('import_url', None), 'credentials_path': kwargs.get('credentials_path', None), 'wizard_config': False} repo = Repository(config_load(), entity) repo.create(args)
def test_paths(self): config = config_load() self.assertTrue(len(get_index_path(config)) > 0) self.assertTrue(len(get_objects_path(config)) > 0) self.assertTrue(len(get_cache_path(config)) > 0) self.assertTrue(len(get_metadata_path(config)) > 0) self.assertTrue('.ml-git' in get_refs_path(config))
def config(**kwargs): config_file = config_load() if kwargs['global']: config_file = global_config_load() elif kwargs['local']: config_file = mlgit_config_load() print('config:') pprint(config_file)
def test_init_refs(self): config = config_load() spec_path = 'dataset-ex' ml_dir = os.path.join(self.tmp_dir, config['mlgit_path']) os.mkdir(ml_dir) refs_dir = os.path.join(ml_dir, 'dataset', 'refs') refs = Refs(refs_dir, spec_path, 'dataset') self.assertIsNotNone(refs) self.assertTrue(os.path.exists(os.path.join(refs_dir, spec_path)))
def repo_remote_add(self, repo_type, mlgit_remote, global_conf=False): try: remote_add(repo_type, mlgit_remote, global_conf) self.__config = config_load() metadata_path = get_metadata_path(self.__config) m = Metadata('', metadata_path, self.__config, self.__repo_type) m.remote_set_url(mlgit_remote) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def test_get_batch_size(self): config = config_load() batch_size = get_batch_size(config) self.assertEqual(batch_size, BATCH_SIZE_VALUE) config[BATCH_SIZE] = 0 self.assertRaises(Exception, lambda: get_batch_size(config)) config[BATCH_SIZE] = 'string' self.assertRaises(Exception, lambda: get_batch_size(config)) del config[BATCH_SIZE] batch_size = get_batch_size(config) self.assertEqual(batch_size, BATCH_SIZE_VALUE)
def remote_add(entity, remote_url, global_configuration=False): """This command will add a remote to store the metadata from this ml-git project. Examples: remote_add('dataset', 'https://[email protected]/mlgit-datasets') Args: entity (str): The type of an ML entity. (repository, dataset, labels or model). remote_url(str): URL of an existing remote git repository. global_configuration (bool, optional): Use this option to set configuration at global level [default: False]. """ repo = Repository(config_load(), entity) repo.repo_remote_add(entity, remote_url, global_configuration)
def test_head(self): config = config_load() spec_path = 'dataset-ex' ml_dir = os.path.join(self.tmp_dir, config['mlgit_path']) os.mkdir(ml_dir) refs_dir = os.path.join(ml_dir, 'dataset', 'refs') refs = Refs(refs_dir, spec_path) sha = 'b569b7e4cd82206b451315123669057ef5f1ac3b' tag = 'images__dataset_ex__1' refs.update_head(tag, sha) head = os.path.join(refs_dir, spec_path, 'HEAD') self.assertEqual((tag, sha), refs.head()) os.remove(head) self.assertEqual((None, None), refs.head())
def test_update_head(self): config = config_load() spec_path = 'dataset-ex' ml_dir = os.path.join(self.tmp_dir, config['mlgit_path']) os.mkdir(ml_dir) refs_dir = os.path.join(ml_dir, 'dataset', 'refs') refs = Refs(refs_dir, spec_path) sha = 'b569b7e4cd82206b451315123669057ef5f1ac3b' tag = 'images__dataset_ex__1' refs.update_head(tag, sha) head = os.path.join(refs_dir, spec_path, 'HEAD') self.assertTrue(os.path.exists(head)) yaml = yaml_load(head) self.assertEqual(yaml[tag], sha)
def push(entity, entity_name, retries=2, clear_on_fail=False): """This command allows pushing the data of a specific version of an ML entity. Example: push('dataset', 'dataset-ex') Args: entity (str): The type of an ML entity. (dataset, labels or model). entity_name (str): An ml-git entity name to identify a ML entity. retries (int, optional): Number of retries to upload the files to the storage [default: 2]. clear_on_fail (bool, optional): Remove the files from the store in case of failure during the push operation [default: False]. """ repo = Repository(config_load(), entity) repo.push(entity_name, retries, clear_on_fail)
def __init_manager(self, type_entity): try: get_root_path() config = config_load() if not config[type_entity]['git']: log.warn( output_messages['WARN_REPOSITORY_NOT_FOUND_FOR_ENTITY'] % type_entity, class_name=LocalEntityManager.__name__) return self._manager = MetadataManager(config, repo_type=type_entity) if not self._manager.check_exists(): self._manager.init() except Exception as e: log.error(e, class_name=LocalEntityManager.__name__)
def add(entity_type, entity_name, bumpversion=False, fsck=False, file_path=[]): """This command will add all the files under the directory into the ml-git index/staging area. Example: add('dataset', 'dataset-ex', bumpversion=True) Args: entity_type (str): The type of an ML entity. (dataset, labels or model) entity_name (str): The name of the ML entity you want to add the files. bumpversion (bool, optional): Increment the entity version number when adding more files [default: False]. fsck (bool, optional): Run fsck after command execution [default: False]. file_path (list, optional): List of files that must be added by the command [default: all files]. """ repo = Repository(config_load(), entity_type) repo.add(entity_name, file_path, bumpversion, fsck)
def init(entity): """This command will start the ml-git entity. Examples: init('repository') init('dataset') Args: entity (str): The type of entity that will be initialized (repository, dataset, labels or model). """ if entity == 'repository': init_mlgit() elif entity in EntityType.to_list(): repo = Repository(config_load(), entity) repo.init() else: log.error('The type of entity entered is invalid. Valid types are: [repository, dataset, labels or model]')
def checkout(entity, tag, sampling=None, retries=2, force=False, dataset=False, labels=False, version=-1): """This command allows retrieving the data of a specific version of an ML entity. Example: checkout('dataset', 'computer-vision__images3__imagenet__1') Args: entity (str): The type of an ML entity. (dataset, labels or model) tag (str): An ml-git tag to identify a specific version of an ML entity. sampling (dict): group: <amount>:<group> The group sample option consists of amount and group used to download a sample.\n range: <start:stop:step> The range sample option consists of start, stop and step used to download a sample. The start parameter can be equal or greater than zero. The stop parameter can be 'all', -1 or any integer above zero.\n random: <amount:frequency> The random sample option consists of amount and frequency used to download a sample. seed: The seed is used to initialize the pseudorandom numbers. retries (int, optional): Number of retries to download the files from the storage [default: 2]. force (bool, optional): Force checkout command to delete untracked/uncommitted files from the local repository [default: False]. dataset (bool, optional): If exist a dataset related with the model or labels, this one must be downloaded [default: False]. labels (bool, optional): If exist labels related with the model, they must be downloaded [default: False]. Returns: str: Return the path where the data was checked out. """ repo = Repository(config_load(), entity) repo.update() if sampling is not None and not validate_sample(sampling): return None options = {} options['with_dataset'] = dataset options['with_labels'] = labels options['retry'] = retries options['force'] = force options['bare'] = False options['version'] = version repo.checkout(tag, sampling, options) data_path = os.path.join(entity, *tag.split('__')[:-1]) if not os.path.exists(data_path): data_path = None return data_path
def init_repository(entity_type=DATASETS): return Repository(config_load(), entity_type)
def init_repository(entity_type='dataset'): return Repository(config_load(), entity_type)
def restore_config(): config = config_load() config_cp = deepcopy(config) yield for key in config_cp.keys(): config[key] = config_cp[key]
def get_repository_instance(repo_type): return Repository(config_load(), repo_type)
index_path = get_index_path(self.__config, repo_type) log_info = metadata.get_log_info(spec, fullstat) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return fidx = FullIndex(spec, index_path) if stat or fullstat: workspace_size = fidx.get_total_size() amount_message = 'Total of files: %s' % fidx.get_total_count() size_message = 'Workspace size: %s' % size(workspace_size, system=alternative) workspace_info = '------------------------------------------------- \n{}\t{}' \ .format(amount_message, size_message) log_info = '{}\n{}'.format(log_info, workspace_info) log.info(log_info, class_name=REPOSITORY_CLASS_NAME) if __name__ == '__main__': config = config_load() r = Repository(config) r.init() r.add('dataset-ex') r.commit('dataset-ex') r.status('dataset-ex')
def init_repository(entity_type=DATASETS): return Repository(config_load(hide_logs=True), entity_type)
def clone_config(self, url, folder=None, track=False): if clone_config_repository(url, folder, track): self.__config = config_load() m = Metadata('', get_metadata_path(self.__config), self.__config) m.clone_config_repo()