def clone(repository_url, folder=None, track=False): """This command will clone minimal configuration files from repository-url with valid .ml-git/config.yaml, then initialize the metadata according to configurations. Example: clone('https://[email protected]/mlgit-repository') Args: repository_url (str): The git repository that will be cloned. folder (str, optional): Directory that can be created to execute the clone command [default: current path]. track (bool, optional): Set if the tracking of the cloned repository should be kept [default: False]. """ repo = Repository(config_load(), 'project') if folder is not None: repo.clone_config(repository_url, folder, track) else: current_directory = os.getcwd() with tempfile.TemporaryDirectory(dir=current_directory) as tempdir: mlgit_path = os.path.join(tempdir, 'mlgit') repo.clone_config(repository_url, mlgit_path, track) if not os.path.exists(os.path.join(current_directory, '.ml-git')): shutil.move(os.path.join(mlgit_path, '.ml-git'), current_directory) os.chdir(current_directory)
def get_repository_instance(repo_type): project_repo_type = 'project' if repo_type not in EntityType.to_list( ) and repo_type != project_repo_type: raise RuntimeError(output_messages['ERROR_INVALID_ENTITY_TYPE'] % EntityType.to_list()) return Repository(config_load(), repo_type)
def create(entity, entity_name, categories, mutability, **kwargs): """This command will create the workspace structure with data and spec file for an entity and set the store configurations. Example: create('dataset', 'dataset-ex', categories=['computer-vision', 'images'], mutability='strict') Args: entity (str): The type of an ML entity. (dataset, labels or model). entity_name (str): An ml-git entity name to identify a ML entity. categories (list): Artifact's category name. mutability (str): Mutability type. The mutability options are strict, flexible and mutable. store_type (str, optional): Data store type [default: s3h]. version (int, optional): Number of retries to upload the files to the storage [default: 2]. import_path (str, optional): Path to be imported to the project. bucket_name (str, optional): Bucket name. import_url (str, optional): Import data from a google drive url. credentials_path (str, optional): Directory of credentials.json. unzip (bool, optional): Unzip imported zipped files [default: False]. """ args = {'artifact_name': entity_name, 'category': categories, 'mutability': mutability, 'version_number': kwargs.get('version', 1), 'import': kwargs.get('import_path', None), 'store_type': kwargs.get('store_type', StoreType.S3H.value), 'bucket_name': kwargs.get('bucket_name', None), 'unzip': kwargs.get('unzip', False), 'import_url': kwargs.get('import_url', None), 'credentials_path': kwargs.get('credentials_path', None), 'wizard_config': False} repo = Repository(config_load(), entity) repo.create(args)
def remote_add(entity, remote_url, global_configuration=False): """This command will add a remote to store the metadata from this ml-git project. Examples: remote_add('dataset', 'https://[email protected]/mlgit-datasets') Args: entity (str): The type of an ML entity. (repository, dataset, labels or model). remote_url(str): URL of an existing remote git repository. global_configuration (bool, optional): Use this option to set configuration at global level [default: False]. """ repo = Repository(config_load(), entity) repo.repo_remote_add(entity, remote_url, global_configuration)
def push(entity, entity_name, retries=2, clear_on_fail=False): """This command allows pushing the data of a specific version of an ML entity. Example: push('dataset', 'dataset-ex') Args: entity (str): The type of an ML entity. (dataset, labels or model). entity_name (str): An ml-git entity name to identify a ML entity. retries (int, optional): Number of retries to upload the files to the storage [default: 2]. clear_on_fail (bool, optional): Remove the files from the store in case of failure during the push operation [default: False]. """ repo = Repository(config_load(), entity) repo.push(entity_name, retries, clear_on_fail)
def add(entity_type, entity_name, bumpversion=False, fsck=False, file_path=[]): """This command will add all the files under the directory into the ml-git index/staging area. Example: add('dataset', 'dataset-ex', bumpversion=True) Args: entity_type (str): The type of an ML entity. (dataset, labels or model) entity_name (str): The name of the ML entity you want to add the files. bumpversion (bool, optional): Increment the entity version number when adding more files [default: False]. fsck (bool, optional): Run fsck after command execution [default: False]. file_path (list, optional): List of files that must be added by the command [default: all files]. """ repo = Repository(config_load(), entity_type) repo.add(entity_name, file_path, bumpversion, fsck)
def test_tag_exist(self): mdpath = os.path.join(self.test_dir, 'metadata') specpath = 'dataset-ex' ensure_path_exists(os.path.join(mdpath, specpath)) shutil.copy('hdata/dataset-ex.spec', os.path.join(mdpath, specpath) + '/dataset-ex.spec') manifestpath = os.path.join(os.path.join(mdpath, specpath), 'MANIFEST.yaml') yaml_save(files_mock, manifestpath) config['mlgit_path'] = self.test_dir m = Metadata(specpath, mdpath, config, repotype) r = Repository(config, repotype) r.init() fullmetadatapath, categories_subpath, metadata = m.tag_exists( self.test_dir) self.assertFalse(metadata is None)
def test_get_tag(self): mdpath = os.path.join(self.test_dir, 'metadata') specpath = 'dataset-ex' ensure_path_exists(os.path.join(mdpath, specpath)) shutil.copy('hdata/dataset-ex.spec', os.path.join(mdpath, specpath) + '/dataset-ex.spec') manifestpath = os.path.join(os.path.join(mdpath, specpath), 'MANIFEST.yaml') yaml_save(files_mock, manifestpath) config['mlgit_path'] = self.test_dir m = Metadata(specpath, mdpath, config, DATASETS) r = Repository(config, DATASETS) r.init() tag_list = ['computer__images__dataset-ex__1'] with mock.patch('ml_git.metadata.Metadata.list_tags', return_value=tag_list): target_tag = m.get_tag(specpath, -1) self.assertEqual(target_tag, tag_list[0]) clear(m.path)
def init(entity): """This command will start the ml-git entity. Examples: init('repository') init('dataset') Args: entity (str): The type of entity that will be initialized (repository, dataset, labels or model). """ if entity == 'repository': init_mlgit() elif entity in EntityType.to_list(): repo = Repository(config_load(), entity) repo.init() else: log.error('The type of entity entered is invalid. Valid types are: [repository, dataset, labels or model]')
def checkout(entity, tag, sampling=None, retries=2, force=False, dataset=False, labels=False, version=-1): """This command allows retrieving the data of a specific version of an ML entity. Example: checkout('dataset', 'computer-vision__images3__imagenet__1') Args: entity (str): The type of an ML entity. (dataset, labels or model) tag (str): An ml-git tag to identify a specific version of an ML entity. sampling (dict): group: <amount>:<group> The group sample option consists of amount and group used to download a sample.\n range: <start:stop:step> The range sample option consists of start, stop and step used to download a sample. The start parameter can be equal or greater than zero. The stop parameter can be 'all', -1 or any integer above zero.\n random: <amount:frequency> The random sample option consists of amount and frequency used to download a sample. seed: The seed is used to initialize the pseudorandom numbers. retries (int, optional): Number of retries to download the files from the storage [default: 2]. force (bool, optional): Force checkout command to delete untracked/uncommitted files from the local repository [default: False]. dataset (bool, optional): If exist a dataset related with the model or labels, this one must be downloaded [default: False]. labels (bool, optional): If exist labels related with the model, they must be downloaded [default: False]. Returns: str: Return the path where the data was checked out. """ repo = Repository(config_load(), entity) repo.update() if sampling is not None and not validate_sample(sampling): return None options = {} options['with_dataset'] = dataset options['with_labels'] = labels options['retry'] = retries options['force'] = force options['bare'] = False options['version'] = version repo.checkout(tag, sampling, options) data_path = os.path.join(entity, *tag.split('__')[:-1]) if not os.path.exists(data_path): data_path = None return data_path
def get_repository_instance(repo_type): return Repository(config_load(), repo_type)
def init_repository(entity_type=DATASETS): return Repository(config_load(), entity_type)
def init_repository(entity_type='dataset'): return Repository(config_load(), entity_type)
def init_repository(entity_type=DATASETS): return Repository(config_load(hide_logs=True), entity_type)