def checkout(self, tag, samples, options): try: metadata_path = get_metadata_path(self.__config) except RootPathException as e: log.warn(e, class_name=REPOSITORY_CLASS_NAME) metadata_path = self._initialize_repository_on_the_fly() dt_tag, lb_tag = self._checkout(tag, samples, options) options['with_dataset'] = False options['with_labels'] = False if dt_tag is not None: try: self.__repo_type = 'dataset' m = Metadata('', metadata_path, self.__config, self.__repo_type) log.info('Initializing related dataset download', class_name=REPOSITORY_CLASS_NAME) if not m.check_exists(): m.init() self._checkout(dt_tag, samples, options) except Exception as e: log.error('LocalRepository: [%s]' % e, class_name=REPOSITORY_CLASS_NAME) if lb_tag is not None: try: self.__repo_type = 'labels' m = Metadata('', metadata_path, self.__config, self.__repo_type) log.info('Initializing related labels download', class_name=REPOSITORY_CLASS_NAME) if not m.check_exists(): m.init() self._checkout(lb_tag, samples, options) except Exception as e: log.error('LocalRepository: [%s]' % e, class_name=REPOSITORY_CLASS_NAME)
def tag(self, spec, usr_tag): repo_type = self.__repo_type try: metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) r = Refs(refs_path, spec, repo_type) curtag, sha = r.head() except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return False if curtag is None: log.error('No current tag for [%s]. commit first.' % spec, class_name=REPOSITORY_CLASS_NAME) return False utag = UsrTag(curtag, usr_tag) # Check if usrtag exists before creating it log.debug('Check if tag [%s] already exists' % utag, class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if m._usrtag_exists(utag) is True: log.error('Tag [%s] already exists.' % utag, class_name=REPOSITORY_CLASS_NAME) return False # ensure metadata repository is at the current tag/sha version m = Metadata('', metadata_path, self.__config, repo_type) m.checkout(curtag) # TODO: format to something that could be used for a checkout: # format: _._user_.._ + curtag + _.._ + usrtag # at checkout with usrtag look for pattern _._ then find usrtag in the list (split on '_.._') # adds usrtag to the metadata repository m = Metadata(spec, metadata_path, self.__config, repo_type) try: m.tag_add(utag) except Exception as e: match = re.search("stderr: 'fatal:(.*)'$", e.stderr) err = match.group(1) log.error(err, class_name=REPOSITORY_CLASS_NAME) return log.info('Create Tag Successfull', class_name=REPOSITORY_CLASS_NAME) # checkout at metadata repository at master version m.checkout() return True
def check_initialized_entity(context, entity_type, entity_name): config = merged_config_load() metadata_path = get_metadata_path(config, entity_type) metadata = Metadata(entity_name, metadata_path, config, entity_type) if not metadata.check_exists(): log.error(output_messages['ERROR_NOT_INITIALIZED'] % entity_type) context.exit()
def _checkout_ref(self, ref): repo_type = self.__repo_type metadata_path = get_metadata_path(self.__config, repo_type) # checkout m = Metadata('', metadata_path, self.__config, repo_type) m.checkout(ref)
def metadata_exists(self, entity): self.__repo_type = entity entity_metadata_path = get_metadata_path(self.__config, self.__repo_type) metadata = Metadata('', entity_metadata_path, self.__config, self.__repo_type) return metadata.check_exists()
def log(self, spec, stat=False, fullstat=False): try: repo_type = self.__repo_type metadata_path = get_metadata_path(self.__config, repo_type) metadata = Metadata(spec, metadata_path, self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) log_info = metadata.get_log_info(spec, fullstat) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return fidx = FullIndex(spec, index_path) if stat or fullstat: workspace_size = fidx.get_total_size() amount_message = 'Total of files: %s' % fidx.get_total_count() size_message = 'Workspace size: %s' % humanize.naturalsize( workspace_size) workspace_info = '------------------------------------------------- \n{}\t{}' \ .format(amount_message, size_message) log_info = '{}\n{}'.format(log_info, workspace_info) log.info(log_info, class_name=REPOSITORY_CLASS_NAME)
def test_get_metrics(self): repo_type = MODELS mdpath = os.path.join(self.test_dir, 'mdata', repo_type, 'metadata') specpath = os.path.join('vision-computer', 'images') entity = 'model-ex' m = Metadata(entity, self.test_dir, config, repo_type) m.init() ensure_path_exists(os.path.join(mdpath, specpath, entity)) spec_metadata_path = os.path.join(mdpath, specpath, entity, 'model-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_metadata_path) spec_file = yaml_load(spec_metadata_path) spec_file[MODEL_SPEC_KEY] = deepcopy(spec_file[DATASET_SPEC_KEY]) del spec_file[DATASET_SPEC_KEY] spec_file[MODEL_SPEC_KEY]['metrics'] = {'metric_1': 0, 'metric_2': 1} yaml_save(spec_file, spec_metadata_path) tag = 'vision-computer__images__model-ex__1' sha = m.commit(spec_metadata_path, specpath) m.tag_add(tag) metrics = m._get_metrics(entity, sha) test_table = PrettyTable() test_table.field_names = ['Name', 'Value'] test_table.align['Name'] = 'l' test_table.align['Value'] = 'l' test_table.add_row(['metric_1', 0]) test_table.add_row(['metric_2', 1]) test_metrics = '\nmetrics:\n{}'.format(test_table.get_string()) self.assertEqual(metrics, test_metrics)
def test_diff_refs_modified_file(self): repo_type = DATASETS mdpath = os.path.join(self.test_dir, '.ml-git', repo_type, 'metadata') entity = 'dataset-ex' specpath = os.path.join('vision-computer', 'images', entity) config_test = deepcopy(config) config_test['mlgit_path'] = '.ml-git' m = Metadata(entity, mdpath, config_test, repo_type) m.init() ensure_path_exists(os.path.join(mdpath, specpath, entity)) manifestpath = os.path.join(os.path.join(mdpath, specpath), 'MANIFEST.yaml') shutil.copy('hdata/dataset-ex.spec', os.path.join(mdpath, specpath, '{}.spec'.format(entity))) yaml_save(files_mock, manifestpath) sha1 = m.commit(manifestpath, 'test') files_mock_copy = deepcopy(files_mock) del files_mock_copy[ 'zdj7WZzR8Tw87Dx3dm76W5aehnT23GSbXbQ9qo73JgtwREGwB'] files_mock_copy['NewHash'] = {'7.jpg'} yaml_save(files_mock_copy, manifestpath) sha2 = m.commit(manifestpath, 'test') added_files, deleted_files, modified_file = m.diff_refs_with_modified_files( entity, sha1, sha2) self.assertTrue(len(added_files) == 0) self.assertTrue(len(deleted_files) == 0) self.assertTrue(len(modified_file) == 1)
def test_diff_refs_add_file(self): repo_type = DATASETS mdpath = os.path.join(self.test_dir, '.ml-git', repo_type, 'metadata') entity = 'dataset-ex' specpath = os.path.join('vision-computer', 'images', entity) config_test = deepcopy(config) config_test['mlgit_path'] = '.ml-git' m = Metadata(entity, mdpath, config_test, repo_type) m.init() ensure_path_exists(os.path.join(mdpath, specpath, entity)) manifestpath = os.path.join(os.path.join(mdpath, specpath), 'MANIFEST.yaml') shutil.copy('hdata/dataset-ex.spec', os.path.join(mdpath, specpath, '{}.spec'.format(entity))) yaml_save(files_mock, manifestpath) sha1 = m.commit(manifestpath, 'test') files_mock_copy = deepcopy(files_mock) files_mock_copy[ 'zPaksM5tNewHashQ2VABPvvfC3VW6wFRTWKvFhUW5QaDx6JMoma'] = { '11.jpg' } yaml_save(files_mock_copy, manifestpath) sha2 = m.commit(manifestpath, 'test') added_files, deleted_files, modified_file = m.diff_refs_with_modified_files( entity, sha1, sha2) self.assertTrue(len(added_files) == 1) self.assertTrue(len(deleted_files) == 0) self.assertTrue(len(modified_file) == 0)
def init(self): try: metadata_path = get_metadata_path(self.__config) m = Metadata('', metadata_path, self.__config, self.__repo_type) m.init() except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def test_format_data_for_csv(self): entity_name = '{}-ex'.format(MODELS) m = Metadata(entity_name, self.test_dir, config, MODELS) m.init() tag_infos = [{PERFORMANCE_KEY: {'accuracy': 10.0}}] expected_header = [DATE, TAG, RELATED_DATASET_TABLE_INFO, RELATED_LABELS_TABLE_INFO, 'accuracy'] csv_header, output_info = m._format_data_for_csv(tag_infos) self.assertEqual(expected_header, csv_header) self.assertIn('accuracy', output_info[0])
def get_last_entity_version(entity_type, entity_name): config = merged_config_load() metadata_path = get_metadata_path(config, entity_type) metadata = Metadata(entity_name, metadata_path, config, entity_type) if not metadata.check_exists(): log.error(output_messages['ERROR_NOT_INITIALIZED'] % entity_type) return last_version = metadata.get_last_tag_version(entity_name) return last_version + 1
def _checkout_ref(self, ref=None): repo_type = self.__repo_type metadata_path = get_metadata_path(self.__config, repo_type) m = Metadata('', metadata_path, self.__config, repo_type) if ref is None: ref = m.get_default_branch() m.checkout(ref)
def test_get_related_entity_info(self): entity_name = '{}-ex'.format(MODELS) m = Metadata(entity_name, self.test_dir, config, MODELS) m.init() related_tag = 'test__dataset-ex__1' expected_formatted_output = 'dataset-ex - (1)' spec = {DATASETS: {'tag': related_tag, 'sha': '7f42830dbd035acb35f41359a5178c72d7cbc12c'}} entity_tag, formatted_info = m._get_related_entity_info(spec, DATASETS) self.assertEqual(related_tag, entity_tag) self.assertEqual(formatted_info, expected_formatted_output)
def repo_remote_del(self, global_conf=False): try: metadata_path = get_metadata_path(self.__config) metadata = Metadata('', metadata_path, self.__config, self.__repo_type) if metadata.delete_git_reference(): remote_del(self.__repo_type, global_conf) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def repo_remote_add(self, repo_type, mlgit_remote, global_conf=False): try: remote_add(repo_type, mlgit_remote, global_conf) self.__config = config_load() metadata_path = get_metadata_path(self.__config) m = Metadata('', metadata_path, self.__config, self.__repo_type) m.remote_set_url(mlgit_remote) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def list_tag(self, spec): repo_type = self.__repo_type try: metadata_path = get_metadata_path(self.__config, repo_type) m = Metadata(spec, metadata_path, self.__config, repo_type) for tag in m.list_tags(spec): print(tag) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def test_get_target_tag(self): tags = ['computer__images__dataset-ex__1', 'computer__images__dataset-ex__2', 'computer__videos__dataset-ex__1'] m = Metadata('', self.test_dir, config, DATASETS) self.assertRaises(RuntimeError, lambda: m._get_target_tag(tags, 'dataset-ex', -1)) self.assertRaises(RuntimeError, lambda: m._get_target_tag(tags, 'dataset-ex', 1)) self.assertRaises(RuntimeError, lambda: m._get_target_tag(tags, 'dataset-wrong', 1)) self.assertEqual(m._get_target_tag(tags, 'dataset-ex', 2), 'computer__images__dataset-ex__2') clear(m.path)
def push(self, spec, retry=2, clear_on_fail=False): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return met = Metadata(spec, metadata_path, self.__config, repo_type) fields = met.git_user_config() if None in fields.values(): log.error( 'Your name and email address need to be configured in git. ' 'Please see the commands below:', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.name \'Your Name\'', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.email [email protected]', class_name=REPOSITORY_CLASS_NAME) return if met.fetch() is False: return ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) spec_path, spec_file = None, None try: spec_path, spec_file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if spec_path is None: return full_spec_path = os.path.join(spec_path, spec_file) repo = LocalRepository(self.__config, objects_path, repo_type) ret = repo.push(objects_path, full_spec_path, retry, clear_on_fail) # ensure first we're on master ! met.checkout() if ret == 0: # push metadata spec to LocalRepository git repository try: met.push() except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return MultihashFS(objects_path).reset_log()
def test_delete_git_reference(self): m = Metadata(spec, self.test_dir, config, repotype) m.init() for url in Repo(m.path).remote().urls: self.assertNotEqual(url, '') self.assertTrue(m.delete_git_reference()) for url in Repo(m.path).remote().urls: self.assertEqual(url, '')
def test_default_branch(self): default_branch_for_empty_repo = 'master' new_branch = 'main' m = Metadata('', self.test_dir, config, DATASETS) m.init() self.assertTrue(m.check_exists()) self.assertEqual(m.get_default_branch(), default_branch_for_empty_repo) self.change_branch(m.path, new_branch) self.assertNotEqual(m.get_default_branch(), default_branch_for_empty_repo) self.assertEqual(m.get_default_branch(), new_branch) clear(m.path)
def test_clone_empty_config_repo(self): config = { 'mlgit_path': './mdata', 'mlgit_conf': 'config.yaml', 'verbose': 'info', DATASETS: {'git': '', }, LABELS: {'git': '', }, MODELS: {'git': '', }, } m = Metadata('', self.test_dir, config, DATASETS) m.clone_config_repo() self.assertFalse(m.check_exists())
def test_export_metrics(self): entity_name = '{}-ex'.format(MODELS) m = Metadata(entity_name, self.test_dir, config, MODELS) m.init() tag_infos = [{PERFORMANCE_KEY: {'accuracy': 10.0}}] data = m.export_metrics(entity_name, self.test_dir, CSV, tag_infos) file_path = os.path.join(self.test_dir, '{}-{}.{}'.format(entity_name, PERFORMANCE_KEY, CSV)) self.assertTrue(os.path.exists(file_path)) self.assertIn('{},{},{},{},accuracy'.format(DATE, TAG, RELATED_DATASET_TABLE_INFO, RELATED_LABELS_TABLE_INFO), data.getvalue()) self.assertIn(',,,,10.0', data.getvalue())
def update(self): repo_type = self.__repo_type try: metadata_path = get_metadata_path(self.__config, repo_type) m = Metadata('', metadata_path, self.__config, repo_type) m.update() except GitError as error: log.error( 'Could not update metadata. Check your remote configuration. %s' % error.stderr, class_name=REPOSITORY_CLASS_NAME) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME)
def _initialize_repository_on_the_fly(self): if os.path.exists(get_global_config_path()): log.info('Initializing the project with global settings', class_name=REPOSITORY_CLASS_NAME) init_mlgit() save_global_config_in_local() metadata_path = get_metadata_path(self.__config) if not os.path.exists(metadata_path): Metadata('', metadata_path, self.__config, self.__repo_type).init() return metadata_path raise RootPathException( 'You are not in an initialized ml-git repository and do not have a global configuration.' )
def test_get_spec_content_from_ref(self): mdpath = os.path.join(self.test_dir, 'mdata', DATASETS, 'metadata') specpath = 'dataset-ex' m = Metadata(specpath, self.test_dir, config, DATASETS) m.init() ensure_path_exists(os.path.join(mdpath, specpath)) spec_metadata_path = os.path.join(mdpath, specpath) + '/dataset-ex.spec' shutil.copy('hdata/dataset-ex.spec', spec_metadata_path) sha = m.commit(spec_metadata_path, specpath) tag = m.tag_add(sha) path = 'dataset-ex/dataset-ex.spec' content = yaml_load_str(m._get_spec_content_from_ref(tag.commit, path)) spec_file = yaml_load(spec_metadata_path) self.assertEqual(content, spec_file)
def test_get_specs_to_compare(self): mdpath = os.path.join(self.test_dir, 'mdata', DATASETS, 'metadata') specpath = 'dataset-ex' m = Metadata(specpath, self.test_dir, config, DATASETS) m.init() ensure_path_exists(os.path.join(mdpath, specpath)) spec_metadata_path = os.path.join(mdpath, specpath) + '/dataset-ex.spec' shutil.copy('hdata/dataset-ex.spec', spec_metadata_path) sha = m.commit(spec_metadata_path, specpath) m.tag_add(sha) specs = m.get_specs_to_compare(specpath) spec_file = yaml_load(spec_metadata_path) for c, v in specs: self.assertEqual(c, spec_file[DATASETS]['manifest']) self.assertIsNotNone(v, {DATASETS: {'manifest': {}}})
def test_create_tag_info_table(self): test_table = PrettyTable() test_table.field_names = ['Name', 'Value'] test_table.add_row([DATE, 'date']) test_table.add_row([RELATED_DATASET_TABLE_INFO, '1']) test_table.add_row([RELATED_LABELS_TABLE_INFO, '2']) test_table.add_row(['accuracy', 10.0]) entity_name = '{}-ex'.format(MODELS) m = Metadata(entity_name, self.test_dir, config, MODELS) m.init() tag_info = {DATE: 'date', RELATED_DATASET_TABLE_INFO: '1', RELATED_LABELS_TABLE_INFO: '2'} metrics = {'accuracy': 10.0} tag_table = m._create_tag_info_table(tag_info, metrics) self.assertEqual(test_table.get_string(), tag_table.get_string())
def test_tag_exist(self): mdpath = os.path.join(self.test_dir, 'metadata') specpath = 'dataset-ex' ensure_path_exists(os.path.join(mdpath, specpath)) shutil.copy('hdata/dataset-ex.spec', os.path.join(mdpath, specpath) + '/dataset-ex.spec') manifestpath = os.path.join(os.path.join(mdpath, specpath), 'MANIFEST.yaml') yaml_save(files_mock, manifestpath) config['mlgit_path'] = self.test_dir m = Metadata(specpath, mdpath, config, repotype) r = Repository(config, repotype) r.init() fullmetadatapath, categories_subpath, metadata = m.tag_exists( self.test_dir) self.assertFalse(metadata is None)
def test_get_tag(self): mdpath = os.path.join(self.test_dir, 'metadata') specpath = 'dataset-ex' ensure_path_exists(os.path.join(mdpath, specpath)) shutil.copy('hdata/dataset-ex.spec', os.path.join(mdpath, specpath) + '/dataset-ex.spec') manifestpath = os.path.join(os.path.join(mdpath, specpath), 'MANIFEST.yaml') yaml_save(files_mock, manifestpath) config['mlgit_path'] = self.test_dir m = Metadata(specpath, mdpath, config, DATASETS) r = Repository(config, DATASETS) r.init() tag_list = ['computer__images__dataset-ex__1'] with mock.patch('ml_git.metadata.Metadata.list_tags', return_value=tag_list): target_tag = m.get_tag(specpath, -1) self.assertEqual(target_tag, tag_list[0]) clear(m.path)