def test_remote_add_global_config(self): remote_default = 'git_local_server.git' new_remote = 'git_local_server2.git' dataset = 'dataset' init_mlgit() with mock.patch('pathlib.Path.home', return_value=self.tmp_dir): remote_add(dataset, new_remote, global_conf=True) self.assertTrue(os.path.exists('.mlgitconfig')) config = yaml_load('.ml-git/config.yaml') config_global = yaml_load('.mlgitconfig') self.assertEqual(config_global['dataset']['git'], new_remote) self.assertNotEqual(config['dataset']['git'], remote_default) with mock.patch('pathlib.Path.home', return_value=self.tmp_dir): remote_add(dataset, '', global_conf=True) config_ = yaml_load('.mlgitconfig') self.assertEqual(config_['dataset']['git'], '') with mock.patch('pathlib.Path.home', return_value=self.tmp_dir): remote_add(dataset, new_remote, global_conf=True) config__ = yaml_load('.mlgitconfig') self.assertEqual(config__['dataset']['git'], new_remote)
def test_save_global_config_in_local(self): remote_default = 'git_local_server.git' new_remote = 'git_local_server2.git' init_mlgit() self.assertTrue(os.path.isdir('.ml-git')) config = yaml_load('.ml-git/config.yaml') self.assertEqual(config[DATASETS]['git'], remote_default) global_conf = { DATASETS: { 'git': 'url' }, MODELS: { 'git': 'url' }, LABELS: { 'git': new_remote }, STORAGE_CONFIG_KEY: {} } with mock.patch('ml_git.config.global_config_load', return_value=global_conf): save_global_config_in_local() config = yaml_load('.ml-git/config.yaml') self.assertEqual(config[LABELS]['git'], new_remote)
def test_add_metrics_file(self): hashfs_path = os.path.join(self.tmp_dir, 'objectsfs') test_config = yaml_load('hdata/config.yaml') local_repo = LocalRepository(test_config, hashfs_path, repo_type=MODELS) spec_path = os.path.join(self.tmp_dir, 'model-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_path) spec_file = yaml_load(spec_path) model = spec_file[DATASET_SPEC_KEY].copy() del spec_file[DATASET_SPEC_KEY] spec_file[MODEL_SPEC_KEY] = model yaml_save(spec_file, spec_path) metrics_file_path = os.path.join(self.tmp_dir, 'metrics.csv') self.create_csv_file(metrics_file_path, { 'metric_a': 10, 'metric_b': 9 }) local_repo.add_metrics(spec_path, (), metrics_file_path) test_spec_file = yaml_load(spec_path) self.assertEqual( test_spec_file[MODEL_SPEC_KEY]['metrics'].get('metric_a', ''), 10.0) self.assertEqual( test_spec_file[MODEL_SPEC_KEY]['metrics'].get('metric_b', ''), 9.0)
def test_save_global_config_in_local(self): remote_default = 'git_local_server.git' new_remote = 'git_local_server2.git' init_mlgit() self.assertTrue(os.path.isdir('.ml-git')) config = yaml_load('.ml-git/config.yaml') self.assertEqual(config['dataset']['git'], remote_default) global_conf = { 'dataset': { 'git': 'url' }, 'model': { 'git': 'url' }, 'labels': { 'git': new_remote }, 'store': {} } with mock.patch('ml_git.config.global_config_load', return_value=global_conf): save_global_config_in_local() config = yaml_load('.ml-git/config.yaml') self.assertEqual(config['labels']['git'], new_remote)
def test_store_del(self): init_mlgit() store_add('s3', 'bucket_test', 'personal') config_edit = yaml_load('.ml-git/config.yaml') self.assertEqual(config_edit['store']['s3']['bucket_test']['aws-credentials']['profile'], 'personal') store_del('s3', 'bucket_test') config = yaml_load('.ml-git/config.yaml') self.assertFalse('s3' in config['store'] and 'bucket_test' in config['store']['s3'])
def test_store_add(self): init_mlgit() store_add('s3', 'bucket_test', 'personal') config_edit = yaml_load('.ml-git/config.yaml') self.assertEqual(config_edit['store']['s3']['bucket_test']['aws-credentials']['profile'], 'personal') self.assertEqual(config_edit['store']['s3']['bucket_test']['region'], None) s = store_add('s4', 'bucket_test', 'personal') self.assertEqual(s, None) config = yaml_load('.ml-git/config.yaml') self.assertTrue('s3' in config['store'])
def test_add_metrics_wrong_entity(self): hashfs_path = os.path.join(self.tmp_dir, 'objectsfs') test_config = yaml_load('hdata/config.yaml') local_repo = LocalRepository(test_config, hashfs_path) spec_path = os.path.join(self.tmp_dir, 'dataset-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_path) local_repo.add_metrics(spec_path, (('metric_a', '10'), ('metric_b', '9')), None) test_spec_file = yaml_load(spec_path) self.assertFalse('metrics' in test_spec_file[DATASET_SPEC_KEY])
def test_storage_del(self): init_mlgit() storage_add(S3, 'bucket_test', 'personal') config_edit = yaml_load('.ml-git/config.yaml') self.assertEqual( config_edit[STORAGE_CONFIG_KEY][S3]['bucket_test'] ['aws-credentials']['profile'], 'personal') storage_del(S3, 'bucket_test') config = yaml_load('.ml-git/config.yaml') self.assertFalse(S3 in config[STORAGE_CONFIG_KEY] and 'bucket_test' in config[STORAGE_CONFIG_KEY][S3])
def test_remote_del(self): remote_default = 'git_local_server.git' init_mlgit() config = yaml_load('.ml-git/config.yaml') self.assertEqual(config[DATASETS]['git'], '') remote_add(DATASETS, remote_default) config = yaml_load('.ml-git/config.yaml') self.assertEqual(config[DATASETS]['git'], remote_default) remote_del(DATASETS) config_ = yaml_load('.ml-git/config.yaml') self.assertEqual(config_[DATASETS]['git'], '')
def test_remote_del(self): remote_default = 'git_local_server.git' dataset = 'dataset' init_mlgit() config = yaml_load('.ml-git/config.yaml') self.assertEqual(config['dataset']['git'], '') remote_add(dataset, remote_default) config = yaml_load('.ml-git/config.yaml') self.assertEqual(config['dataset']['git'], remote_default) remote_del(dataset) config_ = yaml_load('.ml-git/config.yaml') self.assertEqual(config_['dataset']['git'], '')
def test_storage_add(self): init_mlgit() storage_add(S3, 'bucket_test', 'personal') config_edit = yaml_load('.ml-git/config.yaml') self.assertEqual( config_edit[STORAGE_CONFIG_KEY][S3]['bucket_test'] ['aws-credentials']['profile'], 'personal') self.assertEqual( config_edit[STORAGE_CONFIG_KEY][S3]['bucket_test']['region'], None) s = storage_add('s4', 'bucket_test', 'personal') self.assertEqual(s, None) config = yaml_load('.ml-git/config.yaml') self.assertTrue(S3 in config[STORAGE_CONFIG_KEY])
def test_store_del_global_config(self): with mock.patch('pathlib.Path.home', return_value=self.tmp_dir): init_mlgit() store_add('s3', 'bucket_test', 'personal', global_conf=True) config_edit = yaml_load('.mlgitconfig') self.assertEqual(config_edit['store']['s3']['bucket_test']['aws-credentials']['profile'], 'personal') with mock.patch('pathlib.Path.home', return_value=self.tmp_dir): store_del('s3', 'bucket_test', global_conf=True) config = yaml_load('.mlgitconfig') self.assertFalse('s3' in config['store'] and 'bucket_test' in config['store']['s3'])
def test_add(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) # TODO: there is incorrect behavior here. During unit test runs, the link count can be > 1 in some cases # incorrectly, so the file doesn't get added to the index. I think this is a design issue for index.py # add_file in general; for now we will allow the unit tests to not trust this data and add the file anyway # by adding a trust_links parameter that defaults to True and cascades its way through the calls. idx.add('data', '') mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml') self.assertEqual(yaml_load(mf), singlefile['manifest']) fi = yaml_load(os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'INDEX.yaml')) for k, v in fi.items(): self.assertEqual(v['hash'], singlefile['datastore'])
def storage_del(storage_type, bucket, global_conf=False): if not valid_storage_type(storage_type): return try: config_path = get_config_path(global_conf) conf = yaml_load(config_path) except Exception as e: log.error(e, class_name=ADMIN_CLASS_NAME) return storage_exists = STORAGE_CONFIG_KEY in conf and storage_type in conf[ STORAGE_CONFIG_KEY] and bucket in conf[STORAGE_CONFIG_KEY][storage_type] if not storage_exists: log.warn(output_messages['WARN_STORAGE_NOT_IN_CONFIG'] % (storage_type, bucket), class_name=ADMIN_CLASS_NAME) return del conf[STORAGE_CONFIG_KEY][storage_type][bucket] log.info(output_messages['INFO_REMOVED_STORAGE'] % (storage_type, bucket), class_name=ADMIN_CLASS_NAME) yaml_save(conf, config_path)
def get_version(file, repotype='dataset'): spec_hash = utils.yaml_load(file) if is_valid_version(spec_hash, repotype): return spec_hash['dataset']['version'] else: log.error('Invalid version, could not get. File:\n %s' % file, class_name=ML_GIT_PROJECT_NAME) return -1
def store_del(store_type, bucket, global_conf=False): if not valid_store_type(store_type): return try: config_path = get_config_path(global_conf) conf = yaml_load(config_path) except Exception as e: log.error(e, class_name=ADMIN_CLASS_NAME) return store_exists = 'store' in conf and store_type in conf[ 'store'] and bucket in conf['store'][store_type] if not store_exists: log.warn('Store [%s://%s] not found in configuration file.' % (store_type, bucket), class_name=ADMIN_CLASS_NAME) return del conf['store'][store_type][bucket] log.info('Removed store [%s://%s] from configuration file.' % (store_type, bucket), class_name=ADMIN_CLASS_NAME) yaml_save(conf, config_path)
def test_yaml_save(self): with tempfile.TemporaryDirectory() as tmpdir: arr = tmpdir.split('\\') temp_var = arr.pop() yaml_path = os.path.join(tmpdir, 'data.yaml') shutil.copy('udata/data.yaml', yaml_path) yal = yaml_load(yaml_path) temp_arr = yal[DATASETS]['git'].split('.') temp_arr.pop() temp_arr.pop() temp_arr.append(temp_var) temp_arr.append('git') # create new git variable new_git_var = '.'.join(temp_arr) self.assertFalse(yal[DATASETS]['git'] == new_git_var) yal[DATASETS]['git'] = new_git_var yaml_save(yal, yaml_path) self.assertTrue(yal[DATASETS]['git'] == new_git_var)
def test_change_keys_in_config(self): config = """ dataset: git: fake_git_repository labels: git: fake_git_repository model: git: fake_git_repository store: s3: mlgit-datasets: aws-credentials: profile: mlgit region: us-east-1 """ config_path = os.path.join(self.tmp_dir, ROOT_FILE_NAME, 'config.yaml') os.makedirs(os.path.join(self.tmp_dir, ROOT_FILE_NAME), exist_ok=True) with open(config_path, 'w') as config_yaml: config_yaml.write(config) change_keys_in_config(self.tmp_dir) conf = yaml_load(config_path) self.assertNotIn(V1_DATASETS_KEY, conf) self.assertIn(DATASETS, conf) self.assertNotIn(V1_MODELS_KEY, conf) self.assertIn(MODELS, conf) self.assertNotIn(V1_STORAGE_KEY, conf) self.assertIn(STORAGE_CONFIG_KEY, conf)
def test_add_idmpotent(self): idx = MultihashIndex('dataset-spec', self.tmp_dir, self.tmp_dir) idx.add('data', '') idx.add('data', '') mf = os.path.join(self.tmp_dir, 'metadata', 'dataset-spec', 'MANIFEST.yaml') self.assertEqual(yaml_load(mf), singlefile['manifest'])
def test_yaml_load(self): yal = {} self.assertFalse(bool(yal)) yal = yaml_load('./udata/data.yaml') self.assertTrue(bool(yal)) self.assertEqual(yal['store']['s3']['mlgit-datasets']['region'], 'us-east-1')
def test_remote_add(self): remote_default = 'git_local_server.git' new_remote = 'git_local_server2.git' init_mlgit() remote_add(DATASETS, new_remote) self.assertTrue(os.path.isdir('.ml-git')) config = yaml_load('.ml-git/config.yaml') self.assertEqual(config[DATASETS]['git'], new_remote) self.assertNotEqual(remote_default, new_remote) remote_add(DATASETS, '') config_ = yaml_load('.ml-git/config.yaml') self.assertEqual(config_[DATASETS]['git'], '') remote_add(DATASETS, new_remote) self.assertTrue(os.path.isdir('.ml-git')) config__ = yaml_load('.ml-git/config.yaml') self.assertEqual(config__[DATASETS]['git'], new_remote)
def remote_add(repotype, ml_git_remote, global_conf=False): file = get_config_path(global_conf) conf = yaml_load(file) if repotype in conf: if conf[repotype]['git'] is None or not len(conf[repotype]['git']) > 0: log.info(output_messages['INFO_ADD_REMOTE'] % (ml_git_remote, repotype), class_name=ADMIN_CLASS_NAME) else: log.warn(output_messages['WARN_HAS_CONFIGURED_REMOTE'], class_name=ADMIN_CLASS_NAME) log.info(output_messages['INFO_CHANGING_REMOTE'] % (conf[repotype]['git'], ml_git_remote, repotype), class_name=ADMIN_CLASS_NAME) else: log.info(output_messages['INFO_ADD_REMOTE'] % (ml_git_remote, repotype), class_name=ADMIN_CLASS_NAME) try: conf[repotype]['git'] = ml_git_remote except Exception: conf[repotype] = {} conf[repotype]['git'] = ml_git_remote yaml_save(conf, file)
def test_get_metrics(self): repo_type = MODELS mdpath = os.path.join(self.test_dir, 'mdata', repo_type, 'metadata') specpath = os.path.join('vision-computer', 'images') entity = 'model-ex' m = Metadata(entity, self.test_dir, config, repo_type) m.init() ensure_path_exists(os.path.join(mdpath, specpath, entity)) spec_metadata_path = os.path.join(mdpath, specpath, entity, 'model-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_metadata_path) spec_file = yaml_load(spec_metadata_path) spec_file[MODEL_SPEC_KEY] = deepcopy(spec_file[DATASET_SPEC_KEY]) del spec_file[DATASET_SPEC_KEY] spec_file[MODEL_SPEC_KEY]['metrics'] = {'metric_1': 0, 'metric_2': 1} yaml_save(spec_file, spec_metadata_path) tag = 'vision-computer__images__model-ex__1' sha = m.commit(spec_metadata_path, specpath) m.tag_add(tag) metrics = m._get_metrics(entity, sha) test_table = PrettyTable() test_table.field_names = ['Name', 'Value'] test_table.align['Name'] = 'l' test_table.align['Value'] = 'l' test_table.add_row(['metric_1', 0]) test_table.add_row(['metric_2', 1]) test_metrics = '\nmetrics:\n{}'.format(test_table.get_string()) self.assertEqual(metrics, test_metrics)
def _add_dir(self, dir_path, manifest_path, file_path='', ignore_rules=None): self.manifestfiles = yaml_load(manifest_path) f_index_file = self._full_idx.get_index() all_files = [] for root, dirs, files in os.walk(os.path.join(dir_path, file_path)): base_path = root[:len(dir_path) + 1:] relative_path = root[len(dir_path) + 1:] if '.' == root[0] or should_ignore_file( ignore_rules, '{}/'.format(relative_path)): continue for file in files: file_path = os.path.join(relative_path, file) if ignore_rules is None or not should_ignore_file( ignore_rules, file_path): all_files.append(file_path) self.wp.progress_bar_total_inc(len(all_files)) args = { 'wp': self.wp, 'base_path': base_path, 'f_index_file': f_index_file, 'all_files': all_files, 'dir_path': dir_path } result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args) if not result: return False self._full_idx.save_manifest_index() self._mf.save()
def test_remote_fsck(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) ohfs.put(HDATA_IMG_1) s3 = boto3.resource( 's3', region_name='us-east-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete() self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname)) mdpath = os.path.join(self.tmp_dir, 'metadata-test') dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath) fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec')) spec = 'vision-computing__images__dataset-ex__5' c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True) self.assertTrue(ret) self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())
def _add_single_file(self, base_path, manifestpath, file_path): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() if (SPEC_EXTENSION in file_path) or ('README' in file_path) or ( MLGIT_IGNORE_FILE_NAME in file_path): self.wp.progress_bar_total_inc(-1) self.add_metadata(base_path, file_path) else: self.wp.submit(self._add_file, base_path, file_path, f_index_file) futures = self.wp.wait() for future in futures: try: scid, filepath, previous_hash = future.result() self.update_index( scid, filepath, previous_hash) if scid is not None else None except Exception as e: # save the manifest of files added to index so far self._full_idx.save_manifest_index() self._mf.save() log.error(output_messages['ERROR_ADDING_DIR'] % (base_path, e), class_name=MULTI_HASH_CLASS_NAME) return self.wp.reset_futures() self._full_idx.save_manifest_index() self._mf.save()
def _add_dir(self, dirpath, manifestpath, file_path='', trust_links=True): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() all_files = [] for root, dirs, files in os.walk(os.path.join(dirpath, file_path)): if '.' == root[0]: continue basepath = root[:len(dirpath) + 1:] relativepath = root[len(dirpath) + 1:] for file in files: all_files.append(os.path.join(relativepath, file)) self.wp.progress_bar_total_inc(len(all_files)) args = { 'wp': self.wp, 'basepath': basepath, 'f_index_file': f_index_file, 'all_files': all_files, 'dirpath': dirpath } result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args) if not result: return False self._full_idx.save_manifest_index() self._mf.save()
def test_mount_blobs(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') s3 = boto3.resource( 's3', region_name='eu-west-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) keypath = 'zdj7We7Je5MRECsZUF7uptseHHPY29zGoqFsVHw6sbgv1MbWS' file = os.path.join('hdata', keypath) with open(file, 'rb') as f: s3.Bucket(testbucketname).Object(keypath).put(file, Body=f) c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) s3store = S3Store(testbucketname, bucket) links = {'Links': [{'Hash': keypath, 'Size': 16822}]} with open(file, 'rb') as f: self.assertEqual(f.read(), r._mount_blobs(s3store, links))
def test_get_ipld(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') s3 = boto3.resource( 's3', region_name='eu-west-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) keypath = 'zdj7WdjnTVfz5AhTavcpsDT62WiQo4AeQy6s4UC1BSEZYx4NP' file = os.path.join('hdata', keypath) with open(file, 'rb') as f: s3.Bucket(testbucketname).Object(keypath).put(file, Body=f) c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) s3store = S3Store(testbucketname, bucket) links = { 'Links': [{ 'Hash': 'zdj7WVyQ8wTdnDXsbg8wxwwFkt2Bzp95Tncsfg8PCgKXeLTye', 'Size': 16822 }] } self.assertEqual(links, r._get_ipld(s3store, keypath))
def unlock_file(self, spec, file_path): repo_type = self.__repo_type if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) spec_file = yaml_load(spec_path) try: mutability = spec_file[repo_type]['mutability'] if mutability not in Mutability.list(): log.error('Invalid mutability type.', class_name=REPOSITORY_CLASS_NAME) return except Exception: log.info( 'The spec does not have the \'mutability\' property set. Default: strict.', class_name=REPOSITORY_CLASS_NAME) return if mutability != Mutability.STRICT.value: try: local = LocalRepository(self.__config, objects_path, repo_type) local.unlock_file(path, file_path, index_path, objects_path, spec, cache_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return else: log.error( 'You cannot use this command for this entity because mutability cannot be strict.', class_name=REPOSITORY_CLASS_NAME)