def test_add_metrics_file(self): hashfs_path = os.path.join(self.tmp_dir, 'objectsfs') test_config = yaml_load('hdata/config.yaml') local_repo = LocalRepository(test_config, hashfs_path, repo_type=MODELS) spec_path = os.path.join(self.tmp_dir, 'model-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_path) spec_file = yaml_load(spec_path) model = spec_file[DATASET_SPEC_KEY].copy() del spec_file[DATASET_SPEC_KEY] spec_file[MODEL_SPEC_KEY] = model yaml_save(spec_file, spec_path) metrics_file_path = os.path.join(self.tmp_dir, 'metrics.csv') self.create_csv_file(metrics_file_path, { 'metric_a': 10, 'metric_b': 9 }) local_repo.add_metrics(spec_path, (), metrics_file_path) test_spec_file = yaml_load(spec_path) self.assertEqual( test_spec_file[MODEL_SPEC_KEY]['metrics'].get('metric_a', ''), 10.0) self.assertEqual( test_spec_file[MODEL_SPEC_KEY]['metrics'].get('metric_b', ''), 9.0)
def export(self, bucket, tag, retry): try: categories_path, spec_name, _ = spec_parse(tag) get_root_path() if not self._tag_exists(tag): return None, None except InvalidGitRepositoryError: log.error('You are not in an initialized ml-git repository.', class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local = LocalRepository( self.__config, get_objects_path(self.__config, self.__repo_type), self.__repo_type) local.export_tag(get_metadata_path(self.__config, self.__repo_type), tag, bucket, retry) self._checkout_ref()
def test_fetch(self): mdpath = os.path.join(self.tmp_dir, 'metadata-test') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') config_spec = get_sample_config_spec(testbucketname, testprofile, testregion) dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save( { 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'} }, manifestpath) objectpath = os.path.join(self.tmp_dir, 'objects-test') spec = 'vision-computing__images__dataset-ex__5' r = LocalRepository(config_spec, objectpath) r.fetch(mdpath, spec, None) fs = set() for root, dirs, files in os.walk(objectpath): for file in files: fs.add(file) self.assertEqual(len(hs), len(fs)) self.assertTrue(len(hs.difference(fs)) == 0)
def status(self, spec, full_option, status_directory): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) log.info('%s: status of ml-git index for [%s]' % (repo_type, spec), class_name=REPOSITORY_CLASS_NAME) new_files, deleted_files, untracked_files, corruped_files, changed_files = repo.status( spec, status_directory) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if new_files is not None and deleted_files is not None and untracked_files is not None: print('Changes to be committed:') self._print_files(new_files, full_option, 'New file: ') self._print_files(deleted_files, full_option, 'Deleted: ') print('\nUntracked files:') self._print_files(untracked_files, full_option) print('\nCorrupted files:') self._print_files(corruped_files, full_option) if changed_files and len(changed_files) > 0: print('\nChanges not staged for commit:') self._print_files(changed_files, full_option)
def test_remote_fsck(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) ohfs.put(HDATA_IMG_1) s3 = boto3.resource( 's3', region_name='us-east-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete() self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname)) mdpath = os.path.join(self.tmp_dir, 'metadata-test') dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath) fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec')) spec = 'vision-computing__images__dataset-ex__5' c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True) self.assertTrue(ret) self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())
def remote_fsck(self, spec, retries=2, thorough=False, paranoid=False): repo_type = self.__repo_type try: metadata_path = get_metadata_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) self._checkout_ref(tag) spec_path, spec_file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if spec_path is None: return full_spec_path = os.path.join(spec_path, spec_file) r = LocalRepository(self.__config, objects_path, repo_type) r.remote_fsck(metadata_path, tag, full_spec_path, retries, thorough, paranoid) # ensure first we're on master ! self._checkout_ref()
def test_mount_blobs(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') s3 = boto3.resource( 's3', region_name='eu-west-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) keypath = 'zdj7We7Je5MRECsZUF7uptseHHPY29zGoqFsVHw6sbgv1MbWS' file = os.path.join('hdata', keypath) with open(file, 'rb') as f: s3.Bucket(testbucketname).Object(keypath).put(file, Body=f) c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) s3store = S3Store(testbucketname, bucket) links = {'Links': [{'Hash': keypath, 'Size': 16822}]} with open(file, 'rb') as f: self.assertEqual(f.read(), r._mount_blobs(s3store, links))
def test_get_ipld(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') s3 = boto3.resource( 's3', region_name='eu-west-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) keypath = 'zdj7WdjnTVfz5AhTavcpsDT62WiQo4AeQy6s4UC1BSEZYx4NP' file = os.path.join('hdata', keypath) with open(file, 'rb') as f: s3.Bucket(testbucketname).Object(keypath).put(file, Body=f) c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) s3store = S3Store(testbucketname, bucket) links = { 'Links': [{ 'Hash': 'zdj7WVyQ8wTdnDXsbg8wxwwFkt2Bzp95Tncsfg8PCgKXeLTye', 'Size': 16822 }] } self.assertEqual(links, r._get_ipld(s3store, keypath))
def unlock_file(self, spec, file_path): repo_type = self.__repo_type if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) spec_file = yaml_load(spec_path) try: mutability = spec_file[repo_type]['mutability'] if mutability not in Mutability.list(): log.error('Invalid mutability type.', class_name=REPOSITORY_CLASS_NAME) return except Exception: log.info( 'The spec does not have the \'mutability\' property set. Default: strict.', class_name=REPOSITORY_CLASS_NAME) return if mutability != Mutability.STRICT.value: try: local = LocalRepository(self.__config, objects_path, repo_type) local.unlock_file(path, file_path, index_path, objects_path, spec, cache_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return else: log.error( 'You cannot use this command for this entity because mutability cannot be strict.', class_name=REPOSITORY_CLASS_NAME)
def test_add_metrics_wrong_entity(self): hashfs_path = os.path.join(self.tmp_dir, 'objectsfs') test_config = yaml_load('hdata/config.yaml') local_repo = LocalRepository(test_config, hashfs_path) spec_path = os.path.join(self.tmp_dir, 'dataset-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_path) local_repo.add_metrics(spec_path, (('metric_a', '10'), ('metric_b', '9')), None) test_spec_file = yaml_load(spec_path) self.assertFalse('metrics' in test_spec_file[DATASET_SPEC_KEY])
def push(self, spec, retry=2, clear_on_fail=False): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return met = Metadata(spec, metadata_path, self.__config, repo_type) fields = met.git_user_config() if None in fields.values(): log.error( 'Your name and email address need to be configured in git. ' 'Please see the commands below:', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.name \'Your Name\'', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.email [email protected]', class_name=REPOSITORY_CLASS_NAME) return if met.fetch() is False: return ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) spec_path, spec_file = None, None try: spec_path, spec_file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if spec_path is None: return full_spec_path = os.path.join(spec_path, spec_file) repo = LocalRepository(self.__config, objects_path, repo_type) ret = repo.push(objects_path, full_spec_path, retry, clear_on_fail) # ensure first we're on master ! met.checkout() if ret == 0: # push metadata spec to LocalRepository git repository try: met.push() except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return MultihashFS(objects_path).reset_log()
def _fetch(self, tag, samples, retries=2, bare=False): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) # check if no data left untracked/uncommitted. othrewise, stop. local_rep = LocalRepository(self.__config, objects_path, repo_type) return local_rep.fetch(metadata_path, tag, samples, retries, bare) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return
def test_get_update_cache(self): hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) self.assertTrue(os.path.exists(cache.get_keypath(key))) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
def test_add_metrics_with_none_metrics_options(self): hashfs_path = os.path.join(self.tmp_dir, 'objectsfs') test_config = yaml_load('hdata/config.yaml') local_repo = LocalRepository(test_config, hashfs_path, repo_type=MODELS) spec_path = os.path.join(self.tmp_dir, 'model-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_path) spec_file = yaml_load(spec_path) model = spec_file[DATASET_SPEC_KEY].copy() del spec_file[DATASET_SPEC_KEY] spec_file[MODEL_SPEC_KEY] = model yaml_save(spec_file, spec_path) local_repo.add_metrics(spec_path, (), None) test_spec_file = yaml_load(spec_path) self.assertFalse('metrics' in test_spec_file[MODEL_SPEC_KEY])
def create(self, kwargs): artifact_name = kwargs['artifact_name'] categories = list(kwargs['category']) version = int(kwargs['version_number']) imported_dir = kwargs['import'] store_type = kwargs['store_type'] bucket_name = kwargs['bucket_name'] start_wizard = kwargs['wizard_config'] import_url = kwargs['import_url'] unzip_file = kwargs['unzip'] credentials_path = kwargs['credentials_path'] repo_type = self.__repo_type try: create_workspace_tree_structure(repo_type, artifact_name, categories, store_type, bucket_name, version, imported_dir, kwargs['mutability']) if start_wizard: has_new_store, store_type, bucket, profile, endpoint_url, git_repo = start_wizard_questions( repo_type) if has_new_store: store_add(store_type, bucket, profile, endpoint_url) update_store_spec(repo_type, artifact_name, store_type, bucket) remote_add(repo_type, git_repo) if import_url: self.create_config_store('gdrive', credentials_path) local = LocalRepository( self.__config, get_objects_path(self.__config, repo_type)) destine_path = os.path.join(repo_type, artifact_name, 'data') local.import_file_from_url(destine_path, import_url, StoreType.GDRIVE.value) if unzip_file: log.info('Unzipping files', CLASS_NAME=REPOSITORY_CLASS_NAME) data_path = os.path.join(get_root_path(), repo_type, artifact_name, 'data') unzip_files_in_directory(data_path) log.info("Project Created.", CLASS_NAME=REPOSITORY_CLASS_NAME) except Exception as e: if not isinstance(e, PermissionError): clear(os.path.join(repo_type, artifact_name)) if isinstance(e, KeyboardInterrupt): log.info("Create command aborted!", class_name=REPOSITORY_CLASS_NAME) else: log.error(e, CLASS_NAME=REPOSITORY_CLASS_NAME)
def test_push(self): mlgit_dir = os.path.join(self.tmp_dir, '.ml-git') indexpath = os.path.join(mlgit_dir, 'index-test') mdpath = os.path.join(mlgit_dir, 'metadata-test') objectpath = os.path.join(mlgit_dir, 'objects-test') specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex') ensure_path_exists(specpath) ensure_path_exists(indexpath) shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec') shutil.copy('hdata/config.yaml', mlgit_dir + '/config.yaml') manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save( { 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'} }, manifestpath) # adds chunks to ml-git Index idx = MultihashIndex(specpath, indexpath, objectpath) idx.add('data-test-push/', manifestpath) fi = yaml_load(os.path.join(specpath, 'INDEX.yaml')) self.assertTrue(len(fi) > 0) self.assertTrue(os.path.exists(indexpath)) o = Objects(specpath, objectpath) o.commit_index(indexpath, self.tmp_dir) self.assertTrue(os.path.exists(objectpath)) c = yaml_load('hdata/config.yaml') r = LocalRepository(c, objectpath) r.push(objectpath, specpath + '/dataset-ex.spec') s3 = boto3.resource( 's3', region_name='eu-west-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) for key in idx.get_index(): self.assertIsNotNone(s3.Object(testbucketname, key))
def test_add_metrics(self): hashfs_path = os.path.join(self.tmp_dir, 'objectsfs') test_config = yaml_load('hdata/config.yaml') local_repo = LocalRepository(test_config, hashfs_path, repo_type=MODELS) spec_path = os.path.join(self.tmp_dir, 'model-ex.spec') shutil.copy('hdata/dataset-ex.spec', spec_path) spec_file = yaml_load(spec_path) model = spec_file[DATASET_SPEC_KEY].copy() del spec_file[DATASET_SPEC_KEY] spec_file[MODEL_SPEC_KEY] = model yaml_save(spec_file, spec_path) local_repo.add_metrics(spec_path, (('metric_a', '10'), ('metric_b', '9')), None) test_spec_file = yaml_load(spec_path) self.assertTrue(test_spec_file[MODEL_SPEC_KEY]['metrics'].get( 'metric_a', '') == 10.0) self.assertTrue(test_spec_file[MODEL_SPEC_KEY]['metrics'].get( 'metric_b', '') == 9.0)
def test_get_update_links_wspace(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) set_write_read(wspace_file) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) fi = fidx.get_index() for k, v in fi.items(): self.assertEqual(k, os.path.join('data', 'imghires.jpg')) self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh') self.assertEqual(v['status'], 'u') self.assertEqual(v['ctime'], st.st_ctime) self.assertEqual(v['mtime'], st.st_mtime) self.assertTrue(st.st_nlink == 2) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
def test_push(self): indexpath = os.path.join(self.tmp_dir, 'index-test') mdpath = os.path.join(self.tmp_dir, 'metadata-test') objectpath = os.path.join(self.tmp_dir, 'objects-test') specpath = os.path.join(mdpath, 'vision-computing/images/dataset-ex') ensure_path_exists(indexpath) ensure_path_exists(specpath) shutil.copy('hdata/dataset-ex.spec', specpath + '/dataset-ex.spec') manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save(files_mock, manifestpath) idx = MultihashIndex(specpath, indexpath, objectpath) idx.add('data-test-push-1/', manifestpath) fidx = FullIndex(specpath, indexpath) self.assertTrue(os.path.exists(indexpath)) c = yaml_load('hdata/config.yaml') o = Objects(specpath, objectpath) o.commit_index(indexpath, self.tmp_dir) self.assertTrue(os.path.exists(objectpath)) r = LocalRepository(c, objectpath) self.assertTrue(r.push(objectpath, specpath + '/dataset-ex.spec') == 0) self.assertTrue(len(fidx.get_index()) == 1)
def test_import_files(self): path_obj = os.path.join(self.tmp_dir, 'objects') c = yaml_load('hdata/config.yaml') r = LocalRepository(c, path_obj) r.change_config_store(testprofile, testbucketname, 's3', region=None, endpoint_url=None) r.import_files(None, None, self.tmp_dir, 2, '{}://{}'.format('s3', testbucketname)) for h in hs: file_path = os.path.join(self.tmp_dir, h) dir_file = os.path.join('hdata', h) self.assertTrue(os.path.exists(file_path)) self.assertTrue(filecmp.cmp(dir_file, file_path))
def import_files(self, object, path, directory, retry, bucket): err_msg = 'Invalid ml-git project!' try: root = get_root_path() root_dir = os.path.join(root, directory) except Exception: log.error(err_msg, class_name=REPOSITORY_CLASS_NAME) return local = LocalRepository( self.__config, get_objects_path(self.__config, self.__repo_type), self.__repo_type) bucket_name = bucket['bucket_name'] store_type = bucket['store_type'] local.change_config_store(bucket['profile'], bucket_name, store_type, region=bucket['region'], endpoint_url=bucket['endpoint_url']) local.import_files(object, path, root_dir, retry, '{}://{}'.format(store_type, bucket_name))
def commit(self, spec, specs, version=None, run_fsck=False, msg=None): # Move chunks from index to .ml-git/objects repo_type = self.__repo_type try: index_path = get_index_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) mutability, check_mutability = repo.get_mutability_from_spec( spec, repo_type) if not mutability: return if not check_mutability: log.error('Spec mutability cannot be changed.', class_name=REPOSITORY_CLASS_NAME) return except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) manifest_path = os.path.join(metadata_path, categories_path, spec, MANIFEST_FILE) path, file = None, None try: path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if path is None: return None, None, None spec_path = os.path.join(path, file) idx = MultihashIndex(spec, index_path, objects_path) if version: set_version_in_spec(version, spec_path, self.__repo_type) idx.add_metadata(path, file) # Check tag before anything to avoid creating unstable state log.debug('Check if tag already exists', class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if not m.check_exists(): log.error('The %s has not been initialized' % self.__repo_type, class_name=REPOSITORY_CLASS_NAME) return full_metadata_path, categories_sub_path, metadata = m.tag_exists( index_path) if metadata is None: return None log.debug('%s -> %s' % (index_path, objects_path), class_name=REPOSITORY_CLASS_NAME) # commit objects in index to ml-git objects o = Objects(spec, objects_path) changed_files, deleted_files = o.commit_index(index_path, path) bare_mode = os.path.exists( os.path.join(index_path, 'metadata', spec, 'bare')) if not bare_mode: manifest = m.get_metadata_manifest(manifest_path) self._remove_deleted_files(idx, index_path, m, manifest, spec, deleted_files) m.remove_files_added_after_base_tag(manifest, path) else: tag, _ = ref.branch() self._checkout_ref(tag) # update metadata spec & README.md # option --dataset-spec --labels-spec tag, sha = m.commit_metadata(index_path, specs, msg, changed_files, mutability, path) # update ml-git ref spec HEAD == to new SHA-1 / tag if tag is None: return None ref = Refs(refs_path, spec, repo_type) ref.update_head(tag, sha) # Run file check if run_fsck: self.fsck() return tag
def _checkout(self, tag, samples, retries=2, force_get=False, dataset=False, labels=False, bare=False): repo_type = self.__repo_type try: cache_path = get_cache_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) # find out actual workspace path to save data if not self._tag_exists(tag): return None, None categories_path, spec_name, _ = spec_parse(tag) dataset_tag = None labels_tag = None root_path = get_root_path() ws_path = os.path.join(root_path, os.sep.join([repo_type, categories_path])) ensure_path_exists(ws_path) except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None ref = Refs(refs_path, spec_name, repo_type) cur_tag, _ = ref.branch() if cur_tag == tag: log.info('already at tag [%s]' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local_rep = LocalRepository(self.__config, objects_path, repo_type) # check if no data left untracked/uncommitted. otherwise, stop. if not force_get and local_rep.exist_local_changes(spec_name) is True: return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None spec_path = os.path.join(metadata_path, categories_path, spec_name + '.spec') if dataset is True: dataset_tag = get_entity_tag(spec_path, repo_type, 'dataset') if labels is True: labels_tag = get_entity_tag(spec_path, repo_type, 'labels') fetch_success = self._fetch(tag, samples, retries, bare) if not fetch_success: objs = Objects('', objects_path) objs.fsck(remove_corrupted=True) self._checkout_ref('master') return None, None try: spec_index_path = os.path.join( get_index_metadata_path(self.__config, repo_type), spec_name) except Exception: return if os.path.exists(spec_index_path): if os.path.exists( os.path.join(spec_index_path, spec_name + '.spec')): os.unlink(os.path.join(spec_index_path, spec_name + '.spec')) if os.path.exists(os.path.join(spec_index_path, 'README.md')): os.unlink(os.path.join(spec_index_path, 'README.md')) try: r = LocalRepository(self.__config, objects_path, repo_type) r.checkout(cache_path, metadata_path, objects_path, ws_path, tag, samples, bare) except OSError as e: self._checkout_ref('master') if e.errno == errno.ENOSPC: log.error( 'There is not enough space in the disk. Remove some files and try again.', class_name=REPOSITORY_CLASS_NAME) else: log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None except Exception as e: self._checkout_ref('master') log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None m = Metadata('', metadata_path, self.__config, repo_type) sha = m.sha_from_tag(tag) ref.update_head(tag, sha) # restore to master/head self._checkout_ref('master') return dataset_tag, labels_tag
def _checkout(self, tag, samples, options): dataset = options['with_dataset'] labels = options['with_labels'] retries = options['retry'] force_get = options['force'] bare = options['bare'] version = options['version'] repo_type = self.__repo_type try: cache_path = get_cache_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) if not re.search(RGX_TAG_FORMAT, tag): metadata_path = get_metadata_path(self.__config, repo_type) metadata = Metadata(tag, metadata_path, self.__config, repo_type) tag = metadata.get_tag(tag, version) if not tag: return None, None elif not self._tag_exists(tag): return None, None categories_path, spec_name, _ = spec_parse(tag) root_path = get_root_path() ws_path = os.path.join(root_path, os.sep.join([repo_type, categories_path])) except Exception as e: log.error(e, class_name=LOCAL_REPOSITORY_CLASS_NAME) return None, None ref = Refs(refs_path, spec_name, repo_type) cur_tag, _ = ref.branch() if cur_tag == tag: log.info('already at tag [%s]' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None local_rep = LocalRepository(self.__config, objects_path, repo_type) # check if no data left untracked/uncommitted. otherwise, stop. if not force_get and local_rep.exist_local_changes(spec_name) is True: return None, None try: self._checkout_ref(tag) except Exception: log.error('Unable to checkout to %s' % tag, class_name=REPOSITORY_CLASS_NAME) return None, None dataset_tag, labels_tag = self._get_related_tags( categories_path, dataset, labels, metadata_path, repo_type, spec_name) fetch_success = self._fetch(tag, samples, retries, bare) if not fetch_success: objs = Objects('', objects_path) objs.fsck(remove_corrupted=True) self._checkout_ref() return None, None ensure_path_exists(ws_path) try: spec_index_path = os.path.join( get_index_metadata_path(self.__config, repo_type), spec_name) except Exception: return self._delete_spec_and_readme(spec_index_path, spec_name) try: r = LocalRepository(self.__config, objects_path, repo_type) r.checkout(cache_path, metadata_path, ws_path, tag, samples, bare) except OSError as e: self._checkout_ref() if e.errno == errno.ENOSPC: log.error( 'There is not enough space in the disk. Remove some files and try again.', class_name=REPOSITORY_CLASS_NAME) else: log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None except Exception as e: self._checkout_ref() log.error( 'An error occurred while creating the files into workspace: %s \n.' % e, class_name=REPOSITORY_CLASS_NAME) return None, None m = Metadata('', metadata_path, self.__config, repo_type) sha = m.sha_from_tag(tag) ref.update_head(tag, sha) # restore to master/head self._checkout_ref() return dataset_tag, labels_tag
def test_get_update_links_wspace_with_duplicates(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} args = { 'obj_files': { key: files }, 'key': key, 'mutability': 'strict', 'mfiles': mfiles, 'ws_path': wspath, 'cache': cache, 'fidx': fidx } r._update_links_wspace(key, Status.u.name, args) wspace_file = os.path.join(wspath, DATA_IMG_1) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) wspace_file = os.path.join(wspath, DATA_IMG_2) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) self.assertTrue(st.st_nlink == 3) self.assertEqual( mfiles, { DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh', DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh' }) wspath = os.path.join(self.tmp_dir, 'wspace') ensure_path_exists(wspath) to_be_removed = os.path.join(wspath, 'to_be_removed') with open(to_be_removed, 'w') as f: f.write('DEAD\n') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} args = { 'obj_files': { key: files }, 'key': key, 'mutability': 'strict', 'mfiles': mfiles, 'ws_path': wspath, 'cache': cache, 'fidx': fidx } r._update_links_wspace(key, Status.u.name, args) r._remove_unused_links_wspace(wspath, mfiles) self.assertFalse(os.path.exists(to_be_removed))
def add(self, spec, file_path, bump_version=False, run_fsck=False): repo_type = self.__repo_type is_shared_objects = 'objects_path' in self.__config[repo_type] is_shared_cache = 'cache_path' in self.__config[repo_type] if not validate_config_spec_hash(self.__config): log.error( '.ml-git/config.yaml invalid. It should look something like this:\n%s' % get_yaml_str( get_sample_config_spec('somebucket', 'someprofile', 'someregion')), class_name=REPOSITORY_CLASS_NAME) return None path, file = None, None try: refs_path = get_refs_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) cache_path = get_cache_path(self.__config, repo_type) objects_path = get_objects_path(self.__config, repo_type) repo = LocalRepository(self.__config, objects_path, repo_type) mutability, check_mutability = repo.get_mutability_from_spec( spec, repo_type) sampling_flag = os.path.exists( os.path.join(index_path, 'metadata', spec, 'sampling')) if sampling_flag: log.error( 'You cannot add new data to an entity that is based on a checkout with the --sampling option.', class_name=REPOSITORY_CLASS_NAME) return if not mutability: return if not check_mutability: log.error('Spec mutability cannot be changed.', class_name=REPOSITORY_CLASS_NAME) return if not self._has_new_data(repo, spec): return None ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return if path is None: return spec_path = os.path.join(path, file) if not self._is_spec_valid(spec_path): return None # Check tag before anything to avoid creating unstable state log.debug('Repository: check if tag already exists', class_name=REPOSITORY_CLASS_NAME) m = Metadata(spec, metadata_path, self.__config, repo_type) if not m.check_exists(): log.error('The %s has not been initialized' % self.__repo_type, class_name=REPOSITORY_CLASS_NAME) return try: m.update() except Exception: pass # get version of current manifest file manifest = self._get_current_manifest_file(m, tag) try: # adds chunks to ml-git Index log.info('%s adding path [%s] to ml-git index' % (repo_type, path), class_name=REPOSITORY_CLASS_NAME) with change_mask_for_routine(is_shared_objects): idx = MultihashIndex(spec, index_path, objects_path, mutability, cache_path) idx.add(path, manifest, file_path) # create hard links in ml-git Cache self.create_hard_links_in_cache(cache_path, index_path, is_shared_cache, mutability, path, spec) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return None if bump_version and not increment_version_in_spec( spec_path, self.__repo_type): return None idx.add_metadata(path, file) self._check_corrupted_files(spec, repo) # Run file check if run_fsck: self.fsck()