def test_remove_corrupted_files(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) corrupted_file_path = os.path.join(self.tmp_dir, 'corrupted_file') open(corrupted_file_path, 'a').close() self.assertTrue(os.path.exists(corrupted_file_path)) hfs._remove_corrupted_files([corrupted_file_path], True) self.assertFalse(os.path.exists(corrupted_file_path))
def test_put1024K_pathexistence_level3(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=3) hfs.put('data/think-hires.jpg') fullpath = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'No', 'RA', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') self.assertTrue(os.path.exists(fullpath))
def test_remote_fsck(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) ohfs.put(HDATA_IMG_1) s3 = boto3.resource( 's3', region_name='us-east-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete() self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname)) mdpath = os.path.join(self.tmp_dir, 'metadata-test') dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath) fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec')) spec = 'vision-computing__images__dataset-ex__5' c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True) self.assertTrue(ret) self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())
def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None): self._spec = spec self._path = index_path self._hfs = MultihashFS(object_path) self._mf = self._get_index(index_path) self._full_idx = FullIndex(spec, index_path, mutability) self._cache = cache_path
def test_get_update_links_wspace(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) set_write_read(wspace_file) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) fi = fidx.get_index() for k, v in fi.items(): self.assertEqual(k, os.path.join('data', 'imghires.jpg')) self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh') self.assertEqual(v['status'], 'u') self.assertEqual(v['ctime'], st.st_ctime) self.assertEqual(v['mtime'], st.st_mtime) self.assertTrue(st.st_nlink == 2) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
def test_get_simple(self): original_file = 'data/think-hires.jpg' dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg') hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) objkey = hfs.put(original_file) hfs.get(objkey, dst_file) self.assertEqual(self.md5sum(original_file), self.md5sum(dst_file))
def test_put1024K_toomany_levels(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=23) hfs.put('data/think-hires.jpg') fullpath = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'RA', 'zc', 'iw', '2J', 'Ji', '69', 's2', 'Hj', 'fC', 'yz', 'Wt', '39', 'BH', 'Cu', 'cC', 'V2', 'Cs', 'AX', '6v', 'Sv', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') self.assertTrue(os.path.exists(fullpath))
def test_corruption(self): original_file = 'data/think-hires.jpg' dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg') hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) objkey = hfs.put(original_file) chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') with open(chunk, 'wb') as f: f.write(b'blabla') self.assertFalse(hfs.get(objkey, dst_file)) self.assertTrue(os.path.exists(dst_file) is False)
def test_get_update_cache(self): hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) self.assertTrue(os.path.exists(cache.get_keypath(key))) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
def test_fsck(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) # Create a hard link placing the file on a wrong directory chunk_in_wrong_dir = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) with open(chunk, 'wb') as f: f.write(b'blabla') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 2) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files)
def commit_objects(self, index_path, ws_path): added_files = [] deleted_files = [] idx = MultihashFS(self._objects_path) fidx = FullIndex(self.__spec, index_path) findex = fidx.get_index() log_path = os.path.join(self._logpath, 'store.log') with open(log_path, 'a') as log_file: for k, v in findex.items(): if not os.path.exists(os.path.join(ws_path, k)): deleted_files.append(k) elif v['status'] == Status.a.name: idx.fetch_scid(v['hash'], log_file) v['status'] = Status.u.name if 'previous_hash' in v: added_files.append((v['previous_hash'], k)) fidx.get_manifest_index().save() return added_files, deleted_files
def push(self, spec, retry=2, clear_on_fail=False): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return met = Metadata(spec, metadata_path, self.__config, repo_type) fields = met.git_user_config() if None in fields.values(): log.error( 'Your name and email address need to be configured in git. ' 'Please see the commands below:', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.name \'Your Name\'', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.email [email protected]', class_name=REPOSITORY_CLASS_NAME) return if met.fetch() is False: return ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) spec_path, spec_file = None, None try: spec_path, spec_file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if spec_path is None: return full_spec_path = os.path.join(spec_path, spec_file) repo = LocalRepository(self.__config, objects_path, repo_type) ret = repo.push(objects_path, full_spec_path, retry, clear_on_fail) # ensure first we're on master ! met.checkout() if ret == 0: # push metadata spec to LocalRepository git repository try: met.push() except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return MultihashFS(objects_path).reset_log()
def test_get_update_links_wspace_with_duplicates(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) wspace_file = os.path.join(wspath, DATA_IMG_2) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) self.assertTrue(st.st_nlink == 3) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh', DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'}) wspath = os.path.join(self.tmp_dir, 'wspace') ensure_path_exists(wspath) to_be_removed = os.path.join(wspath, 'to_be_removed') with open(to_be_removed, 'w') as f: f.write('DEAD\n') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') r._remove_unused_links_wspace(wspath, mfiles) self.assertFalse(os.path.exists(to_be_removed))
def test_fsck_with_remove_corrupted(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) chunk_in_wrong_dir = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck(remove_corrupted=True) self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) self.assertFalse(os.path.exists(chunk_in_wrong_dir))
class MultihashIndex(object): def __init__(self, spec, index_path, object_path, mutability=MutabilityType.STRICT.value, cache_path=None): self._spec = spec self._path = index_path self._hfs = MultihashFS(object_path) self._mf = self._get_index(index_path) self._full_idx = FullIndex(spec, index_path, mutability) self._cache = cache_path def _get_index(self, idxpath): metadatapath = os.path.join(idxpath, 'metadata', self._spec) ensure_path_exists(metadatapath) mfpath = os.path.join(metadatapath, 'MANIFEST.yaml') return Manifest(mfpath) def _add_dir(self, dir_path, manifest_path, file_path='', ignore_rules=None): self.manifestfiles = yaml_load(manifest_path) f_index_file = self._full_idx.get_index() all_files = [] for root, dirs, files in os.walk(os.path.join(dir_path, file_path)): base_path = root[:len(dir_path) + 1:] relative_path = root[len(dir_path) + 1:] if '.' == root[0] or should_ignore_file( ignore_rules, '{}/'.format(relative_path)): continue for file in files: file_path = os.path.join(relative_path, file) if ignore_rules is None or not should_ignore_file( ignore_rules, file_path): all_files.append(file_path) self.wp.progress_bar_total_inc(len(all_files)) args = { 'wp': self.wp, 'base_path': base_path, 'f_index_file': f_index_file, 'all_files': all_files, 'dir_path': dir_path } result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args) if not result: return False self._full_idx.save_manifest_index() self._mf.save() def add(self, path, manifestpath, files=[]): self.wp = pool_factory(pb_elts=0, pb_desc='files') ignore_rules = get_ignore_rules(path) if len(files) > 0: single_files = filter( lambda x: os.path.isfile(os.path.join(path, x)), files) self.wp.progress_bar_total_inc(len(list(single_files))) for f in files: fullpath = os.path.join(path, f) if os.path.isdir(fullpath): self._add_dir(path, manifestpath, f, ignore_rules=ignore_rules) elif os.path.isfile(fullpath): if not should_ignore_file(ignore_rules, path): self._add_single_file(path, manifestpath, f) else: log.warn(output_messages['WARN_NOT_FOUND'] % fullpath, class_name=MULTI_HASH_CLASS_NAME) else: if os.path.isdir(path): self._add_dir(path, manifestpath, ignore_rules=ignore_rules) self.wp.progress_bar_close() def _adding_dir_work_future_process(self, futures, wp): for future in futures: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None wp.reset_futures() def _adding_dir_work(self, files, args): for k in files: file_path = args['all_files'][k] if (SPEC_EXTENSION in file_path) or (file_path == 'README.md') or ( file_path == MLGIT_IGNORE_FILE_NAME): args['wp'].progress_bar_total_inc(-1) self.add_metadata(args['base_path'], file_path) else: args['wp'].submit(self._add_file, args['base_path'], file_path, args['f_index_file']) futures = self.wp.wait() try: self._adding_dir_work_future_process(futures, self.wp) except Exception as e: self._full_idx.save_manifest_index() self._mf.save() log.error(output_messages['ERROR_ADDING_DIR'] % (args['dir_path'], e), class_name=MULTI_HASH_CLASS_NAME) return False return True def _add_single_file(self, base_path, manifestpath, file_path): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() if (SPEC_EXTENSION in file_path) or ('README' in file_path) or ( MLGIT_IGNORE_FILE_NAME in file_path): self.wp.progress_bar_total_inc(-1) self.add_metadata(base_path, file_path) else: self.wp.submit(self._add_file, base_path, file_path, f_index_file) futures = self.wp.wait() for future in futures: try: scid, filepath, previous_hash = future.result() self.update_index( scid, filepath, previous_hash) if scid is not None else None except Exception as e: # save the manifest of files added to index so far self._full_idx.save_manifest_index() self._mf.save() log.error(output_messages['ERROR_ADDING_DIR'] % (base_path, e), class_name=MULTI_HASH_CLASS_NAME) return self.wp.reset_futures() self._full_idx.save_manifest_index() self._mf.save() def add_metadata(self, basepath, filepath, automatically_added=False): log.debug(output_messages['DEBUG_ADD_FILE'] % filepath, class_name=MULTI_HASH_CLASS_NAME) fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) dstpath = os.path.join(metadatapath, filepath) if not os.path.exists(dstpath): shutil.copy2(fullpath, dstpath) else: os.unlink(dstpath) shutil.copy2(fullpath, dstpath) if automatically_added: log.info(output_messages['INFO_FILE_AUTOMATICALLY_ADDED'].format( filepath), class_name=MULTI_HASH_CLASS_NAME) # TODO add : stat to MANIFEST from original file ... def update_index(self, objectkey, filename, previous_hash=None): self._mf.add(objectkey, posix_path(filename), previous_hash) def remove_manifest(self): index_metadata_path = os.path.join(self._path, 'metadata', self._spec) try: os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml')) except FileNotFoundError: pass def _save_index(self): self._mf.save() def get_index(self): return self._mf def _add_file(self, basepath, filepath, f_index_file): fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) scid = None check_file = f_index_file.get(posix_path(filepath)) previous_hash = None if check_file is not None: if self._full_idx.check_and_update(filepath, check_file, self._hfs, posix_path(filepath), fullpath, self._cache): scid = self._hfs.put(fullpath) updated_check = f_index_file.get(posix_path(filepath)) if 'previous_hash' in updated_check: previous_hash = updated_check['previous_hash'] else: scid = self._hfs.put(fullpath) self._full_idx.update_full_index(posix_path(filepath), fullpath, Status.a.name, scid) return scid, filepath, previous_hash def get(self, objectkey, path, file): log.info(output_messages['INFO_GETTING_FILE'] % file, class_name=MULTI_HASH_CLASS_NAME) dirs = os.path.dirname(file) fulldir = os.path.join(path, dirs) ensure_path_exists(fulldir) dstfile = os.path.join(path, file) return self._hfs.get(objectkey, dstfile) def reset(self): shutil.rmtree(self._path) os.mkdir(self._path) def fsck(self, entity_path): return self._full_idx.fsck(entity_path, self._hfs, self._cache) def update_index_manifest(self, hash_files): for key in hash_files: values = list(hash_files[key]) for e in values: self._mf.add(key, e) self._save_index() def get_index_yaml(self): return self._full_idx def remove_deleted_files_index_manifest(self, deleted_files): manifest = self.get_index() for file in deleted_files: manifest.rm_file(file) manifest.save() def get_hashes_list(self): idx_yaml = self._full_idx.get_index() hashes_list = [] for value in idx_yaml: hashes_list.append(idx_yaml[value]['hash']) return hashes_list
class MultihashIndex(object): def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None): self._spec = spec self._path = index_path self._hfs = MultihashFS(object_path) self._mf = self._get_index(index_path) self._full_idx = FullIndex(spec, index_path, mutability) self._cache = cache_path def _get_index(self, idxpath): metadatapath = os.path.join(idxpath, 'metadata', self._spec) ensure_path_exists(metadatapath) mfpath = os.path.join(metadatapath, 'MANIFEST.yaml') return Manifest(mfpath) def add(self, path, manifestpath, files=[]): self.wp = pool_factory(pb_elts=0, pb_desc='files') if len(files) > 0: single_files = filter(lambda x: os.path.isfile(os.path.join(path, x)), files) self.wp.progress_bar_total_inc(len(list(single_files))) for f in files: fullpath = os.path.join(path, f) if os.path.isdir(fullpath): self._add_dir(path, manifestpath, f) elif os.path.isfile(fullpath): self._add_single_file(path, manifestpath, f) else: log.warn('[%s] Not found!' % fullpath, class_name=MULTI_HASH_CLASS_NAME) else: if os.path.isdir(path): self._add_dir(path, manifestpath) self.wp.progress_bar_close() def _adding_dir_work_future_process(self, futures, wp): for future in futures: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None wp.reset_futures() def _adding_dir_work(self, files, args): for k in files: filepath = args['all_files'][k] if ('.spec' in filepath) or ('README' in filepath): args['wp'].progress_bar_total_inc(-1) self.add_metadata(args['basepath'], filepath) else: args['wp'].submit(self._add_file, args['basepath'], filepath, args['f_index_file']) futures = self.wp.wait() try: self._adding_dir_work_future_process(futures, self.wp) except Exception as e: self._full_idx.save_manifest_index() self._mf.save() log.error('Error adding dir [%s] -- [%s]' % (args['dirpath'], e), class_name=MULTI_HASH_CLASS_NAME) return False return True def _add_dir(self, dirpath, manifestpath, file_path='', trust_links=True): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() all_files = [] for root, dirs, files in os.walk(os.path.join(dirpath, file_path)): if '.' == root[0]: continue basepath = root[:len(dirpath)+1:] relativepath = root[len(dirpath)+1:] for file in files: all_files.append(os.path.join(relativepath, file)) self.wp.progress_bar_total_inc(len(all_files)) args = {'wp': self.wp, 'basepath': basepath, 'f_index_file': f_index_file, 'all_files': all_files, 'dirpath': dirpath} result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args) if not result: return False self._full_idx.save_manifest_index() self._mf.save() def _add_single_file(self, base_path, manifestpath, file_path): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() if ('.spec' in file_path) or ('README' in file_path): self.wp.progress_bar_total_inc(-1) self.add_metadata(base_path, file_path) else: self.wp.submit(self._add_file, base_path, file_path, f_index_file) futures = self.wp.wait() for future in futures: try: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None except Exception as e: # save the manifest of files added to index so far self._full_idx.save_manifest_index() self._mf.save() log.error('Error adding dir [%s] -- [%s]' % (base_path, e), class_name=MULTI_HASH_CLASS_NAME) return self.wp.reset_futures() self._full_idx.save_manifest_index() self._mf.save() def add_metadata(self, basepath, filepath): log.debug('Add file [%s] to ml-git index' % filepath, class_name=MULTI_HASH_CLASS_NAME) fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) dstpath = os.path.join(metadatapath, filepath) if not os.path.exists(dstpath): shutil.copy2(fullpath, dstpath) else: os.unlink(dstpath) shutil.copy2(fullpath, dstpath) # TODO add : stat to MANIFEST from original file ... def update_index(self, objectkey, filename, previous_hash=None): self._mf.add(objectkey, posix_path(filename), previous_hash) def remove_manifest(self): index_metadata_path = os.path.join(self._path, 'metadata', self._spec) try: os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml')) except FileNotFoundError: pass def _save_index(self): self._mf.save() def get_index(self): return self._mf def _add_file(self, basepath, filepath, f_index_file): fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) scid = None check_file = f_index_file.get(posix_path(filepath)) previous_hash = None if check_file is not None: if self._full_idx.check_and_update(filepath, check_file, self._hfs, posix_path(filepath), fullpath, self._cache): scid = self._hfs.put(fullpath) updated_check = f_index_file.get(posix_path(filepath)) if 'previous_hash' in updated_check: previous_hash = updated_check['previous_hash'] else: scid = self._hfs.put(fullpath) self._full_idx.update_full_index(posix_path(filepath), fullpath, Status.a.name, scid) return scid, filepath, previous_hash def get(self, objectkey, path, file): log.info('Getting file [%s] from local index' % file, class_name=MULTI_HASH_CLASS_NAME) dirs = os.path.dirname(file) fulldir = os.path.join(path, dirs) ensure_path_exists(fulldir) dstfile = os.path.join(path, file) return self._hfs.get(objectkey, dstfile) def reset(self): shutil.rmtree(self._path) os.mkdir(self._path) def fsck(self): return self._hfs.fsck() def update_index_manifest(self, hash_files): for key in hash_files: values = list(hash_files[key]) for e in values: self._mf.add(key, e) self._save_index() def get_index_yalm(self): return self._full_idx def remove_deleted_files_index_manifest(self, deleted_files): manifest = self.get_index() for file in deleted_files: manifest.rm_file(file) manifest.save()
def test_put1024K(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) hfs.put('data/think-hires.jpg') for files in hfs.walk(): for file in files: self.assertTrue(file in chunks1024)
def reset(self, spec, reset_type, head): log.info(output_messages['INFO_INITIALIZING_RESET'] % (reset_type, head), class_name=REPOSITORY_CLASS_NAME) if (reset_type == '--soft' or reset_type == '--mixed') and head == HEAD: return try: repo_type = self.__repo_type metadata_path = get_metadata_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) object_path = get_objects_path(self.__config, repo_type) met = Metadata(spec, metadata_path, self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) idx = MultihashIndex(spec, index_path, object_path) fidx = FullIndex(spec, index_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return # get tag before reset tag = met.get_current_tag() categories_path = get_path_with_categories(str(tag)) # current manifest file before reset manifest_path = os.path.join(metadata_path, categories_path, spec, MANIFEST_FILE) _manifest = Manifest(manifest_path).load() if head == HEAD_1: # HEAD~1 try: # reset the repo met.reset() except Exception: return # get tag after reset tag_after_reset = met.get_current_tag() sha = met.sha_from_tag(tag_after_reset) # update ml-git ref HEAD ref.update_head(str(tag_after_reset), sha) # # get path to reset workspace in case of --hard path, file = None, None try: path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if reset_type == '--hard' and path is None: return # get manifest from metadata after reset _manifest_changed = Manifest(manifest_path) hash_files, file_names = _manifest_changed.get_diff(_manifest) idx_mf = idx.get_index().load() if reset_type == '--soft': # add in index/metadata/<entity-name>/MANIFEST idx.update_index_manifest(idx_mf) idx.update_index_manifest(hash_files) fidx.update_index_status(file_names, Status.a.name) else: # --hard or --mixed # remove hash from index/hashsh/store.log file_names.update(*idx_mf.values()) objs = MultihashFS(index_path) for key_hash in hash_files: objs.remove_hash(key_hash) idx.remove_manifest() fidx.remove_from_index_yaml(file_names) fidx.remove_uncommitted() if reset_type == '--hard': # reset workspace remove_from_workspace(file_names, path, spec)