def test_fsck(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) # Create a hard link placing the file on a wrong directory chunk_in_wrong_dir = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) with open(chunk, 'wb') as f: f.write(b'blabla') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 2) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files)
def test_fsck_with_remove_corrupted(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) chunk_in_wrong_dir = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck(remove_corrupted=True) self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) self.assertFalse(os.path.exists(chunk_in_wrong_dir))
class MultihashIndex(object): def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None): self._spec = spec self._path = index_path self._hfs = MultihashFS(object_path) self._mf = self._get_index(index_path) self._full_idx = FullIndex(spec, index_path, mutability) self._cache = cache_path def _get_index(self, idxpath): metadatapath = os.path.join(idxpath, 'metadata', self._spec) ensure_path_exists(metadatapath) mfpath = os.path.join(metadatapath, 'MANIFEST.yaml') return Manifest(mfpath) def add(self, path, manifestpath, files=[]): self.wp = pool_factory(pb_elts=0, pb_desc='files') if len(files) > 0: single_files = filter(lambda x: os.path.isfile(os.path.join(path, x)), files) self.wp.progress_bar_total_inc(len(list(single_files))) for f in files: fullpath = os.path.join(path, f) if os.path.isdir(fullpath): self._add_dir(path, manifestpath, f) elif os.path.isfile(fullpath): self._add_single_file(path, manifestpath, f) else: log.warn('[%s] Not found!' % fullpath, class_name=MULTI_HASH_CLASS_NAME) else: if os.path.isdir(path): self._add_dir(path, manifestpath) self.wp.progress_bar_close() def _adding_dir_work_future_process(self, futures, wp): for future in futures: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None wp.reset_futures() def _adding_dir_work(self, files, args): for k in files: filepath = args['all_files'][k] if ('.spec' in filepath) or ('README' in filepath): args['wp'].progress_bar_total_inc(-1) self.add_metadata(args['basepath'], filepath) else: args['wp'].submit(self._add_file, args['basepath'], filepath, args['f_index_file']) futures = self.wp.wait() try: self._adding_dir_work_future_process(futures, self.wp) except Exception as e: self._full_idx.save_manifest_index() self._mf.save() log.error('Error adding dir [%s] -- [%s]' % (args['dirpath'], e), class_name=MULTI_HASH_CLASS_NAME) return False return True def _add_dir(self, dirpath, manifestpath, file_path='', trust_links=True): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() all_files = [] for root, dirs, files in os.walk(os.path.join(dirpath, file_path)): if '.' == root[0]: continue basepath = root[:len(dirpath)+1:] relativepath = root[len(dirpath)+1:] for file in files: all_files.append(os.path.join(relativepath, file)) self.wp.progress_bar_total_inc(len(all_files)) args = {'wp': self.wp, 'basepath': basepath, 'f_index_file': f_index_file, 'all_files': all_files, 'dirpath': dirpath} result = run_function_per_group(range(len(all_files)), 10000, function=self._adding_dir_work, arguments=args) if not result: return False self._full_idx.save_manifest_index() self._mf.save() def _add_single_file(self, base_path, manifestpath, file_path): self.manifestfiles = yaml_load(manifestpath) f_index_file = self._full_idx.get_index() if ('.spec' in file_path) or ('README' in file_path): self.wp.progress_bar_total_inc(-1) self.add_metadata(base_path, file_path) else: self.wp.submit(self._add_file, base_path, file_path, f_index_file) futures = self.wp.wait() for future in futures: try: scid, filepath, previous_hash = future.result() self.update_index(scid, filepath, previous_hash) if scid is not None else None except Exception as e: # save the manifest of files added to index so far self._full_idx.save_manifest_index() self._mf.save() log.error('Error adding dir [%s] -- [%s]' % (base_path, e), class_name=MULTI_HASH_CLASS_NAME) return self.wp.reset_futures() self._full_idx.save_manifest_index() self._mf.save() def add_metadata(self, basepath, filepath): log.debug('Add file [%s] to ml-git index' % filepath, class_name=MULTI_HASH_CLASS_NAME) fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) dstpath = os.path.join(metadatapath, filepath) if not os.path.exists(dstpath): shutil.copy2(fullpath, dstpath) else: os.unlink(dstpath) shutil.copy2(fullpath, dstpath) # TODO add : stat to MANIFEST from original file ... def update_index(self, objectkey, filename, previous_hash=None): self._mf.add(objectkey, posix_path(filename), previous_hash) def remove_manifest(self): index_metadata_path = os.path.join(self._path, 'metadata', self._spec) try: os.unlink(os.path.join(index_metadata_path, 'MANIFEST.yaml')) except FileNotFoundError: pass def _save_index(self): self._mf.save() def get_index(self): return self._mf def _add_file(self, basepath, filepath, f_index_file): fullpath = os.path.join(basepath, filepath) metadatapath = os.path.join(self._path, 'metadata', self._spec) ensure_path_exists(metadatapath) scid = None check_file = f_index_file.get(posix_path(filepath)) previous_hash = None if check_file is not None: if self._full_idx.check_and_update(filepath, check_file, self._hfs, posix_path(filepath), fullpath, self._cache): scid = self._hfs.put(fullpath) updated_check = f_index_file.get(posix_path(filepath)) if 'previous_hash' in updated_check: previous_hash = updated_check['previous_hash'] else: scid = self._hfs.put(fullpath) self._full_idx.update_full_index(posix_path(filepath), fullpath, Status.a.name, scid) return scid, filepath, previous_hash def get(self, objectkey, path, file): log.info('Getting file [%s] from local index' % file, class_name=MULTI_HASH_CLASS_NAME) dirs = os.path.dirname(file) fulldir = os.path.join(path, dirs) ensure_path_exists(fulldir) dstfile = os.path.join(path, file) return self._hfs.get(objectkey, dstfile) def reset(self): shutil.rmtree(self._path) os.mkdir(self._path) def fsck(self): return self._hfs.fsck() def update_index_manifest(self, hash_files): for key in hash_files: values = list(hash_files[key]) for e in values: self._mf.add(key, e) self._save_index() def get_index_yalm(self): return self._full_idx def remove_deleted_files_index_manifest(self, deleted_files): manifest = self.get_index() for file in deleted_files: manifest.rm_file(file) manifest.save()