def test_get_update_links_wspace_with_duplicates(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) wspace_file = os.path.join(wspath, DATA_IMG_2) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) self.assertTrue(st.st_nlink == 3) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh', DATA_IMG_2: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'}) wspath = os.path.join(self.tmp_dir, 'wspace') ensure_path_exists(wspath) to_be_removed = os.path.join(wspath, 'to_be_removed') with open(to_be_removed, 'w') as f: f.write('DEAD\n') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1, DATA_IMG_2} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') r._remove_unused_links_wspace(wspath, mfiles) self.assertFalse(os.path.exists(to_be_removed))
def __init__(self, spec, index_path, object_path, mutability=Mutability.STRICT.value, cache_path=None): self._spec = spec self._path = index_path self._hfs = MultihashFS(object_path) self._mf = self._get_index(index_path) self._full_idx = FullIndex(spec, index_path, mutability) self._cache = cache_path
def test_put1024K_pathexistence_level3(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=3) hfs.put('data/think-hires.jpg') fullpath = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'No', 'RA', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') self.assertTrue(os.path.exists(fullpath))
def test_remove_corrupted_files(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) corrupted_file_path = os.path.join(self.tmp_dir, 'corrupted_file') open(corrupted_file_path, 'a').close() self.assertTrue(os.path.exists(corrupted_file_path)) hfs._remove_corrupted_files([corrupted_file_path], True) self.assertFalse(os.path.exists(corrupted_file_path))
def test_fsck(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) # Create a hard link placing the file on a wrong directory chunk_in_wrong_dir = os.path.join( self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) with open(chunk, 'wb') as f: f.write(b'blabla') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 2) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files)
def test_get_simple(self): original_file = 'data/think-hires.jpg' dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg') hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) objkey = hfs.put(original_file) hfs.get(objkey, dst_file) self.assertEqual(self.md5sum(original_file), self.md5sum(dst_file))
def test_remote_fsck(self): testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) ohfs.put(HDATA_IMG_1) s3 = boto3.resource( 's3', region_name='us-east-1', aws_access_key_id='fake_access_key', aws_secret_access_key='fake_secret_key', ) s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').delete() self.assertRaises(botocore.exceptions.ClientError, lambda: self.check_delete(s3, testbucketname)) mdpath = os.path.join(self.tmp_dir, 'metadata-test') dataset_spec = get_sample_spec(testbucketname) specpath = os.path.join(mdpath, 'vision-computing', 'images', 'dataset-ex') ensure_path_exists(specpath) yaml_save(dataset_spec, os.path.join(specpath, 'dataset-ex.spec')) manifestpath = os.path.join(specpath, 'MANIFEST.yaml') yaml_save({'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh': {'imghires.jpg'}}, manifestpath) fullspecpath = os.path.join(specpath, os.path.join(specpath, 'dataset-ex.spec')) spec = 'vision-computing__images__dataset-ex__5' c = yaml_load('hdata/config.yaml') r = LocalRepository(c, hfspath) ret = r.remote_fsck(mdpath, spec, fullspecpath, 2, True, True) self.assertTrue(ret) self.assertEqual(None, s3.Object(testbucketname, 'zdj7WWsMkELZSGQGgpm5VieCWV8NxY5n5XEP73H4E7eeDMA3A').load())
def test_get_update_links_wspace(self): wspath = os.path.join(self.tmp_dir, 'wspace') hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) fidx = FullIndex(self.tmp_dir, self.tmp_dir) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) mfiles = {} files = {DATA_IMG_1} r._update_links_wspace(cache, fidx, files, key, wspath, mfiles, Status.u.name, 'strict') wspace_file = os.path.join(wspath, DATA_IMG_1) set_write_read(wspace_file) self.assertTrue(os.path.exists(wspace_file)) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(wspace_file)) st = os.stat(wspace_file) fi = fidx.get_index() for k, v in fi.items(): self.assertEqual(k, os.path.join('data', 'imghires.jpg')) self.assertEqual(v['hash'], 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh') self.assertEqual(v['status'], 'u') self.assertEqual(v['ctime'], st.st_ctime) self.assertEqual(v['mtime'], st.st_mtime) self.assertTrue(st.st_nlink == 2) self.assertEqual(mfiles, {DATA_IMG_1: 'zdj7WjdojNAZN53Wf29rPssZamfbC6MVerzcGwd9tNciMpsQh'})
def test_put1024K_toomany_levels(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024, levels=23) hfs.put('data/think-hires.jpg') fullpath = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'RA', 'zc', 'iw', '2J', 'Ji', '69', 's2', 'Hj', 'fC', 'yz', 'Wt', '39', 'BH', 'Cu', 'cC', 'V2', 'Cs', 'AX', '6v', 'Sv', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') self.assertTrue(os.path.exists(fullpath))
def test_corruption(self): original_file = 'data/think-hires.jpg' dst_file = os.path.join(self.tmp_dir, 'think-hires.jpg') hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) objkey = hfs.put(original_file) chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') with open(chunk, 'wb') as f: f.write(b'blabla') self.assertFalse(hfs.get(objkey, dst_file)) self.assertTrue(os.path.exists(dst_file) is False)
def push(self, spec, retry=2, clear_on_fail=False): repo_type = self.__repo_type try: objects_path = get_objects_path(self.__config, repo_type) metadata_path = get_metadata_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return met = Metadata(spec, metadata_path, self.__config, repo_type) fields = met.git_user_config() if None in fields.values(): log.error( 'Your name and email address need to be configured in git. ' 'Please see the commands below:', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.name \'Your Name\'', class_name=REPOSITORY_CLASS_NAME) log.error('git config --global user.email [email protected]', class_name=REPOSITORY_CLASS_NAME) return if met.fetch() is False: return ref = Refs(refs_path, spec, repo_type) tag, sha = ref.branch() categories_path = get_path_with_categories(tag) spec_path, spec_file = None, None try: spec_path, spec_file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if spec_path is None: return full_spec_path = os.path.join(spec_path, spec_file) repo = LocalRepository(self.__config, objects_path, repo_type) ret = repo.push(objects_path, full_spec_path, retry, clear_on_fail) # ensure first we're on master ! met.checkout() if ret == 0: # push metadata spec to LocalRepository git repository try: met.push() except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return MultihashFS(objects_path).reset_log()
def test_get_update_cache(self): hfspath = os.path.join(self.tmp_dir, 'objectsfs') ohfs = MultihashFS(hfspath) key = ohfs.put(HDATA_IMG_1) cachepath = os.path.join(self.tmp_dir, 'cachefs') cache = Cache(cachepath, '', '') testbucketname = os.getenv('MLGIT_TEST_BUCKET', 'ml-git-datasets') c = get_sample_config_spec(testbucketname, testprofile, testregion) r = LocalRepository(c, hfspath) r._update_cache(cache, key) self.assertTrue(os.path.exists(cache.get_keypath(key))) self.assertEqual(self.md5sum(HDATA_IMG_1), self.md5sum(cache.get_keypath(key)))
def commit_objects(self, index_path, ws_path): added_files = [] deleted_files = [] idx = MultihashFS(self._objects_path) fidx = FullIndex(self.__spec, index_path) findex = fidx.get_index() log_path = os.path.join(self._logpath, 'store.log') with open(log_path, 'a') as log_file: for k, v in findex.items(): if not os.path.exists(os.path.join(ws_path, k)): deleted_files.append(k) elif v['status'] == Status.a.name: idx.fetch_scid(v['hash'], log_file) v['status'] = Status.u.name if 'previous_hash' in v: added_files.append((v['previous_hash'], k)) fidx.get_manifest_index().save() return added_files, deleted_files
def test_fsck_with_remove_corrupted(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) original_file = 'data/think-hires.jpg' hfs.put(original_file) chunk = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'No', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') corrupted_files = hfs.fsck() self.assertTrue(len(corrupted_files) == 0) chunk_in_wrong_dir = os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB', 'zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv') os.makedirs(os.path.join(self.tmp_dir, 'hashfs', 'aU', 'NB')) os.link(chunk, chunk_in_wrong_dir) corrupted_files = hfs.fsck(remove_corrupted=True) self.assertTrue(len(corrupted_files) == 1) self.assertTrue('zdj7WaUNoRAzciw2JJi69s2HjfCyzWt39BHCucCV2CsAX6vSv' in corrupted_files) self.assertFalse(os.path.exists(chunk_in_wrong_dir))
def test_put1024K(self): hfs = MultihashFS(self.tmp_dir, blocksize=1024 * 1024) hfs.put('data/think-hires.jpg') for files in hfs.walk(): for file in files: self.assertTrue(file in chunks1024)
def reset(self, spec, reset_type, head): log.info(output_messages['INFO_INITIALIZING_RESET'] % (reset_type, head), class_name=REPOSITORY_CLASS_NAME) if (reset_type == '--soft' or reset_type == '--mixed') and head == HEAD: return try: repo_type = self.__repo_type metadata_path = get_metadata_path(self.__config, repo_type) index_path = get_index_path(self.__config, repo_type) refs_path = get_refs_path(self.__config, repo_type) object_path = get_objects_path(self.__config, repo_type) met = Metadata(spec, metadata_path, self.__config, repo_type) ref = Refs(refs_path, spec, repo_type) idx = MultihashIndex(spec, index_path, object_path) fidx = FullIndex(spec, index_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) return # get tag before reset tag = met.get_current_tag() categories_path = get_path_with_categories(str(tag)) # current manifest file before reset manifest_path = os.path.join(metadata_path, categories_path, spec, MANIFEST_FILE) _manifest = Manifest(manifest_path).load() if head == HEAD_1: # HEAD~1 try: # reset the repo met.reset() except Exception: return # get tag after reset tag_after_reset = met.get_current_tag() sha = met.sha_from_tag(tag_after_reset) # update ml-git ref HEAD ref.update_head(str(tag_after_reset), sha) # # get path to reset workspace in case of --hard path, file = None, None try: path, file = search_spec_file(self.__repo_type, spec, categories_path) except Exception as e: log.error(e, class_name=REPOSITORY_CLASS_NAME) if reset_type == '--hard' and path is None: return # get manifest from metadata after reset _manifest_changed = Manifest(manifest_path) hash_files, file_names = _manifest_changed.get_diff(_manifest) idx_mf = idx.get_index().load() if reset_type == '--soft': # add in index/metadata/<entity-name>/MANIFEST idx.update_index_manifest(idx_mf) idx.update_index_manifest(hash_files) fidx.update_index_status(file_names, Status.a.name) else: # --hard or --mixed # remove hash from index/hashsh/store.log file_names.update(*idx_mf.values()) objs = MultihashFS(index_path) for key_hash in hash_files: objs.remove_hash(key_hash) idx.remove_manifest() fidx.remove_from_index_yaml(file_names) fidx.remove_uncommitted() if reset_type == '--hard': # reset workspace remove_from_workspace(file_names, path, spec)