def test_file_md5_crlf(self): with open('cr', 'wb+') as fd: fd.write(b'a\nb\nc') with open('crlf', 'wb+') as fd: fd.write(b'a\r\nb\r\nc') self.assertEqual(utils.file_md5('cr')[0], utils.file_md5('crlf')[0])
def setUp(self): super(TestShouldWarnOnNoChecksumInLocalAndRemoteCache, self).setUp() cache_dir = self.mkdtemp() ret = main(["add", self.FOO]) self.assertEqual(0, ret) ret = main(["add", self.BAR]) self.assertEqual(0, ret) # purge cache shutil.rmtree(self.dvc.cache.local.cache_dir) ret = main(["remote", "add", "remote_name", "-d", cache_dir]) self.assertEqual(0, ret) checksum_foo = file_md5(self.FOO)[0] checksum_bar = file_md5(self.BAR)[0] self.message_header = ( "Some of the cache files do not exist neither locally " "nor on remote. Missing cache files: ") self.message_bar_part = "name: {}, md5: {}".format( self.BAR, checksum_bar) self.message_foo_part = "name: {}, md5: {}".format( self.FOO, checksum_foo)
def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) md5 = file_md5(path)[0] mtime = os.path.getmtime(path) inode = System.inode(path) state = State(self.dvc) entry_md5 = state.update(path) self.assertEqual(entry_md5, md5) # Sleep some time to simulate realistic behavior. # Some filesystems have a bad date resolution for # mtime(i.e. 1sec for HFS) that cause problems with # our 'state' system not being able to distinguish # files that were modified within that delta. time.sleep(1) os.unlink(path) with open(path, 'w+') as fd: fd.write('1') md5 = file_md5(path)[0] mtime = os.path.getmtime(path) inode = System.inode(path) entry_md5 = state.update(path) self.assertEqual(entry_md5, md5)
def test(self): self._setup_cloud() self._prepare_repo() data_md5 = file_md5(self.DATA)[0] data_sub_md5 = file_md5(self.DATA_SUB)[0] self._test_recursive_push(data_md5, data_sub_md5) self._test_recursive_fetch(data_md5, data_sub_md5) self._test_recursive_pull()
def collect_dir_cache(self, dname): dir_info = [] for root, dirs, files in os.walk(dname): for fname in files: path = os.path.join(root, fname) relpath = self.unixpath(os.path.relpath(path, dname)) # FIXME: we could've used md5 = state.update(path, dump=False) # here, but it is around twice as slow(on ssd, don't know about # hdd) for a directory with small files. What we could do here # is introduce some kind of a limit for file size, after which # we would actually register it in our state file. md5 = file_md5(path)[0] dir_info.append({self.PARAM_RELPATH: relpath, self.PARAM_MD5: md5}) # NOTE: sorting the list by path to ensure reproducibility dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX if self.changed(md5): self.dump_dir_cache(md5, dir_info) return (md5, dir_info)
def get_file_hash(self, path_info): hash_info = HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0],) if hash_info: hash_info.size = os.path.getsize(path_info) return hash_info
def test(self): cmd = 'python {} {} {}'.format(self.CODE, self.FOO, 'out') deps = [self.FOO, self.CODE] outs = [os.path.join(self.dvc.root_dir, 'out')] outs_no_cache = [] fname = 'out.dvc' cwd = os.curdir self.dvc.add(self.FOO) stage = self.dvc.run(cmd=cmd, deps=deps, outs=outs, outs_no_cache=outs_no_cache, fname=fname, cwd=cwd) self.assertTrue(filecmp.cmp(self.FOO, 'out', shallow=False)) self.assertTrue(os.path.isfile(stage.path)) self.assertEqual(stage.cmd, cmd) self.assertEqual(len(stage.deps), len(deps)) self.assertEqual(len(stage.outs), len(outs + outs_no_cache)) self.assertEqual(stage.outs[0].path, outs[0]) self.assertEqual(stage.outs[0].md5, file_md5(self.FOO)[0]) self.assertTrue(stage.path, fname) with self.assertRaises(OutputDuplicationError): stage = self.dvc.run(cmd=cmd, deps=deps, outs=outs, outs_no_cache=outs_no_cache, fname='duplicate' + fname, cwd=cwd)
def test(self): cmd = "python {} {} {}".format(self.CODE, self.FOO, "out") deps = [self.FOO, self.CODE] outs = [os.path.join(self.dvc.root_dir, "out")] outs_no_cache = [] fname = "out.dvc" self.dvc.add(self.FOO) stage = self.dvc.run( cmd=cmd, deps=deps, outs=outs, outs_no_cache=outs_no_cache, fname=fname, single_stage=True, ) self.assertTrue(filecmp.cmp(self.FOO, "out", shallow=False)) self.assertTrue(os.path.isfile(stage.path)) self.assertEqual(stage.cmd, cmd) self.assertEqual(len(stage.deps), len(deps)) self.assertEqual(len(stage.outs), len(outs + outs_no_cache)) self.assertEqual(stage.outs[0].fspath, outs[0]) self.assertEqual(stage.outs[0].hash_info.value, file_md5(self.FOO)[0]) self.assertTrue(stage.path, fname) with self.assertRaises(OutputDuplicationError): self.dvc.run( cmd=cmd, deps=deps, outs=outs, outs_no_cache=outs_no_cache, fname="duplicate" + fname, single_stage=True, )
def test(self): self.swap_foo_with_bar() stages = self.dvc.reproduce(self.foo_stage.path) self.assertTrue(filecmp.cmp(self.FOO, self.BAR, shallow=False)) self.assertEqual(stages[0].outs[0].checksum, file_md5(self.BAR)[0])
def test_should_pre_push_hook_push(self, repo_dir, git, dvc_repo): assert main(["install"]) == 0 temp = repo_dir.mkdtemp() git_remote = os.path.join(temp, "project.git") storage_path = os.path.join(temp, "dvc_storage") foo_checksum = file_md5(repo_dir.FOO)[0] expected_cache_path = dvc_repo.cache.local.get(foo_checksum) ret = main(["remote", "add", "-d", "store", storage_path]) assert ret == 0 ret = main(["add", repo_dir.FOO]) assert ret == 0 stage_file = repo_dir.FOO + Stage.STAGE_FILE_SUFFIX dvc_repo.scm.repo.index.add([stage_file, ".gitignore"]) dvc_repo.scm.repo.index.commit("commit message") dvc_repo.scm.repo.clone(git_remote) dvc_repo.scm.repo.create_remote("origin", git_remote) dvc_repo.scm.repo.git.push("origin", "master") assert os.path.isfile(expected_cache_path)
def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) md5 = file_md5(path)[0] state = State(self.dvc, self.dvc.config.config) with state: entry_md5 = state.update(path) self.assertEqual(entry_md5, md5) os.unlink(path) with open(path, "a") as fd: fd.write("1") md5 = file_md5(path)[0] entry_md5 = state.update(path) self.assertEqual(entry_md5, md5)
def compute_md5(self, path): if os.path.isdir(path): dir_info = self.collect_dir(path) byts = json.dumps(dir_info, sort_keys=True).encode('utf-8') md5 = bytes_md5(byts) return md5 + Output.MD5_DIR_SUFFIX else: return file_md5(path)[0]
def test(self): self.swap_foo_with_bar() stages = self.dvc.reproduce(self.foo_stage.path) self.assertTrue(filecmp.cmp(self.FOO, self.BAR, shallow=False)) self.assertEqual( stages[0].outs[0].hash_info.value, file_md5(self.BAR, self.dvc.fs), )
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state = State(dvc.root_dir, dvc.tmp_dir, dvc.dvcignore) state.save(path, dvc.fs, hash_info) assert state.get(path, dvc.fs)[1] == hash_info path.unlink() path.write_text("1") assert state.get(path, dvc.fs) == (None, None) hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state.save(path, dvc.fs, hash_info) assert state.get(path, dvc.fs)[1] == hash_info
def _cmp_checksum(blob, fname): """ Verify local and remote checksums. """ md5 = file_md5(fname)[1] b64_encoded_md5 = base64.b64encode(md5).decode() if md5 else None if blob.md5_hash == b64_encoded_md5: return True return False
def _create_multipart(self, key, fname): """ Create multipart upload and save info to tracker file. """ # AWS doesn't provide easilly accessible md5 for multipart # objects, so we have to store our own md5 sum to use later. metadata = {'dvc-md5': str(file_md5(fname)[0])} multipart = key.bucket.initiate_multipart_upload(key.name, metadata=metadata) self._write_upload_tracker(fname, multipart.id) return multipart
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" path_info = PathInfo(path) hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state = State(dvc.root_dir, dvc.tmp_dir) state.save(path_info, dvc.fs, hash_info) assert state.get(path_info, dvc.fs) == hash_info path.unlink() path.write_text("1") assert state.get(path_info, dvc.fs) is None hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state.save(path_info, dvc.fs, hash_info) assert state.get(path_info, dvc.fs) == hash_info
def test_hashed_stream_reader(tmp_dir): tmp_dir.gen({"foo": "foo"}) foo = tmp_dir / "foo" with open(foo, "rb") as fobj: stream_reader = HashedStreamReader(fobj) assert stream_reader.read(3) == b"foo" hex_digest = file_md5(foo, LocalFileSystem(None, {})) assert stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value
def test(self): md5 = file_md5(self.FOO)[0] stage = self.dvc.add(self.FOO) self.assertIsInstance(stage, Stage) self.assertTrue(os.path.isfile(stage.path)) self.assertEqual(len(stage.outs), 1) self.assertEqual(len(stage.deps), 0) self.assertEqual(stage.cmd, None) self.assertEqual(stage.outs[0].info['md5'], md5)
def test_add(tmp_dir, dvc): (stage, ) = tmp_dir.dvc_gen({"foo": "foo"}) md5, _ = file_md5("foo") assert stage is not None assert isinstance(stage, Stage) assert os.path.isfile(stage.path) assert len(stage.outs) == 1 assert len(stage.deps) == 0 assert stage.cmd is None assert stage.outs[0].info["md5"] == md5 assert stage.md5 == "ee343f2482f53efffc109be83cc976ac"
def test_get_state_record_for_inode(get_inode_mock, dvc_repo, repo_dir): state = State(dvc_repo, dvc_repo.config.config) inode = state.MAX_INT + 2 assert inode != state._to_sqlite(inode) path = os.path.join(dvc_repo.root_dir, repo_dir.FOO) md5 = file_md5(path)[0] get_inode_mock.side_effect = mock_get_inode(inode) with state: state.save(PathInfo(path), md5) ret = state.get_state_record_for_inode(inode) assert ret is not None
def test_transforms_inode(self, get_inode_mock): state = State(self.dvc, self.dvc.config.config) inode = state.MAX_INT + 2 self.assertNotEqual(inode, state._to_sqlite(inode)) path = os.path.join(self.dvc.root_dir, self.FOO) md5 = file_md5(path)[0] get_inode_mock.side_effect = self.mock_get_inode(path, inode) with state: state.save({"scheme": "local", "path": path}, md5) ret = state.get_state_record_for_inode(inode) self.assertIsNotNone(ret)
def save(self): super(Output, self).save() if not self.use_cache: return self.project.logger.debug("Saving {} to {}".format( self.path, self.cache)) if self.project.scm.is_tracked(self.path): raise CmdOutputAlreadyTrackedError(self.rel_path) if not self.changed(): return if os.path.exists(self.cache): # This means that we already have cache for this data. # We remove data and link it to existing cache to save # some space. msg = "Cache {} already exists, performing checkout for {}" self.project.logger.debug(msg.format(self.cache, self.path)) self.checkout() return if os.path.isfile(self.path): self.hardlink(self.path, self.cache) return for root, dirs, files in os.walk(self.path): for fname in files: path = os.path.join(root, fname) relpath = os.path.relpath(path, self.path) md5 = file_md5(path)[0] cache = self.project.cache.get(md5) cache_info = os.path.join(self.cache, relpath) cache_dir = os.path.dirname(cache_info) if os.path.exists(cache): self._remove(path, None) self.hardlink(cache, path) else: self.hardlink(path, cache) if not os.path.exists(cache_dir): os.makedirs(cache_dir) with open(cache_info, 'w') as fd: yaml.safe_dump({self.PARAM_MD5: md5}, fd, default_flow_style=False)
def test_hashed_stream_reader_as_chunks(tmp_dir): tmp_dir.gen({"foo": b"foo \x00" * 16}) foo = tmp_dir / "foo" with open(foo, "rb") as fobj: stream_reader = HashedStreamReader(fobj) while True: chunk = stream_reader.read(16) if not chunk: break hex_digest = file_md5(foo, LocalFileSystem(None, {})) assert not stream_reader.is_text_file assert hex_digest == stream_reader.hash_info.value
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" path_info = PathInfo(path) md5 = file_md5(path)[0] state = State(dvc, dvc.config.config) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5 path.unlink() path.write_text("1") entry_md5 = state.get(path_info) assert entry_md5 is None md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5
def get_file_hash(self, path_info): """Return file checksum for specified path. If path_info is a DVC out, the pre-computed checksum for the file will be used. If path_info is a git file, MD5 will be computed for the git object. """ if not self.exists(path_info): raise FileNotFoundError if self.dvctree and self.dvctree.exists(path_info): try: return self.dvctree.get_file_hash(path_info) except OutputNotFoundError: pass return file_md5(path_info, self)[0]
def test(self): md5 = file_md5(self.FOO)[0] stages = self.dvc.add(self.FOO) self.assertEqual(len(stages), 1) stage = stages[0] self.assertTrue(stage is not None) self.assertIsInstance(stage, Stage) self.assertTrue(os.path.isfile(stage.path)) self.assertEqual(len(stage.outs), 1) self.assertEqual(len(stage.deps), 0) self.assertEqual(stage.cmd, None) self.assertEqual(stage.outs[0].info["md5"], md5) self.assertEqual(stage.md5, "ee343f2482f53efffc109be83cc976ac")
def test_state(dvc_repo, repo_dir): path = os.path.join(dvc_repo.root_dir, repo_dir.FOO) path_info = PathInfo(path) md5 = file_md5(path)[0] state = State(dvc_repo, dvc_repo.config.config) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5 os.unlink(path) with open(path, "a") as fd: fd.write("1") entry_md5 = state.get(path_info) assert entry_md5 is None md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5
def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc): tmp_dir.gen("foo", "foo content") state = State(dvc, dvc.config.config) inode = state.MAX_INT + 2 assert inode != state._to_sqlite(inode) foo = tmp_dir / "foo" md5 = file_md5(foo)[0] get_inode_mock.side_effect = mock_get_inode(inode) with state: state.save(PathInfo(foo), md5) ret = state.get_state_record_for_inode(inode) assert ret is not None
def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) path_info = {"scheme": "local", "path": path} md5 = file_md5(path)[0] state = State(self.dvc, self.dvc.config.config) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) self.assertEqual(entry_md5, md5) os.unlink(path) with open(path, "a") as fd: fd.write("1") entry_md5 = state.get(path_info) self.assertTrue(entry_md5 is None) md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) self.assertEqual(entry_md5, md5)