Esempio n. 1
0
    def test_file_md5_crlf(self):
        with open('cr', 'wb+') as fd:
            fd.write(b'a\nb\nc')
        with open('crlf', 'wb+') as fd:
            fd.write(b'a\r\nb\r\nc')

        self.assertEqual(utils.file_md5('cr')[0], utils.file_md5('crlf')[0])
Esempio n. 2
0
    def setUp(self):
        super(TestShouldWarnOnNoChecksumInLocalAndRemoteCache, self).setUp()

        cache_dir = self.mkdtemp()
        ret = main(["add", self.FOO])
        self.assertEqual(0, ret)

        ret = main(["add", self.BAR])
        self.assertEqual(0, ret)

        # purge cache
        shutil.rmtree(self.dvc.cache.local.cache_dir)

        ret = main(["remote", "add", "remote_name", "-d", cache_dir])
        self.assertEqual(0, ret)

        checksum_foo = file_md5(self.FOO)[0]
        checksum_bar = file_md5(self.BAR)[0]
        self.message_header = (
            "Some of the cache files do not exist neither locally "
            "nor on remote. Missing cache files: ")
        self.message_bar_part = "name: {}, md5: {}".format(
            self.BAR, checksum_bar)
        self.message_foo_part = "name: {}, md5: {}".format(
            self.FOO, checksum_foo)
Esempio n. 3
0
    def test_update(self):
        path = os.path.join(self.dvc.root_dir, self.FOO)
        md5 = file_md5(path)[0]
        mtime = os.path.getmtime(path)
        inode = System.inode(path)

        state = State(self.dvc)

        entry_md5 = state.update(path)
        self.assertEqual(entry_md5, md5)

        # Sleep some time to simulate realistic behavior.
        # Some filesystems have a bad date resolution for
        # mtime(i.e. 1sec for HFS) that cause problems with
        # our 'state' system not being able to distinguish
        # files that were modified within that delta.
        time.sleep(1)

        os.unlink(path)
        with open(path, 'w+') as fd:
            fd.write('1')

        md5 = file_md5(path)[0]
        mtime = os.path.getmtime(path)
        inode = System.inode(path)

        entry_md5 = state.update(path)
        self.assertEqual(entry_md5, md5)
Esempio n. 4
0
    def test(self):
        self._setup_cloud()
        self._prepare_repo()

        data_md5 = file_md5(self.DATA)[0]
        data_sub_md5 = file_md5(self.DATA_SUB)[0]

        self._test_recursive_push(data_md5, data_sub_md5)

        self._test_recursive_fetch(data_md5, data_sub_md5)

        self._test_recursive_pull()
Esempio n. 5
0
    def collect_dir_cache(self, dname):
        dir_info = []

        for root, dirs, files in os.walk(dname):
            for fname in files:
                path = os.path.join(root, fname)
                relpath = self.unixpath(os.path.relpath(path, dname))

                # FIXME: we could've used md5 = state.update(path, dump=False)
                # here, but it is around twice as slow(on ssd, don't know about
                # hdd) for a directory with small files. What we could do here
                # is introduce some kind of a limit for file size, after which
                # we would actually register it in our state file.
                md5 = file_md5(path)[0]
                dir_info.append({self.PARAM_RELPATH: relpath,
                                 self.PARAM_MD5: md5})

        # NOTE: sorting the list by path to ensure reproducibility
        dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH))

        md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX
        if self.changed(md5):
            self.dump_dir_cache(md5, dir_info)

        return (md5, dir_info)
Esempio n. 6
0
    def get_file_hash(self, path_info):
        hash_info = HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0],)

        if hash_info:
            hash_info.size = os.path.getsize(path_info)

        return hash_info
Esempio n. 7
0
    def test(self):
        cmd = 'python {} {} {}'.format(self.CODE, self.FOO, 'out')
        deps = [self.FOO, self.CODE]
        outs = [os.path.join(self.dvc.root_dir, 'out')]
        outs_no_cache = []
        fname = 'out.dvc'
        cwd = os.curdir

        self.dvc.add(self.FOO)
        stage = self.dvc.run(cmd=cmd,
                             deps=deps,
                             outs=outs,
                             outs_no_cache=outs_no_cache,
                             fname=fname,
                             cwd=cwd)

        self.assertTrue(filecmp.cmp(self.FOO, 'out', shallow=False))
        self.assertTrue(os.path.isfile(stage.path))
        self.assertEqual(stage.cmd, cmd)
        self.assertEqual(len(stage.deps), len(deps))
        self.assertEqual(len(stage.outs), len(outs + outs_no_cache))
        self.assertEqual(stage.outs[0].path, outs[0])
        self.assertEqual(stage.outs[0].md5, file_md5(self.FOO)[0])  
        self.assertTrue(stage.path, fname)

        with self.assertRaises(OutputDuplicationError):
            stage = self.dvc.run(cmd=cmd,
                                 deps=deps,
                                 outs=outs,
                                 outs_no_cache=outs_no_cache,
                                 fname='duplicate' + fname,
                                 cwd=cwd)
Esempio n. 8
0
    def test(self):
        cmd = "python {} {} {}".format(self.CODE, self.FOO, "out")
        deps = [self.FOO, self.CODE]
        outs = [os.path.join(self.dvc.root_dir, "out")]
        outs_no_cache = []
        fname = "out.dvc"

        self.dvc.add(self.FOO)
        stage = self.dvc.run(
            cmd=cmd,
            deps=deps,
            outs=outs,
            outs_no_cache=outs_no_cache,
            fname=fname,
            single_stage=True,
        )

        self.assertTrue(filecmp.cmp(self.FOO, "out", shallow=False))
        self.assertTrue(os.path.isfile(stage.path))
        self.assertEqual(stage.cmd, cmd)
        self.assertEqual(len(stage.deps), len(deps))
        self.assertEqual(len(stage.outs), len(outs + outs_no_cache))
        self.assertEqual(stage.outs[0].fspath, outs[0])
        self.assertEqual(stage.outs[0].hash_info.value, file_md5(self.FOO)[0])
        self.assertTrue(stage.path, fname)

        with self.assertRaises(OutputDuplicationError):
            self.dvc.run(
                cmd=cmd,
                deps=deps,
                outs=outs,
                outs_no_cache=outs_no_cache,
                fname="duplicate" + fname,
                single_stage=True,
            )
Esempio n. 9
0
    def test(self):
        self.swap_foo_with_bar()

        stages = self.dvc.reproduce(self.foo_stage.path)

        self.assertTrue(filecmp.cmp(self.FOO, self.BAR, shallow=False))
        self.assertEqual(stages[0].outs[0].checksum, file_md5(self.BAR)[0])
Esempio n. 10
0
    def test_should_pre_push_hook_push(self, repo_dir, git, dvc_repo):
        assert main(["install"]) == 0

        temp = repo_dir.mkdtemp()
        git_remote = os.path.join(temp, "project.git")
        storage_path = os.path.join(temp, "dvc_storage")

        foo_checksum = file_md5(repo_dir.FOO)[0]
        expected_cache_path = dvc_repo.cache.local.get(foo_checksum)

        ret = main(["remote", "add", "-d", "store", storage_path])
        assert ret == 0

        ret = main(["add", repo_dir.FOO])
        assert ret == 0

        stage_file = repo_dir.FOO + Stage.STAGE_FILE_SUFFIX
        dvc_repo.scm.repo.index.add([stage_file, ".gitignore"])
        dvc_repo.scm.repo.index.commit("commit message")

        dvc_repo.scm.repo.clone(git_remote)
        dvc_repo.scm.repo.create_remote("origin", git_remote)

        dvc_repo.scm.repo.git.push("origin", "master")

        assert os.path.isfile(expected_cache_path)
Esempio n. 11
0
    def test_update(self):
        path = os.path.join(self.dvc.root_dir, self.FOO)
        md5 = file_md5(path)[0]

        state = State(self.dvc, self.dvc.config.config)

        with state:
            entry_md5 = state.update(path)
            self.assertEqual(entry_md5, md5)

            os.unlink(path)
            with open(path, "a") as fd:
                fd.write("1")

            md5 = file_md5(path)[0]
            entry_md5 = state.update(path)
            self.assertEqual(entry_md5, md5)
Esempio n. 12
0
 def compute_md5(self, path):
     if os.path.isdir(path):
         dir_info = self.collect_dir(path)
         byts = json.dumps(dir_info, sort_keys=True).encode('utf-8')
         md5 = bytes_md5(byts)
         return md5 + Output.MD5_DIR_SUFFIX
     else:
         return file_md5(path)[0]
Esempio n. 13
0
    def test(self):
        self.swap_foo_with_bar()

        stages = self.dvc.reproduce(self.foo_stage.path)

        self.assertTrue(filecmp.cmp(self.FOO, self.BAR, shallow=False))
        self.assertEqual(
            stages[0].outs[0].hash_info.value,
            file_md5(self.BAR, self.dvc.fs),
        )
Esempio n. 14
0
def test_state(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo content")
    path = tmp_dir / "foo"
    hash_info = HashInfo("md5", file_md5(path, dvc.fs))

    state = State(dvc.root_dir, dvc.tmp_dir, dvc.dvcignore)

    state.save(path, dvc.fs, hash_info)
    assert state.get(path, dvc.fs)[1] == hash_info

    path.unlink()
    path.write_text("1")

    assert state.get(path, dvc.fs) == (None, None)

    hash_info = HashInfo("md5", file_md5(path, dvc.fs))
    state.save(path, dvc.fs, hash_info)

    assert state.get(path, dvc.fs)[1] == hash_info
Esempio n. 15
0
File: gcp.py Progetto: ybayle/dvc
    def _cmp_checksum(blob, fname):
        """
        Verify local and remote checksums.
        """
        md5 = file_md5(fname)[1]
        b64_encoded_md5 = base64.b64encode(md5).decode() if md5 else None

        if blob.md5_hash == b64_encoded_md5:
            return True

        return False
Esempio n. 16
0
File: aws.py Progetto: vhcg77/dvc
 def _create_multipart(self, key, fname):
     """
     Create multipart upload and save info to tracker file.
     """
     # AWS doesn't provide easilly accessible md5 for multipart
     # objects, so we have to store our own md5 sum to use later.
     metadata = {'dvc-md5': str(file_md5(fname)[0])}
     multipart = key.bucket.initiate_multipart_upload(key.name,
                                                      metadata=metadata)
     self._write_upload_tracker(fname, multipart.id)
     return multipart
Esempio n. 17
0
def test_state(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo content")
    path = tmp_dir / "foo"
    path_info = PathInfo(path)
    hash_info = HashInfo("md5", file_md5(path, dvc.fs))

    state = State(dvc.root_dir, dvc.tmp_dir)

    state.save(path_info, dvc.fs, hash_info)
    assert state.get(path_info, dvc.fs) == hash_info

    path.unlink()
    path.write_text("1")

    assert state.get(path_info, dvc.fs) is None

    hash_info = HashInfo("md5", file_md5(path, dvc.fs))
    state.save(path_info, dvc.fs, hash_info)

    assert state.get(path_info, dvc.fs) == hash_info
Esempio n. 18
0
def test_hashed_stream_reader(tmp_dir):
    tmp_dir.gen({"foo": "foo"})

    foo = tmp_dir / "foo"
    with open(foo, "rb") as fobj:
        stream_reader = HashedStreamReader(fobj)
        assert stream_reader.read(3) == b"foo"

    hex_digest = file_md5(foo, LocalFileSystem(None, {}))
    assert stream_reader.is_text_file
    assert hex_digest == stream_reader.hash_info.value
Esempio n. 19
0
    def test(self):
        md5 = file_md5(self.FOO)[0]

        stage = self.dvc.add(self.FOO)

        self.assertIsInstance(stage, Stage)
        self.assertTrue(os.path.isfile(stage.path))
        self.assertEqual(len(stage.outs), 1)
        self.assertEqual(len(stage.deps), 0)
        self.assertEqual(stage.cmd, None)
        self.assertEqual(stage.outs[0].info['md5'], md5)
Esempio n. 20
0
def test_add(tmp_dir, dvc):
    (stage, ) = tmp_dir.dvc_gen({"foo": "foo"})
    md5, _ = file_md5("foo")

    assert stage is not None

    assert isinstance(stage, Stage)
    assert os.path.isfile(stage.path)
    assert len(stage.outs) == 1
    assert len(stage.deps) == 0
    assert stage.cmd is None
    assert stage.outs[0].info["md5"] == md5
    assert stage.md5 == "ee343f2482f53efffc109be83cc976ac"
Esempio n. 21
0
def test_get_state_record_for_inode(get_inode_mock, dvc_repo, repo_dir):
    state = State(dvc_repo, dvc_repo.config.config)
    inode = state.MAX_INT + 2
    assert inode != state._to_sqlite(inode)

    path = os.path.join(dvc_repo.root_dir, repo_dir.FOO)
    md5 = file_md5(path)[0]
    get_inode_mock.side_effect = mock_get_inode(inode)

    with state:
        state.save(PathInfo(path), md5)
        ret = state.get_state_record_for_inode(inode)
        assert ret is not None
Esempio n. 22
0
    def test_transforms_inode(self, get_inode_mock):
        state = State(self.dvc, self.dvc.config.config)
        inode = state.MAX_INT + 2
        self.assertNotEqual(inode, state._to_sqlite(inode))

        path = os.path.join(self.dvc.root_dir, self.FOO)
        md5 = file_md5(path)[0]
        get_inode_mock.side_effect = self.mock_get_inode(path, inode)

        with state:
            state.save({"scheme": "local", "path": path}, md5)
            ret = state.get_state_record_for_inode(inode)
            self.assertIsNotNone(ret)
Esempio n. 23
0
    def save(self):
        super(Output, self).save()

        if not self.use_cache:
            return

        self.project.logger.debug("Saving {} to {}".format(
            self.path, self.cache))

        if self.project.scm.is_tracked(self.path):
            raise CmdOutputAlreadyTrackedError(self.rel_path)

        if not self.changed():
            return

        if os.path.exists(self.cache):
            # This means that we already have cache for this data.
            # We remove data and link it to existing cache to save
            # some space.
            msg = "Cache {} already exists, performing checkout for {}"
            self.project.logger.debug(msg.format(self.cache, self.path))
            self.checkout()
            return

        if os.path.isfile(self.path):
            self.hardlink(self.path, self.cache)
            return

        for root, dirs, files in os.walk(self.path):
            for fname in files:
                path = os.path.join(root, fname)
                relpath = os.path.relpath(path, self.path)
                md5 = file_md5(path)[0]
                cache = self.project.cache.get(md5)
                cache_info = os.path.join(self.cache, relpath)
                cache_dir = os.path.dirname(cache_info)

                if os.path.exists(cache):
                    self._remove(path, None)
                    self.hardlink(cache, path)
                else:
                    self.hardlink(path, cache)

                if not os.path.exists(cache_dir):
                    os.makedirs(cache_dir)

                with open(cache_info, 'w') as fd:
                    yaml.safe_dump({self.PARAM_MD5: md5},
                                   fd,
                                   default_flow_style=False)
Esempio n. 24
0
def test_hashed_stream_reader_as_chunks(tmp_dir):
    tmp_dir.gen({"foo": b"foo \x00" * 16})

    foo = tmp_dir / "foo"
    with open(foo, "rb") as fobj:
        stream_reader = HashedStreamReader(fobj)
        while True:
            chunk = stream_reader.read(16)
            if not chunk:
                break

    hex_digest = file_md5(foo, LocalFileSystem(None, {}))
    assert not stream_reader.is_text_file
    assert hex_digest == stream_reader.hash_info.value
Esempio n. 25
0
def test_state(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo content")
    path = tmp_dir / "foo"
    path_info = PathInfo(path)
    md5 = file_md5(path)[0]

    state = State(dvc, dvc.config.config)

    with state:
        state.save(path_info, md5)
        entry_md5 = state.get(path_info)
        assert entry_md5 == md5

        path.unlink()
        path.write_text("1")

        entry_md5 = state.get(path_info)
        assert entry_md5 is None

        md5 = file_md5(path)[0]
        state.save(path_info, md5)

        entry_md5 = state.get(path_info)
        assert entry_md5 == md5
Esempio n. 26
0
File: tree.py Progetto: bgheneti/dvc
    def get_file_hash(self, path_info):
        """Return file checksum for specified path.

        If path_info is a DVC out, the pre-computed checksum for the file
        will be used. If path_info is a git file, MD5 will be computed for
        the git object.
        """
        if not self.exists(path_info):
            raise FileNotFoundError
        if self.dvctree and self.dvctree.exists(path_info):
            try:
                return self.dvctree.get_file_hash(path_info)
            except OutputNotFoundError:
                pass
        return file_md5(path_info, self)[0]
Esempio n. 27
0
    def test(self):
        md5 = file_md5(self.FOO)[0]

        stages = self.dvc.add(self.FOO)
        self.assertEqual(len(stages), 1)
        stage = stages[0]
        self.assertTrue(stage is not None)

        self.assertIsInstance(stage, Stage)
        self.assertTrue(os.path.isfile(stage.path))
        self.assertEqual(len(stage.outs), 1)
        self.assertEqual(len(stage.deps), 0)
        self.assertEqual(stage.cmd, None)
        self.assertEqual(stage.outs[0].info["md5"], md5)
        self.assertEqual(stage.md5, "ee343f2482f53efffc109be83cc976ac")
Esempio n. 28
0
def test_state(dvc_repo, repo_dir):
    path = os.path.join(dvc_repo.root_dir, repo_dir.FOO)
    path_info = PathInfo(path)
    md5 = file_md5(path)[0]

    state = State(dvc_repo, dvc_repo.config.config)

    with state:
        state.save(path_info, md5)
        entry_md5 = state.get(path_info)
        assert entry_md5 == md5

        os.unlink(path)
        with open(path, "a") as fd:
            fd.write("1")

        entry_md5 = state.get(path_info)
        assert entry_md5 is None

        md5 = file_md5(path)[0]
        state.save(path_info, md5)

        entry_md5 = state.get(path_info)
        assert entry_md5 == md5
Esempio n. 29
0
def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc):
    tmp_dir.gen("foo", "foo content")

    state = State(dvc, dvc.config.config)
    inode = state.MAX_INT + 2
    assert inode != state._to_sqlite(inode)

    foo = tmp_dir / "foo"
    md5 = file_md5(foo)[0]
    get_inode_mock.side_effect = mock_get_inode(inode)

    with state:
        state.save(PathInfo(foo), md5)
        ret = state.get_state_record_for_inode(inode)
        assert ret is not None
Esempio n. 30
0
    def test_update(self):
        path = os.path.join(self.dvc.root_dir, self.FOO)
        path_info = {"scheme": "local", "path": path}
        md5 = file_md5(path)[0]

        state = State(self.dvc, self.dvc.config.config)

        with state:
            state.save(path_info, md5)
            entry_md5 = state.get(path_info)
            self.assertEqual(entry_md5, md5)

            os.unlink(path)
            with open(path, "a") as fd:
                fd.write("1")

            entry_md5 = state.get(path_info)
            self.assertTrue(entry_md5 is None)

            md5 = file_md5(path)[0]
            state.save(path_info, md5)

            entry_md5 = state.get(path_info)
            self.assertEqual(entry_md5, md5)