Exemple #1
0
class TestLocalFileSystem(TestDir):
    def setUp(self):
        super().setUp()
        self.fs = LocalFileSystem(None, {})

    def test_open(self):
        with self.fs.open(self.FOO) as fd:
            self.assertEqual(fd.read(), self.FOO_CONTENTS)
        with self.fs.open(self.UNICODE, encoding="utf-8") as fd:
            self.assertEqual(fd.read(), self.UNICODE_CONTENTS)

    def test_exists(self):
        self.assertTrue(self.fs.exists(self.FOO))
        self.assertTrue(self.fs.exists(self.UNICODE))
        self.assertFalse(self.fs.exists("not-existing-file"))

    def test_isdir(self):
        self.assertTrue(self.fs.isdir(self.DATA_DIR))
        self.assertFalse(self.fs.isdir(self.FOO))
        self.assertFalse(self.fs.isdir("not-existing-file"))

    def test_isfile(self):
        self.assertTrue(self.fs.isfile(self.FOO))
        self.assertFalse(self.fs.isfile(self.DATA_DIR))
        self.assertFalse(self.fs.isfile("not-existing-file"))
Exemple #2
0
def test_staging_file(tmp_dir, dvc):
    from dvc.objects import check
    from dvc.objects.stage import stage
    from dvc.objects.transfer import transfer

    tmp_dir.gen("foo", "foo")
    fs = LocalFileSystem()

    local_odb = dvc.odb.local
    staging_odb, obj = stage(local_odb, tmp_dir / "foo", fs, "md5")

    assert not local_odb.exists(obj.hash_info)
    assert staging_odb.exists(obj.hash_info)

    with pytest.raises(FileNotFoundError):
        check(local_odb, obj)
    check(staging_odb, obj)

    transfer(staging_odb, local_odb, {obj.hash_info}, move=True)
    check(local_odb, obj)
    with pytest.raises(FileNotFoundError):
        check(staging_odb, obj)

    path_info = local_odb.hash_to_path_info(obj.hash_info.value)
    assert fs.exists(path_info)
Exemple #3
0
def test_staging_dir(tmp_dir, dvc):
    from dvc.data import check
    from dvc.data.stage import stage
    from dvc.data.transfer import transfer

    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})
    fs = LocalFileSystem()
    local_odb = dvc.odb.local

    staging_odb, _, obj = stage(local_odb, (tmp_dir / "dir").fs_path, fs,
                                "md5")

    assert not local_odb.exists(obj.hash_info)
    assert staging_odb.exists(obj.hash_info)

    with pytest.raises(FileNotFoundError):
        check(local_odb, obj)
    check(staging_odb, obj)

    transfer(staging_odb,
             local_odb, {obj.hash_info},
             shallow=False,
             hardlink=True)
    check(local_odb, obj)
    check(staging_odb, obj)

    path = local_odb.hash_to_path(obj.hash_info.value)
    assert fs.exists(path)
Exemple #4
0
def test_local_fs_exists(tmp_dir):
    tmp_dir.gen({
        "foo": "foo",
        "bar": "bar",
        "тест": "проверка",
        "code.py": "import sys\nimport shutil\n"
        "shutil.copyfile(sys.argv[1], sys.argv[2])",
        "data_dir": {
            "data": "data",
            "data_sub_dir": {
                "data_sub": "data_sub"
            },
        },
    })
    fs = LocalFileSystem()

    assert fs.exists("foo")
    assert fs.exists("тест")
    assert not fs.exists("not-existing-file")
Exemple #5
0
class State(StateBase):  # pylint: disable=too-many-instance-attributes
    def __init__(self, root_dir=None, tmp_dir=None):
        from diskcache import Cache

        super().__init__()

        self.tmp_dir = tmp_dir
        self.root_dir = root_dir
        self.fs = LocalFileSystem(None, {"url": self.root_dir})

        if not tmp_dir:
            return

        config = {"eviction_policy": "least-recently-used"}
        self.links = Cache(directory=os.path.join(tmp_dir, "links"), **config)
        self.md5s = Cache(directory=os.path.join(tmp_dir, "md5s"), **config)

    def close(self):
        self.md5s.close()
        self.links.close()

    def save(self, path_info, fs, hash_info):
        """Save hash for the specified path info.

        Args:
            path_info (dict): path_info to save hash for.
            hash_info (HashInfo): hash to save.
        """

        if not isinstance(fs, LocalFileSystem):
            return

        assert isinstance(path_info, str) or path_info.scheme == "local"
        assert hash_info
        assert isinstance(hash_info, HashInfo)
        assert os.path.exists(path_info)

        mtime, size = get_mtime_and_size(path_info, self.fs)
        inode = get_inode(path_info)

        logger.debug("state save (%s, %s, %s) %s", inode, mtime, size,
                     hash_info.value)

        self.md5s[inode] = (mtime, size, hash_info.value)

    def get(self, path_info, fs):
        """Gets the hash for the specified path info. Hash will be
        retrieved from the state database if available.

        Args:
            path_info (dict): path info to get the hash for.

        Returns:
            HashInfo or None: hash for the specified path info or None if it
            doesn't exist in the state database.
        """
        if not isinstance(fs, LocalFileSystem):
            return None

        assert isinstance(path_info, str) or path_info.scheme == "local"
        path = os.fspath(path_info)

        # NOTE: use os.path.exists instead of LocalFileSystem.exists
        # because it uses lexists() and will return True for broken
        # symlinks that we cannot stat() in get_mtime_and_size
        if not os.path.exists(path):
            return None

        mtime, size = get_mtime_and_size(path, self.fs)
        inode = get_inode(path)

        value = self.md5s.get(inode)

        if not value or value[0] != mtime or value[1] != size:
            return None

        return HashInfo("md5", value[2], size=int(size))

    def save_link(self, path_info, fs):
        """Adds the specified path to the list of links created by dvc. This
        list is later used on `dvc checkout` to cleanup old links.

        Args:
            path_info (dict): path info to add to the list of links.
        """
        if not isinstance(fs, LocalFileSystem):
            return

        assert isinstance(path_info, str) or path_info.scheme == "local"

        if not self.fs.exists(path_info):
            return

        mtime, _ = get_mtime_and_size(path_info, self.fs)
        inode = get_inode(path_info)
        relative_path = relpath(path_info, self.root_dir)

        with self.links as ref:
            ref[relative_path] = (inode, mtime)

    def get_unused_links(self, used, fs):
        """Removes all saved links except the ones that are used.

        Args:
            used (list): list of used links that should not be removed.
        """
        if not isinstance(fs, LocalFileSystem):
            return

        unused = []

        with self.links as ref:
            for relative_path in ref:
                path = os.path.join(self.root_dir, relative_path)

                if path in used or not self.fs.exists(path):
                    continue

                inode = get_inode(path)
                mtime, _ = get_mtime_and_size(path, self.fs)

                if ref[relative_path] == (inode, mtime):
                    logger.debug("Removing '%s' as unused link.", path)
                    unused.append(relative_path)

        return unused

    def remove_links(self, unused, fs):
        if not isinstance(fs, LocalFileSystem):
            return

        for path in unused:
            remove(os.path.join(self.root_dir, path))

        with self.links as ref:
            for path in unused:
                del ref[path]
Exemple #6
0
class State(StateBase):  # pylint: disable=too-many-instance-attributes
    """Class for the state database.

    Args:
        repo (dvc.repo.Repo): repo instance that this state belongs to.
        config (configobj.ConfigObj): config for the state.

    Raises:
        StateVersionTooNewError: thrown when dvc version is older than the
            state database version.
    """

    VERSION = 3
    STATE_FILE = "state"
    STATE_TABLE = "state"
    STATE_TABLE_LAYOUT = (
        "inode INTEGER PRIMARY KEY, "
        "mtime TEXT NOT NULL, "
        "size TEXT NOT NULL, "
        "md5 TEXT NOT NULL, "
        "timestamp TEXT NOT NULL"
    )

    STATE_INFO_TABLE = "state_info"
    STATE_INFO_TABLE_LAYOUT = "count INTEGER"
    STATE_INFO_ROW = 1

    LINK_STATE_TABLE = "link_state"
    LINK_STATE_TABLE_LAYOUT = (
        "path TEXT PRIMARY KEY, "
        "inode INTEGER NOT NULL, "
        "mtime TEXT NOT NULL"
    )

    STATE_ROW_LIMIT = 100000000
    STATE_ROW_CLEANUP_QUOTA = 50

    MAX_INT = 2 ** 63 - 1
    MAX_UINT = 2 ** 64 - 2

    def __init__(self, repo):

        super().__init__()

        self.repo = repo
        self.root_dir = repo.root_dir
        self.fs = LocalFileSystem(None, {"url": self.root_dir})

        state_config = repo.config.get("state", {})
        self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT)
        self.row_cleanup_quota = state_config.get(
            "row_cleanup_quota", self.STATE_ROW_CLEANUP_QUOTA
        )

        if not repo.tmp_dir:
            self.state_file = None
            return

        self.state_file = os.path.join(repo.tmp_dir, self.STATE_FILE)

        self.database = None
        self.cursor = None
        self.inserts = 0

    def _execute(self, cmd, parameters=()):
        logger.trace(cmd)
        return self.cursor.execute(cmd, parameters)

    def _fetchall(self):
        ret = self.cursor.fetchall()
        logger.trace("fetched: %s", ret)
        return ret

    def _to_sqlite(self, num):
        assert num >= 0
        assert num < self.MAX_UINT
        # NOTE: sqlite stores unit as signed ints, so maximum uint is 2^63-1
        # see http://jakegoulding.com/blog/2011/02/06/sqlite-64-bit-integers/
        if num > self.MAX_INT:
            ret = -(num - self.MAX_INT)
        else:
            ret = num
        assert self._from_sqlite(ret) == num
        return ret

    def _from_sqlite(self, num):
        assert abs(num) <= self.MAX_INT
        if num < 0:
            return abs(num) + self.MAX_INT
        assert num < self.MAX_UINT
        assert num >= 0
        return num

    def _prepare_db(self, empty=False):
        from dvc import __version__

        if not empty:
            cmd = "PRAGMA user_version;"
            self._execute(cmd)
            ret = self._fetchall()
            assert len(ret) == 1
            assert len(ret[0]) == 1
            assert isinstance(ret[0][0], int)
            version = ret[0][0]

            if version > self.VERSION:
                raise StateVersionTooNewError(
                    __version__, self.VERSION, version
                )
            elif version < self.VERSION:
                logger.warning(
                    "State file version '%d' is too old. "
                    "Reformatting to the current version '%d'.",
                    version,
                    self.VERSION,
                )
                cmd = "DROP TABLE IF EXISTS {};"
                self._execute(cmd.format(self.STATE_TABLE))
                self._execute(cmd.format(self.STATE_INFO_TABLE))
                self._execute(cmd.format(self.LINK_STATE_TABLE))

        # Check that the state file is indeed a database
        cmd = "CREATE TABLE IF NOT EXISTS {} ({})"
        self._execute(cmd.format(self.STATE_TABLE, self.STATE_TABLE_LAYOUT))
        self._execute(
            cmd.format(self.STATE_INFO_TABLE, self.STATE_INFO_TABLE_LAYOUT)
        )
        self._execute(
            cmd.format(self.LINK_STATE_TABLE, self.LINK_STATE_TABLE_LAYOUT)
        )

        cmd = (
            "INSERT OR IGNORE INTO {} (count) SELECT 0 "
            "WHERE NOT EXISTS (SELECT * FROM {})"
        )
        self._execute(cmd.format(self.STATE_INFO_TABLE, self.STATE_INFO_TABLE))

        cmd = "PRAGMA user_version = {};"
        self._execute(cmd.format(self.VERSION))

    def load(self):
        """Loads state database."""
        retries = 1
        while True:
            assert self.database is None
            assert self.cursor is None
            assert self.inserts == 0
            empty = not os.path.exists(self.state_file)
            # NOTE: we use nolock option because fcntl() lock sqlite uses
            # doesn't work on some older NFS/CIFS filesystems.
            # This opens a possibility of data corruption by concurrent writes,
            # which is prevented by repo lock.
            self.database = _connect_sqlite(self.state_file, {"nolock": 1})
            self.cursor = self.database.cursor()

            from sqlite3 import DatabaseError

            # Try loading once to check that the file is indeed a database
            # and reformat it if it is not.
            try:
                self._prepare_db(empty=empty)
                return
            except DatabaseError:
                self.cursor.close()
                self.database.close()
                self.database = None
                self.cursor = None
                self.inserts = 0
                if retries > 0:
                    os.unlink(self.state_file)
                    retries -= 1
                else:
                    raise

    def _vacuum(self):
        # NOTE: see https://bugs.python.org/issue28518
        self.database.isolation_level = None
        self._execute("VACUUM")
        self.database.isolation_level = ""

    def dump(self):
        """Saves state database."""
        assert self.database is not None

        cmd = "SELECT count from {} WHERE rowid=?".format(
            self.STATE_INFO_TABLE
        )
        self._execute(cmd, (self.STATE_INFO_ROW,))
        ret = self._fetchall()
        assert len(ret) == 1
        assert len(ret[0]) == 1
        count = self._from_sqlite(ret[0][0]) + self.inserts

        if count > self.row_limit:
            msg = "cleaning up state, this might take a while."
            logger.warning(msg)

            delete = count - self.row_limit
            delete += int(self.row_limit * (self.row_cleanup_quota / 100.0))
            cmd = (
                "DELETE FROM {} WHERE timestamp IN ("
                "SELECT timestamp FROM {} ORDER BY timestamp ASC LIMIT {});"
            )
            self._execute(
                cmd.format(self.STATE_TABLE, self.STATE_TABLE, delete)
            )

            self._vacuum()

            cmd = "SELECT COUNT(*) FROM {}"

            self._execute(cmd.format(self.STATE_TABLE))
            ret = self._fetchall()
            assert len(ret) == 1
            assert len(ret[0]) == 1
            count = ret[0][0]

        cmd = "UPDATE {} SET count = ? WHERE rowid = ?".format(
            self.STATE_INFO_TABLE
        )
        self._execute(cmd, (self._to_sqlite(count), self.STATE_INFO_ROW))

        self.database.commit()
        self.cursor.close()
        self.database.close()
        self.database = None
        self.cursor = None
        self.inserts = 0

    @staticmethod
    def _file_metadata_changed(actual_mtime, mtime, actual_size, size):
        return actual_mtime != mtime or actual_size != size

    def _update_state_record_timestamp_for_inode(self, actual_inode):
        cmd = "UPDATE {} SET timestamp = ? WHERE inode = ?".format(
            self.STATE_TABLE
        )
        self._execute(
            cmd, (current_timestamp(), self._to_sqlite(actual_inode))
        )

    def _update_state_for_path_changed(
        self, actual_inode, actual_mtime, actual_size, checksum
    ):
        cmd = (
            "UPDATE {} SET "
            "mtime = ?, size = ?, "
            "md5 = ?, timestamp = ? "
            "WHERE inode = ?"
        ).format(self.STATE_TABLE)
        self._execute(
            cmd,
            (
                actual_mtime,
                actual_size,
                checksum,
                current_timestamp(),
                self._to_sqlite(actual_inode),
            ),
        )

    def _insert_new_state_record(
        self, actual_inode, actual_mtime, actual_size, checksum
    ):
        assert checksum is not None

        cmd = (
            "INSERT INTO {}(inode, mtime, size, md5, timestamp) "
            "VALUES (?, ?, ?, ?, ?)"
        ).format(self.STATE_TABLE)
        self._execute(
            cmd,
            (
                self._to_sqlite(actual_inode),
                actual_mtime,
                actual_size,
                checksum,
                current_timestamp(),
            ),
        )
        self.inserts += 1

    def get_state_record_for_inode(self, inode):
        cmd = (
            "SELECT mtime, size, md5, timestamp from {} WHERE "
            "inode=?".format(self.STATE_TABLE)
        )
        self._execute(cmd, (self._to_sqlite(inode),))
        results = self._fetchall()
        if results:
            # uniqueness constrain on inode
            assert len(results) == 1
            return results[0]
        return None

    def save(self, path_info, fs, hash_info):
        """Save hash for the specified path info.

        Args:
            path_info (dict): path_info to save hash for.
            hash_info (HashInfo): hash to save.
        """

        if not isinstance(fs, LocalFileSystem):
            return

        assert isinstance(path_info, str) or path_info.scheme == "local"
        assert hash_info
        assert isinstance(hash_info, HashInfo)
        assert os.path.exists(path_info)

        actual_mtime, actual_size = get_mtime_and_size(path_info, self.fs)
        actual_inode = get_inode(path_info)

        existing_record = self.get_state_record_for_inode(actual_inode)
        if not existing_record:
            self._insert_new_state_record(
                actual_inode, actual_mtime, actual_size, hash_info.value
            )
            return

        self._update_state_for_path_changed(
            actual_inode, actual_mtime, actual_size, hash_info.value
        )

    def get(self, path_info, fs):
        """Gets the hash for the specified path info. Hash will be
        retrieved from the state database if available.

        Args:
            path_info (dict): path info to get the hash for.

        Returns:
            HashInfo or None: hash for the specified path info or None if it
            doesn't exist in the state database.
        """
        if not isinstance(fs, LocalFileSystem):
            return None

        assert isinstance(path_info, str) or path_info.scheme == "local"
        path = os.fspath(path_info)

        # NOTE: use os.path.exists instead of LocalFileSystem.exists
        # because it uses lexists() and will return True for broken
        # symlinks that we cannot stat() in get_mtime_and_size
        if not os.path.exists(path):
            return None

        actual_mtime, actual_size = get_mtime_and_size(path, self.fs)
        actual_inode = get_inode(path)

        existing_record = self.get_state_record_for_inode(actual_inode)
        if not existing_record:
            return None

        mtime, size, value, _ = existing_record
        if self._file_metadata_changed(actual_mtime, mtime, actual_size, size):
            return None

        self._update_state_record_timestamp_for_inode(actual_inode)
        return HashInfo("md5", value, size=int(actual_size))

    def save_link(self, path_info, fs):
        """Adds the specified path to the list of links created by dvc. This
        list is later used on `dvc checkout` to cleanup old links.

        Args:
            path_info (dict): path info to add to the list of links.
        """
        if not isinstance(fs, LocalFileSystem):
            return

        assert isinstance(path_info, str) or path_info.scheme == "local"

        if not self.fs.exists(path_info):
            return

        mtime, _ = get_mtime_and_size(path_info, self.fs)
        inode = get_inode(path_info)
        relative_path = relpath(path_info, self.root_dir)

        cmd = "REPLACE INTO {}(path, inode, mtime) " "VALUES (?, ?, ?)".format(
            self.LINK_STATE_TABLE
        )
        self._execute(cmd, (relative_path, self._to_sqlite(inode), mtime))

    def get_unused_links(self, used, fs):
        """Removes all saved links except the ones that are used.

        Args:
            used (list): list of used links that should not be removed.
        """
        if not isinstance(fs, LocalFileSystem):
            return

        unused = []

        self._execute(f"SELECT * FROM {self.LINK_STATE_TABLE}")
        for row in self.cursor:
            relative_path, inode, mtime = row
            inode = self._from_sqlite(inode)
            path = os.path.join(self.root_dir, relative_path)

            if path in used or not self.fs.exists(path):
                continue

            actual_inode = get_inode(path)
            actual_mtime, _ = get_mtime_and_size(path, self.fs)

            if (inode, mtime) == (actual_inode, actual_mtime):
                logger.debug("Removing '%s' as unused link.", path)
                unused.append(relative_path)

        return unused

    def remove_links(self, unused, fs):
        if not isinstance(fs, LocalFileSystem):
            return

        for path in unused:
            remove(path)

        for chunk_unused in to_chunks(
            unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER
        ):
            cmd = "DELETE FROM {} WHERE path IN ({})".format(
                self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused))
            )
            self._execute(cmd, tuple(chunk_unused))