Exemple #1
0
def get(url, path, out=None, rev=None):
    out = out or os.path.basename(urlparse(path).path)

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            repo.config.set(
                Config.SECTION_CACHE,
                Config.SECTION_CACHE_TYPE,
                "reflink,hardlink,copy",
            )
            o = repo.find_out_by_relpath(path)
            repo.fetch(o.stage.path)
            o.path_info = PathInfo(os.path.abspath(out))
            with o.repo.state:
                o.checkout()
    finally:
        remove(tmp_dir)
Exemple #2
0
    def gc(self, checksum_infos):
        used_md5s = [info[self.PARAM_MD5] for info in self._collect(checksum_infos)[0]]

        for md5 in self.all():
            if md5 in used_md5s:
                continue
            remove(self.get(md5))
Exemple #3
0
    def _checkout(self, path, md5):
        cache = self.get(md5)

        if not cache or not os.path.exists(cache) or self._changed(md5):
            if cache:
                Logger.warn(u'\'{}({})\': cache file not found'.format(
                    os.path.relpath(cache), os.path.relpath(path)))
            remove(path)
            return

        if os.path.exists(path):
            msg = u'Data \'{}\' exists. Removing before checkout'
            Logger.debug(msg.format(os.path.relpath(path)))
            remove(path)

        msg = u'Checking out \'{}\' with cache \'{}\''
        Logger.debug(msg.format(os.path.relpath(path), os.path.relpath(cache)))

        if not self.is_dir_cache(cache):
            self.link(cache, path)
            return

        dir_cache = self.dir_cache(cache)
        for relpath, c in dir_cache.items():
            p = os.path.join(path, relpath)
            self.link(c, p)
Exemple #4
0
    def checkout(self, path_info, checksum_info):
        path = path_info['path']
        md5 = checksum_info.get(self.PARAM_MD5, None)
        cache = self.get(md5)

        if not cache:
            Logger.warn('No cache info for \'{}\'. Skipping checkout.'.format(
                os.path.relpath(path)))
            return

        if os.path.exists(path):
            msg = u'Data \'{}\' exists. Removing before checkout'
            Logger.debug(msg.format(os.path.relpath(path)))
            remove(path)

        msg = u'Checking out \'{}\' with cache \'{}\''
        Logger.debug(msg.format(os.path.relpath(path), md5))

        if not self.is_dir_cache(cache):
            self.link(md5, path, dump=True)
            return

        # Create dir separately so that dir is created
        # even if there are no files in it
        if not os.path.exists(path):
            os.makedirs(path)

        for entry in self.load_dir_cache(cache):
            md5 = entry[self.PARAM_MD5]
            relpath = entry[self.PARAM_RELPATH]
            p = os.path.join(path, relpath)
            self.link(md5, p, dump=False)
        self.link_state.dump()
Exemple #5
0
    def remove_unused_links(self, used):
        """Removes all saved links except the ones that are used.

        Args:
            used (list): list of used links that should not be removed.
        """
        unused = []

        self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE))
        for row in self.cursor:
            relpath, inode, mtime = row
            inode = self._from_sqlite(inode)
            path = os.path.join(self.root_dir, relpath)

            if path in used:
                continue

            if not os.path.exists(path):
                continue

            actual_inode = get_inode(path)
            actual_mtime, _ = get_mtime_and_size(path, self.repo.dvcignore)

            if inode == actual_inode and mtime == actual_mtime:
                logger.debug("Removing '{}' as unused link.".format(path))
                remove(path)
                unused.append(relpath)

        for chunk_unused in to_chunks(unused,
                                      chunk_size=SQLITE_MAX_VARIABLES_NUMBER):
            cmd = "DELETE FROM {} WHERE path IN ({})".format(
                self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused)))
            self._execute(cmd, tuple(chunk_unused))
Exemple #6
0
def _remove(path):
    if os.name == "nt":
        # git.exe may hang for a while not permitting to remove temp dir
        os_retry = retry(5, errors=OSError, timeout=0.1)
        os_retry(remove)(path)
    else:
        remove(path)
Exemple #7
0
    def _save_dir(self, path_info):
        path = path_info["path"]
        md5, dir_info = self.state.update_info(path)
        dir_relpath = os.path.relpath(path)
        dir_size = len(dir_info)
        bar = dir_size > LARGE_DIR_SIZE

        logger.info("Linking directory '{}'.".format(dir_relpath))

        for processed, entry in enumerate(dir_info):
            relpath = entry[self.PARAM_RELPATH]
            m = entry[self.PARAM_CHECKSUM]
            p = os.path.join(path, relpath)
            c = self.get(m)

            if self.changed_cache(m):
                self._move(p, c)
            else:
                remove(p)

            self.link(c, p)

            if bar:
                progress.update_target(dir_relpath, processed, dir_size)

        self.state.update_link(path)

        if bar:
            progress.finish_target(dir_relpath)

        return {self.PARAM_CHECKSUM: md5}
Exemple #8
0
    def checkout(self, path_info, checksum_info, dump=True):
        path = path_info['path']
        md5 = checksum_info.get(self.PARAM_MD5, None)
        cache = self.get(md5)

        if not cache or not os.path.exists(cache) or self.changed(md5):
            if cache:
                Logger.warn(u'\'{}({})\': cache file not found'.format(
                    os.path.relpath(cache), os.path.relpath(path)))
            remove(path)
            return

        if os.path.exists(path):
            msg = u'Data \'{}\' exists. Removing before checkout'
            Logger.debug(msg.format(os.path.relpath(path)))
            remove(path)

        if not self.is_dir_cache(cache):
            self.link(cache, path, dump=dump)
            return

        msg = u'Checking out directory \'{}\' with cache \'{}\''
        Logger.debug(msg.format(os.path.relpath(path), os.path.relpath(cache)))

        # Create dir separately so that dir is created
        # even if there are no files in it
        if not os.path.exists(path):
            os.makedirs(path)

        dir_cache = self.dir_cache(cache)
        for relpath, c in dir_cache.items():
            p = os.path.join(path, relpath)
            self.link(c, p, dump=dump)
Exemple #9
0
    def remove_unused_links(self, used):
        """Removes all saved links except the ones that are used.

        Args:
            used (list): list of used links that should not be removed.
        """
        unused = []

        self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE))
        for row in self.cursor:
            relpath, inode, mtime = row
            inode = self._from_sqlite(inode)
            path = os.path.join(self.root_dir, relpath)

            if path in used:
                continue

            if not os.path.exists(path):
                continue

            actual_inode = self._inode(path)
            actual_mtime, _ = self._mtime_and_size(path)

            if inode == actual_inode and mtime == actual_mtime:
                logger.debug("Removing '{}' as unused link.".format(path))
                remove(path)
                unused.append(relpath)

        for relpath in unused:
            cmd = 'DELETE FROM {} WHERE path = "{}"'
            self._execute(cmd.format(self.LINK_STATE_TABLE, relpath))
Exemple #10
0
    def remove_unused_links(self, used):
        unused = []

        self._execute('SELECT * FROM {}'.format(self.LINK_STATE_TABLE))
        for row in self.c:
            p, i, m = row
            i = self._from_sqlite(i)
            path = os.path.join(self.root_dir, p)

            if path in used:
                continue

            if not os.path.exists(path):
                continue

            inode = self.inode(path)
            mtime = self.mtime(path)

            if i == inode and m == mtime:
                Logger.debug('Removing \'{}\' as unused link.'.format(path))
                remove(path)
                unused.append(p)

        for p in unused:
            cmd = 'DELETE FROM {} WHERE path = "{}"'
            self._execute(cmd.format(self.LINK_STATE_TABLE, p))
Exemple #11
0
    def _unprotect_file(path):
        if System.is_symlink(path) or System.is_hardlink(path):
            logger.debug("Unprotecting '{}'".format(path))
            tmp = os.path.join(os.path.dirname(path), "." + str(uuid.uuid4()))

            # The operations order is important here - if some application
            # would access the file during the process of copyfile then it
            # would get only the part of file. So, at first, the file should be
            # copied with the temporary name, and then original file should be
            # replaced by new.
            copyfile(
                path,
                tmp,
                name="Unprotecting '{}'".format(os.path.relpath(path)),
            )
            remove(path)
            os.rename(tmp, path)

        else:
            logger.debug(
                "Skipping copying for '{}', since it is not "
                "a symlink or a hardlink.".format(path)
            )

        os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
Exemple #12
0
    def remove_unused(self, used):
        unused = []

        db = self.load()
        c = db.cursor()
        c.execute('SELECT * FROM {}'.format(self.STATE_TABLE))
        for row in c:
            p, i, m = row
            path = os.path.join(self.root_dir, p)

            if path in used:
                continue

            if not os.path.exists(path):
                continue

            inode = self.inode(path)
            mtime = self.mtime(path)

            if i == inode and m == mtime:
                Logger.debug('Removing \'{}\' as unused link.'.format(path))
                remove(path)
                unused.append(p)

        db.commit()
        for p in unused:
            cmd = 'DELETE FROM {} WHERE path = "{}"'
            c.execute(cmd.format(self.STATE_TABLE, p))

        c.close()
        db.commit()
        db.close()
Exemple #13
0
    def install(self, cache_dir=None, force=False):
        if self.installed and not force:
            logger.info(
                "Skipping installing '{}'('{}') as it is already "
                "installed.".format(self.name, self.url)
            )
            return

        makedirs(self.repos_dir, exist_ok=True)

        # installing package to a temporary directory until we are sure that
        # it has been installed correctly.
        #
        # Note that we can't use tempfile.TemporaryDirectory is using symlinks
        # to tmpfs, so we won't be able to use move properly.
        tmp_dir = os.path.join(self.repos_dir, "." + str(shortuuid.uuid()))
        try:
            self._install_to(tmp_dir, cache_dir)
        except ExternalRepoError:
            if os.path.exists(tmp_dir):
                remove(tmp_dir)
            raise

        if self.installed:
            self.uninstall()

        shutil.move(tmp_dir, self.path)
Exemple #14
0
 def changed_cache_file(self, md5):
     cache = self.get(md5)
     if self.state.changed(cache, md5=md5):
         if os.path.exists(cache):
             msg = 'Corrupted cache file {}.'
             Logger.warn(msg.format(os.path.relpath(cache)))
             remove(cache)
         return True
     return False
Exemple #15
0
def _remove(repo):
    repo.scm.close()

    if os.name == "nt":
        # git.exe may hang for a while not permitting to remove temp dir
        os_retry = retry(5, errors=OSError, timeout=0.1)
        os_retry(remove)(repo.root_dir)
    else:
        remove(repo.root_dir)
Exemple #16
0
    def uninstall(self):
        if not self.installed:
            logger.info(
                "Skipping uninstalling '{}' as it is not installed.".format(
                    self.name
                )
            )
            return

        remove(self.path)
Exemple #17
0
    def _changed(self, md5):
        cache = self.get(md5)
        if self.state.changed(cache, md5=md5):
            if os.path.exists(cache):
                Logger.warn('Corrupted cache file {}'.format(
                    os.path.relpath(cache)))
                remove(cache)
            return True

        return False
Exemple #18
0
def _make_repo(repo_url):
    if not repo_url or urlparse(repo_url).scheme == "":
        yield Repo(repo_url)
    else:
        tmp_dir = tempfile.mkdtemp("dvc-repo")
        try:
            ext_repo = ExternalRepo(tmp_dir, url=repo_url)
            ext_repo.install()
            yield ext_repo.repo
        finally:
            remove(tmp_dir)
Exemple #19
0
    def gc(self, checksum_infos):
        checksum_infos = self._collect(checksum_infos['local'])[0]
        used_md5s = [info[self.PARAM_MD5] for info in checksum_infos]

        removed = False
        for md5 in self.all():
            if md5 in used_md5s:
                continue
            remove(self.get(md5))
            removed = True

        return removed
Exemple #20
0
    def _safe_remove(self, file):
        msg = ('File "{}" is going to be removed. '
               'Are you sure you want to proceed?'.format(file))

        confirmed = self.project.prompt.prompt(msg, False)

        if not confirmed:
            raise DvcException(
                'Unable to remove {} without a confirmation'
                " from the user. Use '-f' to force.".format(file))

        remove(file)
Exemple #21
0
def get(url, path, out=None, rev=None):
    out = resolve_output(path, out)
    path = path.lstrip("/")

    if Stage.is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
            # Note: we need to replace state, because in case of getting DVC
            # dependency on CIFS or NFS filesystems, sqlite-based state
            # will be unable to obtain lock
            repo.state = StateNoop()

            # Try any links possible to avoid data duplication.
            #
            # Not using symlink, because we need to remove cache after we are
            # done, and to make that work we would have to copy data over
            # anyway before removing the cache, so we might just copy it
            # right away.
            #
            # Also, we can't use theoretical "move" link type here, because
            # the same cache file might be used a few times in a directory.
            repo.config.set(
                Config.SECTION_CACHE,
                Config.SECTION_CACHE_TYPE,
                "reflink,hardlink,copy",
            )

            o = repo.find_out_by_relpath(path)
            with repo.state:
                repo.cloud.pull(o.get_used_cache())
            o.path_info = PathInfo(os.path.abspath(out))
            with o.repo.state:
                o.checkout()

    except NotDvcRepoError:
        raise UrlNotDvcRepoError(url)
    except OutputNotFoundError:
        raise OutputNotFoundError(path)
    finally:
        remove(tmp_dir)
Exemple #22
0
def init(root_dir=os.curdir, no_scm=False, force=False):
    """
    Creates an empty repo on the given directory -- basically a
    `.dvc` directory with subdirectories for configuration and cache.

    It should be tracked by a SCM or use the `--no-scm` flag.

    If the given directory is not empty, you must use the `--force`
    flag to override it.

    Args:
        root_dir: Path to repo's root directory.

    Returns:
        Repo instance.

    Raises:
        KeyError: Raises an exception.
    """
    root_dir = os.path.realpath(root_dir)
    dvc_dir = os.path.join(root_dir, Repo.DVC_DIR)
    scm = SCM(root_dir)
    if isinstance(scm, NoSCM) and not no_scm:
        raise InitError(
            "{repo} is not tracked by any supported scm tool (e.g. git). "
            "Use '--no-scm' if you don't want to use any scm.".format(
                repo=root_dir))

    if os.path.isdir(dvc_dir):
        if not force:
            raise InitError("'{repo}' exists. Use '-f' to force.".format(
                repo=relpath(dvc_dir)))

        remove(dvc_dir)

    os.mkdir(dvc_dir)

    config = Config.init(dvc_dir)
    proj = Repo(root_dir)

    scm.add([config.config_file])

    if scm.ignore_file:
        scm.add([os.path.join(dvc_dir, scm.ignore_file)])
        logger.info("\nYou can now commit the changes to git.\n")

    _welcome_message()

    return proj
Exemple #23
0
def _unprotect_file(path):
    if System.is_symlink(path) or System.is_hardlink(path):
        logger.debug("Unprotecting '{}'".format(path))

        tmp = os.path.join(os.path.dirname(path), "." + str(uuid.uuid4()))
        move(path, tmp)

        copyfile(tmp, path)

        remove(tmp)
    else:
        logger.debug("Skipping copying for '{}', since it is not "
                     "a symlink or a hardlink.".format(path))

    os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
Exemple #24
0
    def _do_remove_all(self):
        for p, s in self._db.items():
            path = os.path.join(self.root_dir, p)
            state = LinkStateEntry.loadd(s)

            if not os.path.exists(path):
                continue

            inode = self.inode(path)
            mtime = self.mtime(path)

            if inode == state.inode and mtime == state.mtime:
                remove(path)

        self._db = {}
Exemple #25
0
    def _discard_working_directory_changes(self, path, dir_info, force=False):
        working_dir_files = set(
            os.path.join(root, file) for root, _, files in os.walk(path)
            for file in files)

        cached_files = set(
            os.path.join(path, file['relpath']) for file in dir_info)

        delta = working_dir_files - cached_files

        for file in delta:
            if force or self._already_cached(file):
                remove(file)
            else:
                self._safe_remove(file)
Exemple #26
0
    def _unprotect_file(self, path):
        import stat
        import uuid
        from dvc.utils import copyfile, move, remove

        self.logger.debug("Unprotecting '{}'".format(path))

        tmp = os.path.join(os.path.dirname(path), '.' + str(uuid.uuid4()))
        move(path, tmp)

        copyfile(tmp, path)

        remove(tmp)

        os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
Exemple #27
0
    def _save_file(self, path_info):
        path = path_info['path']
        md5 = self.state.update(path)
        assert md5 is not None

        cache = self.get(md5)

        if self.changed_cache(md5):
            self._move(path, cache)
        else:
            remove(path)

        self.link(cache, path)
        self.state.update_link(path)

        return {self.PARAM_MD5: md5}
Exemple #28
0
    def checkout(self, path_info, checksum_info):
        path = path_info['path']
        md5 = checksum_info.get(self.PARAM_MD5, None)
        cache = self.get(md5)

        if not cache:
            msg = 'No cache info for \'{}\'. Skipping checkout.'
            Logger.warn(msg.format(os.path.relpath(path)))
            return

        if not self.changed(path_info, checksum_info):
            msg = "Data '{}' didn't change."
            Logger.info(msg.format(os.path.relpath(path)))
            return

        if self.changed_cache(md5):
            msg = u'Cache \'{}\' not found. File \'{}\' won\'t be created.'
            Logger.warn(msg.format(md5, os.path.relpath(path)))
            remove(path)
            return

        if os.path.exists(path):
            msg = u'Data \'{}\' exists. Removing before checkout.'
            Logger.warn(msg.format(os.path.relpath(path)))
            remove(path)

        msg = u'Checking out \'{}\' with cache \'{}\'.'
        Logger.info(msg.format(os.path.relpath(path), md5))

        if not self.is_dir_cache(cache):
            self.link(cache, path)
            self.state.update_link(path)
            return

        # Create dir separately so that dir is created
        # even if there are no files in it
        if not os.path.exists(path):
            os.makedirs(path)

        for entry in self.load_dir_cache(md5):
            md5 = entry[self.PARAM_MD5]
            c = self.get(md5)
            relpath = entry[self.PARAM_RELPATH]
            p = os.path.join(path, relpath)
            self.link(c, p)
        self.state.update_link(path)
Exemple #29
0
 def tearDown(self):
     self._popd()
     try:
         remove(self._root_dir)
     except OSError as exc:
         # We ignore this under Windows with a warning because it happened
         # to be really hard to trace all not properly closed files.
         #
         # Best guess so far is that gitpython is the culprit:
         # it opens files and uses __del__ to close them, which can happen
         # late in current pythons. TestGitFixture and TestDvcFixture try
         # to close that and it works on most of the tests, but not all.
         # Repos and thus git repos are created all over the dvc ;)
         if os.name == "nt" and exc.winerror == 32:
             warnings.warn("Failed to remove test dir: " + str(exc))
         else:
             raise
Exemple #30
0
    def uninstall(self):
        if not self.installed:
            logger.info(
                "Skipping uninstalling '{}' as it is not installed.".format(
                    self.name))
            return

        # If repo has been initialized then we need to close its git repo
        if "repo" in self.__dict__:
            self.repo.scm.git.close()

        if os.name == "nt":
            # git.exe may hang for a while not permitting to remove temp dir
            os_retry = retry(5, errors=OSError, timeout=0.1)
            os_retry(remove)(self.path)
        else:
            remove(self.path)