def get(url, path, out=None, rev=None): out = out or os.path.basename(urlparse(path).path) # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "reflink,hardlink,copy", ) o = repo.find_out_by_relpath(path) repo.fetch(o.stage.path) o.path_info = PathInfo(os.path.abspath(out)) with o.repo.state: o.checkout() finally: remove(tmp_dir)
def gc(self, checksum_infos): used_md5s = [info[self.PARAM_MD5] for info in self._collect(checksum_infos)[0]] for md5 in self.all(): if md5 in used_md5s: continue remove(self.get(md5))
def _checkout(self, path, md5): cache = self.get(md5) if not cache or not os.path.exists(cache) or self._changed(md5): if cache: Logger.warn(u'\'{}({})\': cache file not found'.format( os.path.relpath(cache), os.path.relpath(path))) remove(path) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout' Logger.debug(msg.format(os.path.relpath(path))) remove(path) msg = u'Checking out \'{}\' with cache \'{}\'' Logger.debug(msg.format(os.path.relpath(path), os.path.relpath(cache))) if not self.is_dir_cache(cache): self.link(cache, path) return dir_cache = self.dir_cache(cache) for relpath, c in dir_cache.items(): p = os.path.join(path, relpath) self.link(c, p)
def checkout(self, path_info, checksum_info): path = path_info['path'] md5 = checksum_info.get(self.PARAM_MD5, None) cache = self.get(md5) if not cache: Logger.warn('No cache info for \'{}\'. Skipping checkout.'.format( os.path.relpath(path))) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout' Logger.debug(msg.format(os.path.relpath(path))) remove(path) msg = u'Checking out \'{}\' with cache \'{}\'' Logger.debug(msg.format(os.path.relpath(path), md5)) if not self.is_dir_cache(cache): self.link(md5, path, dump=True) return # Create dir separately so that dir is created # even if there are no files in it if not os.path.exists(path): os.makedirs(path) for entry in self.load_dir_cache(cache): md5 = entry[self.PARAM_MD5] relpath = entry[self.PARAM_RELPATH] p = os.path.join(path, relpath) self.link(md5, p, dump=False) self.link_state.dump()
def remove_unused_links(self, used): """Removes all saved links except the ones that are used. Args: used (list): list of used links that should not be removed. """ unused = [] self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE)) for row in self.cursor: relpath, inode, mtime = row inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relpath) if path in used: continue if not os.path.exists(path): continue actual_inode = get_inode(path) actual_mtime, _ = get_mtime_and_size(path, self.repo.dvcignore) if inode == actual_inode and mtime == actual_mtime: logger.debug("Removing '{}' as unused link.".format(path)) remove(path) unused.append(relpath) for chunk_unused in to_chunks(unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER): cmd = "DELETE FROM {} WHERE path IN ({})".format( self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused))) self._execute(cmd, tuple(chunk_unused))
def _remove(path): if os.name == "nt": # git.exe may hang for a while not permitting to remove temp dir os_retry = retry(5, errors=OSError, timeout=0.1) os_retry(remove)(path) else: remove(path)
def _save_dir(self, path_info): path = path_info["path"] md5, dir_info = self.state.update_info(path) dir_relpath = os.path.relpath(path) dir_size = len(dir_info) bar = dir_size > LARGE_DIR_SIZE logger.info("Linking directory '{}'.".format(dir_relpath)) for processed, entry in enumerate(dir_info): relpath = entry[self.PARAM_RELPATH] m = entry[self.PARAM_CHECKSUM] p = os.path.join(path, relpath) c = self.get(m) if self.changed_cache(m): self._move(p, c) else: remove(p) self.link(c, p) if bar: progress.update_target(dir_relpath, processed, dir_size) self.state.update_link(path) if bar: progress.finish_target(dir_relpath) return {self.PARAM_CHECKSUM: md5}
def checkout(self, path_info, checksum_info, dump=True): path = path_info['path'] md5 = checksum_info.get(self.PARAM_MD5, None) cache = self.get(md5) if not cache or not os.path.exists(cache) or self.changed(md5): if cache: Logger.warn(u'\'{}({})\': cache file not found'.format( os.path.relpath(cache), os.path.relpath(path))) remove(path) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout' Logger.debug(msg.format(os.path.relpath(path))) remove(path) if not self.is_dir_cache(cache): self.link(cache, path, dump=dump) return msg = u'Checking out directory \'{}\' with cache \'{}\'' Logger.debug(msg.format(os.path.relpath(path), os.path.relpath(cache))) # Create dir separately so that dir is created # even if there are no files in it if not os.path.exists(path): os.makedirs(path) dir_cache = self.dir_cache(cache) for relpath, c in dir_cache.items(): p = os.path.join(path, relpath) self.link(c, p, dump=dump)
def remove_unused_links(self, used): """Removes all saved links except the ones that are used. Args: used (list): list of used links that should not be removed. """ unused = [] self._execute("SELECT * FROM {}".format(self.LINK_STATE_TABLE)) for row in self.cursor: relpath, inode, mtime = row inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relpath) if path in used: continue if not os.path.exists(path): continue actual_inode = self._inode(path) actual_mtime, _ = self._mtime_and_size(path) if inode == actual_inode and mtime == actual_mtime: logger.debug("Removing '{}' as unused link.".format(path)) remove(path) unused.append(relpath) for relpath in unused: cmd = 'DELETE FROM {} WHERE path = "{}"' self._execute(cmd.format(self.LINK_STATE_TABLE, relpath))
def remove_unused_links(self, used): unused = [] self._execute('SELECT * FROM {}'.format(self.LINK_STATE_TABLE)) for row in self.c: p, i, m = row i = self._from_sqlite(i) path = os.path.join(self.root_dir, p) if path in used: continue if not os.path.exists(path): continue inode = self.inode(path) mtime = self.mtime(path) if i == inode and m == mtime: Logger.debug('Removing \'{}\' as unused link.'.format(path)) remove(path) unused.append(p) for p in unused: cmd = 'DELETE FROM {} WHERE path = "{}"' self._execute(cmd.format(self.LINK_STATE_TABLE, p))
def _unprotect_file(path): if System.is_symlink(path) or System.is_hardlink(path): logger.debug("Unprotecting '{}'".format(path)) tmp = os.path.join(os.path.dirname(path), "." + str(uuid.uuid4())) # The operations order is important here - if some application # would access the file during the process of copyfile then it # would get only the part of file. So, at first, the file should be # copied with the temporary name, and then original file should be # replaced by new. copyfile( path, tmp, name="Unprotecting '{}'".format(os.path.relpath(path)), ) remove(path) os.rename(tmp, path) else: logger.debug( "Skipping copying for '{}', since it is not " "a symlink or a hardlink.".format(path) ) os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
def remove_unused(self, used): unused = [] db = self.load() c = db.cursor() c.execute('SELECT * FROM {}'.format(self.STATE_TABLE)) for row in c: p, i, m = row path = os.path.join(self.root_dir, p) if path in used: continue if not os.path.exists(path): continue inode = self.inode(path) mtime = self.mtime(path) if i == inode and m == mtime: Logger.debug('Removing \'{}\' as unused link.'.format(path)) remove(path) unused.append(p) db.commit() for p in unused: cmd = 'DELETE FROM {} WHERE path = "{}"' c.execute(cmd.format(self.STATE_TABLE, p)) c.close() db.commit() db.close()
def install(self, cache_dir=None, force=False): if self.installed and not force: logger.info( "Skipping installing '{}'('{}') as it is already " "installed.".format(self.name, self.url) ) return makedirs(self.repos_dir, exist_ok=True) # installing package to a temporary directory until we are sure that # it has been installed correctly. # # Note that we can't use tempfile.TemporaryDirectory is using symlinks # to tmpfs, so we won't be able to use move properly. tmp_dir = os.path.join(self.repos_dir, "." + str(shortuuid.uuid())) try: self._install_to(tmp_dir, cache_dir) except ExternalRepoError: if os.path.exists(tmp_dir): remove(tmp_dir) raise if self.installed: self.uninstall() shutil.move(tmp_dir, self.path)
def changed_cache_file(self, md5): cache = self.get(md5) if self.state.changed(cache, md5=md5): if os.path.exists(cache): msg = 'Corrupted cache file {}.' Logger.warn(msg.format(os.path.relpath(cache))) remove(cache) return True return False
def _remove(repo): repo.scm.close() if os.name == "nt": # git.exe may hang for a while not permitting to remove temp dir os_retry = retry(5, errors=OSError, timeout=0.1) os_retry(remove)(repo.root_dir) else: remove(repo.root_dir)
def uninstall(self): if not self.installed: logger.info( "Skipping uninstalling '{}' as it is not installed.".format( self.name ) ) return remove(self.path)
def _changed(self, md5): cache = self.get(md5) if self.state.changed(cache, md5=md5): if os.path.exists(cache): Logger.warn('Corrupted cache file {}'.format( os.path.relpath(cache))) remove(cache) return True return False
def _make_repo(repo_url): if not repo_url or urlparse(repo_url).scheme == "": yield Repo(repo_url) else: tmp_dir = tempfile.mkdtemp("dvc-repo") try: ext_repo = ExternalRepo(tmp_dir, url=repo_url) ext_repo.install() yield ext_repo.repo finally: remove(tmp_dir)
def gc(self, checksum_infos): checksum_infos = self._collect(checksum_infos['local'])[0] used_md5s = [info[self.PARAM_MD5] for info in checksum_infos] removed = False for md5 in self.all(): if md5 in used_md5s: continue remove(self.get(md5)) removed = True return removed
def _safe_remove(self, file): msg = ('File "{}" is going to be removed. ' 'Are you sure you want to proceed?'.format(file)) confirmed = self.project.prompt.prompt(msg, False) if not confirmed: raise DvcException( 'Unable to remove {} without a confirmation' " from the user. Use '-f' to force.".format(file)) remove(file)
def get(url, path, out=None, rev=None): out = resolve_output(path, out) path = path.lstrip("/") if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Note: we need to replace state, because in case of getting DVC # dependency on CIFS or NFS filesystems, sqlite-based state # will be unable to obtain lock repo.state = StateNoop() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "reflink,hardlink,copy", ) o = repo.find_out_by_relpath(path) with repo.state: repo.cloud.pull(o.get_used_cache()) o.path_info = PathInfo(os.path.abspath(out)) with o.repo.state: o.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) except OutputNotFoundError: raise OutputNotFoundError(path) finally: remove(tmp_dir)
def init(root_dir=os.curdir, no_scm=False, force=False): """ Creates an empty repo on the given directory -- basically a `.dvc` directory with subdirectories for configuration and cache. It should be tracked by a SCM or use the `--no-scm` flag. If the given directory is not empty, you must use the `--force` flag to override it. Args: root_dir: Path to repo's root directory. Returns: Repo instance. Raises: KeyError: Raises an exception. """ root_dir = os.path.realpath(root_dir) dvc_dir = os.path.join(root_dir, Repo.DVC_DIR) scm = SCM(root_dir) if isinstance(scm, NoSCM) and not no_scm: raise InitError( "{repo} is not tracked by any supported scm tool (e.g. git). " "Use '--no-scm' if you don't want to use any scm.".format( repo=root_dir)) if os.path.isdir(dvc_dir): if not force: raise InitError("'{repo}' exists. Use '-f' to force.".format( repo=relpath(dvc_dir))) remove(dvc_dir) os.mkdir(dvc_dir) config = Config.init(dvc_dir) proj = Repo(root_dir) scm.add([config.config_file]) if scm.ignore_file: scm.add([os.path.join(dvc_dir, scm.ignore_file)]) logger.info("\nYou can now commit the changes to git.\n") _welcome_message() return proj
def _unprotect_file(path): if System.is_symlink(path) or System.is_hardlink(path): logger.debug("Unprotecting '{}'".format(path)) tmp = os.path.join(os.path.dirname(path), "." + str(uuid.uuid4())) move(path, tmp) copyfile(tmp, path) remove(tmp) else: logger.debug("Skipping copying for '{}', since it is not " "a symlink or a hardlink.".format(path)) os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
def _do_remove_all(self): for p, s in self._db.items(): path = os.path.join(self.root_dir, p) state = LinkStateEntry.loadd(s) if not os.path.exists(path): continue inode = self.inode(path) mtime = self.mtime(path) if inode == state.inode and mtime == state.mtime: remove(path) self._db = {}
def _discard_working_directory_changes(self, path, dir_info, force=False): working_dir_files = set( os.path.join(root, file) for root, _, files in os.walk(path) for file in files) cached_files = set( os.path.join(path, file['relpath']) for file in dir_info) delta = working_dir_files - cached_files for file in delta: if force or self._already_cached(file): remove(file) else: self._safe_remove(file)
def _unprotect_file(self, path): import stat import uuid from dvc.utils import copyfile, move, remove self.logger.debug("Unprotecting '{}'".format(path)) tmp = os.path.join(os.path.dirname(path), '.' + str(uuid.uuid4())) move(path, tmp) copyfile(tmp, path) remove(tmp) os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
def _save_file(self, path_info): path = path_info['path'] md5 = self.state.update(path) assert md5 is not None cache = self.get(md5) if self.changed_cache(md5): self._move(path, cache) else: remove(path) self.link(cache, path) self.state.update_link(path) return {self.PARAM_MD5: md5}
def checkout(self, path_info, checksum_info): path = path_info['path'] md5 = checksum_info.get(self.PARAM_MD5, None) cache = self.get(md5) if not cache: msg = 'No cache info for \'{}\'. Skipping checkout.' Logger.warn(msg.format(os.path.relpath(path))) return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." Logger.info(msg.format(os.path.relpath(path))) return if self.changed_cache(md5): msg = u'Cache \'{}\' not found. File \'{}\' won\'t be created.' Logger.warn(msg.format(md5, os.path.relpath(path))) remove(path) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout.' Logger.warn(msg.format(os.path.relpath(path))) remove(path) msg = u'Checking out \'{}\' with cache \'{}\'.' Logger.info(msg.format(os.path.relpath(path), md5)) if not self.is_dir_cache(cache): self.link(cache, path) self.state.update_link(path) return # Create dir separately so that dir is created # even if there are no files in it if not os.path.exists(path): os.makedirs(path) for entry in self.load_dir_cache(md5): md5 = entry[self.PARAM_MD5] c = self.get(md5) relpath = entry[self.PARAM_RELPATH] p = os.path.join(path, relpath) self.link(c, p) self.state.update_link(path)
def tearDown(self): self._popd() try: remove(self._root_dir) except OSError as exc: # We ignore this under Windows with a warning because it happened # to be really hard to trace all not properly closed files. # # Best guess so far is that gitpython is the culprit: # it opens files and uses __del__ to close them, which can happen # late in current pythons. TestGitFixture and TestDvcFixture try # to close that and it works on most of the tests, but not all. # Repos and thus git repos are created all over the dvc ;) if os.name == "nt" and exc.winerror == 32: warnings.warn("Failed to remove test dir: " + str(exc)) else: raise
def uninstall(self): if not self.installed: logger.info( "Skipping uninstalling '{}' as it is not installed.".format( self.name)) return # If repo has been initialized then we need to close its git repo if "repo" in self.__dict__: self.repo.scm.git.close() if os.name == "nt": # git.exe may hang for a while not permitting to remove temp dir os_retry = retry(5, errors=OSError, timeout=0.1) os_retry(remove)(self.path) else: remove(self.path)