def run(self): msg = 'this will remove all cache except the cache that is used in ' if not self.args.all_branches and not self.args.all_tags: msg += 'the current git branch' elif self.args.all_branches and not self.args.all_tags: msg += 'all git branches' elif not self.args.all_branches and self.args.all_tags: msg += 'all git tags' else: msg += 'all git branches and all git tags' if self.args.projects is not None and len(self.args.projects) > 0: msg += ' of the current and the following projects:' for project_path in self.args.projects: msg += '\n - %s' % os.path.abspath(project_path) else: msg += ' of the current project.' logger.warning(msg) msg = 'Are you sure you want to proceed?' if not self.args.force and not prompt.confirm(msg): return 1 self.project.gc(all_branches=self.args.all_branches, all_tags=self.args.all_tags, cloud=self.args.cloud, remote=self.args.remote, force=self.args.force, jobs=self.args.jobs, projects=self.args.projects) return 0
def default_targets(self): """Default targets for `dvc repro` and `dvc pipeline`.""" from dvc.stage import Stage msg = "assuming default target '{}'.".format(Stage.STAGE_FILE) logger.warning(msg) return [Stage.STAGE_FILE]
def _collect_used_cache(self, out, branch=None, remote=None, force=False, jobs=None): if not out.use_cache or not out.info: if not out.info: logger.warning("Output '{}'({}) is missing version " "info. Cache for it will not be collected. " "Use dvc repro to get your pipeline up to " "date.".format(out, out.stage)) return [] info = out.dumpd() info["branch"] = branch ret = [info] if out.scheme != "local": return ret md5 = info[out.remote.PARAM_CHECKSUM] cache = self.cache.local.get(md5) if not out.remote.is_dir_cache(cache): return ret return self._collect_dir_cache(out, branch=branch, remote=remote, force=force, jobs=jobs)
def supported(cls, config): url = config[Config.SECTION_REMOTE_URL] url_ok = cls.match(url) is not None deps_ok = all(cls.REQUIRES.values()) if url_ok and not deps_ok: missing = [k for k, v in cls.REQUIRES.items() if v is None] logger.warning( "URL '{}' is supported but requires these missing " "dependencies: {}. If you have installed dvc using pip, " "choose one of these options to proceed: \n" "\n" " 1) Install specific missing dependencies:\n" " pip install {}\n" " 2) Install dvc package that includes those missing " "dependencies: \n" " pip install dvc[{}]\n" " 3) Install dvc package with all possible " "dependencies included: \n" " pip install dvc[all]\n" "\n" "If you have installed dvc from a binary package and you " "are still seeing this message, please report it to us " "using https://github.com/iterative/dvc/issues. Thank you!". format(url, missing, " ".join(missing), cls.scheme)) return url_ok and deps_ok
def _get_diff_outs(self, diff_dct): self.tree = diff_dct[DIFF_A_TREE] a_outs = {str(out): out for st in self.stages() for out in st.outs} self.tree = diff_dct[DIFF_B_TREE] b_outs = {str(out): out for st in self.stages() for out in st.outs} outs_paths = set(a_outs.keys()) outs_paths.update(b_outs.keys()) results = {} non_local_cache = [] for path in outs_paths: check1 = _check_local_cache(a_outs.get(path), non_local_cache) check2 = _check_local_cache(b_outs.get(path), non_local_cache) # skip files/directories with non-local cache for now if check1 or check2: continue results[path] = {} results[path][DIFF_A_OUTPUT] = a_outs.get(path) results[path][DIFF_B_OUTPUT] = b_outs.get(path) results[path][DIFF_IS_NEW] = path not in a_outs results[path][DIFF_DELETED] = path not in b_outs results[path][DIFF_IS_DIR] = _is_dir(path, a_outs, b_outs) if non_local_cache: logger.warning( "Diff is not supported for non-local outputs. Ignoring: {}".format( non_local_cache)) return results
def run(self): msg = "this will remove all cache except the cache that is used in " if not self.args.all_branches and not self.args.all_tags: msg += "the current git branch" elif self.args.all_branches and not self.args.all_tags: msg += "all git branches" elif not self.args.all_branches and self.args.all_tags: msg += "all git tags" else: msg += "all git branches and all git tags" if self.args.repos is not None and len(self.args.repos) > 0: msg += " of the current and the following repos:" for repo_path in self.args.repos: msg += "\n - %s" % os.path.abspath(repo_path) else: msg += " of the current repo." logger.warning(msg) msg = "Are you sure you want to proceed?" if not self.args.force and not prompt.confirm(msg): return 1 self.repo.gc( all_branches=self.args.all_branches, all_tags=self.args.all_tags, cloud=self.args.cloud, remote=self.args.remote, force=self.args.force, jobs=self.args.jobs, repos=self.args.repos, ) return 0
def _init_cloud(self, cloud_config, cloud_type): global_storage_path = self._core.get(Config.SECTION_CORE_STORAGEPATH) if global_storage_path: logger.warning("using obsoleted config format. Consider updating.") cloud = cloud_type(self.repo, cloud_config) return cloud
def save(self): if not self.use_cache: super(OutputLOCAL, self).save() self._verify_metric() msg = "Output '{}' doesn't use cache. Skipping saving." logger.info(msg.format(self.rel_path)) return if not os.path.exists(self.path): raise self.DoesNotExistError(self.rel_path) if (not os.path.isfile(self.path) and not os.path.isdir(self.path)): raise self.IsNotFileOrDirError(self.rel_path) if (os.path.isfile(self.path) and os.path.getsize(self.path) == 0) or \ (os.path.isdir(self.path) and len(os.listdir(self.path)) == 0): msg = "file/directory '{}' is empty.".format(self.rel_path) logger.warning(msg) if not self.changed(): msg = "Output '{}' didn't change. Skipping saving." logger.info(msg.format(self.rel_path)) return if self.is_local: if self.project.scm.is_tracked(self.path): raise OutputAlreadyTrackedError(self.rel_path) if self.use_cache: self.project.scm.ignore(self.path) self.info = self.project.cache.local.save(self.path_info)
def checkout(self, path_info, checksum_info, force=False): scheme = path_info["scheme"] if scheme not in ["", "local"] and scheme != self.scheme: raise NotImplementedError checksum = checksum_info.get(self.PARAM_CHECKSUM) if not checksum: msg = "No checksum info for '{}'." logger.info(msg.format(str(path_info))) return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." logger.info(msg.format(str(path_info))) return if self.changed_cache(checksum): msg = "Cache '{}' not found. File '{}' won't be created." logger.warning(msg.format(checksum, str(path_info))) self.safe_remove(path_info, force=force) return msg = "Checking out '{}' with cache '{}'." logger.info(msg.format(str(path_info), checksum)) self.do_checkout(path_info, checksum, force=force)
def test_warning(self, caplog): with caplog.at_level(logging.INFO, logger="dvc"): logger.warning("message") expected = "{yellow}WARNING{nc}: message".format(**colors) assert expected == formatter.format(caplog.records[0])
def checkout(self, target=None, with_deps=False, force=False, recursive=False): from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError all_stages = self.stages() try: stages = self.collect(target, with_deps=with_deps, recursive=recursive) except (StageFileDoesNotExistError, StageFileBadNameError) as exc: if not target: raise raise CheckoutErrorSuggestGit(target, exc) with self.state: _cleanup_unused_links(self, all_stages) progress_callback = get_progress_callback(stages) for stage in stages: if stage.locked: logger.warning( "DVC file '{path}' is locked. Its dependencies are" " not going to be checked out.".format(path=stage.relpath) ) stage.checkout(force=force, progress_callback=progress_callback) if progress_callback: progress_callback.finish("Checkout finished!")
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info['scheme'] != self.scheme: raise NotImplementedError if from_info['scheme'] != 'local': raise NotImplementedError bucket = to_info['bucket'] path = to_info['path'] logger.debug("Uploading '{}' to '{}/{}'".format( from_info['path'], bucket, path)) if not name: name = os.path.basename(from_info['path']) cb = Callback(name) try: self.blob_service.create_blob_from_path(bucket, path, from_info['path'], progress_callback=cb) except Exception: msg = "failed to upload '{}'".format(from_info['path']) logger.warning(msg) else: progress.finish_target(name)
def checkout(self, target=None, with_deps=False, force=False, recursive=False): if target and not recursive: from dvc.stage import ( StageFileDoesNotExistError, StageFileBadNameError, ) all_stages = self.active_stages() try: stages = self._collect(target, with_deps=with_deps) except (StageFileDoesNotExistError, StageFileBadNameError) as exc: raise DvcException( str(exc) + " Did you mean 'git checkout {}'?".format(target)) else: all_stages = self.active_stages(target) stages = all_stages with self.state: self._cleanup_unused_links(all_stages) for stage in stages: if stage.locked: logger.warning( "DVC file '{path}' is locked. Its dependencies are" " not going to be checked out.".format( path=stage.relpath)) stage.checkout(force=force)
def save(self): if not os.path.exists(self.path): raise self.DoesNotExistError(self.rel_path) if not os.path.isfile(self.path) and not os.path.isdir(self.path): raise self.IsNotFileOrDirError(self.rel_path) if (os.path.isfile(self.path) and os.path.getsize(self.path) == 0) or ( os.path.isdir(self.path) and len(os.listdir(self.path)) == 0): msg = "file/directory '{}' is empty.".format(self.rel_path) logger.warning(msg) if not self.use_cache: self.info = self.remote.save_info(self.path_info) self.verify_metric() if not self.IS_DEPENDENCY: msg = "Output '{}' doesn't use cache. Skipping saving." logger.info(msg.format(self.rel_path)) return assert not self.IS_DEPENDENCY if not self.changed(): msg = "Output '{}' didn't change. Skipping saving." logger.info(msg.format(self.rel_path)) return if self.is_local: if self.repo.scm.is_tracked(self.path): raise OutputAlreadyTrackedError(self.rel_path) if self.use_cache: self.repo.scm.ignore(self.path) self.info = self.remote.save_info(self.path_info)
def _changed_outs(self): for out in self.outs: if out.changed(): logger.warning("Output '{out}' of '{stage}' changed.".format( out=out, stage=self.relpath)) return True return False
def do_checkout(self, path_info, checksum, force=False): if self.exists(path_info): msg = "data '{}' exists. Removing before checkout." logger.warning(msg.format(str(path_info))) self.safe_remove(path_info, force=force) from_info = self.checksum_to_path_info(checksum) self.copy(from_info, path_info)
def test_logging_debug_with_datetime(caplog, dt): with caplog.at_level(logging.DEBUG, logger="dvc"): logger.warning("WARNING") logger.debug("DEBUG") logger.error("ERROR") for record in caplog.records: assert dt in formatter.format(record) assert record.levelname == record.message
def _changed_outs(self): for out in self.outs: if not out.changed(): continue msg = "Output '{}' of '{}' changed.".format(out, self.relpath) logger.warning(msg) return True return False
def changed_cache_file(self, md5): cache = self.get(md5) if self.state.changed(cache, md5=md5): if os.path.exists(cache): msg = "Corrupted cache file {}." logger.warning(msg.format(os.path.relpath(cache))) remove(cache) return True return False
def _warn_if_fish(self, executable): # pragma: no cover if (executable is None or os.path.basename(os.path.realpath(executable)) != "fish"): return logger.warning( "DVC detected that you are using fish as your default " "shell. Be aware that it might cause problems by overwriting " "your current environment variables with values defined " "in '.fishrc', which might affect your command. See " "https://github.com/iterative/dvc/issues/1307. ")
def do_checkout(self, output, force=False, progress_callback=None): path_info = output.path_info checksum = output.info.get(self.PARAM_CHECKSUM) if self.exists(path_info): msg = "data '{}' exists. Removing before checkout." logger.warning(msg.format(str(path_info))) self.safe_remove(path_info, force=force) from_info = self.checksum_to_path_info(checksum) self.copy(from_info, path_info)
def remove(self, tag, target=None, with_deps=False, recursive=False): stages = self.collect(target, with_deps=with_deps, recursive=recursive) for stage in stages: changed = False for out in stage.outs: if tag not in out.tags.keys(): logger.warning("tag '{}' not found for '{}'".format(tag, out)) continue del out.tags[tag] changed = True if changed: stage.dump()
def add(self, tag, target=None, with_deps=False, recursive=False): stages = self.collect(target, with_deps=with_deps, recursive=recursive) for stage in stages: changed = False for out in stage.outs: if not out.info: logger.warning("missing checksum info for '{}'".format(out)) continue out.tags[tag] = out.info.copy() changed = True if changed: stage.dump()
def _cloud(self): remote = self._core.get(Config.SECTION_CORE_REMOTE, "") if remote != "": return self._init_remote(remote) if self._core.get(Config.SECTION_CORE_CLOUD, None): # backward compatibility msg = "using obsoleted config format. Consider updating." logger.warning(msg) return self._init_compat() return None
def used_cache( self, target=None, all_branches=False, active=True, with_deps=False, all_tags=False, remote=None, force=False, jobs=None, recursive=False, ): cache = {} cache["local"] = [] cache["s3"] = [] cache["gs"] = [] cache["hdfs"] = [] cache["ssh"] = [] cache["azure"] = [] for branch in self.scm.brancher( all_branches=all_branches, all_tags=all_tags ): if target: if recursive: stages = self.stages(target) else: stages = self.collect(target, with_deps=with_deps) elif active: stages = self.active_stages() else: stages = self.stages() for stage in stages: if active and not target and stage.locked: logger.warning( "DVC file '{path}' is locked. Its dependencies are" " not going to be pushed/pulled/fetched.".format( path=stage.relpath ) ) for out in stage.outs: scheme = out.path_info["scheme"] cache[scheme] += self._collect_used_cache( out, branch=branch, remote=remote, force=force, jobs=jobs, ) return cache
def _changed_outs(self): for out in self.outs: status = out.status() if status: logger.warning( "Output '{out}' of '{stage}' changed because it is " "'{status}'".format(out=out, stage=self.relpath, status=status[str(out)])) return True return False
def dump(self): """Saves state database.""" assert self.database is not None cmd = "SELECT count from {} WHERE rowid={}" self._execute(cmd.format(self.STATE_INFO_TABLE, self.STATE_INFO_ROW)) ret = self._fetchall() assert len(ret) == 1 assert len(ret[0]) == 1 count = self._from_sqlite(ret[0][0]) + self.inserts if count > self.row_limit: msg = "cleaning up state, this might take a while." logger.warning(msg) delete = count - self.row_limit delete += int(self.row_limit * (self.row_cleanup_quota / 100.0)) cmd = ( "DELETE FROM {} WHERE timestamp IN (" "SELECT timestamp FROM {} ORDER BY timestamp ASC LIMIT {});" ) self._execute( cmd.format(self.STATE_TABLE, self.STATE_TABLE, delete) ) self._vacuum() cmd = "SELECT COUNT(*) FROM {}" self._execute(cmd.format(self.STATE_TABLE)) ret = self._fetchall() assert len(ret) == 1 assert len(ret[0]) == 1 count = ret[0][0] cmd = "UPDATE {} SET count = {} WHERE rowid = {}" self._execute( cmd.format( self.STATE_INFO_TABLE, self._to_sqlite(count), self.STATE_INFO_ROW, ) ) self._update_cache_directory_state() self.database.commit() self.cursor.close() self.database.close() self.database = None self.cursor = None self.inserts = 0
def _log_missing_caches(self, checksum_info_dict): missing_caches = [(md5, info) for md5, info in checksum_info_dict.items() if info["status"] == STATUS_MISSING] if missing_caches: missing_desc = "".join([ "\nname: {}, md5: {}".format(info["name"], md5) for md5, info in missing_caches ]) msg = ( "Some of the cache files do not exist neither locally " "nor on remote. Missing cache files: {}".format(missing_desc)) logger.warning(msg)
def persist_to(self, module_dir, parent_repo): if not self.is_state_set: raise TempRepoException(self, "cannot persist") tmp_repo_cache = self.repo.cache.local.url for prefix in os.listdir(tmp_repo_cache): if len(prefix) != 2: logger.warning("wrong dir format in cache {}: dir {}".format( tmp_repo_cache, prefix)) self._move_all_cache_files(parent_repo, prefix, tmp_repo_cache) shutil.move(self._cloned_tmp_dir, module_dir) self._reset_state()
def save(self): if not self.exists: raise self.DoesNotExistError(self.rel_path) if not os.path.isfile(self.path) \ and not os.path.isdir(self.path): # pragma: no cover raise self.IsNotFileOrDirError(self.rel_path) if (os.path.isfile(self.path) and os.path.getsize(self.path) == 0) or \ (os.path.isdir(self.path) and len(os.listdir(self.path)) == 0): msg = "file/directory '{}' is empty.".format(self.rel_path) logger.warning(msg) self.info = self.remote.save_info(self.path_info)