def collect( self, targets: List[str] = None, revs: List[str] = None, recursive: bool = False, onerror: Optional[Callable] = None, props: Optional[Dict] = None, ) -> Generator[Dict, None, None]: """Collects all props and data for plots. Generator yielding a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "unstructured data (as stored for given extension)", }}} """ from dvc.utils.collections import ensure_list targets = ensure_list(targets) for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" yield { rev: self._collect_from_revision( revision=rev, targets=targets, recursive=recursive, onerror=onerror, props=props, ) }
def show( self, targets: List[str] = None, revs=None, props=None, templates=None, recursive=False, ): from dvc.utils.collections import ensure_list data = self.collect(targets, revs, recursive) # If any mentioned plot doesn't have any data then that's an error for target in ensure_list(targets): rpath = relpath(target, self.repo.root_dir) if not any("data" in rev_data[key] for rev_data in data.values() for key, d in rev_data.items() if rpath in key): raise MetricDoesNotExistError([target]) # No data at all is a special error with a special message if not data: raise NoMetricsFoundError("plots", "--plots/--plots-no-cache") if templates is None: templates = self.templates return self.render(data, revs, props, templates)
def used_objs( self, targets: "TargetType" = None, with_deps: bool = False, remote: str = None, force: bool = False, recursive: bool = False, jobs: int = None, ) -> "ObjectContainer": from collections import defaultdict from itertools import chain from dvc.utils.collections import ensure_list used: "ObjectContainer" = defaultdict(set) collect_targets: Sequence[Optional[str]] = (None,) if targets: collect_targets = ensure_list(targets) pairs = chain.from_iterable( self.stage_collector.collect_granular( target, recursive=recursive, with_deps=with_deps ) for target in collect_targets ) for stage, filter_info in pairs: for odb, objs in stage.get_used_objs( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ).items(): used[odb].update(objs) return used
def collect( self, targets: List[str] = None, revs: List[str] = None, recursive: bool = False, ) -> Dict[str, Dict]: """Collects all props and data for plots. Returns a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "...data as a string...", }}} Data parsing is postponed, since it's affected by props. """ from dvc.fs.repo import RepoFileSystem from dvc.utils.collections import ensure_list targets = ensure_list(targets) data: Dict[str, Dict] = {} for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" fs = RepoFileSystem(self.repo) plots = _collect_plots(self.repo, targets, rev, recursive) for path_info, props in plots.items(): if rev not in data: data[rev] = {} if fs.isdir(path_info): plot_files = [] for pi in fs.walk_files(path_info): plot_files.append( (pi, relpath(pi, self.repo.root_dir)) ) else: plot_files = [ (path_info, relpath(path_info, self.repo.root_dir)) ] for path, repo_path in plot_files: data[rev].update({repo_path: {"props": props}}) # Load data from git or dvc cache try: with fs.open(path) as fd: data[rev][repo_path]["data"] = fd.read() except FileNotFoundError: # This might happen simply because cache is absent pass return data
def collect( self, targets: List[str] = None, revs: List[str] = None, recursive: bool = False, onerror: Optional[Callable] = None, props: Optional[Dict] = None, ) -> Dict[str, Dict]: """Collects all props and data for plots. Returns a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "unstructured data (as stored for given extension)", }}} """ from dvc.utils.collections import ensure_list targets = ensure_list(targets) data: Dict[str, Dict] = {} for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" data[rev] = self._collect_from_revision( revision=rev, targets=targets, recursive=recursive, onerror=onerror, props=props, ) errored = errored_revisions(data) if errored: from dvc.ui import ui ui.error_write( "DVC failed to load some plots for following revisions: " f"'{', '.join(errored)}'.") return data
def add( # noqa: C901 repo, targets: "TargetType", recursive=False, no_commit=False, fname=None, to_remote=False, **kwargs, ): from dvc.utils.collections import ensure_list if recursive and fname: raise RecursiveAddingWhileUsingFilename() targets = ensure_list(targets) to_cache = kwargs.get("out") and not to_remote invalid_opt = None if to_remote or to_cache: message = "{option} can't be used with " message += "--to-remote" if to_remote else "-o" if len(targets) != 1: invalid_opt = "multiple targets" elif no_commit: invalid_opt = "--no-commit option" elif recursive: invalid_opt = "--recursive option" elif kwargs.get("external"): invalid_opt = "--external option" else: message = "{option} can't be used without --to-remote" if kwargs.get("remote"): invalid_opt = "--remote" elif kwargs.get("jobs"): invalid_opt = "--jobs" if invalid_opt is not None: raise InvalidArgumentError(message.format(option=invalid_opt)) link_failures = [] stages_list = [] num_targets = len(targets) with Tqdm(total=num_targets, desc="Add", unit="file", leave=True) as pbar: if num_targets == 1: # clear unneeded top-level progress bar for single target pbar.bar_format = "Adding..." pbar.refresh() for target in targets: sub_targets = _find_all_targets(repo, target, recursive) pbar.total += len(sub_targets) - 1 if os.path.isdir(target) and len(sub_targets) > LARGE_DIR_SIZE: logger.warning( "You are adding a large directory '{target}' recursively," " consider tracking it as a whole instead.\n" "{purple}HINT:{nc} Remove the generated DVC file and then" " run `{cyan}dvc add {target}{nc}`".format( purple=colorama.Fore.MAGENTA, cyan=colorama.Fore.CYAN, nc=colorama.Style.RESET_ALL, target=target, )) stages = _create_stages( repo, sub_targets, fname, pbar=pbar, transfer=to_remote or to_cache, **kwargs, ) try: repo.check_modified_graph(stages) except OverlappingOutputPathsError as exc: msg = ( "Cannot add '{out}', because it is overlapping with other " "DVC tracked output: '{parent}'.\n" "To include '{out}' in '{parent}', run " "'dvc commit {parent_stage}'").format( out=exc.overlapping_out.path_info, parent=exc.parent.path_info, parent_stage=exc.parent.stage.addressing, ) raise OverlappingOutputPathsError(exc.parent, exc.overlapping_out, msg) except OutputDuplicationError as exc: raise OutputDuplicationError( exc.output, list(set(exc.stages) - set(stages))) link_failures.extend( _process_stages( repo, sub_targets, stages, no_commit, pbar, to_remote, to_cache, **kwargs, )) stages_list += stages if num_targets == 1: # restore bar format for stats pbar.bar_format = pbar.BAR_FMT_DEFAULT if link_failures: msg = ( "Some targets could not be linked from cache to workspace.\n{}\n" "To re-link these targets, reconfigure cache types and then run:\n" "\n\tdvc checkout {}").format( CacheLinkError.SUPPORT_LINK, " ".join([str(stage.relpath) for stage in link_failures]), ) logger.warning(msg) return stages_list
def _make_git_add_cmd(paths: Union[str, Iterable[str]]) -> str: files = " ".join(map(shlex.quote, ensure_list(paths))) return f"\tgit add {files}".expandtabs(4)
def track_file(self, paths: Union[str, Iterable[str], None] = None) -> None: """Track file to remind user to track new files or autostage later.""" return self.files_to_track.update(ensure_list(paths))