def __init__(self, root_dir): from dvc.logger import Logger from dvc.config import Config from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.prompt import Prompt self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir, project=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config._config) core = self.config._config[Config.SECTION_CORE] self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self.prompt = Prompt() self._files_to_git_add = [] self._ignore() self.updater.check()
class Project(object): DVC_DIR = '.dvc' def __init__(self, root_dir): from dvc.logger import Logger from dvc.config import Config from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.prompt import Prompt self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir, project=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config._config) core = self.config._config[Config.SECTION_CORE] self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self.prompt = Prompt() self._files_to_git_add = [] self._ignore() self.updater.check() def _remind_to_git_add(self): if len(self._files_to_git_add) == 0: return msg = '\nTo track the changes with git run:\n\n' msg += '\tgit add ' + " ".join(self._files_to_git_add) self.logger.info(msg) @staticmethod def init(root_dir=os.curdir, no_scm=False, force=False): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ import shutil from dvc.scm import SCM, Base from dvc.config import Config from dvc.logger import Logger root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) scm = SCM(root_dir) if type(scm) == Base and not no_scm: msg = "{} is not tracked by any supported scm tool(e.g. git)." raise InitError(msg.format(root_dir)) if os.path.isdir(dvc_dir): if not force: msg = "'{}' exists. Use '-f' to force." raise InitError(msg.format(os.path.relpath(dvc_dir))) shutil.rmtree(dvc_dir) os.mkdir(dvc_dir) config = Config.init(dvc_dir) proj = Project(root_dir) scm.add([config.config_file]) if scm.ignore_file(): scm.add([os.path.join(dvc_dir, scm.ignore_file())]) Logger.info('You can now commit the changes to git.') return proj @staticmethod def load_all(projects_paths): """ Instantiate all projects in the given list of paths. Args: projects_paths: List of paths to projects. Returns: List of Project instances in the same order of the given paths. """ return [Project(path) for path in projects_paths] def destroy(self): import shutil for stage in self.stages(): stage.remove() shutil.rmtree(self.dvc_dir) def _ignore(self): flist = [ self.state.state_file, self.lock.lock_file, self.config.config_local_file, self.updater.updater_file ] if self.cache.local.cache_dir.startswith(self.root_dir): flist += [self.cache.local.cache_dir] self.scm.ignore_list(flist) def install(self): self.scm.install() def to_dvc_path(self, path): return os.path.relpath(path, self.root_dir) def _check_output_duplication(self, outs): from dvc.exceptions import OutputDuplicationError for stage in self.stages(): for o in stage.outs: for out in outs: if o.path == out.path and o.stage.path != out.stage.path: stages = [o.stage.relpath, out.stage.relpath] raise OutputDuplicationError(o.path, stages) def _check_circular_dependency(self, deps, outs): from dvc.exceptions import CircularDependencyError circular_dependencies = (set(file.path for file in deps) & set(file.path for file in outs)) if circular_dependencies: raise CircularDependencyError(circular_dependencies.pop()) def add(self, fname, recursive=False): fnames = [] if recursive and os.path.isdir(fname): fnames = [] for root, dirs, files in os.walk(fname): for f in files: path = os.path.join(root, f) if Stage.is_stage_file(path): continue if os.path.basename(path) == self.scm.ignore_file(): continue if self.scm.is_tracked(path): continue fnames.append(path) else: fnames = [fname] stages = [] self._files_to_git_add = [] with self.state: for f in fnames: stage = Stage.loads(project=self, outs=[f], add=True) self._check_output_duplication(stage.outs) stage.save() stage.dump() stages.append(stage) self._remind_to_git_add() return stages def remove(self, target, outs_only=False): stage = Stage.load(self, target) if outs_only: stage.remove_outs() else: stage.remove() return stage def lock_stage(self, target, unlock=False): stage = Stage.load(self, target) stage.locked = False if unlock else True stage.dump() return stage def move(self, from_path, to_path): import dvc.output as Output from_out = Output.loads_from(Stage(self, cwd=os.curdir), [from_path])[0] found = False self._files_to_git_add = [] with self.state: for stage in self.stages(): for out in stage.outs: if out.path != from_out.path: continue if not stage.is_data_source: raise MoveNotDataSourceError(stage.relpath) found = True to_out = Output.loads_from(out.stage, [to_path], out.cache, out.metric)[0] out.move(to_out) stage_base = os.path.basename(stage.path) stage_base = stage_base.rstrip(Stage.STAGE_FILE_SUFFIX) stage_dir = os.path.dirname(stage.path) from_base = os.path.basename(from_path) to_base = os.path.basename(to_path) if stage_base == from_base: os.unlink(stage.path) path = to_base + Stage.STAGE_FILE_SUFFIX stage.path = os.path.join(stage_dir, path) stage.dump() self._remind_to_git_add() if not found: msg = 'Unable to find dvcfile with output \'{}\'' raise DvcException(msg.format(from_path)) def _unprotect_file(self, path): import stat import uuid from dvc.utils import copyfile, move, remove self.logger.debug("Unprotecting '{}'".format(path)) tmp = os.path.join(os.path.dirname(path), '.' + str(uuid.uuid4())) move(path, tmp) copyfile(tmp, path) remove(tmp) os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE) def _unprotect_dir(self, path): for root, dirs, files in os.walk(path): for f in files: path = os.path.join(root, f) self._unprotect_file(path) def unprotect(self, path): if not os.path.exists(path): raise DvcException("Can't unprotect non-existing " "data '{}'".format(path)) if os.path.isdir(path): self._unprotect_dir(path) else: self._unprotect_file(path) def run(self, cmd=None, deps=[], outs=[], outs_no_cache=[], metrics_no_cache=[], fname=Stage.STAGE_FILE, cwd=os.curdir, no_exec=False, overwrite=False): stage = Stage.loads(project=self, fname=fname, cmd=cmd, cwd=cwd, outs=outs, outs_no_cache=outs_no_cache, metrics_no_cache=metrics_no_cache, deps=deps, overwrite=overwrite) self._check_output_duplication(stage.outs) self._check_circular_dependency(stage.deps, stage.outs) self._files_to_git_add = [] with self.state: if not no_exec: stage.run() stage.dump() self._remind_to_git_add() return stage def imp(self, url, out): stage = Stage.loads(project=self, cmd=None, deps=[url], outs=[out]) self._check_output_duplication(stage.outs) self._files_to_git_add = [] with self.state: stage.run() stage.dump() self._remind_to_git_add() return stage def _reproduce_stage(self, stages, node, force, dry, interactive): stage = stages[node] if stage.locked: msg = 'DVC file \'{}\' is locked. Its dependencies are not ' \ 'going to be reproduced.' self.logger.warn(msg.format(stage.relpath)) stage = stage.reproduce(force=force, dry=dry, interactive=interactive) if not stage: return [] if not dry: stage.dump() return [stage] def reproduce(self, target=None, recursive=True, force=False, dry=False, interactive=False, pipeline=False, all_pipelines=False): if target is None and not all_pipelines: raise ValueError() if not interactive: config = self.config core = config._config[config.SECTION_CORE] interactive = core.get(config.SECTION_CORE_INTERACTIVE, False) targets = [] if pipeline or all_pipelines: if pipeline: stage = Stage.load(self, target) node = os.path.relpath(stage.path, self.root_dir) pipelines = [self._get_pipeline(node)] else: pipelines = self.pipelines() for G in pipelines: for node in G.nodes(): if G.in_degree(node) == 0: targets.append(os.path.join(self.root_dir, node)) else: targets.append(target) self._files_to_git_add = [] ret = [] with self.state: for target in targets: stages = self._reproduce(target, recursive=recursive, force=force, dry=dry, interactive=interactive) ret.extend(stages) self._remind_to_git_add() return ret def _reproduce(self, target, recursive=True, force=False, dry=False, interactive=False): import networkx as nx stage = Stage.load(self, target) G = self.graph()[1] stages = nx.get_node_attributes(G, 'stage') node = os.path.relpath(stage.path, self.root_dir) if recursive: ret = self._reproduce_stages(G, stages, node, force, dry, interactive) else: ret = self._reproduce_stage(stages, node, force, dry, interactive) return ret def _reproduce_stages(self, G, stages, node, force, dry, interactive): import networkx as nx result = [] for n in nx.dfs_postorder_nodes(G, node): try: result += self._reproduce_stage(stages, n, force, dry, interactive) except Exception as ex: raise ReproductionError(stages[n].relpath, ex) return result def _cleanup_unused_links(self, all_stages): used = [] for stage in all_stages: for out in stage.outs: used.append(out.path) self.state.remove_unused_links(used) def checkout(self, target=None, with_deps=False): all_stages = self.active_stages() if target: stages = self._collect(target, with_deps=with_deps) else: stages = all_stages with self.state: self._cleanup_unused_links(all_stages) for stage in stages: if stage.locked: msg = 'DVC file \'{}\' is locked. Its dependencies are ' \ 'not going to be checked out.' self.logger.warn(msg.format(stage.relpath)) stage.checkout() def _get_pipeline(self, node): pipelines = list(filter(lambda g: node in g.nodes(), self.pipelines())) assert len(pipelines) == 1 return pipelines[0] def _collect(self, target, with_deps=False): import networkx as nx stage = Stage.load(self, target) if not with_deps: return [stage] node = os.path.relpath(stage.path, self.root_dir) G = self._get_pipeline(node) stages = nx.get_node_attributes(G, 'stage') ret = [stage] for n in nx.dfs_postorder_nodes(G, node): ret.append(stages[n]) return ret def _collect_dir_cache(self, out, branch=None, remote=None, force=False, jobs=None): info = out.dumpd() ret = [info] r = out.remote md5 = info[r.PARAM_MD5] if self.cache.local.changed_cache_file(md5): try: self.cloud.pull(ret, jobs=jobs, remote=remote, show_checksums=False) except DvcException as exc: msg = "Failed to pull cache for '{}': {}" self.logger.debug(msg.format(out, exc)) if self.cache.local.changed_cache_file(md5): msg = "Missing cache for directory '{}'. " \ "Cache for files inside will be lost. " \ "Would you like to continue? Use '-f' to force. " if not (force or self.prompt.prompt(msg, False)): raise DvcException("Unable to fully collect " "used cache without cache " "for directory '{}'".format(out)) else: return ret for i in self.cache.local.load_dir_cache(md5): i['branch'] = branch i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH], i[r.PARAM_RELPATH]) ret.append(i) return ret def _collect_used_cache(self, out, branch=None, remote=None, force=False, jobs=None): if not out.use_cache or not out.info: return [] info = out.dumpd() info['branch'] = branch ret = [info] if out.path_info['scheme'] != 'local': return ret md5 = info[out.remote.PARAM_MD5] cache = self.cache.local.get(md5) if not out.remote.is_dir_cache(cache): return ret return self._collect_dir_cache(out, branch=branch, remote=remote, force=force, jobs=jobs) def _used_cache(self, target=None, all_branches=False, active=True, with_deps=False, all_tags=False, remote=None, force=False, jobs=None): cache = {} cache['local'] = [] cache['s3'] = [] cache['gs'] = [] cache['hdfs'] = [] cache['ssh'] = [] cache['azure'] = [] for branch in self.scm.brancher(all_branches=all_branches, all_tags=all_tags): if target: stages = self._collect(target, with_deps=with_deps) elif active: stages = self.active_stages() else: stages = self.stages() for stage in stages: if active and not target and stage.locked: msg = 'DVC file \'{}\' is locked. Its dependencies are ' \ 'not going to be pushed/pulled/fetched.' self.logger.warn(msg.format(stage.relpath)) for out in stage.outs: scheme = out.path_info['scheme'] cache[scheme] += self._collect_used_cache(out, branch=branch, remote=remote, force=force, jobs=jobs) return cache @staticmethod def merge_cache_lists(clists): merged_cache = collections.defaultdict(list) for cache_list in clists: for scheme, cache in cache_list.items(): for item in cache: if item not in merged_cache[scheme]: merged_cache[scheme].append(item) return merged_cache @staticmethod def load_all_used_cache(projects, target=None, all_branches=False, active=True, with_deps=False, all_tags=False, remote=None, force=False, jobs=None): clists = [] for project in projects: with project.state: project_clist = project._used_cache(target=None, all_branches=all_branches, active=False, with_deps=with_deps, all_tags=all_tags, remote=remote, force=force, jobs=jobs) clists.append(project_clist) return clists def _do_gc(self, typ, func, clist): removed = func(clist) if not removed: self.logger.info("No unused {} cache to remove.".format(typ)) def gc(self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, force=False, jobs=None, projects=None): all_projects = [self] if projects is not None and len(projects) > 0: all_projects.extend(Project.load_all(projects)) all_clists = Project.load_all_used_cache(all_projects, target=None, all_branches=all_branches, active=False, with_deps=with_deps, all_tags=all_tags, remote=remote, force=force, jobs=jobs) if len(all_clists) > 1: clist = Project.merge_cache_lists(all_clists) else: clist = all_clists[0] with self.state: self._do_gc('local', self.cache.local.gc, clist) if self.cache.s3: self._do_gc('s3', self.cache.s3.gc, clist) if self.cache.gs: self._do_gc('gs', self.cache.gs.gc, clist) if self.cache.ssh: self._do_gc('ssh', self.cache.ssh.gc, clist) if self.cache.hdfs: self._do_gc('hdfs', self.cache.hdfs.gc, clist) if self.cache.azure: self._do_gc('azure', self.cache.azure.gc, clist) if cloud: self._do_gc('remote', self.cloud._get_cloud(remote, 'gc -c').gc, clist) def push(self, target=None, jobs=1, remote=None, all_branches=False, show_checksums=False, with_deps=False, all_tags=False): with self.state: used = self._used_cache(target, all_branches=all_branches, all_tags=all_tags, with_deps=with_deps, force=True, remote=remote, jobs=jobs)['local'] self.cloud.push(used, jobs, remote=remote, show_checksums=show_checksums) def fetch(self, target=None, jobs=1, remote=None, all_branches=False, show_checksums=False, with_deps=False, all_tags=False): with self.state: used = self._used_cache(target, all_branches=all_branches, all_tags=all_tags, with_deps=with_deps, force=True, remote=remote, jobs=jobs)['local'] self.cloud.pull(used, jobs, remote=remote, show_checksums=show_checksums) def pull(self, target=None, jobs=1, remote=None, all_branches=False, show_checksums=False, with_deps=False, all_tags=False): self.fetch(target, jobs, remote=remote, all_branches=all_branches, all_tags=all_tags, show_checksums=show_checksums, with_deps=with_deps) self.checkout(target=target, with_deps=with_deps) def _local_status(self, target=None): status = {} if target: stages = [Stage.load(self, target)] else: stages = self.active_stages() for stage in stages: if stage.locked: msg = 'DVC file \'{}\' is locked. Its dependencies are not ' \ 'going to be shown in the status output.' self.logger.warn(msg.format(stage.relpath)) status.update(stage.status()) return status def _cloud_status(self, target=None, jobs=1, remote=None, show_checksums=False, all_branches=False, with_deps=False, all_tags=False): import dvc.remote.base as cloud used = self._used_cache(target, all_branches=all_branches, all_tags=all_tags, with_deps=with_deps, force=True, remote=remote, jobs=jobs)['local'] status = {} for md5, ret in self.cloud.status(used, jobs, remote=remote, show_checksums=show_checksums): if ret == cloud.STATUS_OK: continue prefix_map = { cloud.STATUS_DELETED: 'deleted', cloud.STATUS_NEW: 'new', } status[md5] = prefix_map[ret] return status def status(self, target=None, jobs=1, cloud=False, remote=None, show_checksums=False, all_branches=False, with_deps=False, all_tags=False): with self.state: if cloud: return self._cloud_status(target, jobs, remote=remote, show_checksums=show_checksums, all_branches=all_branches, with_deps=with_deps, all_tags=all_tags) return self._local_status(target) def _read_metric_json(self, fd, json_path): import json from jsonpath_rw import parse parser = parse(json_path) return [x.value for x in parser.find(json.load(fd))] def _do_read_metric_xsv(self, reader, row, col): if col is not None and row is not None: return [reader[row][col]] elif col is not None: return [r[col] for r in reader] elif row is not None: return reader[row] return None def _read_metric_hxsv(self, fd, hxsv_path, delimiter): import csv col, row = hxsv_path.split(',') row = int(row) reader = list(csv.DictReader(fd, delimiter=delimiter)) return self._do_read_metric_xsv(reader, row, col) def _read_metric_xsv(self, fd, xsv_path, delimiter): import csv col, row = xsv_path.split(',') row = int(row) col = int(col) reader = list(csv.reader(fd, delimiter=delimiter)) return self._do_read_metric_xsv(reader, row, col) def _read_metric(self, path, typ=None, xpath=None): ret = None if not os.path.exists(path): return ret try: with open(path, 'r') as fd: if typ == 'json': ret = self._read_metric_json(fd, xpath) elif typ == 'csv': ret = self._read_metric_xsv(fd, xpath, ',') elif typ == 'tsv': ret = self._read_metric_xsv(fd, xpath, '\t') elif typ == 'hcsv': ret = self._read_metric_hxsv(fd, xpath, ',') elif typ == 'htsv': ret = self._read_metric_hxsv(fd, xpath, '\t') else: ret = fd.read() except Exception as exc: self.logger.error('Unable to read metric in \'{}\''.format(path), exc) return ret def _find_output_by_path(self, path, outs=None): from dvc.exceptions import OutputDuplicationError if not outs: astages = self.active_stages() outs = [out for stage in astages for out in stage.outs] abs_path = os.path.abspath(path) matched = [out for out in outs if out.path == abs_path] stages = [out.stage.relpath for out in matched] if len(stages) > 1: raise OutputDuplicationError(path, stages) return matched[0] if matched else None def metrics_show(self, path=None, typ=None, xpath=None, all_branches=False, all_tags=False): res = {} for branch in self.scm.brancher(all_branches=all_branches, all_tags=all_tags): astages = self.active_stages() outs = [out for stage in astages for out in stage.outs] if path: out = self._find_output_by_path(path, outs=outs) stage = out.stage.path if out else None if out and all( [out.metric, not typ, isinstance(out.metric, dict)]): entries = [(path, out.metric.get(out.PARAM_METRIC_TYPE, None), out.metric.get(out.PARAM_METRIC_XPATH, None))] else: entries = [(path, typ, xpath)] else: metrics = filter(lambda o: o.metric, outs) stage = None entries = [] for o in metrics: if not typ and isinstance(o.metric, dict): t = o.metric.get(o.PARAM_METRIC_TYPE, typ) x = o.metric.get(o.PARAM_METRIC_XPATH, xpath) else: t = typ x = xpath entries.append((o.path, t, x)) for fname, t, x in entries: if stage: self.checkout(stage) rel = os.path.relpath(fname) metric = self._read_metric(fname, typ=t, xpath=x) if not metric: continue if branch not in res: res[branch] = {} res[branch][rel] = metric for branch, val in res.items(): if all_branches or all_tags: self.logger.info('{}:'.format(branch)) for fname, metric in val.items(): self.logger.info('\t{}: {}'.format(fname, metric)) if res: return res if path: msg = 'File \'{}\' does not exist'.format(path) else: msg = 'No metric files in this repository. ' \ 'Use \'dvc metrics add\' to add a metric file to track.' raise DvcException(msg) def _metrics_modify(self, path, typ=None, xpath=None, delete=False): out = self._find_output_by_path(path) if not out: msg = 'Unable to find file \'{}\' in the pipeline'.format(path) raise DvcException(msg) if out.path_info['scheme'] != 'local': msg = 'Output \'{}\' scheme \'{}\' is not supported for metrics' raise DvcException(msg.format(out.path, out.path_info['scheme'])) if out.use_cache: msg = 'Cached output \'{}\' is not supported for metrics' raise DvcException(msg.format(out.rel_path)) if typ: if not isinstance(out.metric, dict): out.metric = {} out.metric[out.PARAM_METRIC_TYPE] = typ if xpath: if not isinstance(out.metric, dict): out.metric = {} out.metric[out.PARAM_METRIC_XPATH] = xpath if delete: out.metric = None out._verify_metric() out.stage.dump() def metrics_modify(self, path=None, typ=None, xpath=None): self._metrics_modify(path, typ, xpath) def metrics_add(self, path, typ=None, xpath=None): if not typ: typ = 'raw' self._metrics_modify(path, typ, xpath) def metrics_remove(self, path): self._metrics_modify(path, delete=True) def graph(self): import networkx as nx from dvc.exceptions import OutputDuplicationError G = nx.DiGraph() G_active = nx.DiGraph() stages = self.stages() outs = [] outs_by_path = {} for stage in stages: for o in stage.outs: existing = outs_by_path.get(o.path, None) if existing is not None: stages = [o.stage.relpath, existing.stage.relpath] raise OutputDuplicationError(o.path, stages) outs.append(o) outs_by_path[o.path] = o # collect the whole DAG for stage in stages: node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if out.path != dep.path \ and not dep.path.startswith(out.path + out.sep): continue dep_stage = out.stage dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) return G, G_active def pipelines(self): import networkx as nx G, G_active = self.graph() if len(G.nodes()) == 0: return [] # find pipeline ends aka "output stages" ends = [node for node, in_degree in G.in_degree() if in_degree == 0] # filter out subgraphs that didn't exist in original G pipelines = [] for c in nx.weakly_connected_components(G_active): H = G_active.subgraph(c) found = False for node in ends: if node in H: found = True break if found: pipelines.append(H) return pipelines def stages(self): stages = [] outs = [] for root, dirs, files in os.walk(self.root_dir): for fname in files: path = os.path.join(root, fname) if not Stage.is_stage_file(path): continue stage = Stage.load(self, path) for out in stage.outs: outs.append(out.path + out.sep) stages.append(stage) def filter_dirs(dname): path = os.path.join(root, dname) if path == self.dvc_dir or path == self.scm.dir: return False for out in outs: if path == os.path.normpath(out) or path.startswith(out): return False return True dirs[:] = list(filter(filter_dirs, dirs)) return stages def active_stages(self): import networkx as nx stages = [] for G in self.pipelines(): stages.extend(list(nx.get_node_attributes(G, 'stage').values())) return stages