Example #1
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.link_state = LinkState(self.root_dir, self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cache = Cache(self)
        self.cloud = DataCloud(cache=self.cache, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)

        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            msg = "{} is not tracked by any supported scm tool(e.g. git).".format(
                root_dir)
            raise InitError(msg)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])
        if scm.ignore_file():
            scm.add([os.path.join(dvc_dir, scm.ignore_file())])

        return proj

    def _ignore(self):
        l = [
            self.link_state.state_file, self.link_state._lock_file.lock_file,
            self.lock.lock_file, self.config.config_local_file,
            self.updater.updater_file
        ]

        if self.cache.local.cache_dir.startswith(self.root_dir):
            l += [self.cache.local.cache_dir]

        self.scm.ignore_list(l)

    def install(self):
        self.scm.install()

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(os.path.normpath(fname))
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target):
        if not Stage.is_stage_file(target):
            raise StageNotFoundError(target)

        stage = Stage.load(self, target)
        for out in stage.outs:
            out.remove()

        return stage

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node].reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def checkout(self, target=None):
        if target:
            if not Stage.is_stage_file(target):
                raise StageNotFoundError(target)
            stages = [Stage.load(self, target)]
        else:
            self.link_state.remove_all()
            stages = self.stages()

        for stage in stages:
            stage.checkout()

    def _used_cache(self, target=None):
        cache_set = set()

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            for out in stage.outs:
                if out.path_info['scheme'] != 'local':
                    continue

                if not out.use_cache or not out.cache:
                    continue

                cache_set |= set([out.cache])
                if self.cache.local.is_dir_cache(out.cache) and os.path.isfile(
                        out.cache):
                    dir_cache = self.cache.local.dir_cache(out.cache)
                    cache_set |= set(dir_cache.values())

        return list(cache_set)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.local.all():
            if cache in clist:
                continue
            os.unlink(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, target=None, jobs=1, remote=None):
        return self.cloud.push(self._used_cache(target), jobs, remote=remote)

    def fetch(self, target=None, jobs=1, remote=None):
        return self.cloud.pull(self._used_cache(target), jobs, remote=remote)

    def pull(self, target=None, jobs=1, remote=None):
        ret = self.fetch(target, jobs, remote=remote)
        self.checkout()
        return ret

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1, remote=None):
        status = {}
        for target, ret in self.cloud.status(self._used_cache(target),
                                             jobs,
                                             remote=remote):
            if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_MODIFIED: 'modified',
                cloud.STATUS_NEW: 'new',
            }

            path = os.path.relpath(target, self.cache.local.cache_dir)

            status[path] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False, remote=None):
        if cloud:
            return self._cloud_status(target, jobs, remote=remote)
        return self._local_status(target)

    def _read_metric_json(self, fd, json_path):
        parser = parse(json_path)
        return [x.value for x in parser.find(json.load(fd))]

    def _do_read_metric_tsv(self, reader, row, col):
        if col != None and row != None:
            return [reader[row][col]]
        elif col != None:
            return [r[col] for r in reader]
        elif row != None:
            return reader[row]
        return None

    def _read_metric_htsv(self, fd, htsv_path):
        col, row = htsv_path.split(',')
        row = int(row)
        reader = list(csv.DictReader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric_tsv(self, fd, tsv_path):
        col, row = tsv_path.split(',')
        row = int(row)
        col = int(col)
        reader = list(csv.reader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric(self,
                     path,
                     json_path=None,
                     tsv_path=None,
                     htsv_path=None):
        ret = None
        try:
            with open(path, 'r') as fd:
                if json_path:
                    ret = self._read_metric_json(fd, json_path)
                elif tsv_path:
                    ret = self._read_metric_tsv(fd, tsv_path)
                elif htsv_path:
                    ret = self._read_metric_htsv(fd, htsv_path)
                else:
                    ret = fd.read()
        except Exception as exc:
            self.logger.error('Unable to read metric in \'{}\''.format(path),
                              exc)

        return ret

    def metrics(self, path, json_path=None, tsv_path=None, htsv_path=None):
        res = {}
        saved = self.scm.active_branch()
        for branch in self.scm.list_branches():
            self.scm.checkout(branch)
            self.checkout()
            res[branch] = self._read_metric(path,
                                            json_path=json_path,
                                            tsv_path=tsv_path,
                                            htsv_path=htsv_path)
        self.scm.checkout(saved)
        self.checkout()
        return res

    def graph(self):
        G = nx.DiGraph()
        stages = self.stages()

        outs_map = {}
        for stage in stages:
            for o in stage.outs:
                outs_map[o.path] = stage

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = outs_map.get(dep.path, None)
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs
Example #2
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cloud = DataCloud(cache=self.cache,
                               state=self.state,
                               config=self.config._config)

    @staticmethod
    def init(root_dir=os.curdir):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        cache = Cache.init(dvc_dir)
        state = State.init(root_dir, dvc_dir)
        lock = Lock(dvc_dir)

        scm = SCM(root_dir)
        scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file])

        ignore_file = os.path.join(dvc_dir, scm.ignore_file())
        scm.add([config.config_file, ignore_file])

        return Project(root_dir)

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(fname)
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target):
        if not Stage.is_stage_file(target):
            raise StageNotFoundError(target)

        stage = Stage.load(self, target)
        for out in stage.outs:
            out.remove()

        return stage

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node].reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def _remove_untracked_hardlinks(self):
        untracked = self.scm.untracked_files()
        cache = dict((System.inode(c), c) for c in self.cache.all())
        for file in untracked:
            inode = System.inode(file)
            if inode not in cache.keys():
                continue

            Logger.info(u'Remove \'{}\''.format(file))
            os.remove(file)

            dir = os.path.dirname(file)
            if len(dir) != 0 and not os.listdir(dir):
                Logger.info(u'Remove empty directory \'{}\''.format(dir))
                os.removedirs(dir)

    def checkout(self):
        self._remove_untracked_hardlinks()
        for stage in self.stages():
            stage.checkout()

    def _used_cache(self, target=None):
        cache_set = set()

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            for out in stage.outs:
                if not out.use_cache or not out.cache:
                    continue
                cache_set |= set([out.cache])
                if out.is_dir_cache(out.cache) and os.path.isfile(out.cache):
                    dir_cache = out.dir_cache()
                    cache_set |= set(dir_cache.values())

        return list(cache_set)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.all():
            if cache in clist:
                continue
            os.unlink(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, target=None, jobs=1, remote=None):
        return self.cloud.push(self._used_cache(target), jobs, remote=remote)

    def fetch(self, target=None, jobs=1, remote=None):
        return self.cloud.pull(self._used_cache(target), jobs, remote=remote)

    def pull(self, target=None, jobs=1, remote=None):
        ret = self.fetch(target, jobs, remote=remote)
        self.checkout()
        return ret

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in self.stages():
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1, remote=None):
        status = {}
        for target, ret in self.cloud.status(self._used_cache(target),
                                             jobs,
                                             remote=remote):
            if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_MODIFIED: 'modified',
                cloud.STATUS_NEW: 'new',
            }

            path = os.path.relpath(target, self.cache.cache_dir)

            status[path] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False, remote=None):
        if cloud:
            return self._cloud_status(target, jobs, remote=remote)
        return self._local_status(target)

    def graph(self):
        G = nx.DiGraph()

        for stage in self.stages():
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = dep.stage()
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs
Example #3
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config)
        self.cloud = DataCloud(self.cache.cache_dir, self.config._config)

    @staticmethod
    def init(root_dir=os.curdir):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        cache = Cache.init(dvc_dir)
        state = State.init(root_dir, dvc_dir)
        lock = Lock(dvc_dir)

        scm = SCM(root_dir)
        scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file])

        ignore_file = os.path.join(dvc_dir, scm.ignore_file())
        scm.add([config.config_file, ignore_file])

        return Project(root_dir)

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(fname)
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, fname):
        stages = []
        output = Output.loads(self, fname)
        for out in self.outs():
            if out.path == output.path:
                stage = out.stage()
                stages.append(stage)

        if len(stages) == 0:
            raise StageNotFoundError(fname)

        for stage in stages:
            stage.remove()

        return stages

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        if not stages[node].changed():
            return []

        stages[node].reproduce(force=force)
        stages[node].dump()
        return [stages[node]]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def checkout(self):
        for stage in self.stages():
            stage.checkout()

    def _used_cache(self):
        clist = []
        for stage in self.stages():
            for out in stage.outs:
                if not out.use_cache:
                    continue
                if out.cache not in clist:
                    clist.append(out.cache)
        return clist

    def _remove_cache_file(self, cache):
        os.chmod(cache, stat.S_IWRITE)
        os.unlink(cache)

    def _remove_cache(self, cache):
        if os.path.isfile(cache):
            self._remove_cache_file(cache)
            return

        for root, dirs, files in os.walk(cache, topdown=False):
            for dname in dirs:
                path = os.path.join(root, dname)
                os.rmdir(path)
            for fname in files:
                path = os.path.join(root, fname)
                self._remove_cache_file(path)
        os.rmdir(cache)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.all():
            if cache in clist:
                continue
            self._remove_cache(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, jobs=1):
        self.cloud.push(self._used_cache(), jobs)

    def pull(self, jobs=1):
        self.cloud.pull(self._used_cache(), jobs)
        self.checkout()

    def status(self, jobs=1):
        return self.cloud.status(self._used_cache(), jobs)

    def graph(self):
        G = nx.DiGraph()

        for stage in self.stages():
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = dep.stage()
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs