Ejemplo n.º 1
0
    def storage_path_hierarchy_test(self):
        """ StoragePath should be read first from global and then from cloud section """

        c = ("[Global]", "LogLevel =", "DataDir = ", "CacheDir = ",
             "StateDir = ", "Cloud = aws",
             "StoragePath = globalsb/global_storage_path", "[AWS]",
             "StoragePath = awssb/aws_storage_path", "CredentialPath =",
             "[GCP]", "StoragePath = googlesb/google_storage_path")
        s = StringIO('\n'.join(c))
        conf = self._conf(s)
        cloud = DataCloud(None, conf)
        self.assertEqual(cloud.typ, 'AWS')
        self.assertEqual(cloud._cloud.storage_bucket, 'globalsb')
        self.assertEqual(cloud._cloud.storage_prefix, 'global_storage_path')

        c = ("[Global]", "LogLevel =", "DataDir = ", "CacheDir = ",
             "StateDir = ", "Cloud = Aws", "[AWS]",
             "StoragePath = awssb/aws_storage_path", "CredentialPath =",
             "[GCP]", "StoragePath = googlesb/google_storage_path")
        s = StringIO('\n'.join(c))
        conf = self._conf(s)
        cloud = DataCloud(None, conf)
        self.assertEqual(cloud.typ, 'AWS')
        self.assertEqual(cloud._cloud.storage_bucket, 'awssb')
        self.assertEqual(cloud._cloud.storage_prefix, 'aws_storage_path')
Ejemplo n.º 2
0
Archivo: project.py Proyecto: rdwrt/dvc
    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config)
        self.cloud = DataCloud(self.cache, self.config._config)
Ejemplo n.º 3
0
    def test(self):
        for k,v in DataCloud.CLOUD_MAP.items():
            config = {Config.SECTION_CORE: {'Cloud': k},
                      k: {'StoragePath': 'a/b',
                          'ProjectName': 'name'}}
            cloud = DataCloud(self.dvc.cache, config)
            self.assertIsInstance(cloud._cloud, v)

        with self.assertRaises(ConfigError) as cx:
            config = {Config.SECTION_CORE: {'Cloud': 'not_supported_type'}}
            DataCloud(self.dvc.cache, config)
Ejemplo n.º 4
0
    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cloud = DataCloud(cache=self.cache,
                               state=self.state,
                               config=self.config._config)
Ejemplo n.º 5
0
    def aws_credentials_default_tes_t_DISABLED(self):
        """ in absence of [AWS] -> CredentialPath, aws creds should be read from ~/.aws/credentials """

        default_path = os.path.expanduser('~/.aws/credentials')
        c = (
            "[Global]",
            "LogLevel =",
            "DataDir = ",
            "CacheDir = ",
            "StateDir = ",
            "Cloud = aws",
            "[AWS]",
            "StoragePath = awssb/aws_storage_path",
            "CredentialPath =",
        )
        s = StringIO('\n'.join(c))

        patcher = mock.patch(
            builtin_module_name + '.open',
            side_effect=self.mocked_open_aws_default_credentials)

        # patcher.start()
        conf = self._conf(s)
        cloud = DataCloud(None, conf)
        aws_creds = cloud._cloud._get_credentials()
        patcher.stop()

        self.assertEqual(aws_creds[0], 'default_access_id')
        self.assertEqual(aws_creds[1], 'default_sekret')
Ejemplo n.º 6
0
    def aws_credentials_specified_tes_t_DISABLED(self, isfile_function):
        """ in presence of [AWS] -> CredentialPath, use those credentials """

        c = (
            "[Global]",
            "LogLevel =",
            "DataDir = ",
            "CacheDir = ",
            "StateDir = ",
            "Cloud = aws",
            "[AWS]",
            "StoragePath = awssb/aws_storage_path",
            "CredentialPath = some_credential_path",
        )
        s = StringIO('\n'.join(c))

        patcher = mock.patch(
            builtin_module_name + '.open',
            side_effect=self.mocked_open_aws_default_credentials)

        patcher.start()
        conf = self._conf(s)
        cloud = DataCloud(None, conf)
        aws_creds = cloud._cloud._get_credentials()
        patcher.stop()

        self.assertEqual(aws_creds[0], 'override_access_id')
        self.assertEqual(aws_creds[1], 'override_sekret')
Ejemplo n.º 7
0
 def test_unsupported(self):
     with self.assertRaises(ConfigError) as cx:
         config = TEST_CONFIG
         config[TEST_SECTION][
             Config.SECTION_REMOTE_URL] = 'notsupportedscheme://a/b'
         DataCloud(cache=self.dvc.cache,
                   state=self.dvc.state,
                   config=config)
Ejemplo n.º 8
0
    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.root_dir, self.dvc_dir, cache_dir=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_DIR, None),
                                         cache_type=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_TYPE, None))
        self.state = State(self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(Config.SECTION_CORE_LOGLEVEL, None))
        self.cloud = DataCloud(cache=self.cache, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()
Ejemplo n.º 9
0
    def storage_path_hierarchy_test(self):
        """ StoragePath should be read first from global and then from cloud section """

        c = ("[core]", "loglevel =", "cloud = aws",
             "storagepath = globalsb/global_storage_path", "[aws]",
             "storagepath = awssb/aws_storage_path", "credentialpath =",
             "[gcp]", "storagepath = googlesb/google_storage_path")
        s = StringIO('\n'.join(c))
        conf = self._conf(s)
        cloud = DataCloud(None, conf)
        self.assertIsInstance(cloud._cloud, DataCloudAWS)
        self.assertEqual(cloud._cloud.storage_bucket, 'globalsb')
        self.assertEqual(cloud._cloud.storage_prefix, 'global_storage_path')

        c = ("[core]", "loglevel =", "cloud = Aws", "[aws]",
             "storagepath = awssb/aws_storage_path", "credentialpath =",
             "[gcp]", "storagepath = googlesb/google_storage_path")
        s = StringIO('\n'.join(c))
        conf = self._conf(s)
        cloud = DataCloud(None, conf)
        self.assertIsInstance(cloud._cloud, DataCloudAWS)
        self.assertEqual(cloud._cloud.storage_bucket, 'awssb')
        self.assertEqual(cloud._cloud.storage_prefix, 'aws_storage_path')
Ejemplo n.º 10
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cloud = DataCloud(cache=self.cache,
                               state=self.state,
                               config=self.config._config)

    @staticmethod
    def init(root_dir=os.curdir):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        cache = Cache.init(dvc_dir)
        state = State.init(root_dir, dvc_dir)
        lock = Lock(dvc_dir)

        scm = SCM(root_dir)
        scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file])

        ignore_file = os.path.join(dvc_dir, scm.ignore_file())
        scm.add([config.config_file, ignore_file])

        return Project(root_dir)

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(fname)
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target):
        if not Stage.is_stage_file(target):
            raise StageNotFoundError(target)

        stage = Stage.load(self, target)
        for out in stage.outs:
            out.remove()

        return stage

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node].reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def _remove_untracked_hardlinks(self):
        untracked = self.scm.untracked_files()
        cache = dict((System.inode(c), c) for c in self.cache.all())
        for file in untracked:
            inode = System.inode(file)
            if inode not in cache.keys():
                continue

            Logger.info(u'Remove \'{}\''.format(file))
            os.remove(file)

            dir = os.path.dirname(file)
            if len(dir) != 0 and not os.listdir(dir):
                Logger.info(u'Remove empty directory \'{}\''.format(dir))
                os.removedirs(dir)

    def checkout(self):
        self._remove_untracked_hardlinks()
        for stage in self.stages():
            stage.checkout()

    def _used_cache(self, target=None):
        cache_set = set()

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            for out in stage.outs:
                if not out.use_cache or not out.cache:
                    continue
                cache_set |= set([out.cache])
                if out.is_dir_cache(out.cache) and os.path.isfile(out.cache):
                    dir_cache = out.dir_cache()
                    cache_set |= set(dir_cache.values())

        return list(cache_set)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.all():
            if cache in clist:
                continue
            os.unlink(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, target=None, jobs=1, remote=None):
        return self.cloud.push(self._used_cache(target), jobs, remote=remote)

    def fetch(self, target=None, jobs=1, remote=None):
        return self.cloud.pull(self._used_cache(target), jobs, remote=remote)

    def pull(self, target=None, jobs=1, remote=None):
        ret = self.fetch(target, jobs, remote=remote)
        self.checkout()
        return ret

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in self.stages():
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1, remote=None):
        status = {}
        for target, ret in self.cloud.status(self._used_cache(target),
                                             jobs,
                                             remote=remote):
            if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_MODIFIED: 'modified',
                cloud.STATUS_NEW: 'new',
            }

            path = os.path.relpath(target, self.cache.cache_dir)

            status[path] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False, remote=None):
        if cloud:
            return self._cloud_status(target, jobs, remote=remote)
        return self._local_status(target)

    def graph(self):
        G = nx.DiGraph()

        for stage in self.stages():
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = dep.stage()
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs
Ejemplo n.º 11
0
def supported_url(url):
    from dvc.cloud.data_cloud import DataCloud
    return DataCloud.supported(url) != None
Ejemplo n.º 12
0
 def _test_cloud(self, config, cl):
     cloud = DataCloud(cache=self.dvc.cache,
                       state=self.dvc.state,
                       config=config)
     self.assertIsInstance(cloud._cloud, cl)
Ejemplo n.º 13
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.link_state = LinkState(self.root_dir, self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cache = Cache(self)
        self.cloud = DataCloud(cache=self.cache, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)

        scm = SCM(root_dir)
        if type(scm) == Base and not no_scm:
            msg = "{} is not tracked by any supported scm tool(e.g. git).".format(
                root_dir)
            raise InitError(msg)

        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        proj = Project(root_dir)

        scm.add([config.config_file])
        if scm.ignore_file():
            scm.add([os.path.join(dvc_dir, scm.ignore_file())])

        return proj

    def _ignore(self):
        l = [
            self.link_state.state_file, self.link_state._lock_file.lock_file,
            self.lock.lock_file, self.config.config_local_file,
            self.updater.updater_file
        ]

        if self.cache.local.cache_dir.startswith(self.root_dir):
            l += [self.cache.local.cache_dir]

        self.scm.ignore_list(l)

    def install(self):
        self.scm.install()

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(os.path.normpath(fname))
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target):
        if not Stage.is_stage_file(target):
            raise StageNotFoundError(target)

        stage = Stage.load(self, target)
        for out in stage.outs:
            out.remove()

        return stage

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node].reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def checkout(self, target=None):
        if target:
            if not Stage.is_stage_file(target):
                raise StageNotFoundError(target)
            stages = [Stage.load(self, target)]
        else:
            self.link_state.remove_all()
            stages = self.stages()

        for stage in stages:
            stage.checkout()

    def _used_cache(self, target=None):
        cache_set = set()

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            for out in stage.outs:
                if out.path_info['scheme'] != 'local':
                    continue

                if not out.use_cache or not out.cache:
                    continue

                cache_set |= set([out.cache])
                if self.cache.local.is_dir_cache(out.cache) and os.path.isfile(
                        out.cache):
                    dir_cache = self.cache.local.dir_cache(out.cache)
                    cache_set |= set(dir_cache.values())

        return list(cache_set)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.local.all():
            if cache in clist:
                continue
            os.unlink(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, target=None, jobs=1, remote=None):
        return self.cloud.push(self._used_cache(target), jobs, remote=remote)

    def fetch(self, target=None, jobs=1, remote=None):
        return self.cloud.pull(self._used_cache(target), jobs, remote=remote)

    def pull(self, target=None, jobs=1, remote=None):
        ret = self.fetch(target, jobs, remote=remote)
        self.checkout()
        return ret

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1, remote=None):
        status = {}
        for target, ret in self.cloud.status(self._used_cache(target),
                                             jobs,
                                             remote=remote):
            if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_MODIFIED: 'modified',
                cloud.STATUS_NEW: 'new',
            }

            path = os.path.relpath(target, self.cache.local.cache_dir)

            status[path] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False, remote=None):
        if cloud:
            return self._cloud_status(target, jobs, remote=remote)
        return self._local_status(target)

    def _read_metric_json(self, fd, json_path):
        parser = parse(json_path)
        return [x.value for x in parser.find(json.load(fd))]

    def _do_read_metric_tsv(self, reader, row, col):
        if col != None and row != None:
            return [reader[row][col]]
        elif col != None:
            return [r[col] for r in reader]
        elif row != None:
            return reader[row]
        return None

    def _read_metric_htsv(self, fd, htsv_path):
        col, row = htsv_path.split(',')
        row = int(row)
        reader = list(csv.DictReader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric_tsv(self, fd, tsv_path):
        col, row = tsv_path.split(',')
        row = int(row)
        col = int(col)
        reader = list(csv.reader(fd, delimiter='\t'))
        return self._do_read_metric_tsv(reader, row, col)

    def _read_metric(self,
                     path,
                     json_path=None,
                     tsv_path=None,
                     htsv_path=None):
        ret = None
        try:
            with open(path, 'r') as fd:
                if json_path:
                    ret = self._read_metric_json(fd, json_path)
                elif tsv_path:
                    ret = self._read_metric_tsv(fd, tsv_path)
                elif htsv_path:
                    ret = self._read_metric_htsv(fd, htsv_path)
                else:
                    ret = fd.read()
        except Exception as exc:
            self.logger.error('Unable to read metric in \'{}\''.format(path),
                              exc)

        return ret

    def metrics(self, path, json_path=None, tsv_path=None, htsv_path=None):
        res = {}
        saved = self.scm.active_branch()
        for branch in self.scm.list_branches():
            self.scm.checkout(branch)
            self.checkout()
            res[branch] = self._read_metric(path,
                                            json_path=json_path,
                                            tsv_path=tsv_path,
                                            htsv_path=htsv_path)
        self.scm.checkout(saved)
        self.checkout()
        return res

    def graph(self):
        G = nx.DiGraph()
        stages = self.stages()

        outs_map = {}
        for stage in stages:
            for o in stage.outs:
                outs_map[o.path] = stage

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = outs_map.get(dep.path, None)
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs
Ejemplo n.º 14
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config)
        self.cloud = DataCloud(self.cache.cache_dir, self.config._config)

    @staticmethod
    def init(root_dir=os.curdir):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        cache = Cache.init(dvc_dir)
        state = State.init(root_dir, dvc_dir)
        lock = Lock(dvc_dir)

        scm = SCM(root_dir)
        scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file])

        ignore_file = os.path.join(dvc_dir, scm.ignore_file())
        scm.add([config.config_file, ignore_file])

        return Project(root_dir)

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(fname)
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, fname):
        stages = []
        output = Output.loads(self, fname)
        for out in self.outs():
            if out.path == output.path:
                stage = out.stage()
                stages.append(stage)

        if len(stages) == 0:
            raise StageNotFoundError(fname)

        for stage in stages:
            stage.remove()

        return stages

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        if not stages[node].changed():
            return []

        stages[node].reproduce(force=force)
        stages[node].dump()
        return [stages[node]]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def checkout(self):
        for stage in self.stages():
            stage.checkout()

    def _used_cache(self):
        clist = []
        for stage in self.stages():
            for out in stage.outs:
                if not out.use_cache:
                    continue
                if out.cache not in clist:
                    clist.append(out.cache)
        return clist

    def _remove_cache_file(self, cache):
        os.chmod(cache, stat.S_IWRITE)
        os.unlink(cache)

    def _remove_cache(self, cache):
        if os.path.isfile(cache):
            self._remove_cache_file(cache)
            return

        for root, dirs, files in os.walk(cache, topdown=False):
            for dname in dirs:
                path = os.path.join(root, dname)
                os.rmdir(path)
            for fname in files:
                path = os.path.join(root, fname)
                self._remove_cache_file(path)
        os.rmdir(cache)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.all():
            if cache in clist:
                continue
            self._remove_cache(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, jobs=1):
        self.cloud.push(self._used_cache(), jobs)

    def pull(self, jobs=1):
        self.cloud.pull(self._used_cache(), jobs)
        self.checkout()

    def status(self, jobs=1):
        return self.cloud.status(self._used_cache(), jobs)

    def graph(self):
        G = nx.DiGraph()

        for stage in self.stages():
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = dep.stage()
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs