コード例 #1
0
    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.utils import makedirs

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir)

        self.tree = WorkingTree(self.root_dir)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        makedirs(self.tmp_dir, exist_ok=True)

        hardlink_lock = self.config.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.dvc_dir, "lock"),
            tmp_dir=os.path.join(self.dvc_dir, "tmp"),
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.metrics = Metrics(self)
        self.tag = Tag(self)

        self._ignore()
コード例 #2
0
ファイル: __init__.py プロジェクト: rogervaas/dvc
    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.scm.tree import WorkingTree
        from dvc.utils.fs import makedirs
        from dvc.stage.cache import StageCache

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        no_scm = self.config["core"].get("no_scm", False)
        self.scm = SCM(self.root_dir, no_scm=no_scm)

        self.tree = WorkingTree(self.root_dir)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        self.index_dir = os.path.join(self.tmp_dir, "index")
        makedirs(self.index_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self)

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.stage_cache = StageCache(self.cache.local.cache_dir)

        self.metrics = Metrics(self)
        self.params = Params(self)

        self._ignore()
コード例 #3
0
    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self.get_url()

        config = copy.deepcopy(TEST_CONFIG)
        config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo
        config[TEST_SECTION][
            Config.SECTION_GDRIVE_CLIENT_ID] = TEST_GDRIVE_CLIENT_ID
        config[TEST_SECTION][
            Config.SECTION_GDRIVE_CLIENT_SECRET] = TEST_GDRIVE_CLIENT_SECRET
        self.dvc.config.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class())
コード例 #4
0
ファイル: test_data_cloud.py プロジェクト: jdeguia/dvc
    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self.get_url()

        config = copy.deepcopy(TEST_CONFIG)
        config["remote"][TEST_REMOTE] = {
            "url": repo,
            "credentialpath": TEST_GCP_CREDS_FILE,
        }
        self.dvc.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote().tree,
                              self._get_cloud_class())
コード例 #5
0
class TestRemoteSSHMocked(SSHMocked, TestDataCloudBase):
    @pytest.fixture(autouse=True)
    def setup_method_fixture(self, request, ssh_server):
        self.ssh_server = ssh_server
        self.method_name = request.function.__name__

    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self.get_url()
        keyfile = self._get_keyfile()

        config = copy.deepcopy(TEST_CONFIG)
        config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo
        config[TEST_SECTION][Config.SECTION_REMOTE_KEY_FILE] = keyfile
        config[TEST_SECTION][Config.SECTION_REMOTE_NO_TRAVERSE] = False
        self.dvc.config.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class())

    def get_url(self):
        user = self.ssh_server.test_creds["username"]
        return super().get_url(user, self.ssh_server.port)

    def _get_keyfile(self):
        return self.ssh_server.test_creds["key_filename"]

    def _get_cloud_class(self):
        return RemoteSSH
コード例 #6
0
class TestRemoteGDrive(TestDataCloudBase):
    def _should_test(self):
        return _should_test_gdrive()

    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self._get_url()

        config = copy.deepcopy(TEST_CONFIG)
        config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo
        config[TEST_SECTION][
            Config.SECTION_GDRIVE_CLIENT_ID
        ] = TEST_GDRIVE_CLIENT_ID
        config[TEST_SECTION][
            Config.SECTION_GDRIVE_CLIENT_SECRET
        ] = TEST_GDRIVE_CLIENT_SECRET
        self.dvc.config.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class())

    def _get_url(self):
        return get_gdrive_url()

    def _get_cloud_class(self):
        return RemoteGDrive
コード例 #7
0
ファイル: test_config.py プロジェクト: tanthml/dvc
    def aws_credentials_default_tes_t_DISABLED(self):
        """ in absence of [AWS] -> CredentialPath, aws creds should be read from ~/.aws/credentials """

        default_path = os.path.expanduser('~/.aws/credentials')
        c = (
            "[Global]",
            "LogLevel =",
            "DataDir = ",
            "CacheDir = ",
            "StateDir = ",
            "Cloud = aws",
            "[AWS]",
            "StoragePath = awssb/aws_storage_path",
            "CredentialPath =",
        )
        s = StringIO('\n'.join(c))

        patcher = mock.patch(
            builtin_module_name + '.open',
            side_effect=self.mocked_open_aws_default_credentials)

        # patcher.start()
        conf = self._conf(s)
        cloud = DataCloud(conf)
        aws_creds = cloud._cloud._get_credentials()
        patcher.stop()

        self.assertEqual(aws_creds[0], 'default_access_id')
        self.assertEqual(aws_creds[1], 'default_sekret')
コード例 #8
0
ファイル: test_data_cloud.py プロジェクト: jackwellsxyz/dvc
class TestSSHRemoteMocked(SSHMocked, TestDataCloudBase):
    @pytest.fixture(autouse=True)
    def setup_method_fixture(self, request, ssh_server):
        self.ssh_server = ssh_server
        self.method_name = request.function.__name__

    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self.get_url()
        keyfile = self._get_keyfile()

        self._get_cloud_class().CAN_TRAVERSE = False
        config = copy.deepcopy(TEST_CONFIG)
        config["remote"][TEST_REMOTE] = {
            "url": repo,
            "keyfile": keyfile,
        }
        self.dvc.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class())

    def get_url(self):
        user = self.ssh_server.test_creds["username"]
        return super().get_url(user, self.ssh_server.port)

    def _get_keyfile(self):
        return self.ssh_server.test_creds["key_filename"]

    def _get_cloud_class(self):
        return SSHRemote
コード例 #9
0
ファイル: test_data_cloud.py プロジェクト: jackwellsxyz/dvc
    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self.get_url()
        keyfile = self._get_keyfile()

        self._get_cloud_class().CAN_TRAVERSE = False
        config = copy.deepcopy(TEST_CONFIG)
        config["remote"][TEST_REMOTE] = {
            "url": repo,
            "keyfile": keyfile,
        }
        self.dvc.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class())
コード例 #10
0
ファイル: __init__.py プロジェクト: gbiagomba/dvc
    def __init__(self, root_dir=None, scm=None, rev=None):
        from dvc.state import State, StateNoop
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.repo.plots import Plots
        from dvc.repo.params import Params
        from dvc.scm.tree import WorkingTree
        from dvc.utils.fs import makedirs
        from dvc.stage.cache import StageCache

        if scm:
            # use GitTree instead of WorkingTree as default repo tree instance
            tree = scm.get_tree(rev)
            self.root_dir = self.find_root(root_dir, tree)
            self.scm = scm
            self.tree = tree
            self.state = StateNoop()
        else:
            root_dir = self.find_root(root_dir)
            self.root_dir = os.path.abspath(os.path.realpath(root_dir))
            self.tree = WorkingTree(self.root_dir)

        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
        self.config = Config(self.dvc_dir, tree=self.tree)

        if not scm:
            no_scm = self.config["core"].get("no_scm", False)
            self.scm = SCM(self.root_dir, no_scm=no_scm)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        self.index_dir = os.path.join(self.tmp_dir, "index")
        makedirs(self.index_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if not scm:
            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self.cache.local)

        self.stage_cache = StageCache(self)

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)

        self._ignore()
コード例 #11
0
ファイル: test_config.py プロジェクト: tanthml/dvc
    def aws_credentials_specified_tes_t_DISABLED(self, isfile_function):
        """ in presence of [AWS] -> CredentialPath, use those credentials """

        c = (
            "[Global]",
            "LogLevel =",
            "DataDir = ",
            "CacheDir = ",
            "StateDir = ",
            "Cloud = aws",
            "[AWS]",
            "StoragePath = awssb/aws_storage_path",
            "CredentialPath = some_credential_path",
        )
        s = StringIO('\n'.join(c))

        patcher = mock.patch(
            builtin_module_name + '.open',
            side_effect=self.mocked_open_aws_default_credentials)

        patcher.start()
        conf = self._conf(s)
        cloud = DataCloud(conf)
        aws_creds = cloud._cloud._get_credentials()
        patcher.stop()

        self.assertEqual(aws_creds[0], 'override_access_id')
        self.assertEqual(aws_creds[1], 'override_sekret')
コード例 #12
0
ファイル: project.py プロジェクト: yustoris/dvc
    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.link_state = LinkState(self.root_dir, self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()
コード例 #13
0
    def test(self):
        for k, v in DataCloud.CLOUD_MAP.items():
            config = {
                'Global': {
                    'Cloud': k
                },
                k: {
                    'StoragePath': 'a/b',
                    'ProjectName': 'name'
                }
            }
            cloud = DataCloud(config)
            self.assertIsInstance(cloud._cloud, v)

        with self.assertRaises(ConfigError) as cx:
            config = {'Global': {'Cloud': 'not_supported_type'}}
            DataCloud(config)
コード例 #14
0
    def __init__(self, root_dir=None):
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.repo.pkg import Pkg

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.tree = WorkingTree(self.root_dir)

        self.scm = SCM(self.root_dir, repo=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config.config)
        self.updater = Updater(self.dvc_dir)

        self.metrics = Metrics(self)
        self.tag = Tag(self)
        self.pkg = Pkg(self)

        self._ignore()

        self.updater.check()
コード例 #15
0
ファイル: test_data_cloud.py プロジェクト: rjsears/dvc
    def _setup_cloud(self):
        self._ensure_should_run()

        url = self.get_url()
        self.create_dir(self.dvc, url)

        config = copy.deepcopy(TEST_CONFIG)
        config["remote"][TEST_REMOTE] = {
            "url": url,
            "gdrive_service_account_email": "test",
            "gdrive_service_account_p12_file_path": "test.p12",
            "gdrive_use_service_account": True,
        }

        self.dvc.config = config
        self.cloud = DataCloud(self.dvc)
        remote = self.cloud.get_remote()
        self.assertIsInstance(remote, self._get_cloud_class())
コード例 #16
0
ファイル: test_data_cloud.py プロジェクト: vernt/dvc
    def test_unsupported(self):
        remote_url = 'notsupportedscheme://a/b'
        config = TEST_CONFIG
        config[TEST_SECTION][Config.SECTION_REMOTE_URL] = remote_url

        # NOTE: making sure that error doesn't raise too early
        cloud = DataCloud(self.dvc, config=config)

        with self.assertRaises(ConfigError):
            cloud._cloud
コード例 #17
0
    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self)
        self.link_state = LinkState(self)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()
コード例 #18
0
ファイル: test_data_cloud.py プロジェクト: vyloy/dvc
    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self._get_url()

        config = copy.deepcopy(TEST_CONFIG)
        config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo
        config[TEST_SECTION][
            Config.SECTION_GCP_CREDENTIALPATH] = TestDvc.GCP_CREDS_FILE
        self.cloud = DataCloud(self.dvc, config)

        self.assertIsInstance(self.cloud._cloud, self._get_cloud_class())
コード例 #19
0
ファイル: test_data_cloud.py プロジェクト: jackwellsxyz/dvc
class TestGDriveRemote(GDrive, TestDataCloudBase):
    def _setup_cloud(self):
        self._ensure_should_run()

        setup_gdrive_cloud(self.get_url(), self.dvc)

        self.cloud = DataCloud(self.dvc)
        remote = self.cloud.get_remote()
        self.assertIsInstance(remote, self._get_cloud_class())

    def _get_cloud_class(self):
        return GDriveRemote
コード例 #20
0
    def __init__(self, root_dir=None):
        from dvc.logger import logger
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater
        from dvc.prompt import Prompt

        root_dir = self._find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir, project=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config._config)

        core = self.config._config[Config.SECTION_CORE]
        self.logger = logger
        self.logger.set_level(core.get(Config.SECTION_CORE_LOGLEVEL, None))

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)
        self.prompt = Prompt()

        self._files_to_git_add = []

        self._ignore()

        self.updater.check()
コード例 #21
0
class TestRemoteGS(GCP, TestDataCloudBase):
    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self.get_url()

        config = copy.deepcopy(TEST_CONFIG)
        config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo
        config[TEST_SECTION][
            Config.SECTION_GCP_CREDENTIALPATH] = TEST_GCP_CREDS_FILE
        self.dvc.config.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class())

    def _get_cloud_class(self):
        return RemoteGS
コード例 #22
0
ファイル: data_sync.py プロジェクト: vjkumran/dvc
    def run(self):
        with DvcLock(self.is_locker, self.git):
            cloud = DataCloud(self.settings)
            targets = []

            if len(self.parsed_args.targets) == 0:
                raise DataSyncError('Sync target is not specified')

            for target in self.parsed_args.targets:
                if System.islink(target):
                    targets.append(target)
                elif os.path.isdir(target):
                    for root, dirs, files in os.walk(target):
                        for f in files:
                            targets.append(os.path.join(root, f))
                else:
                    raise DataSyncError(
                        'File "{}" does not exit'.format(target))

            map_progress(cloud.sync, targets, self.parsed_args.jobs)
        pass
コード例 #23
0
    def __init__(self, argv=None, git=None, config=None):
        self._args = None
        args = None

        if argv is not None and len(argv) != 0:
            args = parse_args(argv)
            self._args = argv[2:]

        if git is None:
            git = GitWrapper()

        if config is None:
            if args is not None and args.cmd != 'init':
                config = Config()
            else:
                config = ConfigI()

        self._git = git
        self._config = config
        self._path_factory = PathFactory(git, config)
        self._parsed_args = args
        self._cloud = DataCloud(self)
コード例 #24
0
class Repo(object):
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.lock import lock as lock_stage
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.imp_url import imp_url
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import _checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import _fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.diff import diff
    from dvc.repo.brancher import brancher
    from dvc.repo.get import get
    from dvc.repo.get_url import get_url
    from dvc.repo.update import update

    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.utils import makedirs

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir)

        self.tree = CleanTree(WorkingTree(self.root_dir))

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        makedirs(self.tmp_dir, exist_ok=True)

        hardlink_lock = self.config.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.dvc_dir, "lock"),
            tmp_dir=os.path.join(self.dvc_dir, "tmp"),
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.metrics = Metrics(self)
        self.tag = Tag(self)

        self._ignore()

    @property
    def tree(self):
        return self._tree

    @tree.setter
    def tree(self, tree):
        self._tree = tree
        # Our graph cache is no longer valid, as it was based on the previous
        # tree.
        self._reset()

    def __repr__(self):
        return "Repo: '{root_dir}'".format(root_dir=self.root_dir)

    @classmethod
    def find_root(cls, root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, cls.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcRepoError(root)

    @classmethod
    def find_dvc_dir(cls, root=None):
        root_dir = cls.find_root(root)
        return os.path.join(root_dir, cls.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force)
        return Repo(root_dir)

    def unprotect(self, target):
        return self.cache.local.unprotect(PathInfo(target))

    def _ignore(self):
        from dvc.updater import Updater

        updater = Updater(self.dvc_dir)

        flist = ([self.config.config_local_file, updater.updater_file] +
                 [self.lock.lockfile, updater.lock.lockfile, self.tmp_dir] +
                 self.state.files)

        if path_isin(self.cache.local.cache_dir, self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def check_modified_graph(self, new_stages):
        """Generate graph including the new stage to check for errors"""
        self._collect_graph(self.stages + new_stages)

    def collect(self, target, with_deps=False, recursive=False, graph=None):
        import networkx as nx
        from dvc.stage import Stage

        G = graph or self.graph

        if not target:
            return list(G)

        target = os.path.abspath(target)

        if recursive and os.path.isdir(target):
            stages = nx.dfs_postorder_nodes(G)
            return [stage for stage in stages if path_isin(stage.path, target)]

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        pipeline = get_pipeline(get_pipelines(G), stage)
        return list(nx.dfs_postorder_nodes(pipeline, stage))

    def collect_granular(self, target, *args, **kwargs):
        if not target:
            return [(stage, None) for stage in self.stages]

        try:
            out, = self.find_outs_by_path(target, strict=False)
            filter_info = PathInfo(os.path.abspath(target))
            return [(out.stage, filter_info)]
        except OutputNotFoundError:
            stages = self.collect(target, *args, **kwargs)
            return [(stage, None) for stage in stages]

    def used_cache(
        self,
        targets=None,
        all_branches=False,
        with_deps=False,
        all_tags=False,
        all_commits=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches` or `all_tags` to expand scope.

        Returns:
            A dictionary with Schemes (representing output's location) as keys,
            and a list with the outputs' `dumpd` as values.
        """
        from funcy.py2 import icat
        from dvc.cache import NamedCache

        cache = NamedCache()

        for branch in self.brancher(
                all_branches=all_branches,
                all_tags=all_tags,
                all_commits=all_commits,
        ):
            targets = targets or [None]

            pairs = icat(
                self.collect_granular(
                    target, recursive=recursive, with_deps=with_deps)
                for target in targets)

            suffix = "({})".format(branch) if branch else ""
            for stage, filter_info in pairs:
                used_cache = stage.get_used_cache(
                    remote=remote,
                    force=force,
                    jobs=jobs,
                    filter_info=filter_info,
                )
                cache.update(used_cache, suffix=suffix)

        return cache

    def _collect_graph(self, stages=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, collect stages
                in the repository.

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        stages = stages or self.stages
        stages = [stage for stage in stages if stage]
        outs = {}

        for stage in stages:
            for out in stage.outs:
                if out.path_info in outs:
                    dup_stages = [stage, outs[out.path_info].stage]
                    raise OutputDuplicationError(str(out), dup_stages)
                outs[out.path_info] = out

        for stage in stages:
            for out in stage.outs:
                for p in out.path_info.parents:
                    if p in outs:
                        raise OverlappingOutputPathsError(outs[p], out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for p in chain([stage_path_info], stage_path_info.parents):
                if p in outs:
                    raise StagePathAsOutputError(stage, str(outs[p]))

        for stage in stages:
            G.add_node(stage)

            for dep in stage.deps:
                if dep.path_info is None:
                    continue

                for out_path_info, out in outs.items():
                    if out_path_info.overlaps(dep.path_info):
                        G.add_node(out.stage)
                        G.add_edge(stage, out.stage)

        check_acyclic(G)

        return G

    @cached_property
    def graph(self):
        return self._collect_graph()

    @cached_property
    def pipelines(self):
        return get_pipelines(self.graph)

    @staticmethod
    def _filter_out_dirs(dirs, outs, root_dir):
        def filter_dirs(dname):
            path = os.path.join(root_dir, dname)
            for out in outs:
                if path == os.path.normpath(out):
                    return False
            return True

        return list(filter(filter_dirs, dirs))

    @cached_property
    def stages(self):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        stages = []
        outs = []

        for root, dirs, files in self.tree.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_valid_filename(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    if out.scheme == "local":
                        outs.append(out.fspath + out.sep)
                stages.append(stage)

            dirs[:] = self._filter_out_dirs(dirs, outs, root)

        return stages

    def find_outs_by_path(self, path, outs=None, recursive=False, strict=True):
        if not outs:
            outs = [out for stage in self.stages for out in stage.outs]

        abs_path = os.path.abspath(path)
        path_info = PathInfo(abs_path)
        is_dir = self.tree.isdir(abs_path)
        match = path_info.__eq__ if strict else path_info.isin_or_eq

        def func(out):
            if out.scheme == "local" and match(out.path_info):
                return True

            if is_dir and recursive and out.path_info.isin(path_info):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path, self)

        return matched

    def find_out_by_relpath(self, relpath):
        path = os.path.join(self.root_dir, relpath)
        out, = self.find_outs_by_path(path)
        return out

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts

    @contextmanager
    def open(self, path, remote=None, mode="r", encoding=None):
        """Opens a specified resource as a file descriptor"""
        cause = None
        try:
            out, = self.find_outs_by_path(path)
        except OutputNotFoundError as e:
            out = None
            cause = e

        if out and out.use_cache:
            try:
                with self._open_cached(out, remote, mode, encoding) as fd:
                    yield fd
                return
            except FileNotFoundError as e:
                raise FileMissingError(relpath(path, self.root_dir), cause=e)

        if self.tree.exists(path):
            with self.tree.open(path, mode, encoding) as fd:
                yield fd
            return

        raise FileMissingError(relpath(path, self.root_dir), cause=cause)

    def _open_cached(self, out, remote=None, mode="r", encoding=None):
        if out.isdir():
            raise ValueError("Can't open a dir")

        cache_file = self.cache.local.checksum_to_path_info(out.checksum)
        cache_file = fspath_py35(cache_file)

        if os.path.exists(cache_file):
            return open(cache_file, mode=mode, encoding=encoding)

        try:
            remote_obj = self.cloud.get_remote(remote)
            remote_info = remote_obj.checksum_to_path_info(out.checksum)
            return remote_obj.open(remote_info, mode=mode, encoding=encoding)
        except RemoteActionNotImplemented:
            with self.state:
                cache_info = out.get_used_cache(remote=remote)
                self.cloud.pull(cache_info, remote=remote)

            return open(cache_file, mode=mode, encoding=encoding)

    def close(self):
        self.scm.close()

    @locked
    def checkout(self, *args, **kwargs):
        return self._checkout(*args, **kwargs)

    @locked
    def fetch(self, *args, **kwargs):
        return self._fetch(*args, **kwargs)

    def _reset(self):
        self.__dict__.pop("graph", None)
        self.__dict__.pop("stages", None)
        self.__dict__.pop("pipelines", None)
        self.__dict__.pop("dvcignore", None)
コード例 #25
0
    def __init__(
        self,
        root_dir=None,
        scm=None,
        rev=None,
        subrepos=False,
        uninitialized=False,
    ):
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.lock import LockNoop, make_lock
        from dvc.repo.experiments import Experiments
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.repo.plots import Plots
        from dvc.scm import SCM
        from dvc.stage.cache import StageCache
        from dvc.state import State, StateNoop
        from dvc.tree.local import LocalTree
        from dvc.utils.fs import makedirs

        try:
            tree = scm.get_tree(rev) if rev else None
            self.root_dir = self.find_root(root_dir, tree)
            self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
            self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
            makedirs(self.tmp_dir, exist_ok=True)
        except NotDvcRepoError:
            if not uninitialized:
                raise
            self.root_dir = SCM(root_dir or os.curdir).root_dir
            self.dvc_dir = None
            self.tmp_dir = None

        tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir)
        if scm:
            self.tree = scm.get_tree(rev, **tree_kwargs)
        else:
            self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs)

        self.config = Config(self.dvc_dir, tree=self.tree)
        self._scm = scm

        # used by RepoTree to determine if it should traverse subrepos
        self.subrepos = subrepos

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if scm or not self.dvc_dir:
            self.lock = LockNoop()
            self.state = StateNoop()
        else:
            self.lock = make_lock(
                os.path.join(self.tmp_dir, "lock"),
                tmp_dir=self.tmp_dir,
                hardlink_lock=self.config["core"].get("hardlink_lock", False),
                friendly=True,
            )

            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self)
            self.stage_cache = StageCache(self)

            try:
                self.experiments = Experiments(self)
            except NotImplementedError:
                self.experiments = None

            self._ignore()

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)
コード例 #26
0
class Repo(object):
    DVC_DIR = ".dvc"

    from dvc.repo.destroy import destroy
    from dvc.repo.install import install
    from dvc.repo.add import add
    from dvc.repo.remove import remove
    from dvc.repo.lock import lock as lock_stage
    from dvc.repo.move import move
    from dvc.repo.run import run
    from dvc.repo.imp import imp
    from dvc.repo.reproduce import reproduce
    from dvc.repo.checkout import checkout
    from dvc.repo.push import push
    from dvc.repo.fetch import fetch
    from dvc.repo.pull import pull
    from dvc.repo.status import status
    from dvc.repo.gc import gc
    from dvc.repo.commit import commit
    from dvc.repo.diff import diff
    from dvc.repo.brancher import brancher

    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.repo.pkg import Pkg

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.tree = WorkingTree(self.root_dir)

        self.scm = SCM(self.root_dir, repo=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.metrics = Metrics(self)
        self.tag = Tag(self)
        self.pkg = Pkg(self)

        self._ignore()

    def __repr__(self):
        return "Repo: '{root_dir}'".format(root_dir=self.root_dir)

    @classmethod
    def find_root(cls, root=None):
        if root is None:
            root = os.getcwd()
        else:
            root = os.path.abspath(os.path.realpath(root))

        while True:
            dvc_dir = os.path.join(root, cls.DVC_DIR)
            if os.path.isdir(dvc_dir):
                return root
            if os.path.ismount(root):
                break
            root = os.path.dirname(root)
        raise NotDvcRepoError(root)

    @classmethod
    def find_dvc_dir(cls, root=None):
        root_dir = cls.find_root(root)
        return os.path.join(root_dir, cls.DVC_DIR)

    @staticmethod
    def init(root_dir=os.curdir, no_scm=False, force=False):
        from dvc.repo.init import init

        init(root_dir=root_dir, no_scm=no_scm, force=force)
        return Repo(root_dir)

    def unprotect(self, target):
        return self.cache.local.unprotect(PathInfo(target))

    def _ignore(self):
        from dvc.updater import Updater

        updater = Updater(self.dvc_dir)

        flist = [
            self.state.state_file,
            self.lock.lock_file,
            self.config.config_local_file,
            updater.updater_file,
            updater.lock.lock_file,
        ] + self.state.temp_files

        if self.cache.local.cache_dir.startswith(self.root_dir):
            flist += [self.cache.local.cache_dir]

        self.scm.ignore_list(flist)

    def check_dag(self, stages):
        """Generate graph including the new stage to check for errors"""
        self.graph(stages=stages)

    @staticmethod
    def _check_cyclic_graph(graph):
        import networkx as nx
        from dvc.exceptions import CyclicGraphError

        cycles = list(nx.simple_cycles(graph))

        if cycles:
            raise CyclicGraphError(cycles[0])

    def _get_pipeline(self, node):
        pipelines = [i for i in self.pipelines() if i.has_node(node)]
        assert len(pipelines) == 1
        return pipelines[0]

    def collect(self, target, with_deps=False, recursive=False):
        import networkx as nx
        from dvc.stage import Stage

        if not target or (recursive and os.path.isdir(target)):
            return self.active_stages(target)

        stage = Stage.load(self, target)
        if not with_deps:
            return [stage]

        node = os.path.relpath(stage.path, self.root_dir)
        G = self._get_pipeline(node)

        ret = []
        for n in nx.dfs_postorder_nodes(G, node):
            ret.append(G.node[n]["stage"])

        return ret

    def _collect_dir_cache(self,
                           out,
                           branch=None,
                           remote=None,
                           force=False,
                           jobs=None):
        """Get a list of `info`s retaled to the given directory.

        - Pull the directory entry from the remote cache if it was changed.

        Example:

            Given the following commands:

            $ echo "foo" > directory/foo
            $ echo "bar" > directory/bar
            $ dvc add directory

            It will return something similar to the following list:

            [
                { 'path': 'directory',     'md5': '168fd6761b9c.dir', ... },
                { 'path': 'directory/foo', 'md5': 'c157a79031e1', ... },
                { 'path': 'directory/bar', 'md5': 'd3b07384d113', ... },
            ]
        """
        info = out.dumpd()
        ret = [info]
        r = out.remote
        md5 = info[r.PARAM_CHECKSUM]

        if self.cache.local.changed_cache_file(md5):
            try:
                self.cloud.pull(ret,
                                jobs=jobs,
                                remote=remote,
                                show_checksums=False)
            except DvcException as exc:
                msg = "Failed to pull cache for '{}': {}"
                logger.debug(msg.format(out, exc))

        if self.cache.local.changed_cache_file(md5):
            msg = ("Missing cache for directory '{}'. "
                   "Cache for files inside will be lost. "
                   "Would you like to continue? Use '-f' to force. ")
            if not force and not prompt.confirm(msg):
                raise DvcException(
                    "unable to fully collect used cache"
                    " without cache for directory '{}'".format(out))
            else:
                return ret

        for i in out.dir_cache:
            i["branch"] = branch
            i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH],
                                           i[r.PARAM_RELPATH])
            ret.append(i)

        return ret

    def _collect_used_cache(self,
                            out,
                            branch=None,
                            remote=None,
                            force=False,
                            jobs=None):
        """Get a dumpd of the given `out`, with an entry including the branch.

        The `used_cache` of an output is no more than its `info`.

        In case that the given output is a directory, it will also
        include the `info` of its files.
        """
        if not out.use_cache or not out.info:
            if not out.info:
                logger.warning("Output '{}'({}) is missing version "
                               "info. Cache for it will not be collected. "
                               "Use dvc repro to get your pipeline up to "
                               "date.".format(out, out.stage))
            return []

        info = out.dumpd()
        info["branch"] = branch
        ret = [info]

        if out.scheme != "local":
            return ret

        if not out.is_dir_checksum:
            return ret

        return self._collect_dir_cache(out,
                                       branch=branch,
                                       remote=remote,
                                       force=force,
                                       jobs=jobs)

    def used_cache(
        self,
        target=None,
        all_branches=False,
        active=True,
        with_deps=False,
        all_tags=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches` or `all_tags` to expand scope.

        Returns:
            A dictionary with Schemes (representing output's location) as keys,
            and a list with the outputs' `dumpd` as values.
        """

        cache = {}
        cache["local"] = []
        cache["s3"] = []
        cache["gs"] = []
        cache["hdfs"] = []
        cache["ssh"] = []
        cache["azure"] = []

        for branch in self.brancher(all_branches=all_branches,
                                    all_tags=all_tags):
            if target:
                if recursive and os.path.isdir(target):
                    stages = self.stages(target)
                else:
                    stages = self.collect(target, with_deps=with_deps)
            elif active:
                stages = self.active_stages()
            else:
                stages = self.stages()

            for stage in stages:
                if active and not target and stage.locked:
                    logger.warning(
                        "DVC-file '{path}' is locked. Its dependencies are"
                        " not going to be pushed/pulled/fetched.".format(
                            path=stage.relpath))

                for out in stage.outs:
                    scheme = out.path_info.scheme
                    cache[scheme].extend(
                        self._collect_used_cache(
                            out,
                            branch=branch,
                            remote=remote,
                            force=force,
                            jobs=jobs,
                        ))

        return cache

    def graph(self, stages=None, from_directory=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, use the ones
                on the `from_directory`.

            from_directory (str): directory where to look at for stages, if
                None is given, use the current working directory

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = []
                for o in outs:
                    if o.path_info == out.path_info:
                        existing.append(o.stage)

                    in_o_dir = out.path_info.isin(o.path_info)
                    in_out_dir = o.path_info.isin(out.path_info)
                    if in_o_dir or in_out_dir:
                        raise OverlappingOutputPathsError(o, out)

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(str(out), stages)

                outs.append(out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for out in outs:
                if stage_path_info.isin(out.path_info):
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        for stage in stages:
            node = os.path.relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (out.path_info != dep.path_info
                            and not dep.path_info.isin(out.path_info)
                            and not out.path_info.isin(dep.path_info)):
                        continue

                    dep_stage = out.stage
                    dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active

    def pipelines(self, from_directory=None):
        import networkx as nx

        G, G_active = self.graph(from_directory=from_directory)

        return [
            G.subgraph(c).copy() for c in nx.weakly_connected_components(G)
        ]

    def stages(self, from_directory=None, check_dag=True):
        """
        Walks down the root directory looking for Dvcfiles,
        skipping the directories that are related with
        any SCM (e.g. `.git`), DVC itself (`.dvc`), or directories
        tracked by DVC (e.g. `dvc add data` would skip `data/`)

        NOTE: For large repos, this could be an expensive
              operation. Consider using some memoization.
        """
        from dvc.stage import Stage

        if not from_directory:
            from_directory = self.root_dir
        elif not os.path.isdir(from_directory):
            raise TargetNotDirectoryError(from_directory)

        stages = []
        outs = []

        ignore_file_handler = DvcIgnoreFileHandler(self.tree)
        for root, dirs, files in self.tree.walk(
                from_directory, ignore_file_handler=ignore_file_handler):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_valid_filename(path):
                    continue
                stage = Stage.load(self, path)
                for out in stage.outs:
                    if out.scheme == "local":
                        outs.append(out.fspath + out.sep)
                stages.append(stage)

            def filter_dirs(dname, root=root):
                path = os.path.join(root, dname)
                if path in (self.dvc_dir, self.scm.dir):
                    return False
                for out in outs:
                    if path == os.path.normpath(out) or path.startswith(out):
                        return False
                return True

            dirs[:] = list(filter(filter_dirs, dirs))

        if check_dag:
            self.check_dag(stages)

        return stages

    def active_stages(self, from_directory=None):
        import networkx as nx

        stages = []
        for G in self.pipelines(from_directory):
            stages.extend(list(nx.get_node_attributes(G, "stage").values()))
        return stages

    def find_outs_by_path(self, path, outs=None, recursive=False):
        if not outs:
            # there is no `from_directory=path` argument because some data
            # files might be generated to an upper level, and so it is
            # needed to look at all the files (project root_dir)
            stages = self.stages()
            outs = [out for stage in stages for out in stage.outs]

        abs_path = os.path.abspath(path)
        is_dir = self.tree.isdir(abs_path)

        def func(out):
            if out.scheme == "local" and out.fspath == abs_path:
                return True

            if is_dir and recursive and out.path_info.isin(abs_path):
                return True

            return False

        matched = list(filter(func, outs))
        if not matched:
            raise OutputNotFoundError(path)

        return matched

    def is_dvc_internal(self, path):
        path_parts = os.path.normpath(path).split(os.path.sep)
        return self.DVC_DIR in path_parts

    def open(self, path, remote=None, mode="r", encoding=None):
        """Opens a specified resource as a file descriptor"""
        out, = self.find_outs_by_path(path)
        if out.isdir():
            raise ValueError("Can't open a dir")

        with self.state:
            cache_info = self._collect_used_cache(out, remote=remote)
            self.cloud.pull(cache_info, remote=remote)

        cache_file = self.cache.local.checksum_to_path_info(out.checksum)
        return _open(cache_file.fspath, mode=mode, encoding=encoding)
コード例 #27
0
ファイル: test_data_cloud.py プロジェクト: jackwellsxyz/dvc
class TestDataCloudBase(TestDvc):
    def _get_cloud_class(self):
        return None

    @staticmethod
    def should_test():
        return False

    @staticmethod
    def get_url():
        return NotImplementedError

    def _get_keyfile(self):
        return None

    def _ensure_should_run(self):
        if not self.should_test():
            raise SkipTest(
                "Test {} is disabled".format(self.__class__.__name__)
            )

    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self.get_url()
        keyfile = self._get_keyfile()

        config = copy.deepcopy(TEST_CONFIG)
        config["remote"][TEST_REMOTE] = {"url": repo, "keyfile": keyfile}
        self.dvc.config = config
        self.cloud = DataCloud(self.dvc)

        self.assertIsInstance(self.cloud.get_remote(), self._get_cloud_class())

    def _test_cloud(self):
        self._setup_cloud()

        stages = self.dvc.add(self.FOO)
        self.assertEqual(len(stages), 1)
        stage = stages[0]
        self.assertTrue(stage is not None)
        out = stage.outs[0]
        cache = out.cache_path
        md5 = out.checksum
        info = out.get_used_cache()

        stages = self.dvc.add(self.DATA_DIR)
        self.assertEqual(len(stages), 1)
        stage_dir = stages[0]
        self.assertTrue(stage_dir is not None)
        out_dir = stage_dir.outs[0]
        cache_dir = out_dir.cache_path
        name_dir = str(out_dir)
        md5_dir = out_dir.checksum
        info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir)

        with self.cloud.repo.state:
            # Check status
            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_NEW}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}}
            self.assertEqual(status_dir, expected)

            # Push and check status
            self.cloud.push(info)
            self.assertTrue(os.path.exists(cache))
            self.assertTrue(os.path.isfile(cache))

            self.cloud.push(info_dir)
            self.assertTrue(os.path.isfile(cache_dir))

            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_OK}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
            self.assertEqual(status_dir, expected)

            # Remove and check status
            remove(self.dvc.cache.local.cache_dir)

            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_DELETED}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}}
            self.assertEqual(status_dir, expected)

            # Pull and check status
            self.cloud.pull(info)
            self.assertTrue(os.path.exists(cache))
            self.assertTrue(os.path.isfile(cache))
            with open(cache, "r") as fd:
                self.assertEqual(fd.read(), self.FOO_CONTENTS)

            self.cloud.pull(info_dir)
            self.assertTrue(os.path.isfile(cache_dir))

            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_OK}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
            self.assertTrue(status_dir, expected)

    def test(self):
        self._ensure_should_run()
        self._test_cloud()
コード例 #28
0
ファイル: test_data_cloud.py プロジェクト: jackwellsxyz/dvc
 def _test_cloud(self, config, cl):
     self.dvc.config = config
     cloud = DataCloud(self.dvc)
     self.assertIsInstance(cloud.get_remote(), cl)
コード例 #29
0
ファイル: test_data_cloud.py プロジェクト: vyloy/dvc
class TestDataCloudBase(TestDvc):
    def _get_cloud_class(self):
        return None

    def _should_test(self):
        return False

    def _get_url(self):
        return ""

    def _ensure_should_run(self):
        if not self._should_test():
            raise SkipTest("Test {} is disabled".format(
                self.__class__.__name__))

    def _setup_cloud(self):
        self._ensure_should_run()

        repo = self._get_url()

        config = copy.deepcopy(TEST_CONFIG)
        config[TEST_SECTION][Config.SECTION_REMOTE_URL] = repo
        self.cloud = DataCloud(self.dvc, config)

        self.assertIsInstance(self.cloud._cloud, self._get_cloud_class())

    def _test_cloud(self):
        self._setup_cloud()

        stages = self.dvc.add(self.FOO)
        self.assertEqual(len(stages), 1)
        stage = stages[0]
        self.assertTrue(stage is not None)
        cache = stage.outs[0].cache
        info = stage.outs[0].dumpd()
        md5 = info["md5"]

        stages = self.dvc.add(self.DATA_DIR)
        self.assertEqual(len(stages), 1)
        stage_dir = stages[0]
        self.assertTrue(stage_dir is not None)

        cache_dir = stage_dir.outs[0].cache
        info_dir = stage_dir.outs[0].dumpd()
        md5_dir = info_dir["md5"]

        with self.cloud.repo.state:
            # Check status
            status = self.cloud.status([info], show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_NEW}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status([info_dir], show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}}
            self.assertEqual(status_dir, expected)

            # Push and check status
            self.cloud.push([info])
            self.assertTrue(os.path.exists(cache))
            self.assertTrue(os.path.isfile(cache))

            self.cloud.push([info_dir])
            self.assertTrue(os.path.isfile(cache_dir))

            status = self.cloud.status([info], show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_OK}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status([info_dir], show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
            self.assertEqual(status_dir, expected)

            # Remove and check status
            shutil.rmtree(self.dvc.cache.local.cache_dir)

            status = self.cloud.status([info], show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_DELETED}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status([info_dir], show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}}
            self.assertEqual(status_dir, expected)

            # Pull and check status
            self.cloud.pull([info])
            self.assertTrue(os.path.exists(cache))
            self.assertTrue(os.path.isfile(cache))
            with open(cache, "r") as fd:
                self.assertEqual(fd.read(), self.FOO_CONTENTS)

            self.cloud.pull([info_dir])
            self.assertTrue(os.path.isfile(cache_dir))

            status = self.cloud.status([info], show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_OK}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status([info_dir], show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
            self.assertTrue(status_dir, expected)

    def test(self):
        self._ensure_should_run()
        self._test_cloud()
コード例 #30
0
ファイル: test_data_cloud.py プロジェクト: vyloy/dvc
 def _test_cloud(self, config, cl):
     cloud = DataCloud(self.dvc, config=config)
     self.assertIsInstance(cloud._cloud, cl)