Esempio n. 1
0
File: init.py Progetto: vladkol/dvc
def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False):
    """
    Creates an empty repo on the given directory -- basically a
    `.dvc` directory with subdirectories for configuration and cache.

    It should be tracked by a SCM or use the `--no-scm` flag.

    If the given directory is not empty, you must use the `--force`
    flag to override it.

    Args:
        root_dir: Path to repo's root directory.

    Returns:
        Repo instance.

    Raises:
        KeyError: Raises an exception.
    """

    if no_scm and subdir:
        raise InvalidArgumentError(
            "Cannot initialize repo with `--no-scm` and `--subdir`")

    root_dir = os.path.realpath(root_dir)
    dvc_dir = os.path.join(root_dir, Repo.DVC_DIR)

    try:
        scm = SCM(root_dir, search_parent_directories=subdir, no_scm=no_scm)
    except SCMError:
        raise InitError(
            "{repo} is not tracked by any supported SCM tool (e.g. Git). "
            "Use `--no-scm` if you don't want to use any SCM or "
            "`--subdir` if initializing inside a subdirectory of a parent SCM "
            "repository.".format(repo=root_dir))

    if os.path.isdir(dvc_dir):
        if not force:
            raise InitError("'{repo}' exists. Use `-f` to force.".format(
                repo=relpath(dvc_dir)))

        remove(dvc_dir)

    os.mkdir(dvc_dir)

    config = Config.init(dvc_dir)

    if no_scm:
        with config.edit() as conf:
            conf["core"]["no_scm"] = True

    dvcignore = init_dvcignore(root_dir)

    proj = Repo(root_dir)

    proj.plots.templates.init()

    scm.add(
        [config.files["repo"], dvcignore, proj.plots.templates.templates_dir])

    if scm.ignore_file:
        scm.add([os.path.join(dvc_dir, scm.ignore_file)])
        logger.info("\nYou can now commit the changes to git.\n")

    return proj
Esempio n. 2
0
    def reproduce(
        cls,
        dvc_dir: Optional[str],
        rev: str,
        queue: Optional["Queue"] = None,
        rel_cwd: Optional[str] = None,
        name: Optional[str] = None,
        log_level: Optional[int] = None,
    ) -> "ExecutorResult":
        """Run dvc repro and return the result.

        Returns tuple of (exp_hash, exp_ref, force) where exp_hash is the
            experiment hash (or None on error), exp_ref is the experiment ref,
            and force is a bool specifying whether or not this experiment
            should force overwrite any existing duplicates.
        """
        from dvc.repo import Repo
        from dvc.repo.checkout import checkout as dvc_checkout
        from dvc.repo.reproduce import reproduce as dvc_reproduce

        unchanged = []

        if queue is not None:
            queue.put((rev, os.getpid()))
        if log_level is not None:
            cls._set_log_level(log_level)

        def filter_pipeline(stages):
            unchanged.extend([
                stage for stage in stages if isinstance(stage, PipelineStage)
            ])

        exp_hash: Optional[str] = None
        exp_ref: Optional["ExpRefInfo"] = None
        repro_force: bool = False

        try:
            dvc = Repo(dvc_dir)
            if dvc_dir is not None:
                old_cwd = os.getcwd()
                if rel_cwd:
                    os.chdir(os.path.join(dvc.root_dir, rel_cwd))
                else:
                    os.chdir(dvc.root_dir)
            else:
                old_cwd = None
            logger.debug("Running repro in '%s'", os.getcwd())

            args_path = os.path.join(dvc.tmp_dir,
                                     BaseExecutor.PACKED_ARGS_FILE)
            if os.path.exists(args_path):
                args, kwargs = BaseExecutor.unpack_repro_args(args_path)
                remove(args_path)
            else:
                args = []
                kwargs = {}

            repro_force = kwargs.get("force", False)
            logger.debug("force = %s", str(repro_force))

            # NOTE: for checkpoint experiments we handle persist outs slightly
            # differently than normal:
            #
            # - checkpoint out may not yet exist if this is the first time this
            #   experiment has been run, this is not an error condition for
            #   experiments
            # - at the start of a repro run, we need to remove the persist out
            #   and restore it to its last known (committed) state (which may
            #   be removed/does not yet exist) so that our executor workspace
            #   is not polluted with the (persistent) out from an unrelated
            #   experiment run
            dvc_checkout(dvc, force=True, quiet=True)

            checkpoint_func = partial(cls.checkpoint_callback, dvc.scm, name,
                                      repro_force)
            stages = dvc_reproduce(
                dvc,
                *args,
                on_unchanged=filter_pipeline,
                checkpoint_func=checkpoint_func,
                **kwargs,
            )

            exp_hash = cls.hash_exp(stages)
            try:
                cls.commit(
                    dvc.scm,
                    exp_hash,
                    exp_name=name,
                    force=repro_force,
                    checkpoint=any(stage.is_checkpoint for stage in stages),
                )
            except UnchangedExperimentError:
                pass
            ref = dvc.scm.get_ref(EXEC_BRANCH, follow=False)
            if ref:
                exp_ref = ExpRefInfo.from_ref(ref)
            if cls.WARN_UNTRACKED:
                untracked = dvc.scm.untracked_files()
                if untracked:
                    logger.warning(
                        "The following untracked files were present in the "
                        "experiment directory after reproduction but will "
                        "not be included in experiment commits:\n"
                        "\t%s",
                        ", ".join(untracked),
                    )
        finally:
            if dvc:
                dvc.scm.close()
            if old_cwd:
                os.chdir(old_cwd)

        # ideally we would return stages here like a normal repro() call, but
        # stages is not currently picklable and cannot be returned across
        # multiprocessing calls
        return ExecutorResult(exp_hash, exp_ref, repro_force)
Esempio n. 3
0
    def dvc(self):
        from dvc.repo import Repo

        return Repo(self.dvc_dir)
Esempio n. 4
0
    def reproduce(
        cls,
        dvc_dir: str,
        queue: "Queue",
        rev: str,
        cwd: Optional[str] = None,
        name: Optional[str] = None,
    ) -> Tuple[bool, Optional[str]]:
        """Run dvc repro and return the result.

        Returns tuple of (exp_hash, force) where exp_hash is the experiment
            hash (or None on error) and force is a bool specifying whether or
            not this experiment should force overwrite any existing duplicates.
        """
        unchanged = []

        queue.put((rev, os.getpid()))

        def filter_pipeline(stages):
            unchanged.extend([
                stage for stage in stages if isinstance(stage, PipelineStage)
            ])

        result = None
        force = False

        try:
            dvc = Repo(dvc_dir)
            old_cwd = os.getcwd()
            new_cwd = cwd if cwd else dvc.root_dir
            os.chdir(new_cwd)
            logger.debug("Running repro in '%s'", cwd)

            args_path = os.path.join(dvc.tmp_dir,
                                     BaseExecutor.PACKED_ARGS_FILE)
            if os.path.exists(args_path):
                args, kwargs = BaseExecutor.unpack_repro_args(args_path)
                remove(args_path)
            else:
                args = []
                kwargs = {}

            force = kwargs.get("force", False)

            # NOTE: for checkpoint experiments we handle persist outs slightly
            # differently than normal:
            #
            # - checkpoint out may not yet exist if this is the first time this
            #   experiment has been run, this is not an error condition for
            #   experiments
            # - at the start of a repro run, we need to remove the persist out
            #   and restore it to its last known (committed) state (which may
            #   be removed/does not yet exist) so that our executor workspace
            #   is not polluted with the (persistent) out from an unrelated
            #   experiment run
            dvc.checkout(force=True, quiet=True)

            checkpoint_func = partial(cls.checkpoint_callback, dvc.scm, name)
            stages = dvc.reproduce(
                *args,
                on_unchanged=filter_pipeline,
                checkpoint_func=checkpoint_func,
                **kwargs,
            )

            exp_hash = cls.hash_exp(stages)
            exp_rev = cls.commit(dvc.scm, exp_hash, exp_name=name)
            if dvc.scm.get_ref(EXEC_CHECKPOINT):
                dvc.scm.set_ref(EXEC_CHECKPOINT, exp_rev)
        except UnchangedExperimentError:
            pass
        finally:
            if dvc:
                dvc.scm.close()
            if old_cwd:
                os.chdir(old_cwd)

        # ideally we would return stages here like a normal repro() call, but
        # stages is not currently picklable and cannot be returned across
        # multiprocessing calls
        return result, force
Esempio n. 5
0
    def _read_outputs(self, module_temp_dir):
        from dvc.repo import Repo

        pkg_repo = Repo(module_temp_dir)
        stages = pkg_repo.stages()
        return [out for s in stages for out in s.outs]
Esempio n. 6
0
def _get_repo():
    from dvc.repo import Repo

    return Repo()
Esempio n. 7
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    all_experiments=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # `all_experiments` or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
        all_experiments=all_experiments,
    )

    from contextlib import ExitStack

    from dvc.repo import Repo

    if not repos:
        repos = []
    all_repos = [Repo(path) for path in repos]

    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)
            stack.enter_context(repo.state)

        used = NamedCache()
        for repo in all_repos + [self]:
            used.update(
                repo.used_cache(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    all_experiments=all_experiments,
                    remote=remote,
                    force=force,
                    jobs=jobs,
                ))

    for scheme, odb in self.odb.by_scheme():
        if not odb:
            continue

        removed = odb.gc(set(used.scheme_keys(scheme)), jobs=jobs)
        if not removed:
            logger.info(f"No unused '{scheme}' cache to remove.")

    if not cloud:
        return

    remote = self.cloud.get_remote(remote, "gc -c")
    removed = remote.gc(set(used.scheme_keys(Schemes.LOCAL)), jobs=jobs)
    if not removed:
        logger.info("No unused cache to remove from remote.")
Esempio n. 8
0
def test_show_no_repo(tmp_dir):
    tmp_dir.gen("metrics.json", '{"foo": 0, "bar": 0.0, "baz": {}}')

    dvc = Repo(uninitialized=True)

    dvc.metrics.show(targets=["metrics.json"])
Esempio n. 9
0
def repo():
    return Repo(".")
Esempio n. 10
0
 def stages():
     return {stage.relpath for stage in Repo(os.fspath(tmp_dir)).stages}
Esempio n. 11
0
def external_repo(url,
                  rev=None,
                  for_write=False,
                  cache_dir=None,
                  cache_types=None,
                  **kwargs):
    from scmrepo.git import Git

    from dvc.config import NoRemoteError
    from dvc.fs.git import GitFileSystem

    logger.debug("Creating external repo %s@%s", url, rev)
    path = _cached_clone(url, rev, for_write=for_write)
    # Local HEAD points to the tip of whatever branch we first cloned from
    # (which may not be the default branch), use origin/HEAD here to get
    # the tip of the default branch
    rev = rev or "refs/remotes/origin/HEAD"

    cache_config = {
        "cache": {
            "dir": cache_dir or _get_cache_dir(url),
            "type": cache_types
        }
    }

    config = _get_remote_config(url) if os.path.isdir(url) else {}
    config.update(cache_config)

    if for_write:
        root_dir = path
        fs = None
    else:
        root_dir = os.path.realpath(path)
        scm = Git(root_dir)
        fs = GitFileSystem(scm=scm, rev=rev)

    repo_kwargs = dict(
        root_dir=root_dir,
        url=url,
        fs=fs,
        config=config,
        repo_factory=erepo_factory(url, cache_config),
        **kwargs,
    )

    if "subrepos" not in repo_kwargs:
        repo_kwargs["subrepos"] = True

    if "uninitialized" not in repo_kwargs:
        repo_kwargs["uninitialized"] = True

    repo = Repo(**repo_kwargs)

    try:
        yield repo
    except NoRemoteError as exc:
        raise NoRemoteInExternalRepoError(url) from exc
    except OutputNotFoundError as exc:
        if exc.repo is repo:
            raise NoOutputInExternalRepoError(exc.output, repo.root_dir,
                                              url) from exc
        raise
    except FileMissingError as exc:
        raise PathMissingError(exc.path, url) from exc
    finally:
        repo.close()
        if for_write:
            _remove(path)
Esempio n. 12
0
 def stages():
     return set(stage.relpath for stage in Repo(fspath(tmp_dir)).stages)
Esempio n. 13
0
"""
Define paths to all DVC dependency and output files.

"""
from pathlib import Path
from urllib.parse import urlparse

import dask
from dvc.repo import Repo
dvc_repo = Repo('.')

# Get DVC remote URL to your remote work directory for this project
remote_user_work_path = urlparse(
    dvc_repo.config.config[f'remote "ahsoka_user_workspace"']['url']).path
remote_user_work_path = Path(remote_user_work_path)
remote_project_work_path = urlparse(
    dvc_repo.config.config[f'remote "ahsoka_project_data"']['url']).path
remote_project_work_folder_name = Path(remote_project_work_path).name
remote_work_path = remote_user_work_path/remote_project_work_folder_name

# Get DVC remote URL to the project cache
remote_cache_path = urlparse(
    dvc_repo.config.config[f'remote "ahsoka_project_cache"']['url']).path
remote_cache_path = Path(remote_cache_path)

assert remote_work_path.name == remote_cache_path.name, (
    'The name of your remote DVC data directory for this project:'
    f'"{remote_work_path.name}", is should be the same as the same as the name'
    f'of the project cache directory: "{remote_cache_path.name}')

data_dir = remote_work_path
Esempio n. 14
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    all_experiments=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # `all_experiments` or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
        all_experiments=all_experiments,
    )

    from contextlib import ExitStack

    from dvc.objects.db import get_index
    from dvc.repo import Repo

    if not repos:
        repos = []
    all_repos = [Repo(path) for path in repos]

    used_obj_ids = set()
    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)

        for repo in all_repos + [self]:
            for obj_ids in repo.used_objs(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    all_experiments=all_experiments,
                    remote=remote,
                    force=force,
                    jobs=jobs,
            ).values():
                used_obj_ids.update(obj_ids)

    for scheme, odb in self.odb.by_scheme():
        if not odb:
            continue

        removed = odb.gc(used_obj_ids, jobs=jobs)
        if not removed:
            logger.info(f"No unused '{scheme}' cache to remove.")

    if not cloud:
        return

    odb = self.cloud.get_remote_odb(remote, "gc -c")
    removed = odb.gc(used_obj_ids)
    if removed:
        get_index(odb).clear()
    else:
        logger.info("No unused cache to remove from remote.")
Esempio n. 15
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
        cloud=cloud,
    )

    from contextlib import ExitStack
    from dvc.repo import Repo

    all_repos = []

    if repos:
        all_repos = [Repo(path) for path in repos]

    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)
            stack.enter_context(repo.state)

        used = NamedCache()
        for repo in all_repos + [self]:
            used.update(
                repo.used_cache(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    remote=remote,
                    force=force,
                    jobs=jobs,
                ))

    _do_gc("local", self.cache.local.gc, used)

    if self.cache.s3:
        _do_gc("s3", self.cache.s3.gc, used)

    if self.cache.gs:
        _do_gc("gs", self.cache.gs.gc, used)

    if self.cache.ssh:
        _do_gc("ssh", self.cache.ssh.gc, used)

    if self.cache.hdfs:
        _do_gc("hdfs", self.cache.hdfs.gc, used)

    if self.cache.azure:
        _do_gc("azure", self.cache.azure.gc, used)

    if cloud:
        _do_gc("remote", self.cloud.get_remote(remote, "gc -c").gc, used)
Esempio n. 16
0
    def exp_dvc(self):
        """Return clone dvc Repo instance."""
        from dvc.repo import Repo

        return Repo(self.exp_dvc_dir)
Esempio n. 17
0
 def collect_stages():
     return {
         stage.relpath
         for stage in Repo(os.fspath(tmp_dir)).index.stages
     }
Esempio n. 18
0
def external_repo(url,
                  rev=None,
                  for_write=False,
                  cache_dir=None,
                  cache_types=None,
                  **kwargs):
    from dvc.config import NoRemoteError
    from dvc.scm.git import Git

    logger.debug("Creating external repo %s@%s", url, rev)
    path = _cached_clone(url, rev, for_write=for_write)
    # Local HEAD points to the tip of whatever branch we first cloned from
    # (which may not be the default branch), use origin/HEAD here to get
    # the tip of the default branch
    rev = rev or "refs/remotes/origin/HEAD"

    cache_config = {
        "cache": {
            "dir": cache_dir or _get_cache_dir(url),
            "type": cache_types,
        }
    }

    config = _get_remote_config(url) if os.path.isdir(url) else {}
    config.update(cache_config)

    def make_repo(path, **_kwargs):
        _config = cache_config.copy()
        if os.path.isdir(url):
            rel = os.path.relpath(path, _kwargs["scm"].root_dir)
            repo_path = os.path.join(url, rel)
            _config.update(_get_remote_config(repo_path))
        return Repo(path, config=_config, **_kwargs)

    root_dir = path if for_write else os.path.realpath(path)
    repo_kwargs = dict(
        root_dir=root_dir,
        url=url,
        scm=None if for_write else Git(root_dir),
        rev=None if for_write else rev,
        subrepos=not for_write,
        uninitialized=True,
        config=config,
        repo_factory=make_repo,
        **kwargs,
    )

    if "fetch" not in repo_kwargs:
        repo_kwargs["fetch"] = True

    repo = Repo(**repo_kwargs)

    try:
        yield repo
    except NoRemoteError as exc:
        raise NoRemoteInExternalRepoError(url) from exc
    except OutputNotFoundError as exc:
        if exc.repo is repo:
            raise NoOutputInExternalRepoError(exc.output, repo.root_dir,
                                              url) from exc
        raise
    except FileMissingError as exc:
        raise PathMissingError(exc.path, url) from exc
    finally:
        repo.close()
        if for_write:
            _remove(path)
Esempio n. 19
0
    def __init__(self, args):
        from dvc.repo import Repo

        self.repo = Repo()
        self.config = self.repo.config
        self.args = args
Esempio n. 20
0
    def identifier(self) -> str:
        """Unique identifier for the index.

        We can use this to optimize and skip opening some indices
        eg: on push/pull/fetch/gc --all-commits.

        Currently, it is unique to the platform (windows vs posix).
        """
        return dict_md5(self.dumpd())


if __name__ == "__main__":
    from funcy import log_durations

    from dvc.repo import Repo

    repo = Repo()
    index = Index(repo, repo.fs)
    print(index)
    with log_durations(print, "collecting stages"):
        # pylint: disable=pointless-statement
        print("no of stages", len(index.stages))
    with log_durations(print, "building graph"):
        index.build_graph()
    with log_durations(print, "calculating hash"):
        print(index.identifier)
    with log_durations(print, "updating"):
        index2 = index.update(index.stages)
    with log_durations(print, "calculating hash"):
        print(index2.identifier)
Esempio n. 21
0
def create_dag(data_path: str, out: str) -> None:
    repo = Repo()
    repo.run(cmd=f"python train_model.py {data_path} {out}",
             deps=[data_path],
             outs=[out],
             fname=out + ".dvc")