Beispiel #1
0
def test_load_stage(dvc, stage_data, lock_data):
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data)

    assert stage.wdir == os.path.abspath(os.curdir)
    assert stage.name == "stage-1"
    assert stage.cmd == "command"
    assert stage.path == os.path.abspath(PIPELINE_FILE)
    assert stage.deps[0].def_path == "foo"
    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].def_path == "bar"
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Beispiel #2
0
def _create_stages(
    repo,
    targets,
    fname,
    pbar=None,
    external=False,
    glob=False,
    desc=None,
    transfer=False,
    **kwargs,
):
    from dvc.dvcfile import Dvcfile
    from dvc.stage import Stage, create_stage, restore_meta

    expanded_targets = glob_targets(targets, glob=glob)

    stages = []
    for out in Tqdm(
        expanded_targets,
        desc="Creating DVC files",
        disable=len(expanded_targets) < LARGE_DIR_SIZE,
        unit="file",
    ):
        if kwargs.get("out"):
            out = resolve_output(out, kwargs["out"])
        path, wdir, out = resolve_paths(
            repo, out, always_local=transfer and not kwargs.get("out")
        )
        stage = create_stage(
            Stage,
            repo,
            fname or path,
            wdir=wdir,
            outs=[out],
            external=external,
        )
        restore_meta(stage)
        Dvcfile(repo, stage.path).remove()
        if desc:
            stage.outs[0].desc = desc

        repo._reset()  # pylint: disable=protected-access

        if not stage:
            if pbar is not None:
                pbar.total -= 1
            continue

        stages.append(stage)
        if pbar is not None:
            pbar.update_msg(out)

    return stages
Beispiel #3
0
def test_load_all_singlestage(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo")
    stage1 = dvc.run(
        cmd="cp foo foo2",
        deps=["foo"],
        metrics=["foo2"],
        always_changed=True,
        single_stage=True,
    )
    stages = Dvcfile(dvc, "foo2.dvc").stages.values()
    assert len(stages) == 1
    assert list(stages) == [stage1]
Beispiel #4
0
    def dvcfile(self):
        if self.path and self._dvcfile and self.path == self._dvcfile.path:
            return self._dvcfile

        if not self.path:
            raise DvcException("Stage does not have any path set "
                               "and is detached from dvcfile.")

        from dvc.dvcfile import Dvcfile

        self._dvcfile = Dvcfile(self.repo, self.path)
        return self._dvcfile
Beispiel #5
0
def test_load_stage_with_params(dvc, stage_data, lock_data):
    lock_data["params"] = {"params.yaml": {"lorem": "ipsum"}}
    stage_data["params"] = ["lorem"]
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data)

    params, deps = split_params_deps(stage)
    assert deps[0].def_path == "foo" and stage.outs[0].def_path == "bar"
    assert params[0].def_path == "params.yaml"
    assert params[0].hash_info == HashInfo("params", {"lorem": "ipsum"})
    assert deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Beispiel #6
0
def test_has_stage_with_name(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo")
    dvc.run(
        cmd="cp foo foo2",
        deps=["foo"],
        name="copy-foo-foo2",
        metrics=["foo2"],
        always_changed=True,
    )
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    assert "copy-foo-foo2" in dvcfile.stages
    assert "copy" not in dvcfile.stages
Beispiel #7
0
def _create_stages(
    repo, targets, fname, pbar=None, external=False, glob=False, desc=None,
):
    from glob import iglob

    from dvc.stage import Stage, create_stage, restore_meta

    if glob:
        expanded_targets = [
            exp_target
            for target in targets
            for exp_target in iglob(target, recursive=True)
        ]
    else:
        expanded_targets = targets

    stages = []
    for out in Tqdm(
        expanded_targets,
        desc="Creating DVC-files",
        disable=len(expanded_targets) < LARGE_DIR_SIZE,
        unit="file",
    ):
        path, wdir, out = resolve_paths(repo, out)
        stage = create_stage(
            Stage,
            repo,
            fname or path,
            wdir=wdir,
            outs=[out],
            external=external,
        )
        restore_meta(stage)
        if stage.can_be_skipped:
            stage = None
        else:
            Dvcfile(repo, stage.path).remove()
            if desc:
                stage.outs[0].desc = desc

        repo._reset()  # pylint: disable=protected-access

        if not stage:
            if pbar is not None:
                pbar.total -= 1
            continue

        stages.append(stage)
        if pbar is not None:
            pbar.update_msg(out)

    return stages
Beispiel #8
0
def run(self, fname=None, no_exec=False, single_stage=False, **kwargs):
    from dvc.stage import PipelineStage, Stage, create_stage
    from dvc.dvcfile import Dvcfile, PIPELINE_FILE

    if not kwargs.get("cmd"):
        raise InvalidArgumentError("command is not specified")

    stage_cls = PipelineStage
    path = PIPELINE_FILE
    stage_name = kwargs.get("name")

    if stage_name and single_stage:
        raise InvalidArgumentError(
            "`-n|--name` is incompatible with `--single-stage`")

    if not stage_name and not single_stage:
        raise InvalidArgumentError("`-n|--name` is required")

    if single_stage:
        kwargs.pop("name", None)
        stage_cls = Stage
        path = fname or _get_file_path(kwargs)
    else:
        if not is_valid_name(stage_name):
            raise InvalidStageName

    stage = create_stage(stage_cls, repo=self, path=path, **kwargs)
    if stage is None:
        return None

    dvcfile = Dvcfile(self, stage.path)
    if dvcfile.exists():
        if stage_name and stage_name in dvcfile.stages:
            raise DuplicateStageName(stage_name, dvcfile)
        if stage_cls != PipelineStage:
            dvcfile.remove_with_prompt(force=kwargs.get("overwrite", True))

    try:
        self.check_modified_graph([stage])
    except OutputDuplicationError as exc:
        raise OutputDuplicationError(exc.output, set(exc.stages) - {stage})

    if no_exec:
        stage.ignore_outs()
    else:
        stage.run(
            no_commit=kwargs.get("no_commit", False),
            run_cache=kwargs.get("run_cache", True),
        )

    dvcfile.dump(stage, update_pipeline=True, no_lock=no_exec)
    return stage
Beispiel #9
0
def test_try_loading_dvcfile_that_is_gitignored(tmp_dir, dvc, scm, file):
    with open(tmp_dir / ".gitignore", "a+", encoding="utf-8") as fd:
        fd.write(file)

    # create a file just to avoid other checks
    (tmp_dir / file).write_text("")
    scm._reset()

    dvcfile = Dvcfile(dvc, file)
    with pytest.raises(FileIsGitIgnored) as exc_info:
        dvcfile._load()

    assert str(exc_info.value) == f"bad DVC file name '{file}' is git-ignored."
Beispiel #10
0
    def load_one(self, path: str = None, name: str = None) -> "Stage":
        """Load a single stage from a file.

        Args:
            path: if not provided, default `dvc.yaml` is assumed.
            name: required for `dvc.yaml` files, ignored for `.dvc` files.
        """
        path = self._get_filepath(path, name)
        dvcfile = Dvcfile(self.repo, path)

        stages = dvcfile.stages  # type: ignore

        return stages[name]
Beispiel #11
0
def test_try_get_single_stage_from_pipeline_file(tmp_dir, dvc):
    from dvc.dvcfile import DvcException

    tmp_dir.gen("foo", "foo")
    dvc.run(
        cmd="cp foo foo2",
        deps=["foo"],
        name="copy-foo-foo2",
        metrics=["foo2"],
        always_changed=True,
    )
    with pytest.raises(DvcException):
        assert Dvcfile(dvc, PIPELINE_FILE).stage
Beispiel #12
0
def add(self, tag, target=None, with_deps=False, recursive=False):
    stages = self.collect(target, with_deps=with_deps, recursive=recursive)
    for stage in stages:
        changed = False
        for out in stage.outs:
            if not out.info:
                logger.warning("missing checksum info for '{}'".format(out))
                continue
            out.tags[tag] = copy(out.info)
            changed = True
        if changed:
            dvcfile = Dvcfile(self, stage.path)
            dvcfile.dump(stage)
Beispiel #13
0
def remove(self, tag, target=None, with_deps=False, recursive=False):
    stages = self.collect(target, with_deps=with_deps, recursive=recursive)
    for stage in stages:
        changed = False
        for out in stage.outs:
            if tag not in out.tags.keys():
                logger.warning("tag '{}' not found for '{}'".format(tag, out))
                continue
            del out.tags[tag]
            changed = True
        if changed:
            dvcfile = Dvcfile(self, stage.path)
            dvcfile.dump(stage)
Beispiel #14
0
def test_load_all_multistage(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo")
    stage1 = dvc.run(
        cmd="cp foo foo2",
        deps=["foo"],
        name="copy-foo-foo2",
        metrics=["foo2"],
        always_changed=True,
    )
    stages = Dvcfile(dvc, PIPELINE_FILE).stages.values()
    assert len(stages) == 1
    assert list(stages) == [stage1]

    tmp_dir.gen("bar", "bar")
    stage2 = dvc.run(
        cmd="cp bar bar2",
        deps=["bar"],
        name="copy-bar-bar2",
        metrics=["bar2"],
        always_changed=True,
    )
    assert set(Dvcfile(dvc, PIPELINE_FILE).stages.values()) == {stage2, stage1}
Beispiel #15
0
    def test_force_with_dependencies(self):
        run_out = self.dvc.run(
            fname="datetime.dvc",
            deps=[self.FOO],
            outs=["datetime.txt"],
            cmd='python -c "import time; print(time.time())" > datetime.txt',
        ).outs[0]

        ret = main(["repro", "--force", "datetime.dvc"])
        self.assertEqual(ret, 0)

        repro_out = Dvcfile(self.dvc, "datetime.dvc").load().outs[0]

        self.assertNotEqual(run_out.checksum, repro_out.checksum)
Beispiel #16
0
def test_download_error_pulling_imported_stage(tmp_dir, dvc, erepo_dir):
    with erepo_dir.chdir():
        erepo_dir.dvc_gen("foo", "foo content", commit="create foo")
    dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported")

    dst_stage = Dvcfile(dvc, "foo_imported.dvc").stage
    dst_cache = dst_stage.outs[0].cache_path

    remove("foo_imported")
    remove(dst_cache)

    with patch("dvc.fs.local.LocalFileSystem._download",
               side_effect=Exception), pytest.raises(DownloadError):
        dvc.pull(["foo_imported.dvc"])
Beispiel #17
0
def test_multistage_with_wdir(tmp_dir, dvc):
    from dvc.dvcfile import Dvcfile

    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})
    stage = dvc.run(
        cmd="cp foo foo1",
        deps=["foo"],
        name="copy-foo1-foo2",
        outs=["foo1"],
        wdir="dir",
    )

    data, _ = Dvcfile(dvc, stage.path)._load()
    assert "dir" == data["stages"]["copy-foo1-foo2"]["wdir"]
Beispiel #18
0
def test_pull_imported_stage(tmp_dir, dvc, erepo_dir):
    with erepo_dir.chdir():
        erepo_dir.dvc_gen("foo", "foo content", commit="create foo")
    dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported")

    dst_stage = Dvcfile(dvc, "foo_imported.dvc").stage
    dst_cache = dst_stage.outs[0].cache_path

    remove("foo_imported")
    remove(dst_cache)
    dvc.pull(["foo_imported.dvc"])

    assert os.path.isfile("foo_imported")
    assert os.path.isfile(dst_cache)
Beispiel #19
0
def test_remove_stage_removes_dvcfiles_if_no_stages_left(
        tmp_dir, dvc, run_copy):
    tmp_dir.gen("foo", "foo")
    run_copy("foo", "bar", name="run_copy")

    dvc_file = Dvcfile(dvc, PIPELINE_FILE)

    assert dvc_file.exists()
    assert (tmp_dir / PIPELINE_LOCK).exists()
    assert (tmp_dir / "foo").exists()

    dvc_file.remove_stage(dvc_file.stages["run_copy"])
    assert not dvc_file.exists()
    assert not (tmp_dir / PIPELINE_LOCK).exists()
Beispiel #20
0
def test_multistage_always_changed(tmp_dir, dvc):
    from dvc.dvcfile import Dvcfile

    tmp_dir.gen({"foo": "foo", "bar": "bar"})
    stage = dvc.run(
        cmd="cp foo foo1",
        deps=["foo"],
        name="copy-foo1-foo2",
        outs=["foo1"],
        always_changed=True,
    )

    data, _ = Dvcfile(dvc, stage.path)._load()
    assert data["stages"]["copy-foo1-foo2"]["always_changed"]
Beispiel #21
0
def imp_url(self,
            url,
            out=None,
            fname=None,
            erepo=None,
            frozen=True,
            no_exec=False):
    from dvc.dvcfile import Dvcfile
    from dvc.stage import Stage, create_stage

    out = resolve_output(url, out)
    path, wdir, out = resolve_paths(self, out)

    # NOTE: when user is importing something from within their own repository
    if (erepo is None and os.path.exists(url)
            and path_isin(os.path.abspath(url), self.root_dir)):
        url = relpath(url, wdir)

    stage = create_stage(
        Stage,
        self,
        fname or path,
        wdir=wdir,
        deps=[url],
        outs=[out],
        erepo=erepo,
    )

    if stage is None:
        return None

    dvcfile = Dvcfile(self, stage.path)
    dvcfile.remove()

    try:
        self.check_modified_graph([stage])
    except OutputDuplicationError as exc:
        raise OutputDuplicationError(exc.output, set(exc.stages) - {stage})

    if no_exec:
        stage.ignore_outs()
    else:
        stage.run()

    stage.frozen = frozen

    dvcfile.dump(stage)

    return stage
Beispiel #22
0
def test_gc_not_collect_pipeline_tracked_files(tmp_dir, dvc, run_copy):
    from dvc.dvcfile import PIPELINE_FILE, Dvcfile

    tmp_dir.gen("foo", "foo")
    tmp_dir.gen("bar", "bar")

    run_copy("foo", "foo2", name="copy")
    assert _count_files(dvc.cache.local.cache_dir) == 1
    dvc.gc(workspace=True, force=True)
    assert _count_files(dvc.cache.local.cache_dir) == 1

    # remove pipeline file and lockfile and check
    Dvcfile(dvc, PIPELINE_FILE).remove(force=True)
    dvc.gc(workspace=True, force=True)
    assert _count_files(dvc.cache.local.cache_dir) == 0
Beispiel #23
0
    def _collect_stages(self):
        from dvc.dvcfile import Dvcfile, is_valid_filename

        stages = []
        outs = set()

        for root, dirs, files in self.tree.walk(self.root_dir):
            for file_name in filter(is_valid_filename, files):
                path = os.path.join(root, file_name)
                stages.extend(list(Dvcfile(self, path).stages.values()))
                outs.update(out.fspath for stage in stages
                            for out in (out for out in stage.outs
                                        if out.scheme == "local"))
            dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs]
        return stages
Beispiel #24
0
def test_remove_stage_dvcfiles(tmp_dir, dvc, run_copy):
    tmp_dir.gen("foo", "foo")
    stage = run_copy("foo", "bar", single_stage=True)

    dvc_file = Dvcfile(dvc, stage.path)
    assert dvc_file.exists()
    dvc_file.remove_stage(stage)
    assert not dvc_file.exists()

    # re-check to see if it fails if there's no stage entry
    dvc_file.remove_stage(stage)
    dvc_file.remove(force=True)

    # should not fail when there's no file at all.
    dvc_file.remove_stage(stage)
Beispiel #25
0
def test_meta_is_preserved(tmp_dir, dvc):
    (stage, ) = tmp_dir.dvc_gen("foo", "foo content")

    # Add meta to DVC-file
    data = load_stage_file(stage.path)
    data["meta"] = {"custom_key": 42}
    dump_stage_file(stage.path, data)

    # Loading and dumping to test that it works and meta is retained
    dvcfile = Dvcfile(dvc, stage.path)
    new_stage = dvcfile.load()
    dvcfile.dump(new_stage)

    new_data = load_stage_file(stage.path)
    assert new_data["meta"] == data["meta"]
Beispiel #26
0
def test_parametrization_flag_when_enabled(tmp_dir, dvc, mocker):
    dvc.config["feature"]["parametrization"] = True

    mock = mocker.patch(
        "dvc.dvcfile.DataResolver.resolve", return_value=RESOLVED_DVC_YAML_DATA
    )

    dvcfile = Dvcfile(dvc, "dvc.yaml")
    mocker.patch.object(
        dvcfile, "_load", return_value=[TEMPLATED_DVC_YAML_DATA, None]
    )

    stages = list(dvcfile.stages)
    mock.assert_called_once()
    assert len(stages) == 2
Beispiel #27
0
    def test_nested(self):
        #       .
        #       |-- a
        #       |  |__ nested
        #       |     |__ dir
        #       |       |__ error.dvc     (stage.cwd == 'a/nested/dir')
        #       |__ b
        #          |__ nested.dvc         (stage.out == 'a/nested')
        dir1 = "b"
        dir2 = "a"

        os.mkdir(dir1)
        os.mkdir(dir2)

        nested_dir = os.path.join(dir2, "nested")
        out_dir = relpath(nested_dir, dir1)

        nested_stage = self.dvc.run(
            fname=os.path.join(dir1, "b.dvc"),
            wdir=dir1,
            outs=[out_dir],  # ../a/nested
            cmd=f"mkdir {out_dir}",
            single_stage=True,
        )

        os.mkdir(os.path.join(nested_dir, "dir"))

        error_stage_path = os.path.join(nested_dir, "dir", "error.dvc")

        output = os.path.join("..", "..", "something")
        stage_dump = {
            "cmd": f"echo something > {output}",
            "outs": [{
                "path": output
            }],
        }
        dump_yaml(error_stage_path, stage_dump)

        # NOTE: os.walk() walks in a sorted order and we need dir2 subdirs to
        # be processed before dir1 to load error.dvc first.
        self.dvc.index = self.dvc.index.update([
            nested_stage,
            Dvcfile(self.dvc, error_stage_path).stage,
        ])

        with patch.object(self.dvc, "_reset"):  # to prevent `stages` resetting
            with self.assertRaises(StagePathAsOutputError):
                self.dvc.reproduce(error_stage_path)
Beispiel #28
0
def modify(repo, path, delete=False):
    outs = repo.find_outs_by_path(path)
    assert len(outs) == 1
    out = outs[0]

    if out.scheme != "local":
        msg = "output '{}' scheme '{}' is not supported for metrics"
        raise DvcException(msg.format(out.path, out.path_info.scheme))

    if delete:
        out.metric = None

    out.verify_metric()

    dvcfile = Dvcfile(repo, out.stage.path)
    dvcfile.dump(out.stage)
Beispiel #29
0
def test_run_multi_stage_repeat(tmp_dir, dvc, run_copy):
    from dvc.dvcfile import PIPELINE_FILE, Dvcfile
    from dvc.stage import PipelineStage

    tmp_dir.dvc_gen("foo", "foo")
    run_copy("foo", "foo1", name="copy-foo-foo1")
    run_copy("foo1", "foo2", name="copy-foo1-foo2")
    run_copy("foo2", "foo3", single_stage=True)

    stages = list(Dvcfile(dvc, PIPELINE_FILE).stages.values())
    assert len(stages) == 2
    assert all(isinstance(stage, PipelineStage) for stage in stages)
    assert {stage.name for stage in stages} == {
        "copy-foo-foo1",
        "copy-foo1-foo2",
    }
Beispiel #30
0
    def test_ignored_in_checksum(self):
        stage = self.dvc.run(
            cmd="echo test > {}".format(self.FOO),
            deps=[self.BAR],
            outs=[self.FOO],
        )

        d = stage.dumpd()
        self.assertNotIn(Stage.PARAM_WDIR, d.keys())

        d = load_stage_file(stage.relpath)
        self.assertNotIn(Stage.PARAM_WDIR, d.keys())

        with self.dvc.lock, self.dvc.state:
            stage = Dvcfile(self.dvc, stage.relpath).load()
            self.assertFalse(stage.changed())