Exemple #1
0
class TestCheckoutCorruptedCacheDir(TestDvc):
    def test(self):
        # NOTE: using 'copy' so that cache and link don't have same inode
        ret = main(["config", "cache.type", "copy"])
        self.assertEqual(ret, 0)

        self.dvc = DvcRepo(".")
        stages = self.dvc.add(self.DATA_DIR)
        self.assertEqual(len(stages), 1)
        self.assertEqual(len(stages[0].outs), 1)
        out = stages[0].outs[0]

        # NOTE: modifying cache file for one of the files inside the directory
        # to check if dvc will detect that the cache is corrupted.
        _, entry_hash = next(
            self.dvc.cache.local.load_dir_cache(out.hash_info).items())
        cache = os.fspath(
            self.dvc.cache.local.tree.hash_to_path_info(entry_hash.value))

        os.chmod(cache, 0o644)
        with open(cache, "w+") as fobj:
            fobj.write("1")

        with pytest.raises(CheckoutError):
            self.dvc.checkout(force=True)

        self.assertFalse(os.path.exists(cache))
Exemple #2
0
class TestCheckoutCorruptedCacheDir(TestDvc):
    def test(self):
        # NOTE: using 'copy' so that cache and link don't have same inode
        ret = main(["config", "cache.type", "copy"])
        self.assertEqual(ret, 0)

        self.dvc = DvcRepo(".")
        stages = self.dvc.add(self.DATA_DIR)
        self.assertEqual(len(stages), 1)
        self.assertEqual(len(stages[0].outs), 1)
        out = stages[0].outs[0]

        # NOTE: modifying cache file for one of the files inside the directory
        # to check if dvc will detect that the cache is corrupted.
        entry = self.dvc.cache.local.load_dir_cache(out.checksum)[0]
        checksum = entry[self.dvc.cache.local.PARAM_CHECKSUM]
        cache = self.dvc.cache.local.get(checksum)

        os.chmod(cache, 0o644)
        with open(cache, "w+") as fobj:
            fobj.write("1")

        with pytest.raises(CheckoutError):
            self.dvc.checkout(force=True)

        self.assertFalse(os.path.exists(cache))
Exemple #3
0
    def reproduce(dvc_dir, cwd=None, **kwargs):
        """Run dvc repro and return the result."""
        from dvc.repo import Repo
        from dvc.repo.experiments import hash_exp

        unchanged = []

        def filter_pipeline(stage):
            if isinstance(stage, PipelineStage):
                unchanged.append(stage)

        if cwd:
            old_cwd = os.getcwd()
            os.chdir(cwd)
        else:
            old_cwd = None
            cwd = os.getcwd()

        try:
            logger.debug("Running repro in '%s'", cwd)
            dvc = Repo(dvc_dir)
            dvc.checkout()
            stages = dvc.reproduce(on_unchanged=filter_pipeline, **kwargs)
        finally:
            if old_cwd is not None:
                os.chdir(old_cwd)

        # ideally we would return stages here like a normal repro() call, but
        # stages is not currently picklable and cannot be returned across
        # multiprocessing calls
        return hash_exp(stages + unchanged)
Exemple #4
0
    def reproduce(dvc_dir, cwd=None, **kwargs):
        """Run dvc repro and return the result."""
        from dvc.repo import Repo
        from dvc.repo.experiments import hash_exp

        unchanged = []

        def filter_pipeline(stages):
            unchanged.extend([
                stage for stage in stages if isinstance(stage, PipelineStage)
            ])

        if cwd:
            old_cwd = os.getcwd()
            os.chdir(cwd)
        else:
            old_cwd = None
            cwd = os.getcwd()

        try:
            logger.debug("Running repro in '%s'", cwd)
            dvc = Repo(dvc_dir)

            # NOTE: for checkpoint experiments we handle persist outs slightly
            # differently than normal:
            #
            # - checkpoint out may not yet exist if this is the first time this
            #   experiment has been run, this is not an error condition for
            #   experiments
            # - at the start of a repro run, we need to remove the persist out
            #   and restore it to its last known (committed) state (which may
            #   be removed/does not yet exist) so that our executor workspace
            #   is not polluted with the (persistent) out from an unrelated
            #   experiment run
            checkpoint = kwargs.pop("checkpoint", False)
            dvc.checkout(allow_missing=checkpoint,
                         force=checkpoint,
                         quiet=checkpoint)
            stages = dvc.reproduce(
                on_unchanged=filter_pipeline,
                allow_missing=checkpoint,
                **kwargs,
            )
        finally:
            if old_cwd is not None:
                os.chdir(old_cwd)

        # ideally we would return stages here like a normal repro() call, but
        # stages is not currently picklable and cannot be returned across
        # multiprocessing calls
        return hash_exp(stages + unchanged)
Exemple #5
0
class TestReproExternalBase(TestDvc):
    def should_test(self):
        return False

    @property
    def cache_scheme(self):
        return self.scheme

    @property
    def cache_type(self):
        return "copy"

    @property
    def scheme(self):
        return None

    @property
    def scheme_sep(self):
        return "://"

    @property
    def sep(self):
        return "/"

    def check_already_cached(self, stage):
        stage.outs[0].remove()

        patch_download = patch.object(stage.deps[0],
                                      "download",
                                      wraps=stage.deps[0].download)

        patch_checkout = patch.object(stage.outs[0],
                                      "checkout",
                                      wraps=stage.outs[0].checkout)

        patch_run = patch.object(stage, "_run", wraps=stage._run)

        with self.dvc.state:
            with patch_download as mock_download:
                with patch_checkout as mock_checkout:
                    with patch_run as mock_run:
                        stage.locked = False
                        stage.run()
                        stage.locked = True

                        mock_run.assert_not_called()
                        mock_download.assert_not_called()
                        mock_checkout.assert_called_once()

    @patch("dvc.prompt.confirm", return_value=True)
    def test(self, mock_prompt):
        if not self.should_test():
            return

        cache = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                 str(uuid.uuid4()))

        ret = main(["config", "cache." + self.cache_scheme, "myrepo"])
        self.assertEqual(ret, 0)
        ret = main(["remote", "add", "myrepo", cache])
        self.assertEqual(ret, 0)
        ret = main(["remote", "modify", "myrepo", "type", self.cache_type])
        self.assertEqual(ret, 0)

        remote_name = "myremote"
        remote_key = str(uuid.uuid4())
        remote = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                  remote_key)

        ret = main(["remote", "add", remote_name, remote])
        self.assertEqual(ret, 0)
        ret = main(["remote", "modify", remote_name, "type", self.cache_type])
        self.assertEqual(ret, 0)

        self.dvc = DvcRepo(".")

        foo_key = remote_key + self.sep + self.FOO
        bar_key = remote_key + self.sep + self.BAR

        foo_path = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                    foo_key)
        bar_path = (self.scheme + self.scheme_sep + self.bucket + self.sep +
                    bar_key)

        # Using both plain and remote notation
        out_foo_path = "remote://" + remote_name + "/" + self.FOO
        out_bar_path = bar_path

        self.write(self.bucket, foo_key, self.FOO_CONTENTS)

        import_stage = self.dvc.imp_url(out_foo_path, "import")

        self.assertTrue(os.path.exists("import"))
        self.assertTrue(filecmp.cmp("import", self.FOO, shallow=False))
        self.assertEqual(self.dvc.status([import_stage.path]), {})
        self.check_already_cached(import_stage)

        import_remote_stage = self.dvc.imp_url(out_foo_path,
                                               out_foo_path + "_imported")
        self.assertEqual(self.dvc.status([import_remote_stage.path]), {})

        cmd_stage = self.dvc.run(
            outs=[out_bar_path],
            deps=[out_foo_path],
            cmd=self.cmd(foo_path, bar_path),
        )

        self.assertEqual(self.dvc.status([cmd_stage.path]), {})
        self.assertEqual(self.dvc.status(), {})
        self.check_already_cached(cmd_stage)

        self.write(self.bucket, foo_key, self.BAR_CONTENTS)

        self.assertNotEqual(self.dvc.status(), {})

        self.dvc.update(import_stage.path)
        self.assertTrue(os.path.exists("import"))
        self.assertTrue(filecmp.cmp("import", self.BAR, shallow=False))
        self.assertEqual(self.dvc.status([import_stage.path]), {})

        self.dvc.update(import_remote_stage.path)
        self.assertEqual(self.dvc.status([import_remote_stage.path]), {})

        stages = self.dvc.reproduce(cmd_stage.path)
        self.assertEqual(len(stages), 1)
        self.assertEqual(self.dvc.status([cmd_stage.path]), {})

        self.assertEqual(self.dvc.status(), {})
        self.dvc.gc()
        self.assertEqual(self.dvc.status(), {})

        self.dvc.remove(cmd_stage.path, outs_only=True)
        self.assertNotEqual(self.dvc.status([cmd_stage.path]), {})

        self.dvc.checkout([cmd_stage.path], force=True)
        self.assertEqual(self.dvc.status([cmd_stage.path]), {})
Exemple #6
0
    def reproduce(
        cls,
        dvc_dir: str,
        queue: "Queue",
        rev: str,
        cwd: Optional[str] = None,
        name: Optional[str] = None,
    ) -> Tuple[bool, Optional[str]]:
        """Run dvc repro and return the result.

        Returns tuple of (exp_hash, force) where exp_hash is the experiment
            hash (or None on error) and force is a bool specifying whether or
            not this experiment should force overwrite any existing duplicates.
        """
        unchanged = []

        queue.put((rev, os.getpid()))

        def filter_pipeline(stages):
            unchanged.extend([
                stage for stage in stages if isinstance(stage, PipelineStage)
            ])

        result = None
        force = False

        try:
            dvc = Repo(dvc_dir)
            old_cwd = os.getcwd()
            new_cwd = cwd if cwd else dvc.root_dir
            os.chdir(new_cwd)
            logger.debug("Running repro in '%s'", cwd)

            args_path = os.path.join(dvc.tmp_dir,
                                     BaseExecutor.PACKED_ARGS_FILE)
            if os.path.exists(args_path):
                args, kwargs = BaseExecutor.unpack_repro_args(args_path)
                remove(args_path)
            else:
                args = []
                kwargs = {}

            force = kwargs.get("force", False)

            # NOTE: for checkpoint experiments we handle persist outs slightly
            # differently than normal:
            #
            # - checkpoint out may not yet exist if this is the first time this
            #   experiment has been run, this is not an error condition for
            #   experiments
            # - at the start of a repro run, we need to remove the persist out
            #   and restore it to its last known (committed) state (which may
            #   be removed/does not yet exist) so that our executor workspace
            #   is not polluted with the (persistent) out from an unrelated
            #   experiment run
            dvc.checkout(force=True, quiet=True)

            # We cannot use dvc.scm to make commits inside the executor since
            # cached props are not picklable.
            scm = Git()
            checkpoint_func = partial(cls.checkpoint_callback, scm, name)
            stages = dvc.reproduce(
                *args,
                on_unchanged=filter_pipeline,
                checkpoint_func=checkpoint_func,
                **kwargs,
            )

            exp_hash = cls.hash_exp(stages)
            exp_rev = cls.commit(scm, exp_hash, exp_name=name)
            if scm.get_ref(EXEC_CHECKPOINT):
                scm.set_ref(EXEC_CHECKPOINT, exp_rev)
        except UnchangedExperimentError:
            pass
        finally:
            if scm:
                scm.close()
                del scm
            if old_cwd:
                os.chdir(old_cwd)

        # ideally we would return stages here like a normal repro() call, but
        # stages is not currently picklable and cannot be returned across
        # multiprocessing calls
        return result, force
Exemple #7
0
def main():
    argv = sys.argv[1:]
    if '-h' in argv or '--help' in argv or len(argv) == 0:
        print(DESCRIPTION)
        print()
        print('dvc-cc git branch:')
        print(
            '\tShows the branches without the automatic created branches from DVC-CC.'
        )
        print('dvc-cc git sync [-d] [-l]:')
        print('\tCreate local branches for all remote branches.')
        print('\t\t-d: Than it will download all files from the DVC-Server.')
        print(
            '\t\t-l: If this is set, than it will repeat every 20 seconds the script.'
        )
        print('\t\t\tYou can cancel it with CTRL+C.')
        print('dvc-cc git OTHER_GIT_COMMAND:')
        print(
            '\tEvery other git command will be piped directly to git. After it was called it will run '
            + bcolors.OKBLUE + 'dvc checkout' + bcolors.ENDC)
        print('\t\tto cleanup the repository')
    elif len(argv) == 1 and sys.argv[1] == 'branch':
        git_branch = check_output(['git', 'branch']).decode("utf8").split('\n')
        for line in git_branch:
            if not line.startswith('  rcc_') and not line.startswith(
                    '  remotes/origin/rcc_') and not line.startswith(
                        '  cc_') and not line.startswith(
                            '  remotes/origin/cc_'):
                print(line)
    elif sys.argv[1] == 'sync':
        repo = DVCRepo()

        if (len(argv) > 2 and argv[1] == '-d') or (len(argv) == 3
                                                   and argv[2] == '-d'):
            remote_name = repo.config['core']['remote']
            remote_settings = repo.config['remote'][remote_name]
            if 'ask_password' in remote_settings and remote_settings[
                    'ask_password']:
                remote_settings['password'] = getpass.getpass(
                    'Password for ' + remote_settings['url'] + ': ')
                remote_settings['ask_password'] = False

        git_name_of_branch = get_name_of_branch()

        if (len(argv) > 2 and argv[1] == '-l') or (len(argv) == 3
                                                   and argv[2] == '-l'):
            loop = True
        else:
            loop = False

        git_stash_output = check_output(
            ['git', 'stash']).decode().startswith('No local changes to save')

        subprocess.call(['git', 'fetch', '--all'])

        try:
            is_first_iteration = True
            while loop or is_first_iteration:

                if is_first_iteration == False:
                    print(
                        'All remote branches were created locally. Wait 5 seconds for the next pull request. To cancel the script press CTRL+C.'
                    )
                    time.sleep(5)
                is_first_iteration = False

                _ = check_output(["git", "pull"]).decode("utf8").split("\n")

                all_branches = check_output(["git", "branch",
                                             '-a']).decode("utf8").split("\n")
                all_branches_local = [
                    i[2:] for i in all_branches if len(i.split('/')) == 1
                ]
                all_branches_remote = [
                    i.split('/')[-1] for i in all_branches
                    if len(i.split('/')) > 1
                ]

                for b in all_branches_remote:
                    if b not in all_branches_local:
                        print('git checkout ' + b)
                        _ = check_output(['git', 'checkout', b])

                        print('\t\ŧI CHECKOUT THE DATA')

                        if len(argv) >= 2 and argv[1] == '-d':
                            try:
                                repo.checkout()
                            except:
                                print('Some files are missing.')

                            print('\t\ŧI PULL THE DATA')
                            try:
                                repo.pull()
                            except:
                                print('Some files are missing.')
        finally:
            print('git checkout ' + git_name_of_branch)
            _ = check_output(['git', 'checkout', git_name_of_branch])
            try:
                repo.checkout()
            except:
                print('Some files are missing.')
            try:
                repo.pull()
            except:
                print('Some files are missing.')
            if git_stash_output == False:
                _ = check_output(['git', 'stash', 'apply'])
    else:
        subprocess.call(['git'] + argv)
        try:
            subprocess.call(['dvc', 'checkout'])
        except:
            print('Some files are missing.')
Exemple #8
0
def main():
    parser = ArgumentParser(description=DESCRIPTION)
    parser.add_argument(
        '-f',
        '--regex-name-of-file',
        type=str,
        default=None,
        help='A regex of the name of the files that you want to find.')
    parser.add_argument(
        '-ef',
        '--exclude-regex-name-of-file',
        type=str,
        default=None,
        help='A regex of the name of the file that are excluded.')
    parser.add_argument(
        '-b',
        '--regex-name-of-branch',
        type=str,
        default=None,
        help='A regex of the name of the branches to be included in the search.'
    )
    parser.add_argument(
        '-eb',
        '--exclude-regex-name-of-branch',
        type=str,
        default=None,
        help='A regex of the name of the branch that are excluded.')
    parser.add_argument(
        '-pos',
        '--list-of-pos',
        help=
        'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.',
        nargs="+",
        type=str)
    parser.add_argument('-p',
                        '--path-to-output',
                        type=str,
                        default=None,
                        help='The path where you want save the files.')
    parser.add_argument(
        '-o',
        '--original-name',
        dest='original_name',
        action='store_true',
        default=False,
        help=
        'In default, the branch name is added to the file or folder name. If this parameter is '
        'set,  it will use the original name of the file or folder. If the file exists multiple'
        'times and this parameter is set, then it will use indices at the end of the file or folder names.'
    )
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        default=False,
                        help='Print all files that are copied.')
    parser.add_argument(
        '-d',
        '--download-stages',
        dest='download_stages',
        action='store_true',
        default=False,
        help='Download a stage if the file is not in the local cache.')
    parser.add_argument(
        '-fd',
        '--forbid-dir',
        dest='forbid_dir',
        action='store_true',
        default=False,
        help='If this parameter is set, then it will ignore output folders.')
    parser.add_argument(
        '-ns',
        '--no-save',
        dest='no_save',
        action='store_true',
        default=False,
        help=
        'If true, it will not create a folder or link the file. This parameter is helpfull if it is used with --debug to test your regular expressions.'
    )
    parser.add_argument(
        '-nw',
        '--no-print-of-warnings',
        dest='no_warning',
        action='store_true',
        default=False,
        help=
        'If true, it will not print warning if a file is not created or not in the local cache.'
    )
    args = parser.parse_args()

    repo = DVCRepo()
    g = Git()
    starting_branch = g.branch().split('*')[1].split('\n')[0][1:]

    # Set the password only once!
    if args.download_stages:
        remote_name = repo.config['core']['remote']
        remote_settings = repo.config['remote'][remote_name]
        if 'ask_password' in remote_settings and remote_settings[
                'ask_password']:
            remote_settings['password'] = getpass.getpass(
                'Password for ' + remote_settings['url'] + ': ')
            remote_settings['ask_password'] = False

    if not args.no_save:
        path_to_output = create_output_dir(repo.root_dir, args.path_to_output)
        if path_to_output is None:
            exit(1)
    else:
        path_to_output = 'NONE'

    list_of_allowed_dvccc_ids = None

    if args.list_of_pos is not None:

        list_of_allowed_dvccc_ids = []
        for pos in args.list_of_pos:
            try:
                if pos.find(':') > -1:
                    pos = np.array(pos.split(':'), dtype=int)
                    list_of_allowed_dvccc_ids.extend(np.arange(*pos))
                else:
                    pos = int(pos)
                    if pos >= 0:
                        list_of_allowed_dvccc_ids.append(pos)
                    else:
                        raise ValueError(
                            'ERROR: The parameters ' + str(pos) +
                            ' from --list-of-pos must be positive.')
            except:
                raise ValueError(
                    'ERROR: The parameters ' + str(pos) +
                    ' from --list-of-pos must be an integer or a slicings. i.e.1: 12 14    i.e.2: 12:15:2'
                )

        list_of_allowed_dvccc_ids = np.array(list_of_allowed_dvccc_ids)

    try:
        file_counter = 0
        saved_files = {}
        for branch in repo.brancher(all_branches=True):
            outs = []
            branch_names = []
            if branch.lower() != 'working tree':

                # check if this is a result branch:
                is_dvccc_result_branch = branch.startswith('rcc_')

                # search for all output files in the current branch
                is_branch_of_interest1 = args.regex_name_of_branch is None or re.match(
                    args.regex_name_of_branch, branch)
                is_branch_of_interest2 = args.exclude_regex_name_of_branch is None or not re.match(
                    args.exclude_regex_name_of_branch, branch)

                is_allowed_dvccc_id = True
                if list_of_allowed_dvccc_ids is not None and is_dvccc_result_branch:
                    if not int(
                            branch.split('_')[1]) in list_of_allowed_dvccc_ids:
                        is_allowed_dvccc_id = False

                if is_branch_of_interest1 and is_branch_of_interest2 and is_dvccc_result_branch and is_allowed_dvccc_id:
                    print(branch)
                    g.checkout(branch)
                    #TODO: This would be nice, but its too sloow!
                    try:
                        repo.checkout()
                    except:
                        print('Some files are missing.')

                    print('\tIt is a branch of interest!')
                    #TODO: repo.stages is very slow!
                    for stage in repo.stages:
                        for out in stage.outs:
                            valid_msg = check_out_if_its_valid(
                                out, args.regex_name_of_file,
                                args.exclude_regex_name_of_file,
                                not args.forbid_dir)
                            print('\t\t\t', out, valid_msg)
                            if valid_msg == 'not_in_local_cache' and args.download_stages:
                                g.pull()
                                try:
                                    repo.pull(stage.relpath)
                                except:
                                    print('Some files are missing.')
                                time.sleep(1)
                                valid_msg = check_out_if_its_valid(
                                    out, args.regex_name_of_file,
                                    args.exclude_regex_name_of_file,
                                    not args.forbid_dir)
                                print(valid_msg)
                            if valid_msg == 'valid':
                                outs.append(out)
                                branch_names.append(branch)
                            elif valid_msg == 'not_created' and args.no_warning == False:
                                print(
                                    'Warning: A output file of interest has not yet been created. '
                                    + '(file: ' + str(out) + '; branch: ' +
                                    branch + ')')
                            elif valid_msg == 'not_in_local_cache' and args.no_warning == False:
                                print(
                                    'Warning: A output file of interest is not on the local cache. '
                                    + '(file: ' + out.cache_path +
                                    '; branch: ' + branch +
                                    ')\n You can use this script with -d and it will download the missing stage.'
                                )

                # create a link for each output file of interest in the current branch
                for out, branch_name in zip(outs, branch_names):
                    # create the output file name
                    if not args.original_name:
                        out_filename = branch_name + '_' + str(out).replace(
                            '/', '_').replace('\\\\', '_')
                    else:
                        out_filename = str(out).replace('/', '_').replace(
                            '\\\\', '_')
                    out_filepath = os.path.join(repo.root_dir, path_to_output,
                                                out_filename)

                    file_was_already_saved = False
                    renamer_index = 2
                    file_can_be_saved = False
                    tmp_out_filepath = out_filepath

                    while not file_can_be_saved and not file_was_already_saved:
                        if tmp_out_filepath not in saved_files:
                            file_can_be_saved = True
                            out_filepath = tmp_out_filepath
                            saved_files[out_filepath] = out.checksum
                        elif saved_files[tmp_out_filepath] == out.checksum:
                            file_was_already_saved = True
                        else:
                            tmp_out_filepath = out_filepath + '_' + str(
                                renamer_index)
                            renamer_index += 1
                    if file_can_be_saved:
                        if args.debug:
                            print(out.cache_path, ' -> ', out_filepath)
                        if args.no_save is False:
                            if out.isfile():
                                os.link(out.cache_path, out_filepath)
                            elif out.isdir():
                                os.mkdir(out_filepath)
                                for cache in out.dir_cache:
                                    dirfile_cache_path = repo.cache.local.get(
                                        cache['md5'])
                                    dirfile_outpath = os.path.join(
                                        out_filepath, cache['relpath'])
                                    os.makedirs(
                                        os.path.dirname(dirfile_outpath),
                                        exist_ok=True)
                                    os.link(dirfile_cache_path,
                                            dirfile_outpath)

                        file_counter += 1

        print(
            str(file_counter) + ' files are linked to ' + path_to_output + '.')

    # return always to the starting branch!
    finally:
        g.checkout(starting_branch)
        try:
            repo.checkout()
        except:
            print('Some files are missing.')