class TestCheckoutCorruptedCacheDir(TestDvc): def test(self): # NOTE: using 'copy' so that cache and link don't have same inode ret = main(["config", "cache.type", "copy"]) self.assertEqual(ret, 0) self.dvc = DvcRepo(".") stages = self.dvc.add(self.DATA_DIR) self.assertEqual(len(stages), 1) self.assertEqual(len(stages[0].outs), 1) out = stages[0].outs[0] # NOTE: modifying cache file for one of the files inside the directory # to check if dvc will detect that the cache is corrupted. _, entry_hash = next( self.dvc.cache.local.load_dir_cache(out.hash_info).items()) cache = os.fspath( self.dvc.cache.local.tree.hash_to_path_info(entry_hash.value)) os.chmod(cache, 0o644) with open(cache, "w+") as fobj: fobj.write("1") with pytest.raises(CheckoutError): self.dvc.checkout(force=True) self.assertFalse(os.path.exists(cache))
class TestCheckoutCorruptedCacheDir(TestDvc): def test(self): # NOTE: using 'copy' so that cache and link don't have same inode ret = main(["config", "cache.type", "copy"]) self.assertEqual(ret, 0) self.dvc = DvcRepo(".") stages = self.dvc.add(self.DATA_DIR) self.assertEqual(len(stages), 1) self.assertEqual(len(stages[0].outs), 1) out = stages[0].outs[0] # NOTE: modifying cache file for one of the files inside the directory # to check if dvc will detect that the cache is corrupted. entry = self.dvc.cache.local.load_dir_cache(out.checksum)[0] checksum = entry[self.dvc.cache.local.PARAM_CHECKSUM] cache = self.dvc.cache.local.get(checksum) os.chmod(cache, 0o644) with open(cache, "w+") as fobj: fobj.write("1") with pytest.raises(CheckoutError): self.dvc.checkout(force=True) self.assertFalse(os.path.exists(cache))
def reproduce(dvc_dir, cwd=None, **kwargs): """Run dvc repro and return the result.""" from dvc.repo import Repo from dvc.repo.experiments import hash_exp unchanged = [] def filter_pipeline(stage): if isinstance(stage, PipelineStage): unchanged.append(stage) if cwd: old_cwd = os.getcwd() os.chdir(cwd) else: old_cwd = None cwd = os.getcwd() try: logger.debug("Running repro in '%s'", cwd) dvc = Repo(dvc_dir) dvc.checkout() stages = dvc.reproduce(on_unchanged=filter_pipeline, **kwargs) finally: if old_cwd is not None: os.chdir(old_cwd) # ideally we would return stages here like a normal repro() call, but # stages is not currently picklable and cannot be returned across # multiprocessing calls return hash_exp(stages + unchanged)
def reproduce(dvc_dir, cwd=None, **kwargs): """Run dvc repro and return the result.""" from dvc.repo import Repo from dvc.repo.experiments import hash_exp unchanged = [] def filter_pipeline(stages): unchanged.extend([ stage for stage in stages if isinstance(stage, PipelineStage) ]) if cwd: old_cwd = os.getcwd() os.chdir(cwd) else: old_cwd = None cwd = os.getcwd() try: logger.debug("Running repro in '%s'", cwd) dvc = Repo(dvc_dir) # NOTE: for checkpoint experiments we handle persist outs slightly # differently than normal: # # - checkpoint out may not yet exist if this is the first time this # experiment has been run, this is not an error condition for # experiments # - at the start of a repro run, we need to remove the persist out # and restore it to its last known (committed) state (which may # be removed/does not yet exist) so that our executor workspace # is not polluted with the (persistent) out from an unrelated # experiment run checkpoint = kwargs.pop("checkpoint", False) dvc.checkout(allow_missing=checkpoint, force=checkpoint, quiet=checkpoint) stages = dvc.reproduce( on_unchanged=filter_pipeline, allow_missing=checkpoint, **kwargs, ) finally: if old_cwd is not None: os.chdir(old_cwd) # ideally we would return stages here like a normal repro() call, but # stages is not currently picklable and cannot be returned across # multiprocessing calls return hash_exp(stages + unchanged)
class TestReproExternalBase(TestDvc): def should_test(self): return False @property def cache_scheme(self): return self.scheme @property def cache_type(self): return "copy" @property def scheme(self): return None @property def scheme_sep(self): return "://" @property def sep(self): return "/" def check_already_cached(self, stage): stage.outs[0].remove() patch_download = patch.object(stage.deps[0], "download", wraps=stage.deps[0].download) patch_checkout = patch.object(stage.outs[0], "checkout", wraps=stage.outs[0].checkout) patch_run = patch.object(stage, "_run", wraps=stage._run) with self.dvc.state: with patch_download as mock_download: with patch_checkout as mock_checkout: with patch_run as mock_run: stage.locked = False stage.run() stage.locked = True mock_run.assert_not_called() mock_download.assert_not_called() mock_checkout.assert_called_once() @patch("dvc.prompt.confirm", return_value=True) def test(self, mock_prompt): if not self.should_test(): return cache = (self.scheme + self.scheme_sep + self.bucket + self.sep + str(uuid.uuid4())) ret = main(["config", "cache." + self.cache_scheme, "myrepo"]) self.assertEqual(ret, 0) ret = main(["remote", "add", "myrepo", cache]) self.assertEqual(ret, 0) ret = main(["remote", "modify", "myrepo", "type", self.cache_type]) self.assertEqual(ret, 0) remote_name = "myremote" remote_key = str(uuid.uuid4()) remote = (self.scheme + self.scheme_sep + self.bucket + self.sep + remote_key) ret = main(["remote", "add", remote_name, remote]) self.assertEqual(ret, 0) ret = main(["remote", "modify", remote_name, "type", self.cache_type]) self.assertEqual(ret, 0) self.dvc = DvcRepo(".") foo_key = remote_key + self.sep + self.FOO bar_key = remote_key + self.sep + self.BAR foo_path = (self.scheme + self.scheme_sep + self.bucket + self.sep + foo_key) bar_path = (self.scheme + self.scheme_sep + self.bucket + self.sep + bar_key) # Using both plain and remote notation out_foo_path = "remote://" + remote_name + "/" + self.FOO out_bar_path = bar_path self.write(self.bucket, foo_key, self.FOO_CONTENTS) import_stage = self.dvc.imp_url(out_foo_path, "import") self.assertTrue(os.path.exists("import")) self.assertTrue(filecmp.cmp("import", self.FOO, shallow=False)) self.assertEqual(self.dvc.status([import_stage.path]), {}) self.check_already_cached(import_stage) import_remote_stage = self.dvc.imp_url(out_foo_path, out_foo_path + "_imported") self.assertEqual(self.dvc.status([import_remote_stage.path]), {}) cmd_stage = self.dvc.run( outs=[out_bar_path], deps=[out_foo_path], cmd=self.cmd(foo_path, bar_path), ) self.assertEqual(self.dvc.status([cmd_stage.path]), {}) self.assertEqual(self.dvc.status(), {}) self.check_already_cached(cmd_stage) self.write(self.bucket, foo_key, self.BAR_CONTENTS) self.assertNotEqual(self.dvc.status(), {}) self.dvc.update(import_stage.path) self.assertTrue(os.path.exists("import")) self.assertTrue(filecmp.cmp("import", self.BAR, shallow=False)) self.assertEqual(self.dvc.status([import_stage.path]), {}) self.dvc.update(import_remote_stage.path) self.assertEqual(self.dvc.status([import_remote_stage.path]), {}) stages = self.dvc.reproduce(cmd_stage.path) self.assertEqual(len(stages), 1) self.assertEqual(self.dvc.status([cmd_stage.path]), {}) self.assertEqual(self.dvc.status(), {}) self.dvc.gc() self.assertEqual(self.dvc.status(), {}) self.dvc.remove(cmd_stage.path, outs_only=True) self.assertNotEqual(self.dvc.status([cmd_stage.path]), {}) self.dvc.checkout([cmd_stage.path], force=True) self.assertEqual(self.dvc.status([cmd_stage.path]), {})
def reproduce( cls, dvc_dir: str, queue: "Queue", rev: str, cwd: Optional[str] = None, name: Optional[str] = None, ) -> Tuple[bool, Optional[str]]: """Run dvc repro and return the result. Returns tuple of (exp_hash, force) where exp_hash is the experiment hash (or None on error) and force is a bool specifying whether or not this experiment should force overwrite any existing duplicates. """ unchanged = [] queue.put((rev, os.getpid())) def filter_pipeline(stages): unchanged.extend([ stage for stage in stages if isinstance(stage, PipelineStage) ]) result = None force = False try: dvc = Repo(dvc_dir) old_cwd = os.getcwd() new_cwd = cwd if cwd else dvc.root_dir os.chdir(new_cwd) logger.debug("Running repro in '%s'", cwd) args_path = os.path.join(dvc.tmp_dir, BaseExecutor.PACKED_ARGS_FILE) if os.path.exists(args_path): args, kwargs = BaseExecutor.unpack_repro_args(args_path) remove(args_path) else: args = [] kwargs = {} force = kwargs.get("force", False) # NOTE: for checkpoint experiments we handle persist outs slightly # differently than normal: # # - checkpoint out may not yet exist if this is the first time this # experiment has been run, this is not an error condition for # experiments # - at the start of a repro run, we need to remove the persist out # and restore it to its last known (committed) state (which may # be removed/does not yet exist) so that our executor workspace # is not polluted with the (persistent) out from an unrelated # experiment run dvc.checkout(force=True, quiet=True) # We cannot use dvc.scm to make commits inside the executor since # cached props are not picklable. scm = Git() checkpoint_func = partial(cls.checkpoint_callback, scm, name) stages = dvc.reproduce( *args, on_unchanged=filter_pipeline, checkpoint_func=checkpoint_func, **kwargs, ) exp_hash = cls.hash_exp(stages) exp_rev = cls.commit(scm, exp_hash, exp_name=name) if scm.get_ref(EXEC_CHECKPOINT): scm.set_ref(EXEC_CHECKPOINT, exp_rev) except UnchangedExperimentError: pass finally: if scm: scm.close() del scm if old_cwd: os.chdir(old_cwd) # ideally we would return stages here like a normal repro() call, but # stages is not currently picklable and cannot be returned across # multiprocessing calls return result, force
def main(): argv = sys.argv[1:] if '-h' in argv or '--help' in argv or len(argv) == 0: print(DESCRIPTION) print() print('dvc-cc git branch:') print( '\tShows the branches without the automatic created branches from DVC-CC.' ) print('dvc-cc git sync [-d] [-l]:') print('\tCreate local branches for all remote branches.') print('\t\t-d: Than it will download all files from the DVC-Server.') print( '\t\t-l: If this is set, than it will repeat every 20 seconds the script.' ) print('\t\t\tYou can cancel it with CTRL+C.') print('dvc-cc git OTHER_GIT_COMMAND:') print( '\tEvery other git command will be piped directly to git. After it was called it will run ' + bcolors.OKBLUE + 'dvc checkout' + bcolors.ENDC) print('\t\tto cleanup the repository') elif len(argv) == 1 and sys.argv[1] == 'branch': git_branch = check_output(['git', 'branch']).decode("utf8").split('\n') for line in git_branch: if not line.startswith(' rcc_') and not line.startswith( ' remotes/origin/rcc_') and not line.startswith( ' cc_') and not line.startswith( ' remotes/origin/cc_'): print(line) elif sys.argv[1] == 'sync': repo = DVCRepo() if (len(argv) > 2 and argv[1] == '-d') or (len(argv) == 3 and argv[2] == '-d'): remote_name = repo.config['core']['remote'] remote_settings = repo.config['remote'][remote_name] if 'ask_password' in remote_settings and remote_settings[ 'ask_password']: remote_settings['password'] = getpass.getpass( 'Password for ' + remote_settings['url'] + ': ') remote_settings['ask_password'] = False git_name_of_branch = get_name_of_branch() if (len(argv) > 2 and argv[1] == '-l') or (len(argv) == 3 and argv[2] == '-l'): loop = True else: loop = False git_stash_output = check_output( ['git', 'stash']).decode().startswith('No local changes to save') subprocess.call(['git', 'fetch', '--all']) try: is_first_iteration = True while loop or is_first_iteration: if is_first_iteration == False: print( 'All remote branches were created locally. Wait 5 seconds for the next pull request. To cancel the script press CTRL+C.' ) time.sleep(5) is_first_iteration = False _ = check_output(["git", "pull"]).decode("utf8").split("\n") all_branches = check_output(["git", "branch", '-a']).decode("utf8").split("\n") all_branches_local = [ i[2:] for i in all_branches if len(i.split('/')) == 1 ] all_branches_remote = [ i.split('/')[-1] for i in all_branches if len(i.split('/')) > 1 ] for b in all_branches_remote: if b not in all_branches_local: print('git checkout ' + b) _ = check_output(['git', 'checkout', b]) print('\t\ŧI CHECKOUT THE DATA') if len(argv) >= 2 and argv[1] == '-d': try: repo.checkout() except: print('Some files are missing.') print('\t\ŧI PULL THE DATA') try: repo.pull() except: print('Some files are missing.') finally: print('git checkout ' + git_name_of_branch) _ = check_output(['git', 'checkout', git_name_of_branch]) try: repo.checkout() except: print('Some files are missing.') try: repo.pull() except: print('Some files are missing.') if git_stash_output == False: _ = check_output(['git', 'stash', 'apply']) else: subprocess.call(['git'] + argv) try: subprocess.call(['dvc', 'checkout']) except: print('Some files are missing.')
def main(): parser = ArgumentParser(description=DESCRIPTION) parser.add_argument( '-f', '--regex-name-of-file', type=str, default=None, help='A regex of the name of the files that you want to find.') parser.add_argument( '-ef', '--exclude-regex-name-of-file', type=str, default=None, help='A regex of the name of the file that are excluded.') parser.add_argument( '-b', '--regex-name-of-branch', type=str, default=None, help='A regex of the name of the branches to be included in the search.' ) parser.add_argument( '-eb', '--exclude-regex-name-of-branch', type=str, default=None, help='A regex of the name of the branch that are excluded.') parser.add_argument( '-pos', '--list-of-pos', help= 'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.', nargs="+", type=str) parser.add_argument('-p', '--path-to-output', type=str, default=None, help='The path where you want save the files.') parser.add_argument( '-o', '--original-name', dest='original_name', action='store_true', default=False, help= 'In default, the branch name is added to the file or folder name. If this parameter is ' 'set, it will use the original name of the file or folder. If the file exists multiple' 'times and this parameter is set, then it will use indices at the end of the file or folder names.' ) parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='Print all files that are copied.') parser.add_argument( '-d', '--download-stages', dest='download_stages', action='store_true', default=False, help='Download a stage if the file is not in the local cache.') parser.add_argument( '-fd', '--forbid-dir', dest='forbid_dir', action='store_true', default=False, help='If this parameter is set, then it will ignore output folders.') parser.add_argument( '-ns', '--no-save', dest='no_save', action='store_true', default=False, help= 'If true, it will not create a folder or link the file. This parameter is helpfull if it is used with --debug to test your regular expressions.' ) parser.add_argument( '-nw', '--no-print-of-warnings', dest='no_warning', action='store_true', default=False, help= 'If true, it will not print warning if a file is not created or not in the local cache.' ) args = parser.parse_args() repo = DVCRepo() g = Git() starting_branch = g.branch().split('*')[1].split('\n')[0][1:] # Set the password only once! if args.download_stages: remote_name = repo.config['core']['remote'] remote_settings = repo.config['remote'][remote_name] if 'ask_password' in remote_settings and remote_settings[ 'ask_password']: remote_settings['password'] = getpass.getpass( 'Password for ' + remote_settings['url'] + ': ') remote_settings['ask_password'] = False if not args.no_save: path_to_output = create_output_dir(repo.root_dir, args.path_to_output) if path_to_output is None: exit(1) else: path_to_output = 'NONE' list_of_allowed_dvccc_ids = None if args.list_of_pos is not None: list_of_allowed_dvccc_ids = [] for pos in args.list_of_pos: try: if pos.find(':') > -1: pos = np.array(pos.split(':'), dtype=int) list_of_allowed_dvccc_ids.extend(np.arange(*pos)) else: pos = int(pos) if pos >= 0: list_of_allowed_dvccc_ids.append(pos) else: raise ValueError( 'ERROR: The parameters ' + str(pos) + ' from --list-of-pos must be positive.') except: raise ValueError( 'ERROR: The parameters ' + str(pos) + ' from --list-of-pos must be an integer or a slicings. i.e.1: 12 14 i.e.2: 12:15:2' ) list_of_allowed_dvccc_ids = np.array(list_of_allowed_dvccc_ids) try: file_counter = 0 saved_files = {} for branch in repo.brancher(all_branches=True): outs = [] branch_names = [] if branch.lower() != 'working tree': # check if this is a result branch: is_dvccc_result_branch = branch.startswith('rcc_') # search for all output files in the current branch is_branch_of_interest1 = args.regex_name_of_branch is None or re.match( args.regex_name_of_branch, branch) is_branch_of_interest2 = args.exclude_regex_name_of_branch is None or not re.match( args.exclude_regex_name_of_branch, branch) is_allowed_dvccc_id = True if list_of_allowed_dvccc_ids is not None and is_dvccc_result_branch: if not int( branch.split('_')[1]) in list_of_allowed_dvccc_ids: is_allowed_dvccc_id = False if is_branch_of_interest1 and is_branch_of_interest2 and is_dvccc_result_branch and is_allowed_dvccc_id: print(branch) g.checkout(branch) #TODO: This would be nice, but its too sloow! try: repo.checkout() except: print('Some files are missing.') print('\tIt is a branch of interest!') #TODO: repo.stages is very slow! for stage in repo.stages: for out in stage.outs: valid_msg = check_out_if_its_valid( out, args.regex_name_of_file, args.exclude_regex_name_of_file, not args.forbid_dir) print('\t\t\t', out, valid_msg) if valid_msg == 'not_in_local_cache' and args.download_stages: g.pull() try: repo.pull(stage.relpath) except: print('Some files are missing.') time.sleep(1) valid_msg = check_out_if_its_valid( out, args.regex_name_of_file, args.exclude_regex_name_of_file, not args.forbid_dir) print(valid_msg) if valid_msg == 'valid': outs.append(out) branch_names.append(branch) elif valid_msg == 'not_created' and args.no_warning == False: print( 'Warning: A output file of interest has not yet been created. ' + '(file: ' + str(out) + '; branch: ' + branch + ')') elif valid_msg == 'not_in_local_cache' and args.no_warning == False: print( 'Warning: A output file of interest is not on the local cache. ' + '(file: ' + out.cache_path + '; branch: ' + branch + ')\n You can use this script with -d and it will download the missing stage.' ) # create a link for each output file of interest in the current branch for out, branch_name in zip(outs, branch_names): # create the output file name if not args.original_name: out_filename = branch_name + '_' + str(out).replace( '/', '_').replace('\\\\', '_') else: out_filename = str(out).replace('/', '_').replace( '\\\\', '_') out_filepath = os.path.join(repo.root_dir, path_to_output, out_filename) file_was_already_saved = False renamer_index = 2 file_can_be_saved = False tmp_out_filepath = out_filepath while not file_can_be_saved and not file_was_already_saved: if tmp_out_filepath not in saved_files: file_can_be_saved = True out_filepath = tmp_out_filepath saved_files[out_filepath] = out.checksum elif saved_files[tmp_out_filepath] == out.checksum: file_was_already_saved = True else: tmp_out_filepath = out_filepath + '_' + str( renamer_index) renamer_index += 1 if file_can_be_saved: if args.debug: print(out.cache_path, ' -> ', out_filepath) if args.no_save is False: if out.isfile(): os.link(out.cache_path, out_filepath) elif out.isdir(): os.mkdir(out_filepath) for cache in out.dir_cache: dirfile_cache_path = repo.cache.local.get( cache['md5']) dirfile_outpath = os.path.join( out_filepath, cache['relpath']) os.makedirs( os.path.dirname(dirfile_outpath), exist_ok=True) os.link(dirfile_cache_path, dirfile_outpath) file_counter += 1 print( str(file_counter) + ' files are linked to ' + path_to_output + '.') # return always to the starting branch! finally: g.checkout(starting_branch) try: repo.checkout() except: print('Some files are missing.')