def find_all_dvc_leafs(args_dvc_files): ##################################### # Rename the hyperopt-files to .dvc # ##################################### if os.path.exists('dvc') and os.path.exists(str(Path('dvc/.hyperopt'))): list_of_hyperopt_files = [f for f in os.listdir(str(Path('dvc/.hyperopt'))) if f.endswith('.hyperopt')] for f in list_of_hyperopt_files: os.rename(str(Path('dvc/.hyperopt/' + f)), str(Path('dvc/'+f[:-9]+'.dvc'))) print(str(Path('dvc/.hyperopt/' + f)), str(Path('dvc/'+f[:-9]+'.dvc'))) else: list_of_hyperopt_files = [] ############################# # Find all leafs to execute # ############################# try: dvcrepo = DVCRepo('.') Gs = dvcrepo.pipelines if len(Gs) > 0: if args_dvc_files is None: #dvc_files = [[f[2:]] for f in helper.getListOfFiles(add_only_files_that_ends_with='.dvc')] dvc_files = get_leafs_that_need_to_reproduce(dvcrepo, Gs) else: dvc_files = [] dvc_files_tmp = args_dvc_files.dvc_files.replace(' ', '').split(',') for dvc_files_branch in dvc_files_tmp: dvc_files.append([]) dvc_files_file = dvc_files_branch.split('|') for i in range(len(dvc_files_file)): dvc_files[-1].append(dvc_files_file[i]) if not dvc_files_file[i].endswith('.dvc'): raise ValueError('Error: You define with -f which dvc files you want to exec. One or more files does not ends with .dvc. Please use only DVC files.') else: dvc_files = [] # make sure that the hyperopt files get always renamed! finally: ############################# # Rename the hyperopt-files # ############################# for f in list_of_hyperopt_files: if os.path.exists(str(Path('dvc/'+f[:-9]+'.dvc'))): os.rename(str(Path('dvc/'+f[:-9]+'.dvc')), str(Path('dvc/.hyperopt/' + f))) else: print(bcolors.WARNING+'Warning: File ' + str(Path('dvc/'+f[:-9]+'.dvc')) + ' not found.'+bcolors.ENDC) return dvc_files, list_of_hyperopt_files, Gs
def cmd_paraneter_to_dagshub_paramfile(file=None): from dvc.repo import Repo as DVCRepo dvcrepo = DVCRepo('.') parameters = {} not_saved_parametername = None for pipe in dvcrepo.pipelines: for node in pipe.nodes: num_of_unnamed_parameters_found = 0 node = node.cmd.split() if node[0] == 'python': name_of_pyfile = None for i in range(1, len(node)): if name_of_pyfile is not None: if node[i].startswith('--'): not_saved_parametername = node[i][2:] elif node[i].startswith('-'): not_saved_parametername = node[i][1:] else: if not_saved_parametername is None: num_of_unnamed_parameters_found += 1 parameters[name_of_pyfile + '_unnamed_' + str(num_of_unnamed_parameters_found)] = node[ i] else: parameters[name_of_pyfile + '_' + not_saved_parametername] = node[i] not_saved_parametername = None if node[i].endswith('.py'): name_of_pyfile = node[i][:-3].split('/')[-1].replace('-', '').replace('-', '') elif node[0] == 'papermill': position = None for i in range(1, len(node)): if node[i] == '-p': position = 1 elif position == 1: not_saved_parametername = node[i] position = 2 elif position == 2: position = None parameters[not_saved_parametername] = node[i] for k in parameters.keys(): print(k + ': ' + str(parameters[k]), file=file)
def get_command_list_in_right_order(): from dvc.repo import Repo as DVCRepo dvcrepo = DVCRepo('.') G = dvcrepo.pipelines[0] all_nodes = [] while len(G.nodes()) > 0: next = [x for x in G.nodes() if G.out_degree(x) == 0] all_nodes.extend(next) for node in next: G.remove_node(node) stages = sorted(dvcrepo.stages, key=lambda s: all_nodes.index(s)) commandlist = '' for s in stages: if s.cmd is not None: commandlist = commandlist + s.cmd + '\n' return commandlist
def main(): parser = ArgumentParser(description=DESCRIPTION) args = parser.parse_args() project_dir = get_main_git_directory_Path() #os.chdir(str(Path(project_dir)) gitrepo = GITRepo('.') dvcrepo = DVCRepo('.') subprocess.call(['git', 'push']) subprocess.call(['git', 'push', 'origin', '--tags']) if os.path.exists(str(Path('.dvc_cc/cc_agency_experiments.yml'))): with open(str(Path('.dvc_cc/cc_agency_experiments.yml')), 'r') as stream: try: experiments = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) exit(1) start_an_experiment = False for k in experiments.keys(): # find all files paths = [] if experiments[k]['id'] is None: print('Start job ' + k) start_an_experiment = True paths.extend(experiments[k]['files']) # write all to tmp.red.yml with open(str(Path('.dvc_cc/tmp.red.yml')), 'w') as f: print("batches:", file=f) for path in paths: with open(str(Path(path)), "r") as r: print(r.read(), file=f) with open(str(Path('.dvc_cc/cc_config.yml')), "r") as r: print(r.read(), file=f) # execute faice output = subprocess.Popen( ('faice exec .dvc_cc/tmp.red.yml').split(), stdout=subprocess.PIPE) cc_id = output.communicate()[0].decode().split()[-1] print('The experiment ID is: ' + cc_id) os.remove('.dvc_cc/tmp.red.yml') # write cc_id to cc_agency_experiments.yml experiments[k]['id'] = cc_id if start_an_experiment: with open(str(Path('.dvc_cc/cc_agency_experiments.yml')), 'w') as outfile: yaml.dump(experiments, outfile, default_flow_style=False) # push the ids subprocess.call(['git', 'add', '.dvc_cc/cc_agency_experiments.yml']) subprocess.call([ 'git', 'commit', '-m', '\'Update cc_agency_experiments.yml: ' + path + '\'' ]) subprocess.call(['git', 'push']) else: print( 'Warning you did not define a job with dvc-cc run --no-exec. So there is no job to start.' )
def main(): parser = ArgumentParser(description=DESCRIPTION) parser.add_argument( 'path_to_output', help='The path to the output/metric file or folder that you want get.', type=str) parser.add_argument( '-p', '--list-of-pos', help= 'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.', nargs="+", type=str) parser.add_argument( '-e', '--print-error', help= 'If this parameter is set, it will print the error message, why a file or folder could not be found.', action='store_true') args = parser.parse_args() repo = DVCRepo() g = Git() starting_branch = g.branch().split('*')[1].split('\n')[0][1:] # Set the password only once! remote_name = repo.config['core']['remote'] remote_settings = repo.config['remote'][remote_name] if 'ask_password' in remote_settings and remote_settings['ask_password']: remote_settings['password'] = getpass.getpass('Password for ' + remote_settings['url'] + ': ') remote_settings['ask_password'] = False path_to_output_clean = args.path_to_output.replace('./', '_').replace( '/', '_').replace('\\\\', '_') outputdir = create_output_dir(repo.root_dir, path_to_output_clean) print('##################', outputdir) if outputdir is None: exit(1) list_of_allowed_dvccc_ids = None if args.list_of_pos is not None: list_of_allowed_dvccc_ids = [] for pos in args.list_of_pos: try: if pos.find(':') > -1: pos = np.array(pos.split(':'), dtype=int) list_of_allowed_dvccc_ids.extend(np.arange(*pos)) else: pos = int(pos) if pos >= 0: list_of_allowed_dvccc_ids.append(pos) else: raise ValueError( 'ERROR: The parameters ' + str(pos) + ' from --list-of-pos must be positive.') except: raise ValueError( 'ERROR: The parameters ' + str(pos) + ' from --list-of-pos must be an integer or a slicings. i.e.1: 12 14 i.e.2: 12:15:2' ) list_of_allowed_dvccc_ids = np.array(list_of_allowed_dvccc_ids) all_branches = [ b.split('/')[-1] for b in g.branch('-a').split() if b.startswith('rcc_') ] all_branches = np.unique(all_branches) for b in all_branches: if list_of_allowed_dvccc_ids is None or int( b.split('_')[1]) in list_of_allowed_dvccc_ids: print('dvc get : ', '.', args.path_to_output, str(Path(outputdir + '/' + b)), b) try: if b.endswith('.dvc'): path_to_output = str(Path(outputdir + '/' + b[:-4])) print(path_to_output) os.mkdir(path_to_output) repo.get('.', args.path_to_output, out=str( Path(path_to_output + '/' + path_to_output_clean)), rev=b) else: path_to_output = str(Path(outputdir + '/' + b)) print(path_to_output) os.mkdir(path_to_output) repo.get('.', args.path_to_output, out=str( Path(path_to_output + '/' + path_to_output_clean)), rev=b) for p in os.listdir('/tmp'): if p.endswith('dvc-erepo'): shutil.rmtree('/tmp/' + p) except Exception as ex: print('File was not found.') os.rmdir(path_to_output) if args.print_error: print(ex) print() print('Found ' + str(len(os.listdir(str(Path(outputdir))))) + ' files or folders.') if len(os.listdir(str(Path(outputdir)))) == 0: print('Folder was removed.') os.rmdir(str(Path(outputdir))) print( 'If files are missing, please use "dvc-cc git sync" to get new result branches and repeat this command.' )
def main(): parser = ArgumentParser(description=DESCRIPTION) parser.add_argument('--htw-student', help='If this parameter is set, it will not ask the user to set the values. ' 'All values will set by default values.',default=False, action='store_true') parser.add_argument('--htw-staff', help='If this parameter is set, it will not ask the user to set the values. ' 'All values will set by default values.',default=False, action='store_true') parser.add_argument('--stderr-in-same-file', help='If you do not want a own file for stdout and stderr you need to set this flag. If this flag is set, it will use the same file for both stdout and stderr.', default=False, action='store_true') args = parser.parse_args() gitrepo,gitowner,gitname = get_gitinformation() if not args.htw_student and not args.htw_staff: print('These settings refer to the required hardware resources in the cluster.') print('If you do not set an argument it will take the default values.') print() print('Please enter the number of GPUs that you want on the cluster. Hint: In the most Deep Learning ' 'scripts, you want to use 1 GPU in the docker container.') num_of_gpus = None while num_of_gpus is None: num_of_gpus = input(bcolors.OKBLUE+'\tNumber of GPUs'+bcolors.ENDC+' (default 0): ') if num_of_gpus == '': num_of_gpus = 0 elif num_of_gpus.isdigit(): num_of_gpus = int(num_of_gpus) else: print(bcolors.FAIL + '\tWarning: Did not understand your answer. Please use integer values i.e. 0,1,2,3,...' + bcolors.ENDC) num_of_gpus = None print() print('Please enter the RAM that you want on the cluster.') ram = None while ram is None: ram = input(bcolors.OKBLUE+'\tRAM in GB'+bcolors.ENDC+' (default 20): ') if ram == '': ram = 20000 # 20 GB elif ram.isdigit(): ram = int(ram)*1000 else: print(bcolors.FAIL + '\tWarning: Did not understand your answer. Please use integer values i.e. 10,100,...'+bcolors.ENDC) ram = None print() print('Please enter the Docker Image in which your script gets executed at the cluster.') print(' You can choose from the following:') print(' - "large", if you want to work with PyTorch 1.2 or/and TensorFlow 2.') print(' You can also enter a URL to your own Docker Image.') print(' If you need more informations take a look at the following site: https://bit.ly/2mgbiVK') docker_image = input(bcolors.OKBLUE+'\tDocker Image'+bcolors.ENDC+' (default: "large"): ') if docker_image == '' or docker_image.lower() == 'large': docker_image = 'docker.io/deepprojects/dvc-cc-large:10.2' docker_image_needs_credentials = False else: docker_image_needs_credentials = None while docker_image_needs_credentials is None: docker_image_needs_credentials = input('\tDoes this docker image needs ' ''+bcolors.OKBLUE+'credentials'+bcolors.ENDC+'? [y,n]:') if docker_image_needs_credentials.lower().startswith('y'): docker_image_needs_credentials = True elif docker_image_needs_credentials.lower().startswith('n'): docker_image_needs_credentials = False else: print(bcolors.FAIL+'\tWarning: Did not understand your answer. Please use y or n.'+bcolors.ENDC) docker_image_needs_credentials = None print('You will use the Docker Image: '+ docker_image) print() batch_concurrency_limit = None print('The batch concurrency limit describes how many jobs you can start in parallel.') print('You can lower the number to 1, if you do not want the jobs from one experiment runs in parallel.') while batch_concurrency_limit is None: batch_concurrency_limit = input(bcolors.OKBLUE+'\tBatch concurrency limit'+bcolors.ENDC+' (default 12): ') if batch_concurrency_limit == '': batch_concurrency_limit = 12 elif batch_concurrency_limit.isdigit(): batch_concurrency_limit = int(batch_concurrency_limit) else: print(bcolors.FAIL+'\tWarning: Did not understand your answer. Please use integer values i.e. 1,4,12,...'+bcolors.ENDC) batch_concurrency_limit = None print() print('The name of the engine you want to use. This describes the cluster that you want to use.') print('At the HTW we have the engines "dt", "cc" and "cctest".') engine = input('\tThe '+bcolors.OKBLUE+'engine'+bcolors.ENDC+' you want to use (default: dt): ') if engine == '' or engine == 'dt': engine = 'ccagency' engine_url = 'https://agency.f4.htw-berlin.de/dt' elif engine == 'cc': engine = 'ccagency' engine_url = 'https://agency.f4.htw-berlin.de/cc' elif engine == 'cctest': engine = 'ccagency' engine_url = 'https://agency.f4.htw-berlin.de/cctest' else: print('\tThis engine is unknown. Please specify the engine-url:') engine_url = input('The ' + bcolors.OKBLUE + 'engine-url' + bcolors.ENDC + ' you want to use: ') print('You will use the engine "' +engine+'" with the url "'+engine_url+'".') print() print('All large files created by your script and defined as output files by DVC are stored on the DVC server.') print('At the HTW we have the storage server "dt1" and "avocado01".') dvc_remote_server = input('\tThe remote '+bcolors.OKBLUE+'DVC server'+bcolors.ENDC+' that you want use (' 'default: dt1): ') if dvc_remote_server == '' or dvc_remote_server.lower() == 'dt' or dvc_remote_server.lower() == 'dt1': dvc_remote_server = 'dt1.f4.htw-berlin.de' elif dvc_remote_server.lower() == 'avocado' or dvc_remote_server.lower() == 'avocado01': dvc_remote_server = 'avocado01.f4.htw-berlin.de' print('You will use the following DVC server "' + dvc_remote_server + '".') print() print('Here you can enter the folder where you want to store the DVC files on the DVC Storage Server.') if dvc_remote_server == 'avocado01.f4.htw-berlin.de': dvc_folder_default_value = '/data/ldap/Data-Version-Control-Cache/' + gitrepo + '/' + gitowner + '/' + \ gitname else: dvc_folder_default_value = '~/' + gitrepo + '/' + gitowner + '/' + gitname dvc_remote_path = input('\tThe remote '+bcolors.OKBLUE+'DVC folder'+bcolors.ENDC+' that you want use (' 'default: '+dvc_folder_default_value+'): ') if dvc_remote_path == '': dvc_remote_path = dvc_folder_default_value print() print('The username with that you can access the DVC storage server "'+dvc_remote_server+'".') dvc_remote_user = input('\tThe '+bcolors.OKBLUE+'username'+bcolors.ENDC+' for the remote DVC folder: ') if dvc_remote_user == '': dvc_remote_user = input('Do you really want to use the connection to the remote dvc folder without credentials? [n,y]') if not dvc_remote_user.lower().startswith('y'): dvc_remote_user = input('The username for the remote DVC folder: ') print() elif args.htw_student: # set default values num_of_gpus = 1 ## ram = 60000 docker_image = 'docker.io/deepprojects/dvc-cc-large:10.2' docker_image_needs_credentials = False batch_concurrency_limit = 12 engine = 'ccagency' engine_url = 'https://agency.f4.htw-berlin.de/dt' dvc_remote_server = 'dt1.f4.htw-berlin.de' dvc_remote_path = '~/' + gitrepo + '/' + gitowner + '/' + gitname valid_matriculation_number = False print(bcolors.OKBLUE+'Information: The matriculation number is used to access the dt1-storage server and the curious containers ' 'agency. If you get asked for dt1_f4_htw_berlin_de_username or agency_username, please use your matriculation number. ' 'The password for agency_password and dt1_f4_htw_berlin_de_password is the password you received to access ' 'the curious containers agency.'+bcolors.ENDC) while valid_matriculation_number == False: dvc_remote_user = input('\tPlease fill in your matriculation number (i.e. s0XXXXXX): ').strip() if dvc_remote_user.startswith('s0') and dvc_remote_user[2:].isdigit(): valid_matriculation_number = True else: print('This is not a valid matriculation number.') else: # set default values num_of_gpus = 1 ## ram = 180000 docker_image = 'docker.io/deepprojects/dvc-cc-large:10.2' docker_image_needs_credentials = False batch_concurrency_limit = 12 engine = 'ccagency' engine_url = 'https://agency.f4.htw-berlin.de/cc' dvc_remote_server = 'avocado01.f4.htw-berlin.de' dvc_remote_path = '/data/ldap/Data-Version-Control-Cache/' + gitrepo + '/' + gitowner + '/' + gitname valid_matriculation_number = False dvc_remote_user = input('\tPlease fill in your ldap username: '******'.') try: if os.path.exists(str(Path('.dvc/config'))): os.remove('.dvc/config') if os.path.exists(str(Path('.dvc/config.local'))): os.remove('.dvc/config.local') dvcrepo = DVCRepo('.') #TODO: this can be removed!? if not os.path.exists('.dvc'): dvcrepo.init() except: subprocess.call(['dvc', 'init']) dvcrepo = DVCRepo('.') if dvc_remote_path.startswith('~'): if dvc_remote_server == 'dt1.f4.htw-berlin.de': dvc_remote_path = '/mnt/md0/' + dvc_remote_user + dvc_remote_path[1:] else: dvc_remote_path = '/home/'+ dvc_remote_user + dvc_remote_path[1:] # set remote dvc connection if dvc_remote_user == '': subprocess.call( ['dvc', 'remote', 'add', '--force', '-d', 'dvc_connection', 'ssh://' + dvc_remote_server + ':' + dvc_remote_path]) subprocess.call(['dvc', 'remote', 'modify', 'dvc_connection', 'ask_password', 'false']) else: subprocess.call(['dvc', 'remote', 'add', '--force', '-d', 'dvc_connection', 'ssh://' + dvc_remote_user + '@' + dvc_remote_server + ':' + dvc_remote_path]) subprocess.call(['dvc', 'remote', 'modify', 'dvc_connection', 'ask_password', 'true']) try: subprocess.call(['ssh', dvc_remote_user + '@' + dvc_remote_server, "mkdir -p "+dvc_remote_path+" ; chmod 774 "+dvc_remote_path+" ; setfacl -d -m u::rwX,g::rwX,o::- "+dvc_remote_path]) except: print(bcolors.WARNING+'Warning: Currently acl is not installed on the server! You will maybe have problems by sharing the same remote dvc folder!'+bcolors.ENDC) # create the main folder of the dvc_cc software package. if not os.path.exists('.dvc_cc'): os.mkdir('.dvc_cc') # create the config file. if os.path.exists(str(Path('.dvc_cc/cc_config.yml'))): os.remove('.dvc_cc/cc_config.yml') create_cc_config_file(num_of_gpus,ram,docker_image, docker_image_needs_credentials, batch_concurrency_limit, engine, engine_url, args.stderr_in_same_file) subprocess.call(['git', 'add', '.dvc_cc/cc_config.yml'])
parser.add_argument( '-p', '--list-of-pos', help= 'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.', nargs="+", type=str) parser.add_argument( '-e', '--print-error', help= 'If this parameter is set, it will print the error message, why a file or folder could not be found.', action='store_true') args = parser.parse_args() repo = DVCRepo() g = Git() starting_branch = g.branch().split('*')[1].split('\n')[0][1:] # Set the password only once! remote_name = repo.config['core']['remote'] remote_settings = repo.config['remote'][remote_name] if 'ask_password' in remote_settings and remote_settings['ask_password']: remote_settings['password'] = getpass.getpass('Password for ' + remote_settings['url'] + ': ') remote_settings['ask_password'] = False path_to_output_clean = args.path_to_output.replace('./', '_').replace( '/', '_').replace('\\\\', '_') outputdir = create_output_dir(repo.root_dir, path_to_output_clean)
def main(): argv = sys.argv[1:] if '-h' in argv or '--help' in argv or len(argv) == 0: print(DESCRIPTION) print() print('dvc-cc git branch:') print( '\tShows the branches without the automatic created branches from DVC-CC.' ) print('dvc-cc git sync [-d] [-l]:') print('\tCreate local branches for all remote branches.') print('\t\t-d: Than it will download all files from the DVC-Server.') print( '\t\t-l: If this is set, than it will repeat every 20 seconds the script.' ) print('\t\t\tYou can cancel it with CTRL+C.') print('dvc-cc git OTHER_GIT_COMMAND:') print( '\tEvery other git command will be piped directly to git. After it was called it will run ' + bcolors.OKBLUE + 'dvc checkout' + bcolors.ENDC) print('\t\tto cleanup the repository') elif len(argv) == 1 and sys.argv[1] == 'branch': git_branch = check_output(['git', 'branch']).decode("utf8").split('\n') for line in git_branch: if not line.startswith(' rcc_') and not line.startswith( ' remotes/origin/rcc_') and not line.startswith( ' cc_') and not line.startswith( ' remotes/origin/cc_'): print(line) elif sys.argv[1] == 'sync': repo = DVCRepo() if (len(argv) > 2 and argv[1] == '-d') or (len(argv) == 3 and argv[2] == '-d'): remote_name = repo.config['core']['remote'] remote_settings = repo.config['remote'][remote_name] if 'ask_password' in remote_settings and remote_settings[ 'ask_password']: remote_settings['password'] = getpass.getpass( 'Password for ' + remote_settings['url'] + ': ') remote_settings['ask_password'] = False git_name_of_branch = get_name_of_branch() if (len(argv) > 2 and argv[1] == '-l') or (len(argv) == 3 and argv[2] == '-l'): loop = True else: loop = False git_stash_output = check_output( ['git', 'stash']).decode().startswith('No local changes to save') subprocess.call(['git', 'fetch', '--all']) try: is_first_iteration = True while loop or is_first_iteration: if is_first_iteration == False: print( 'All remote branches were created locally. Wait 5 seconds for the next pull request. To cancel the script press CTRL+C.' ) time.sleep(5) is_first_iteration = False _ = check_output(["git", "pull"]).decode("utf8").split("\n") all_branches = check_output(["git", "branch", '-a']).decode("utf8").split("\n") all_branches_local = [ i[2:] for i in all_branches if len(i.split('/')) == 1 ] all_branches_remote = [ i.split('/')[-1] for i in all_branches if len(i.split('/')) > 1 ] for b in all_branches_remote: if b not in all_branches_local: print('git checkout ' + b) _ = check_output(['git', 'checkout', b]) print('\t\ŧI CHECKOUT THE DATA') if len(argv) >= 2 and argv[1] == '-d': try: repo.checkout() except: print('Some files are missing.') print('\t\ŧI PULL THE DATA') try: repo.pull() except: print('Some files are missing.') finally: print('git checkout ' + git_name_of_branch) _ = check_output(['git', 'checkout', git_name_of_branch]) try: repo.checkout() except: print('Some files are missing.') try: repo.pull() except: print('Some files are missing.') if git_stash_output == False: _ = check_output(['git', 'stash', 'apply']) else: subprocess.call(['git'] + argv) try: subprocess.call(['dvc', 'checkout']) except: print('Some files are missing.')
def main(): parser = ArgumentParser(description=DESCRIPTION) parser.add_argument( '-f', '--regex-name-of-file', type=str, default=None, help='A regex of the name of the files that you want to find.') parser.add_argument( '-ef', '--exclude-regex-name-of-file', type=str, default=None, help='A regex of the name of the file that are excluded.') parser.add_argument( '-b', '--regex-name-of-branch', type=str, default=None, help='A regex of the name of the branches to be included in the search.' ) parser.add_argument( '-eb', '--exclude-regex-name-of-branch', type=str, default=None, help='A regex of the name of the branch that are excluded.') parser.add_argument( '-pos', '--list-of-pos', help= 'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.', nargs="+", type=str) parser.add_argument('-p', '--path-to-output', type=str, default=None, help='The path where you want save the files.') parser.add_argument( '-o', '--original-name', dest='original_name', action='store_true', default=False, help= 'In default, the branch name is added to the file or folder name. If this parameter is ' 'set, it will use the original name of the file or folder. If the file exists multiple' 'times and this parameter is set, then it will use indices at the end of the file or folder names.' ) parser.add_argument('--debug', dest='debug', action='store_true', default=False, help='Print all files that are copied.') parser.add_argument( '-d', '--download-stages', dest='download_stages', action='store_true', default=False, help='Download a stage if the file is not in the local cache.') parser.add_argument( '-fd', '--forbid-dir', dest='forbid_dir', action='store_true', default=False, help='If this parameter is set, then it will ignore output folders.') parser.add_argument( '-ns', '--no-save', dest='no_save', action='store_true', default=False, help= 'If true, it will not create a folder or link the file. This parameter is helpfull if it is used with --debug to test your regular expressions.' ) parser.add_argument( '-nw', '--no-print-of-warnings', dest='no_warning', action='store_true', default=False, help= 'If true, it will not print warning if a file is not created or not in the local cache.' ) args = parser.parse_args() repo = DVCRepo() g = Git() starting_branch = g.branch().split('*')[1].split('\n')[0][1:] # Set the password only once! if args.download_stages: remote_name = repo.config['core']['remote'] remote_settings = repo.config['remote'][remote_name] if 'ask_password' in remote_settings and remote_settings[ 'ask_password']: remote_settings['password'] = getpass.getpass( 'Password for ' + remote_settings['url'] + ': ') remote_settings['ask_password'] = False if not args.no_save: path_to_output = create_output_dir(repo.root_dir, args.path_to_output) if path_to_output is None: exit(1) else: path_to_output = 'NONE' list_of_allowed_dvccc_ids = None if args.list_of_pos is not None: list_of_allowed_dvccc_ids = [] for pos in args.list_of_pos: try: if pos.find(':') > -1: pos = np.array(pos.split(':'), dtype=int) list_of_allowed_dvccc_ids.extend(np.arange(*pos)) else: pos = int(pos) if pos >= 0: list_of_allowed_dvccc_ids.append(pos) else: raise ValueError( 'ERROR: The parameters ' + str(pos) + ' from --list-of-pos must be positive.') except: raise ValueError( 'ERROR: The parameters ' + str(pos) + ' from --list-of-pos must be an integer or a slicings. i.e.1: 12 14 i.e.2: 12:15:2' ) list_of_allowed_dvccc_ids = np.array(list_of_allowed_dvccc_ids) try: file_counter = 0 saved_files = {} for branch in repo.brancher(all_branches=True): outs = [] branch_names = [] if branch.lower() != 'working tree': # check if this is a result branch: is_dvccc_result_branch = branch.startswith('rcc_') # search for all output files in the current branch is_branch_of_interest1 = args.regex_name_of_branch is None or re.match( args.regex_name_of_branch, branch) is_branch_of_interest2 = args.exclude_regex_name_of_branch is None or not re.match( args.exclude_regex_name_of_branch, branch) is_allowed_dvccc_id = True if list_of_allowed_dvccc_ids is not None and is_dvccc_result_branch: if not int( branch.split('_')[1]) in list_of_allowed_dvccc_ids: is_allowed_dvccc_id = False if is_branch_of_interest1 and is_branch_of_interest2 and is_dvccc_result_branch and is_allowed_dvccc_id: print(branch) g.checkout(branch) #TODO: This would be nice, but its too sloow! try: repo.checkout() except: print('Some files are missing.') print('\tIt is a branch of interest!') #TODO: repo.stages is very slow! for stage in repo.stages: for out in stage.outs: valid_msg = check_out_if_its_valid( out, args.regex_name_of_file, args.exclude_regex_name_of_file, not args.forbid_dir) print('\t\t\t', out, valid_msg) if valid_msg == 'not_in_local_cache' and args.download_stages: g.pull() try: repo.pull(stage.relpath) except: print('Some files are missing.') time.sleep(1) valid_msg = check_out_if_its_valid( out, args.regex_name_of_file, args.exclude_regex_name_of_file, not args.forbid_dir) print(valid_msg) if valid_msg == 'valid': outs.append(out) branch_names.append(branch) elif valid_msg == 'not_created' and args.no_warning == False: print( 'Warning: A output file of interest has not yet been created. ' + '(file: ' + str(out) + '; branch: ' + branch + ')') elif valid_msg == 'not_in_local_cache' and args.no_warning == False: print( 'Warning: A output file of interest is not on the local cache. ' + '(file: ' + out.cache_path + '; branch: ' + branch + ')\n You can use this script with -d and it will download the missing stage.' ) # create a link for each output file of interest in the current branch for out, branch_name in zip(outs, branch_names): # create the output file name if not args.original_name: out_filename = branch_name + '_' + str(out).replace( '/', '_').replace('\\\\', '_') else: out_filename = str(out).replace('/', '_').replace( '\\\\', '_') out_filepath = os.path.join(repo.root_dir, path_to_output, out_filename) file_was_already_saved = False renamer_index = 2 file_can_be_saved = False tmp_out_filepath = out_filepath while not file_can_be_saved and not file_was_already_saved: if tmp_out_filepath not in saved_files: file_can_be_saved = True out_filepath = tmp_out_filepath saved_files[out_filepath] = out.checksum elif saved_files[tmp_out_filepath] == out.checksum: file_was_already_saved = True else: tmp_out_filepath = out_filepath + '_' + str( renamer_index) renamer_index += 1 if file_can_be_saved: if args.debug: print(out.cache_path, ' -> ', out_filepath) if args.no_save is False: if out.isfile(): os.link(out.cache_path, out_filepath) elif out.isdir(): os.mkdir(out_filepath) for cache in out.dir_cache: dirfile_cache_path = repo.cache.local.get( cache['md5']) dirfile_outpath = os.path.join( out_filepath, cache['relpath']) os.makedirs( os.path.dirname(dirfile_outpath), exist_ok=True) os.link(dirfile_cache_path, dirfile_outpath) file_counter += 1 print( str(file_counter) + ' files are linked to ' + path_to_output + '.') # return always to the starting branch! finally: g.checkout(starting_branch) try: repo.checkout() except: print('Some files are missing.')
def main(): parser = argparse.ArgumentParser(description='With this script you can visualize your Data Version Control (DVC) - Pipeline.') parser.add_argument('-p', '--path', type=str, default=None, help='The path to save the graphs. If this is not set it will plot the dependencies.') parser.add_argument('--path-to-repository', type=str, default=None, help='The path to the repository. If this is not set, it will use the current dir.') parser.add_argument('--figure-size', type=int, default=15, help='The size of the matplotlib figure that gets created with this script.') parser.add_argument('--ignore-outputs', action='store_true', default=False, help='Currently this have no meaning!') parser.add_argument('--ignore-dependencies', action='store_true', default=False, help='Currently this have no meaning!') args = parser.parse_args() # Get the pipeline if args.path_to_repository is not None: dvcrepo = DVCRepo(args.path_to_repository) else: dvcrepo = DVCRepo('.') pipelines = dvcrepo.pipelines # for each pipeline of the graph pipe_id = 0 for g in pipelines: # create a new figure for each pipeline plt.figure(figsize=(args.figure_size, args.figure_size)) plt.title('Dependency-Graph ' + str(pipe_id+1)) # use the path in repo to the dvc file instead of the DataTyoe "Stage" g = nx.relabel_nodes(g, {s: s.path_in_repo for s in g.nodes()}, copy=True) # get the status of each name status_of_stages = list(dvcrepo.status(targets=g.nodes(), with_deps=True)) # rename nodes for a better visualization of the node names in the plot. mapping = {} for n in g.nodes(): new_n = rename_stage_names(n) mapping[n] = new_n mapping[new_n] = n g = nx.relabel_nodes(g, mapping, copy=True) # Find optimal order for the stages order = [n for n in list(reversed(list(nx.topological_sort(g))))] # calc position for each node rad_each_segment = 2. * np.pi / len(order) pos = {} for i in range(len(order)): n = order[i] rad = rad_each_segment * float(i) pos[n] = np.array([np.sin(rad), np.cos(rad)]) # calculate the optimal radius of a stage-node for the unit circle optimal_radius = np.power(np.power(np.sin(rad_each_segment),2.)+np.power(1-np.cos(rad_each_segment),2.),0.5) / 2. # calculate the positions that are needed pos_stages = {p: pos[p] for p in pos if p in order} # set plt lim _, _, dist = set_plt_lim(pos_stages, optimal_radius) optimal_radius /= dist # set basic options for the drawing of the network #TODO: the size of each component can be a parameter! options = { 'node_color': '#6FB98F', 'node_size': 80000 * np.power(optimal_radius,2.0) * np.pi, # volumen!!!! of node 'width': 20 * optimal_radius, # width of arrow; linear value 'arrowstyle': '-|>', 'arrowsize': 50 * optimal_radius, # width of head of arrow; linear value 'font_size': 40 * optimal_radius # linear value } # draw; all stages nx.draw_networkx_nodes(g, pos=pos_stages, nodelist=order, **options, label='Stages executed') # draw; all stages that are changed changed_status_mapped = [mapping[v] for v in status_of_stages] print(changed_status_mapped) options['node_color'] = '#FB6542' nx.draw_networkx_nodes(g, pos=pos_stages, nodelist=changed_status_mapped, **options, label='Stages to be executed') # draw edges nx.draw_networkx_edges(g, pos_stages, **options) # draw labels nx.draw_networkx_labels(g, pos_stages, **options) # plot legend #TODO change the legend and add parameters! plt.legend(scatterpoints=1, markerscale=0.1) if args.path is not None: plt.savefig(args.path + '_' + str(pipe_id+1) + '.jpg') pipe_id += 1 if args.path is None: plt.show()