コード例 #1
0
ファイル: main.py プロジェクト: KarakoA/dvc-cc
def find_all_dvc_leafs(args_dvc_files):

    #####################################
    # Rename the hyperopt-files to .dvc #
    #####################################
    if os.path.exists('dvc') and os.path.exists(str(Path('dvc/.hyperopt'))):
        list_of_hyperopt_files = [f for f in os.listdir(str(Path('dvc/.hyperopt'))) if f.endswith('.hyperopt')]
        for f in list_of_hyperopt_files:
            os.rename(str(Path('dvc/.hyperopt/' + f)), str(Path('dvc/'+f[:-9]+'.dvc')))
            print(str(Path('dvc/.hyperopt/' + f)), str(Path('dvc/'+f[:-9]+'.dvc')))
    else:
        list_of_hyperopt_files = []

    #############################
    # Find all leafs to execute #
    #############################
    try:
        dvcrepo = DVCRepo('.')
        Gs = dvcrepo.pipelines

        if len(Gs) > 0:
            if args_dvc_files is None:
                #dvc_files = [[f[2:]] for f in helper.getListOfFiles(add_only_files_that_ends_with='.dvc')]
                dvc_files = get_leafs_that_need_to_reproduce(dvcrepo, Gs)
            else:
                dvc_files = []
                dvc_files_tmp = args_dvc_files.dvc_files.replace(' ', '').split(',')
                for dvc_files_branch in dvc_files_tmp:
                    dvc_files.append([])
                    dvc_files_file = dvc_files_branch.split('|')
                    for i in range(len(dvc_files_file)):
                        dvc_files[-1].append(dvc_files_file[i])
                        if not dvc_files_file[i].endswith('.dvc'):
                            raise ValueError('Error: You define with -f which dvc files you want to exec. One or more files does not ends with .dvc. Please use only DVC files.')
        else:
            dvc_files = []
    # make sure that the hyperopt files get always renamed!
    finally:
        #############################
        # Rename the hyperopt-files #
        #############################
        for f in list_of_hyperopt_files:
            if os.path.exists(str(Path('dvc/'+f[:-9]+'.dvc'))):
                os.rename(str(Path('dvc/'+f[:-9]+'.dvc')), str(Path('dvc/.hyperopt/' + f)))
            else:
                print(bcolors.WARNING+'Warning: File ' + str(Path('dvc/'+f[:-9]+'.dvc')) + ' not found.'+bcolors.ENDC)
    return dvc_files, list_of_hyperopt_files, Gs
コード例 #2
0
ファイル: main.py プロジェクト: deep-projects/dvc-cc
def cmd_paraneter_to_dagshub_paramfile(file=None):
    from dvc.repo import Repo as DVCRepo
    dvcrepo = DVCRepo('.')

    parameters = {}
    not_saved_parametername = None
    for pipe in dvcrepo.pipelines:
        for node in pipe.nodes:
            num_of_unnamed_parameters_found = 0
            node = node.cmd.split()
            if node[0] == 'python':
                name_of_pyfile = None
                for i in range(1, len(node)):
                    if name_of_pyfile is not None:
                        if node[i].startswith('--'):
                            not_saved_parametername = node[i][2:]
                        elif node[i].startswith('-'):
                            not_saved_parametername = node[i][1:]
                        else:
                            if not_saved_parametername is None:
                                num_of_unnamed_parameters_found += 1
                                parameters[name_of_pyfile + '_unnamed_' + str(num_of_unnamed_parameters_found)] = node[
                                    i]
                            else:
                                parameters[name_of_pyfile + '_' + not_saved_parametername] = node[i]
                                not_saved_parametername = None

                    if node[i].endswith('.py'):
                        name_of_pyfile = node[i][:-3].split('/')[-1].replace('-', '').replace('-', '')
            elif node[0] == 'papermill':
                position = None
                for i in range(1, len(node)):
                    if node[i] == '-p':
                        position = 1
                    elif position == 1:
                        not_saved_parametername = node[i]
                        position = 2
                    elif position == 2:
                        position = None
                        parameters[not_saved_parametername] = node[i]

    for k in parameters.keys():
        print(k + ': ' + str(parameters[k]), file=file)
コード例 #3
0
ファイル: main.py プロジェクト: deep-projects/dvc-cc
def get_command_list_in_right_order():
    from dvc.repo import Repo as DVCRepo
    dvcrepo = DVCRepo('.')

    G = dvcrepo.pipelines[0]

    all_nodes = []
    while len(G.nodes()) > 0: 
        next = [x for x in G.nodes() if G.out_degree(x) == 0]
        all_nodes.extend(next)
        for node in next:
            G.remove_node(node)

    stages = sorted(dvcrepo.stages, key=lambda s: all_nodes.index(s))

    commandlist = ''
    for s in stages:
        if s.cmd is not None:
            commandlist = commandlist + s.cmd + '\n'
    return commandlist
コード例 #4
0
def main():
    parser = ArgumentParser(description=DESCRIPTION)
    args = parser.parse_args()

    project_dir = get_main_git_directory_Path()

    #os.chdir(str(Path(project_dir))

    gitrepo = GITRepo('.')
    dvcrepo = DVCRepo('.')

    subprocess.call(['git', 'push'])
    subprocess.call(['git', 'push', 'origin', '--tags'])

    if os.path.exists(str(Path('.dvc_cc/cc_agency_experiments.yml'))):
        with open(str(Path('.dvc_cc/cc_agency_experiments.yml')),
                  'r') as stream:
            try:
                experiments = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                exit(1)

        start_an_experiment = False
        for k in experiments.keys():
            # find all files
            paths = []
            if experiments[k]['id'] is None:
                print('Start job ' + k)
                start_an_experiment = True
                paths.extend(experiments[k]['files'])
                # write all to tmp.red.yml
                with open(str(Path('.dvc_cc/tmp.red.yml')), 'w') as f:
                    print("batches:", file=f)
                    for path in paths:
                        with open(str(Path(path)), "r") as r:
                            print(r.read(), file=f)
                    with open(str(Path('.dvc_cc/cc_config.yml')), "r") as r:
                        print(r.read(), file=f)

                # execute faice
                output = subprocess.Popen(
                    ('faice exec .dvc_cc/tmp.red.yml').split(),
                    stdout=subprocess.PIPE)
                cc_id = output.communicate()[0].decode().split()[-1]
                print('The experiment ID is: ' + cc_id)
                os.remove('.dvc_cc/tmp.red.yml')

                # write cc_id to cc_agency_experiments.yml
                experiments[k]['id'] = cc_id
        if start_an_experiment:
            with open(str(Path('.dvc_cc/cc_agency_experiments.yml')),
                      'w') as outfile:
                yaml.dump(experiments, outfile, default_flow_style=False)

        # push the ids
        subprocess.call(['git', 'add', '.dvc_cc/cc_agency_experiments.yml'])
        subprocess.call([
            'git', 'commit', '-m',
            '\'Update cc_agency_experiments.yml: ' + path + '\''
        ])
        subprocess.call(['git', 'push'])
    else:
        print(
            'Warning you did not define a job with dvc-cc run --no-exec. So there is no job to start.'
        )
コード例 #5
0
def main():
    parser = ArgumentParser(description=DESCRIPTION)
    parser.add_argument(
        'path_to_output',
        help='The path to the output/metric file or folder that you want get.',
        type=str)
    parser.add_argument(
        '-p',
        '--list-of-pos',
        help=
        'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.',
        nargs="+",
        type=str)
    parser.add_argument(
        '-e',
        '--print-error',
        help=
        'If this parameter is set, it will print the error message, why a file or folder could not be found.',
        action='store_true')
    args = parser.parse_args()

    repo = DVCRepo()
    g = Git()
    starting_branch = g.branch().split('*')[1].split('\n')[0][1:]

    # Set the password only once!
    remote_name = repo.config['core']['remote']
    remote_settings = repo.config['remote'][remote_name]
    if 'ask_password' in remote_settings and remote_settings['ask_password']:
        remote_settings['password'] = getpass.getpass('Password for ' +
                                                      remote_settings['url'] +
                                                      ': ')
        remote_settings['ask_password'] = False

    path_to_output_clean = args.path_to_output.replace('./', '_').replace(
        '/', '_').replace('\\\\', '_')
    outputdir = create_output_dir(repo.root_dir, path_to_output_clean)
    print('##################', outputdir)
    if outputdir is None:
        exit(1)

    list_of_allowed_dvccc_ids = None

    if args.list_of_pos is not None:

        list_of_allowed_dvccc_ids = []
        for pos in args.list_of_pos:
            try:
                if pos.find(':') > -1:
                    pos = np.array(pos.split(':'), dtype=int)
                    list_of_allowed_dvccc_ids.extend(np.arange(*pos))
                else:
                    pos = int(pos)
                    if pos >= 0:
                        list_of_allowed_dvccc_ids.append(pos)
                    else:
                        raise ValueError(
                            'ERROR: The parameters ' + str(pos) +
                            ' from --list-of-pos must be positive.')
            except:
                raise ValueError(
                    'ERROR: The parameters ' + str(pos) +
                    ' from --list-of-pos must be an integer or a slicings. i.e.1: 12 14    i.e.2: 12:15:2'
                )

        list_of_allowed_dvccc_ids = np.array(list_of_allowed_dvccc_ids)

    all_branches = [
        b.split('/')[-1] for b in g.branch('-a').split()
        if b.startswith('rcc_')
    ]
    all_branches = np.unique(all_branches)

    for b in all_branches:
        if list_of_allowed_dvccc_ids is None or int(
                b.split('_')[1]) in list_of_allowed_dvccc_ids:
            print('dvc get : ', '.', args.path_to_output,
                  str(Path(outputdir + '/' + b)), b)
            try:
                if b.endswith('.dvc'):
                    path_to_output = str(Path(outputdir + '/' + b[:-4]))
                    print(path_to_output)
                    os.mkdir(path_to_output)
                    repo.get('.',
                             args.path_to_output,
                             out=str(
                                 Path(path_to_output + '/' +
                                      path_to_output_clean)),
                             rev=b)
                else:
                    path_to_output = str(Path(outputdir + '/' + b))
                    print(path_to_output)
                    os.mkdir(path_to_output)
                    repo.get('.',
                             args.path_to_output,
                             out=str(
                                 Path(path_to_output + '/' +
                                      path_to_output_clean)),
                             rev=b)
                for p in os.listdir('/tmp'):
                    if p.endswith('dvc-erepo'):
                        shutil.rmtree('/tmp/' + p)
            except Exception as ex:
                print('File was not found.')
                os.rmdir(path_to_output)
                if args.print_error:
                    print(ex)
    print()
    print('Found ' + str(len(os.listdir(str(Path(outputdir))))) +
          ' files or folders.')
    if len(os.listdir(str(Path(outputdir)))) == 0:
        print('Folder was removed.')
        os.rmdir(str(Path(outputdir)))
    print(
        'If files are missing, please use "dvc-cc git sync" to get new result branches and repeat this command.'
    )
コード例 #6
0
def main():
    parser = ArgumentParser(description=DESCRIPTION)
    parser.add_argument('--htw-student', help='If this parameter is set, it will not ask the user to set the values. '
                                             'All values will set by default values.',default=False, action='store_true')
    parser.add_argument('--htw-staff', help='If this parameter is set, it will not ask the user to set the values. '
                                             'All values will set by default values.',default=False, action='store_true')

    parser.add_argument('--stderr-in-same-file',
                        help='If you do not want a own file for stdout and stderr you need to set this flag. If this flag is set, it will use the same file for both stdout and stderr.',
                        default=False, action='store_true')

    args = parser.parse_args()

    gitrepo,gitowner,gitname = get_gitinformation()

    if not args.htw_student and not args.htw_staff:
        print('These settings refer to the required hardware resources in the cluster.')
        print('If you do not set an argument it will take the default values.')

        print()
        print('Please enter the number of GPUs that you want on the cluster. Hint: In the most Deep Learning '
              'scripts, you want to use 1 GPU in the docker container.')
        num_of_gpus = None
        while num_of_gpus is None:
            num_of_gpus = input(bcolors.OKBLUE+'\tNumber of GPUs'+bcolors.ENDC+' (default 0): ')
            if num_of_gpus == '':
                num_of_gpus = 0
            elif num_of_gpus.isdigit():
                num_of_gpus = int(num_of_gpus)
            else:
                print(bcolors.FAIL + '\tWarning: Did not understand your answer. Please use integer values i.e. 0,1,2,3,...' + bcolors.ENDC)
                num_of_gpus = None

        print()
        print('Please enter the RAM that you want on the cluster.')
        ram = None
        while ram is None:
            ram = input(bcolors.OKBLUE+'\tRAM in GB'+bcolors.ENDC+' (default 20): ')
            if ram == '':
                ram = 20000 # 20 GB
            elif ram.isdigit():
                ram = int(ram)*1000
            else:
                print(bcolors.FAIL + '\tWarning: Did not understand your answer. Please use integer values i.e. 10,100,...'+bcolors.ENDC)
                ram = None

        print()
        print('Please enter the Docker Image in which your script gets executed at the cluster.')
        print('   You can choose from the following:')
        print('     - "large", if you want to work with PyTorch 1.2 or/and TensorFlow 2.')
        print('   You can also enter a URL to your own Docker Image.')
        print('   If you need more informations take a look at the following site: https://bit.ly/2mgbiVK')
        docker_image = input(bcolors.OKBLUE+'\tDocker Image'+bcolors.ENDC+' (default: "large"): ')
        if docker_image == '' or docker_image.lower() == 'large':
            docker_image = 'docker.io/deepprojects/dvc-cc-large:10.2'
            docker_image_needs_credentials = False
        else:
            docker_image_needs_credentials = None
            while docker_image_needs_credentials is None:
                docker_image_needs_credentials = input('\tDoes this docker image needs '
                                                       ''+bcolors.OKBLUE+'credentials'+bcolors.ENDC+'? [y,n]:')
                if docker_image_needs_credentials.lower().startswith('y'):
                    docker_image_needs_credentials = True
                elif docker_image_needs_credentials.lower().startswith('n'):
                    docker_image_needs_credentials = False
                else:
                    print(bcolors.FAIL+'\tWarning: Did not understand your answer. Please use y or n.'+bcolors.ENDC)
                    docker_image_needs_credentials = None
        print('You will use the Docker Image: '+ docker_image)
        print()
        batch_concurrency_limit = None
        print('The batch concurrency limit describes how many jobs you can start in parallel.')
        print('You can lower the number to 1, if you do not want the jobs from one experiment runs in parallel.')
        while batch_concurrency_limit is None:
            batch_concurrency_limit = input(bcolors.OKBLUE+'\tBatch concurrency limit'+bcolors.ENDC+' (default 12): ')
            if batch_concurrency_limit == '':
                batch_concurrency_limit = 12
            elif batch_concurrency_limit.isdigit():
                batch_concurrency_limit = int(batch_concurrency_limit)
            else:
                print(bcolors.FAIL+'\tWarning: Did not understand your answer. Please use integer values i.e. 1,4,12,...'+bcolors.ENDC)
                batch_concurrency_limit = None

        print()
        print('The name of the engine you want to use. This describes the cluster that you want to use.')
        print('At the HTW we have the engines "dt", "cc" and "cctest".')

        engine = input('\tThe '+bcolors.OKBLUE+'engine'+bcolors.ENDC+' you want to use (default: dt): ')
        if engine == '' or engine == 'dt':
            engine = 'ccagency'
            engine_url = 'https://agency.f4.htw-berlin.de/dt'
        elif engine == 'cc':
            engine = 'ccagency'
            engine_url = 'https://agency.f4.htw-berlin.de/cc'
        elif engine == 'cctest':
            engine = 'ccagency'
            engine_url = 'https://agency.f4.htw-berlin.de/cctest'
        else:
            print('\tThis engine is unknown. Please specify the engine-url:')
            engine_url = input('The ' + bcolors.OKBLUE + 'engine-url' + bcolors.ENDC + ' you want to use: ')

        print('You will use the engine "' +engine+'" with the url "'+engine_url+'".')

        print()
        print('All large files created by your script and defined as output files by DVC are stored on the DVC server.')
        print('At the HTW we have the storage server "dt1" and "avocado01".')
        dvc_remote_server = input('\tThe remote '+bcolors.OKBLUE+'DVC server'+bcolors.ENDC+' that you want use ('
                                                                                          'default: dt1): ')
        if dvc_remote_server == '' or dvc_remote_server.lower() == 'dt' or dvc_remote_server.lower() == 'dt1':
            dvc_remote_server = 'dt1.f4.htw-berlin.de'
        elif dvc_remote_server.lower() == 'avocado' or dvc_remote_server.lower() == 'avocado01':
            dvc_remote_server = 'avocado01.f4.htw-berlin.de'
        print('You will use the following DVC server "' + dvc_remote_server + '".')


        print()
        print('Here you can enter the folder where you want to store the DVC files on the DVC Storage Server.')
        if dvc_remote_server == 'avocado01.f4.htw-berlin.de':
            dvc_folder_default_value = '/data/ldap/Data-Version-Control-Cache/' + gitrepo + '/' + gitowner + '/' + \
                                       gitname
        else:
            dvc_folder_default_value = '~/' + gitrepo + '/' + gitowner + '/' + gitname
        dvc_remote_path = input('\tThe remote '+bcolors.OKBLUE+'DVC folder'+bcolors.ENDC+' that you want use ('
                                                                                         'default: '+dvc_folder_default_value+'): ')
        if dvc_remote_path == '':
            dvc_remote_path = dvc_folder_default_value

        print()
        print('The username with that you can access the DVC storage server "'+dvc_remote_server+'".')
        dvc_remote_user = input('\tThe '+bcolors.OKBLUE+'username'+bcolors.ENDC+' for the remote DVC folder: ')
        if dvc_remote_user == '':
            dvc_remote_user = input('Do you really want to use the connection to the remote dvc folder without credentials? [n,y]')
            if not dvc_remote_user.lower().startswith('y'):
                dvc_remote_user = input('The username for the remote DVC folder: ')
        print()
    elif args.htw_student:
        # set default values
        num_of_gpus = 1 ##
        ram = 60000
        docker_image = 'docker.io/deepprojects/dvc-cc-large:10.2'
        docker_image_needs_credentials = False
        batch_concurrency_limit = 12
        engine = 'ccagency'
        engine_url = 'https://agency.f4.htw-berlin.de/dt'
        dvc_remote_server = 'dt1.f4.htw-berlin.de'
        dvc_remote_path = '~/' + gitrepo + '/' + gitowner + '/' + gitname

        valid_matriculation_number = False

        print(bcolors.OKBLUE+'Information: The matriculation number is used to access the dt1-storage server and the curious containers '
              'agency. If you get asked for dt1_f4_htw_berlin_de_username or agency_username, please use your matriculation number. '
              'The password for agency_password and dt1_f4_htw_berlin_de_password is the password you received to access '
              'the curious containers agency.'+bcolors.ENDC)

        while valid_matriculation_number == False:
            dvc_remote_user = input('\tPlease fill in your matriculation number (i.e. s0XXXXXX): ').strip()
            if dvc_remote_user.startswith('s0') and dvc_remote_user[2:].isdigit():
                valid_matriculation_number = True
            else:
                print('This is not a valid matriculation number.')
    else:
        # set default values
        num_of_gpus = 1 ##
        ram = 180000
        docker_image = 'docker.io/deepprojects/dvc-cc-large:10.2'
        docker_image_needs_credentials = False
        batch_concurrency_limit = 12
        engine = 'ccagency'
        engine_url = 'https://agency.f4.htw-berlin.de/cc'
        dvc_remote_server = 'avocado01.f4.htw-berlin.de'
        dvc_remote_path = '/data/ldap/Data-Version-Control-Cache/' + gitrepo + '/' + gitowner + '/' + gitname

        valid_matriculation_number = False

        dvc_remote_user = input('\tPlease fill in your ldap username: '******'.')
    try:
        if os.path.exists(str(Path('.dvc/config'))):
            os.remove('.dvc/config')
        if os.path.exists(str(Path('.dvc/config.local'))):
            os.remove('.dvc/config.local')

        dvcrepo = DVCRepo('.')

        #TODO: this can be removed!?
        if not os.path.exists('.dvc'):
            dvcrepo.init()
    except:
        subprocess.call(['dvc', 'init'])
        dvcrepo = DVCRepo('.')

    if dvc_remote_path.startswith('~'):
        if dvc_remote_server == 'dt1.f4.htw-berlin.de':
            dvc_remote_path = '/mnt/md0/' + dvc_remote_user + dvc_remote_path[1:]
        else:
            dvc_remote_path = '/home/'+ dvc_remote_user + dvc_remote_path[1:]


    # set remote dvc connection
    if dvc_remote_user == '':
        subprocess.call(
            ['dvc', 'remote', 'add', '--force', '-d', 'dvc_connection', 'ssh://' + dvc_remote_server + ':' + dvc_remote_path])
        subprocess.call(['dvc', 'remote', 'modify', 'dvc_connection', 'ask_password', 'false'])
    else:
        subprocess.call(['dvc', 'remote', 'add', '--force', '-d', 'dvc_connection',
                         'ssh://' + dvc_remote_user + '@' + dvc_remote_server + ':' + dvc_remote_path])
        subprocess.call(['dvc', 'remote', 'modify', 'dvc_connection', 'ask_password', 'true'])

    try:
        subprocess.call(['ssh', dvc_remote_user + '@' + dvc_remote_server, "mkdir -p "+dvc_remote_path+" ; chmod 774 "+dvc_remote_path+" ; setfacl -d -m u::rwX,g::rwX,o::- "+dvc_remote_path])
    except:
        print(bcolors.WARNING+'Warning: Currently acl is not installed on the server! You will maybe have problems by sharing the same remote dvc folder!'+bcolors.ENDC)


    # create the main folder of the dvc_cc software package.
    if not os.path.exists('.dvc_cc'):
        os.mkdir('.dvc_cc')
    
    # create the config file.    
    if os.path.exists(str(Path('.dvc_cc/cc_config.yml'))):
        os.remove('.dvc_cc/cc_config.yml')

    create_cc_config_file(num_of_gpus,ram,docker_image, docker_image_needs_credentials, batch_concurrency_limit,
                          engine, engine_url, args.stderr_in_same_file)
    subprocess.call(['git', 'add', '.dvc_cc/cc_config.yml'])
コード例 #7
0
parser.add_argument(
    '-p',
    '--list-of-pos',
    help=
    'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.',
    nargs="+",
    type=str)
parser.add_argument(
    '-e',
    '--print-error',
    help=
    'If this parameter is set, it will print the error message, why a file or folder could not be found.',
    action='store_true')
args = parser.parse_args()

repo = DVCRepo()
g = Git()
starting_branch = g.branch().split('*')[1].split('\n')[0][1:]

# Set the password only once!
remote_name = repo.config['core']['remote']
remote_settings = repo.config['remote'][remote_name]
if 'ask_password' in remote_settings and remote_settings['ask_password']:
    remote_settings['password'] = getpass.getpass('Password for ' +
                                                  remote_settings['url'] +
                                                  ': ')
    remote_settings['ask_password'] = False

path_to_output_clean = args.path_to_output.replace('./', '_').replace(
    '/', '_').replace('\\\\', '_')
outputdir = create_output_dir(repo.root_dir, path_to_output_clean)
コード例 #8
0
ファイル: main.py プロジェクト: deep-projects/dvc-cc
def main():
    argv = sys.argv[1:]
    if '-h' in argv or '--help' in argv or len(argv) == 0:
        print(DESCRIPTION)
        print()
        print('dvc-cc git branch:')
        print(
            '\tShows the branches without the automatic created branches from DVC-CC.'
        )
        print('dvc-cc git sync [-d] [-l]:')
        print('\tCreate local branches for all remote branches.')
        print('\t\t-d: Than it will download all files from the DVC-Server.')
        print(
            '\t\t-l: If this is set, than it will repeat every 20 seconds the script.'
        )
        print('\t\t\tYou can cancel it with CTRL+C.')
        print('dvc-cc git OTHER_GIT_COMMAND:')
        print(
            '\tEvery other git command will be piped directly to git. After it was called it will run '
            + bcolors.OKBLUE + 'dvc checkout' + bcolors.ENDC)
        print('\t\tto cleanup the repository')
    elif len(argv) == 1 and sys.argv[1] == 'branch':
        git_branch = check_output(['git', 'branch']).decode("utf8").split('\n')
        for line in git_branch:
            if not line.startswith('  rcc_') and not line.startswith(
                    '  remotes/origin/rcc_') and not line.startswith(
                        '  cc_') and not line.startswith(
                            '  remotes/origin/cc_'):
                print(line)
    elif sys.argv[1] == 'sync':
        repo = DVCRepo()

        if (len(argv) > 2 and argv[1] == '-d') or (len(argv) == 3
                                                   and argv[2] == '-d'):
            remote_name = repo.config['core']['remote']
            remote_settings = repo.config['remote'][remote_name]
            if 'ask_password' in remote_settings and remote_settings[
                    'ask_password']:
                remote_settings['password'] = getpass.getpass(
                    'Password for ' + remote_settings['url'] + ': ')
                remote_settings['ask_password'] = False

        git_name_of_branch = get_name_of_branch()

        if (len(argv) > 2 and argv[1] == '-l') or (len(argv) == 3
                                                   and argv[2] == '-l'):
            loop = True
        else:
            loop = False

        git_stash_output = check_output(
            ['git', 'stash']).decode().startswith('No local changes to save')

        subprocess.call(['git', 'fetch', '--all'])

        try:
            is_first_iteration = True
            while loop or is_first_iteration:

                if is_first_iteration == False:
                    print(
                        'All remote branches were created locally. Wait 5 seconds for the next pull request. To cancel the script press CTRL+C.'
                    )
                    time.sleep(5)
                is_first_iteration = False

                _ = check_output(["git", "pull"]).decode("utf8").split("\n")

                all_branches = check_output(["git", "branch",
                                             '-a']).decode("utf8").split("\n")
                all_branches_local = [
                    i[2:] for i in all_branches if len(i.split('/')) == 1
                ]
                all_branches_remote = [
                    i.split('/')[-1] for i in all_branches
                    if len(i.split('/')) > 1
                ]

                for b in all_branches_remote:
                    if b not in all_branches_local:
                        print('git checkout ' + b)
                        _ = check_output(['git', 'checkout', b])

                        print('\t\ŧI CHECKOUT THE DATA')

                        if len(argv) >= 2 and argv[1] == '-d':
                            try:
                                repo.checkout()
                            except:
                                print('Some files are missing.')

                            print('\t\ŧI PULL THE DATA')
                            try:
                                repo.pull()
                            except:
                                print('Some files are missing.')
        finally:
            print('git checkout ' + git_name_of_branch)
            _ = check_output(['git', 'checkout', git_name_of_branch])
            try:
                repo.checkout()
            except:
                print('Some files are missing.')
            try:
                repo.pull()
            except:
                print('Some files are missing.')
            if git_stash_output == False:
                _ = check_output(['git', 'stash', 'apply'])
    else:
        subprocess.call(['git'] + argv)
        try:
            subprocess.call(['dvc', 'checkout'])
        except:
            print('Some files are missing.')
コード例 #9
0
ファイル: main.py プロジェクト: KarakoA/dvc-cc
def main():
    parser = ArgumentParser(description=DESCRIPTION)
    parser.add_argument(
        '-f',
        '--regex-name-of-file',
        type=str,
        default=None,
        help='A regex of the name of the files that you want to find.')
    parser.add_argument(
        '-ef',
        '--exclude-regex-name-of-file',
        type=str,
        default=None,
        help='A regex of the name of the file that are excluded.')
    parser.add_argument(
        '-b',
        '--regex-name-of-branch',
        type=str,
        default=None,
        help='A regex of the name of the branches to be included in the search.'
    )
    parser.add_argument(
        '-eb',
        '--exclude-regex-name-of-branch',
        type=str,
        default=None,
        help='A regex of the name of the branch that are excluded.')
    parser.add_argument(
        '-pos',
        '--list-of-pos',
        help=
        'A list of dvc-cc indizes that you want include in the display. You can also use slicing for example: 12:15:2 to use 12, 14.',
        nargs="+",
        type=str)
    parser.add_argument('-p',
                        '--path-to-output',
                        type=str,
                        default=None,
                        help='The path where you want save the files.')
    parser.add_argument(
        '-o',
        '--original-name',
        dest='original_name',
        action='store_true',
        default=False,
        help=
        'In default, the branch name is added to the file or folder name. If this parameter is '
        'set,  it will use the original name of the file or folder. If the file exists multiple'
        'times and this parameter is set, then it will use indices at the end of the file or folder names.'
    )
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        default=False,
                        help='Print all files that are copied.')
    parser.add_argument(
        '-d',
        '--download-stages',
        dest='download_stages',
        action='store_true',
        default=False,
        help='Download a stage if the file is not in the local cache.')
    parser.add_argument(
        '-fd',
        '--forbid-dir',
        dest='forbid_dir',
        action='store_true',
        default=False,
        help='If this parameter is set, then it will ignore output folders.')
    parser.add_argument(
        '-ns',
        '--no-save',
        dest='no_save',
        action='store_true',
        default=False,
        help=
        'If true, it will not create a folder or link the file. This parameter is helpfull if it is used with --debug to test your regular expressions.'
    )
    parser.add_argument(
        '-nw',
        '--no-print-of-warnings',
        dest='no_warning',
        action='store_true',
        default=False,
        help=
        'If true, it will not print warning if a file is not created or not in the local cache.'
    )
    args = parser.parse_args()

    repo = DVCRepo()
    g = Git()
    starting_branch = g.branch().split('*')[1].split('\n')[0][1:]

    # Set the password only once!
    if args.download_stages:
        remote_name = repo.config['core']['remote']
        remote_settings = repo.config['remote'][remote_name]
        if 'ask_password' in remote_settings and remote_settings[
                'ask_password']:
            remote_settings['password'] = getpass.getpass(
                'Password for ' + remote_settings['url'] + ': ')
            remote_settings['ask_password'] = False

    if not args.no_save:
        path_to_output = create_output_dir(repo.root_dir, args.path_to_output)
        if path_to_output is None:
            exit(1)
    else:
        path_to_output = 'NONE'

    list_of_allowed_dvccc_ids = None

    if args.list_of_pos is not None:

        list_of_allowed_dvccc_ids = []
        for pos in args.list_of_pos:
            try:
                if pos.find(':') > -1:
                    pos = np.array(pos.split(':'), dtype=int)
                    list_of_allowed_dvccc_ids.extend(np.arange(*pos))
                else:
                    pos = int(pos)
                    if pos >= 0:
                        list_of_allowed_dvccc_ids.append(pos)
                    else:
                        raise ValueError(
                            'ERROR: The parameters ' + str(pos) +
                            ' from --list-of-pos must be positive.')
            except:
                raise ValueError(
                    'ERROR: The parameters ' + str(pos) +
                    ' from --list-of-pos must be an integer or a slicings. i.e.1: 12 14    i.e.2: 12:15:2'
                )

        list_of_allowed_dvccc_ids = np.array(list_of_allowed_dvccc_ids)

    try:
        file_counter = 0
        saved_files = {}
        for branch in repo.brancher(all_branches=True):
            outs = []
            branch_names = []
            if branch.lower() != 'working tree':

                # check if this is a result branch:
                is_dvccc_result_branch = branch.startswith('rcc_')

                # search for all output files in the current branch
                is_branch_of_interest1 = args.regex_name_of_branch is None or re.match(
                    args.regex_name_of_branch, branch)
                is_branch_of_interest2 = args.exclude_regex_name_of_branch is None or not re.match(
                    args.exclude_regex_name_of_branch, branch)

                is_allowed_dvccc_id = True
                if list_of_allowed_dvccc_ids is not None and is_dvccc_result_branch:
                    if not int(
                            branch.split('_')[1]) in list_of_allowed_dvccc_ids:
                        is_allowed_dvccc_id = False

                if is_branch_of_interest1 and is_branch_of_interest2 and is_dvccc_result_branch and is_allowed_dvccc_id:
                    print(branch)
                    g.checkout(branch)
                    #TODO: This would be nice, but its too sloow!
                    try:
                        repo.checkout()
                    except:
                        print('Some files are missing.')

                    print('\tIt is a branch of interest!')
                    #TODO: repo.stages is very slow!
                    for stage in repo.stages:
                        for out in stage.outs:
                            valid_msg = check_out_if_its_valid(
                                out, args.regex_name_of_file,
                                args.exclude_regex_name_of_file,
                                not args.forbid_dir)
                            print('\t\t\t', out, valid_msg)
                            if valid_msg == 'not_in_local_cache' and args.download_stages:
                                g.pull()
                                try:
                                    repo.pull(stage.relpath)
                                except:
                                    print('Some files are missing.')
                                time.sleep(1)
                                valid_msg = check_out_if_its_valid(
                                    out, args.regex_name_of_file,
                                    args.exclude_regex_name_of_file,
                                    not args.forbid_dir)
                                print(valid_msg)
                            if valid_msg == 'valid':
                                outs.append(out)
                                branch_names.append(branch)
                            elif valid_msg == 'not_created' and args.no_warning == False:
                                print(
                                    'Warning: A output file of interest has not yet been created. '
                                    + '(file: ' + str(out) + '; branch: ' +
                                    branch + ')')
                            elif valid_msg == 'not_in_local_cache' and args.no_warning == False:
                                print(
                                    'Warning: A output file of interest is not on the local cache. '
                                    + '(file: ' + out.cache_path +
                                    '; branch: ' + branch +
                                    ')\n You can use this script with -d and it will download the missing stage.'
                                )

                # create a link for each output file of interest in the current branch
                for out, branch_name in zip(outs, branch_names):
                    # create the output file name
                    if not args.original_name:
                        out_filename = branch_name + '_' + str(out).replace(
                            '/', '_').replace('\\\\', '_')
                    else:
                        out_filename = str(out).replace('/', '_').replace(
                            '\\\\', '_')
                    out_filepath = os.path.join(repo.root_dir, path_to_output,
                                                out_filename)

                    file_was_already_saved = False
                    renamer_index = 2
                    file_can_be_saved = False
                    tmp_out_filepath = out_filepath

                    while not file_can_be_saved and not file_was_already_saved:
                        if tmp_out_filepath not in saved_files:
                            file_can_be_saved = True
                            out_filepath = tmp_out_filepath
                            saved_files[out_filepath] = out.checksum
                        elif saved_files[tmp_out_filepath] == out.checksum:
                            file_was_already_saved = True
                        else:
                            tmp_out_filepath = out_filepath + '_' + str(
                                renamer_index)
                            renamer_index += 1
                    if file_can_be_saved:
                        if args.debug:
                            print(out.cache_path, ' -> ', out_filepath)
                        if args.no_save is False:
                            if out.isfile():
                                os.link(out.cache_path, out_filepath)
                            elif out.isdir():
                                os.mkdir(out_filepath)
                                for cache in out.dir_cache:
                                    dirfile_cache_path = repo.cache.local.get(
                                        cache['md5'])
                                    dirfile_outpath = os.path.join(
                                        out_filepath, cache['relpath'])
                                    os.makedirs(
                                        os.path.dirname(dirfile_outpath),
                                        exist_ok=True)
                                    os.link(dirfile_cache_path,
                                            dirfile_outpath)

                        file_counter += 1

        print(
            str(file_counter) + ' files are linked to ' + path_to_output + '.')

    # return always to the starting branch!
    finally:
        g.checkout(starting_branch)
        try:
            repo.checkout()
        except:
            print('Some files are missing.')
コード例 #10
0
def main():
    parser = argparse.ArgumentParser(description='With this script you can visualize your Data Version Control (DVC) - Pipeline.')

    parser.add_argument('-p', '--path', type=str, default=None,
                        help='The path to save the graphs. If this is not set it will plot the dependencies.')

    parser.add_argument('--path-to-repository', type=str, default=None,
                        help='The path to the repository. If this is not set, it will use the current dir.')


    parser.add_argument('--figure-size', type=int, default=15,
                        help='The size of the matplotlib figure that gets created with this script.')


    parser.add_argument('--ignore-outputs', action='store_true', default=False,
                        help='Currently this have no meaning!')
    parser.add_argument('--ignore-dependencies', action='store_true', default=False,
                        help='Currently this have no meaning!')
    args = parser.parse_args()

    # Get the pipeline
    if args.path_to_repository is not None:
        dvcrepo = DVCRepo(args.path_to_repository)
    else:
        dvcrepo = DVCRepo('.')
    pipelines = dvcrepo.pipelines

    # for each pipeline of the graph
    pipe_id = 0
    for g in pipelines:
        # create a new figure for each pipeline
        plt.figure(figsize=(args.figure_size, args.figure_size))
        plt.title('Dependency-Graph ' + str(pipe_id+1))

        # use the path in repo to the dvc file instead of the DataTyoe "Stage"
        g = nx.relabel_nodes(g, {s: s.path_in_repo for s in g.nodes()}, copy=True)


        # get the status of each name
        status_of_stages = list(dvcrepo.status(targets=g.nodes(), with_deps=True))

        # rename nodes for a better visualization of the node names in the plot.
        mapping = {}
        for n in g.nodes():
            new_n = rename_stage_names(n)
            mapping[n] = new_n
            mapping[new_n] = n

        g = nx.relabel_nodes(g, mapping, copy=True)

        # Find optimal order for the stages
        order = [n for n in list(reversed(list(nx.topological_sort(g))))]

        # calc position for each node
        rad_each_segment = 2. * np.pi / len(order)
        pos = {}
        for i in range(len(order)):
            n = order[i]
            rad = rad_each_segment * float(i)
            pos[n] = np.array([np.sin(rad), np.cos(rad)])

        # calculate the optimal radius of a stage-node for the unit circle
        optimal_radius = np.power(np.power(np.sin(rad_each_segment),2.)+np.power(1-np.cos(rad_each_segment),2.),0.5) / 2.

        # calculate the positions that are needed
        pos_stages = {p: pos[p] for p in pos if p in order}

        # set plt lim
        _, _, dist = set_plt_lim(pos_stages, optimal_radius)

        optimal_radius /= dist

        # set basic options for the drawing of the network
        #TODO: the size of each component can be a parameter!
        options = {
            'node_color': '#6FB98F',
            'node_size': 80000 * np.power(optimal_radius,2.0) * np.pi, # volumen!!!! of node
            'width': 20 * optimal_radius, # width of arrow; linear value
            'arrowstyle': '-|>',
            'arrowsize': 50 * optimal_radius, # width of head of arrow; linear value
            'font_size': 40 * optimal_radius # linear value
        }

        # draw; all stages
        nx.draw_networkx_nodes(g, pos=pos_stages, nodelist=order, **options, label='Stages executed')

        # draw; all stages that are changed
        changed_status_mapped = [mapping[v] for v in status_of_stages]
        print(changed_status_mapped)
        options['node_color'] = '#FB6542'
        nx.draw_networkx_nodes(g, pos=pos_stages, nodelist=changed_status_mapped, **options,  label='Stages to be executed')

        # draw edges
        nx.draw_networkx_edges(g, pos_stages, **options)

        # draw labels
        nx.draw_networkx_labels(g, pos_stages, **options)

        # plot legend
        #TODO change the legend and add parameters!
        plt.legend(scatterpoints=1, markerscale=0.1)

        if args.path is not None:
            plt.savefig(args.path + '_' + str(pipe_id+1) + '.jpg')


        pipe_id += 1
    if args.path is None:
        plt.show()