Esempio n. 1
0
def split_submit_job(files,
                     script_path,
                     job_name_pattern,
                     n_files,
                     additional_argumets='',
                     test=True):
    print(files)
    if test:
        files = files[:10]
        n_files = 2

    job_id = 0

    for file_list in tqdm(batch(files, n_files),
                          total=int(len(files) / n_files) + 1,
                          desc='Submitting jobs: '):
        job_name = job_name_pattern + str(job_id)
        pd.DataFrame({'file': file_list}).to_csv(job_name + '.csv')
        arguments = ' ' + job_name + '.csv ' + additional_argumets
        command = get_job_command(job_name, script_path, arguments)
        subprocess.run(command, shell=True)

        job_id = job_id + 1

    return len(files)
Esempio n. 2
0
def submit_download_job(name, login, csv_file):
    commands = get_job_command(name,
                               definitions.ROOT_DIR + '/io/download_file.py',
                               login + ' ' + csv_file)
    subprocess.run(commands,
                   shell=True,
                   stdout=subprocess.DEVNULL,
                   stderr=subprocess.DEVNULL)
Esempio n. 3
0
def submit_merge_root_files(dataset_name, maximum_size, continue_pre, number_of_runs):
    print('Dataset name = ' + dataset_name)

    folder_root_files = definitions.DATA_FOLDER + '/root/' + dataset_name
    folder_to_save = definitions.DATA_FOLDER + '/root_merged/' + dataset_name

    if not os.path.isdir(definitions.DATA_FOLDER + '/root_merged/'):
        os.mkdir(definitions.DATA_FOLDER + '/root_merged/')

    if not os.path.isdir(folder_to_save):
        os.mkdir(folder_to_save)
    elif not continue_pre:  # Delete previous merge
        delete_files = subprocess.Popen('rm -f ' + folder_to_save + '/*', shell=True)
        delete_files.wait()

    print('Folder with root files: ' + folder_root_files)
    print('Target file size(GB): ' + str(maximum_size))

    files = glob.glob(folder_root_files + "/*.root")
    periods = {get_run_number(x) for x in files}

    print('The total number of files found is : ' + str(len(files)))
    print()
    print('Saving them to:' + folder_to_save)

    if continue_pre:
        print("The merge will not delete files from previous iterations. Only new periods will be reprocessed.")
        files_already_merged = glob.glob(folder_to_save + "/*.root")

        period_already_merged = {get_run_number(x) for x in files_already_merged}
        if len(period_already_merged) == 0:
            print("No previous merged files.")
        else:
            print("Found " + str(len(period_already_merged)) + "periods already merged")
            print(period_already_merged)
        periods = periods - period_already_merged

    periods = list(periods)
    if len(periods) == 0:
        print("No periods to merge")

    job_id = 0

    for period in tqdm(list(batch(periods, number_of_runs)), desc='Submitting jobs'):
        job_name = dataset_name + '_merge_' + str(job_id)
        arguments = str(dataset_name) + ' --target_sizeGB ' + str(maximum_size) + ' -r '
        for p in period:
            arguments = arguments + str(p) + ' '
        command = get_job_command(job_name, definitions.ROOT_DIR + '/io/merge_root_files.py', arguments)
        subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        job_id += 1

    return len(periods)
Esempio n. 4
0
def submit_train(dataset_name, yaml_config, prefix=None):
    d_cuts = configyaml.ConfigYaml(yaml_config)

    pt_bins = np.array(d_cuts.values['model_building']['bins_pt'])
    pt_bins = pd.cut(0.5 * (pt_bins[:-1] + pt_bins[1:]), bins=pt_bins)
    base_f = definitions.ROOT_DIR

    queue = d_cuts.values['model_building']['queue']

    for i in reversed(range(len(pt_bins))):
        arguments = str(i) + ' ' + str(dataset_name)
        if prefix is not None:
            arguments += ' --prefix ' + prefix

        command = get_job_command(dataset_name + '_t_pt_' + str(i), base_f + "/ml/train_lgb.py ", arguments,
                                  queue=queue)
        subprocess.run(command, shell=True)
def submit_root_to_parquet(dataset_name, name_root, n_files):
    if name_root is None:
        name_root = dataset_name

    print('Configuration (root file) = ' + name_root)
    print('Dataset name (in this system) = ' + dataset_name)

    folder_root_files = definitions.DATA_FOLDER + '/root_merged/' + dataset_name
    print('Folder with root files: ' + folder_root_files)
    files = reader.find_missing_processed_files(dataset_name,
                                                'root_merged',
                                                'raw',
                                                None,
                                                full_file_path=True)

    from dhfcorr.utils import batch

    job_id = 0

    print()
    print("")

    for file_list in tqdm(batch(files, n_files),
                          total=int(len(files) / n_files) + 1,
                          desc='Submitting jobs: '):
        job_name = dataset_name + '_conv_' + str(job_id)

        script_path = definitions.ROOT_DIR + '/io/convert_to_parquet.py'
        arguments = name_root + ' ' + dataset_name + ' ' + job_name + '.csv'

        pd.DataFrame({'file': file_list}).to_csv(job_name + '.csv')

        command = get_job_command(job_name, script_path, arguments)
        subprocess.run(command,
                       shell=True,
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL)

        job_id = job_id + 1

    return len(files)
Esempio n. 6
0
    clear = subprocess.Popen('rm -f ' + ' bkg_*', shell=True)
    clear.wait()
    job_id = 0

    for run_list in tqdm(list(batch(runs, args.nfiles))):
        job_name = args.data_config + '_bkg_' + str(job_id)

        script = definitions.ROOT_DIR + '/io/create_bkg_sample.py'
        arguments = format_list_to_bash(
            run_list) + ' ' + args.data_config + ' --id ' + str(job_id)

        if args.yaml_file is not None:
            arguments += ' --yaml_file ' + args.yaml_file

        if args.submit_bkg:
            command = get_job_command(job_name, script, arguments)
            subprocess.run(command, shell=True)

        else:
            n_short = args.nsubprocess
            processes = list()
            sub_job_id = 0
            for short_run in batch(run_list, n_short):
                command = "python " + definitions.ROOT_DIR + '/io/create_bkg_sample.py '
                command = command + format_list_to_bash(
                    short_run) + ' ' + args.data_config + ' --id ' + str(
                        job_id) + '_' + str(sub_job_id)
                if args.yaml_file is not None:
                    command += ' --yaml_file ' + args.yaml_file
                processes.append(
                    subprocess.Popen(command,
    print("The following periods/runs will be processed: ")
    print(runs)

    import subprocess

    job_id = 0
    print()
    print("Submitting jobs:")

    for run in tqdm(runs):
        job_name = 'm_pt_' + dataset_name + '_' + str(run)
        script_path = definitions.ROOT_DIR + '/io/merge_parquet_files.py'

        arguments_d = dataset_name + ' -s filtered -p dmeson -r ' + str(run)
        command_d = get_job_command(job_name + '_d', script_path, arguments_d)
        subprocess.run(command_d,
                       shell=True,
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL)

        args_ele_ev = dataset_name + ' -s raw -p electron event -r ' + str(run)
        command_ele_ev = get_job_command(job_name + '_e', script_path,
                                         args_ele_ev)
        subprocess.run(command_ele_ev,
                       shell=True,
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL)

        job_id = job_id + 1