Exemple #1
0
def download_train_opt(train_name,
                       run_number,
                       local_folder,
                       login,
                       n_batches=100,
                       file_name='AnalysisResults.root',
                       n_trains=0):

    files = find_files_from_train(run_number, train_name, file_name=file_name)
    files_downloaded = glob.glob(local_folder + '/*.root')
    local_folder_name = local_folder.split('/')[-1]

    print("Train name: " + train_name)
    print("Train run number: " + run_number)

    print('Number of files in this path: ' + str(len(files)))

    friendly_names = [get_friendly_file_name(x, train_name) for x in files]
    friendly_names_downloaded = [
        x.split('/')[-1].split('.root')[0] for x in files_downloaded
    ]

    if len(files) < len(friendly_names_downloaded) and n_trains == 1:
        print(
            'It is normal to have more downloaded files than found files in case you are downloading multiple trains '
            'to the same folder. IF YOU ARE NOT DOWNLOADING MULTIPLE TRAINS, YOU MIGHT BE SAVING TO A FOLDER WITH '
            'OTHER FILES.')
    files_to_download = [(files[x], friendly_names[x])
                         for x in range(len(files))
                         if friendly_names[x] not in friendly_names_downloaded]

    files = [x[0] for x in files_to_download]
    friendly_names = [x[1] for x in files_to_download]

    print('I will download:' + str(len(files)))
    size_jobs = len(files) / n_batches
    print("Number of jobs that will be submitted (approx.): " + str(size_jobs))

    from dhfcorr.utils import batch

    current_job = 0
    for grid_file, local_file in zip(batch(files, n_batches),
                                     batch(friendly_names, n_batches)):
        local_file = [local_folder + '/' + x + '.root' for x in local_file]
        name_job = local_folder_name + '_d_' + str(run_number) + str(
            '_') + str(current_job)
        pd.DataFrame({
            'grid': grid_file,
            'local': local_file
        }).to_csv(name_job + '.csv')
        submit_download_job(name_job, login,
                            os.path.join(os.getcwd(), name_job + '.csv'))
        current_job = current_job + 1

    return len(files)
Exemple #2
0
def split_submit_job(files,
                     script_path,
                     job_name_pattern,
                     n_files,
                     additional_argumets='',
                     test=True):
    print(files)
    if test:
        files = files[:10]
        n_files = 2

    job_id = 0

    for file_list in tqdm(batch(files, n_files),
                          total=int(len(files) / n_files) + 1,
                          desc='Submitting jobs: '):
        job_name = job_name_pattern + str(job_id)
        pd.DataFrame({'file': file_list}).to_csv(job_name + '.csv')
        arguments = ' ' + job_name + '.csv ' + additional_argumets
        command = get_job_command(job_name, script_path, arguments)
        subprocess.run(command, shell=True)

        job_id = job_id + 1

    return len(files)
Exemple #3
0
def submit_merge_root_files(dataset_name, maximum_size, continue_pre, number_of_runs):
    print('Dataset name = ' + dataset_name)

    folder_root_files = definitions.DATA_FOLDER + '/root/' + dataset_name
    folder_to_save = definitions.DATA_FOLDER + '/root_merged/' + dataset_name

    if not os.path.isdir(definitions.DATA_FOLDER + '/root_merged/'):
        os.mkdir(definitions.DATA_FOLDER + '/root_merged/')

    if not os.path.isdir(folder_to_save):
        os.mkdir(folder_to_save)
    elif not continue_pre:  # Delete previous merge
        delete_files = subprocess.Popen('rm -f ' + folder_to_save + '/*', shell=True)
        delete_files.wait()

    print('Folder with root files: ' + folder_root_files)
    print('Target file size(GB): ' + str(maximum_size))

    files = glob.glob(folder_root_files + "/*.root")
    periods = {get_run_number(x) for x in files}

    print('The total number of files found is : ' + str(len(files)))
    print()
    print('Saving them to:' + folder_to_save)

    if continue_pre:
        print("The merge will not delete files from previous iterations. Only new periods will be reprocessed.")
        files_already_merged = glob.glob(folder_to_save + "/*.root")

        period_already_merged = {get_run_number(x) for x in files_already_merged}
        if len(period_already_merged) == 0:
            print("No previous merged files.")
        else:
            print("Found " + str(len(period_already_merged)) + "periods already merged")
            print(period_already_merged)
        periods = periods - period_already_merged

    periods = list(periods)
    if len(periods) == 0:
        print("No periods to merge")

    job_id = 0

    for period in tqdm(list(batch(periods, number_of_runs)), desc='Submitting jobs'):
        job_name = dataset_name + '_merge_' + str(job_id)
        arguments = str(dataset_name) + ' --target_sizeGB ' + str(maximum_size) + ' -r '
        for p in period:
            arguments = arguments + str(p) + ' '
        command = get_job_command(job_name, definitions.ROOT_DIR + '/io/merge_root_files.py', arguments)
        subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        job_id += 1

    return len(periods)
def process_multicore(worker, list_of_data, n_treads, message=None, **kwargs):
    data_blocks = list(batch(list_of_data, math.ceil(len(list_of_data) / n_treads)))
    queue = Queue()
    processes = list()

    for data_worker in data_blocks:
        p = Process(target=lambda x: process_multiple_data_in_one_run(queue, x, worker, **kwargs),
                    args=(data_worker,))
        processes.append(p)
        p.start()

    combined_results = list(itertools.chain.from_iterable([queue.get() for _ in tqdm(range(len(data_blocks)),
                                                                                     total=len(data_blocks),
                                                                                     desc=message)]))
    for p in processes:
        p.join()
    return combined_results
def submit_root_to_parquet(dataset_name, name_root, n_files):
    if name_root is None:
        name_root = dataset_name

    print('Configuration (root file) = ' + name_root)
    print('Dataset name (in this system) = ' + dataset_name)

    folder_root_files = definitions.DATA_FOLDER + '/root_merged/' + dataset_name
    print('Folder with root files: ' + folder_root_files)
    files = reader.find_missing_processed_files(dataset_name,
                                                'root_merged',
                                                'raw',
                                                None,
                                                full_file_path=True)

    from dhfcorr.utils import batch

    job_id = 0

    print()
    print("")

    for file_list in tqdm(batch(files, n_files),
                          total=int(len(files) / n_files) + 1,
                          desc='Submitting jobs: '):
        job_name = dataset_name + '_conv_' + str(job_id)

        script_path = definitions.ROOT_DIR + '/io/convert_to_parquet.py'
        arguments = name_root + ' ' + dataset_name + ' ' + job_name + '.csv'

        pd.DataFrame({'file': file_list}).to_csv(job_name + '.csv')

        command = get_job_command(job_name, script_path, arguments)
        subprocess.run(command,
                       shell=True,
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL)

        job_id = job_id + 1

    return len(files)
Exemple #6
0
    dr.check_for_folder(dr.get_location_step(args.data_config, 'ml'))

    if not args.skip_signal:
        prepare_signal(args.mc_config,
                       d_cuts.values['model_building']['bins_pt'], 'dmeson')

    from dhfcorr.utils import batch, format_list_to_bash

    runs = dr.get_run_numbers(args.data_config)

    print("Processing Background:")
    clear = subprocess.Popen('rm -f ' + ' bkg_*', shell=True)
    clear.wait()
    job_id = 0

    for run_list in tqdm(list(batch(runs, args.nfiles))):
        job_name = args.data_config + '_bkg_' + str(job_id)

        script = definitions.ROOT_DIR + '/io/create_bkg_sample.py'
        arguments = format_list_to_bash(
            run_list) + ' ' + args.data_config + ' --id ' + str(job_id)

        if args.yaml_file is not None:
            arguments += ' --yaml_file ' + args.yaml_file

        if args.submit_bkg:
            command = get_job_command(job_name, script, arguments)
            subprocess.run(command, shell=True)

        else:
            n_short = args.nsubprocess
Exemple #7
0
    parser.add_argument("--maximum_pt_filter", default=0., help='Maximum pT that pre_filter_bkg will  be used')
    parser.add_argument("--yaml_file", default=None, help='YAML file with the configurations of the analysis. If None, '
                                                          'uses the default configuration.')
    parser.add_argument("--nfiles", help='Number of files per job.', default=20)

    args = parser.parse_args()

    file_list = reader.get_file_list(args.config, args.particle, step='filtered')
    print('Skimming the files')

    total_file_size = (np.array([os.path.getsize(file) for file in file_list])).sum() / (1024 ** 3)
    print('Size before skimming: {:0.2f} GB'.format(total_file_size))

    processing_folder = definitions.PROCESSING_FOLDER + args.config
    if not os.path.isdir(processing_folder):
        os.mkdir(processing_folder)
    if not os.path.isdir(processing_folder + '/skimmed'):
        os.mkdir(processing_folder + '/skimmed/')

    file_batches = list(batch(file_list, args.nfiles))
    for files, job_id in tqdm(zip(file_batches, range(len(file_batches))), total=len(file_batches)):
        reduce_opt(files, args.config, args.yaml_file, job_id, args.particle, args.pre_filter_bkg,
                   args.maximum_pt_filter)

    files_produced = glob.glob(definitions.PROCESSING_FOLDER + args.config + '/skimmed/*' + args.particle + '.parquet')
    size_after = (np.array([os.path.getsize(file) for file in files_produced])).sum() / (1024 ** 3)

    print('Size after skimming: {:0.2f} GB (reduction {:0.2f} times)'.format(size_after, total_file_size / size_after))

    print('Processing done.')