def download_train_opt(train_name, run_number, local_folder, login, n_batches=100, file_name='AnalysisResults.root', n_trains=0): files = find_files_from_train(run_number, train_name, file_name=file_name) files_downloaded = glob.glob(local_folder + '/*.root') local_folder_name = local_folder.split('/')[-1] print("Train name: " + train_name) print("Train run number: " + run_number) print('Number of files in this path: ' + str(len(files))) friendly_names = [get_friendly_file_name(x, train_name) for x in files] friendly_names_downloaded = [ x.split('/')[-1].split('.root')[0] for x in files_downloaded ] if len(files) < len(friendly_names_downloaded) and n_trains == 1: print( 'It is normal to have more downloaded files than found files in case you are downloading multiple trains ' 'to the same folder. IF YOU ARE NOT DOWNLOADING MULTIPLE TRAINS, YOU MIGHT BE SAVING TO A FOLDER WITH ' 'OTHER FILES.') files_to_download = [(files[x], friendly_names[x]) for x in range(len(files)) if friendly_names[x] not in friendly_names_downloaded] files = [x[0] for x in files_to_download] friendly_names = [x[1] for x in files_to_download] print('I will download:' + str(len(files))) size_jobs = len(files) / n_batches print("Number of jobs that will be submitted (approx.): " + str(size_jobs)) from dhfcorr.utils import batch current_job = 0 for grid_file, local_file in zip(batch(files, n_batches), batch(friendly_names, n_batches)): local_file = [local_folder + '/' + x + '.root' for x in local_file] name_job = local_folder_name + '_d_' + str(run_number) + str( '_') + str(current_job) pd.DataFrame({ 'grid': grid_file, 'local': local_file }).to_csv(name_job + '.csv') submit_download_job(name_job, login, os.path.join(os.getcwd(), name_job + '.csv')) current_job = current_job + 1 return len(files)
def split_submit_job(files, script_path, job_name_pattern, n_files, additional_argumets='', test=True): print(files) if test: files = files[:10] n_files = 2 job_id = 0 for file_list in tqdm(batch(files, n_files), total=int(len(files) / n_files) + 1, desc='Submitting jobs: '): job_name = job_name_pattern + str(job_id) pd.DataFrame({'file': file_list}).to_csv(job_name + '.csv') arguments = ' ' + job_name + '.csv ' + additional_argumets command = get_job_command(job_name, script_path, arguments) subprocess.run(command, shell=True) job_id = job_id + 1 return len(files)
def submit_merge_root_files(dataset_name, maximum_size, continue_pre, number_of_runs): print('Dataset name = ' + dataset_name) folder_root_files = definitions.DATA_FOLDER + '/root/' + dataset_name folder_to_save = definitions.DATA_FOLDER + '/root_merged/' + dataset_name if not os.path.isdir(definitions.DATA_FOLDER + '/root_merged/'): os.mkdir(definitions.DATA_FOLDER + '/root_merged/') if not os.path.isdir(folder_to_save): os.mkdir(folder_to_save) elif not continue_pre: # Delete previous merge delete_files = subprocess.Popen('rm -f ' + folder_to_save + '/*', shell=True) delete_files.wait() print('Folder with root files: ' + folder_root_files) print('Target file size(GB): ' + str(maximum_size)) files = glob.glob(folder_root_files + "/*.root") periods = {get_run_number(x) for x in files} print('The total number of files found is : ' + str(len(files))) print() print('Saving them to:' + folder_to_save) if continue_pre: print("The merge will not delete files from previous iterations. Only new periods will be reprocessed.") files_already_merged = glob.glob(folder_to_save + "/*.root") period_already_merged = {get_run_number(x) for x in files_already_merged} if len(period_already_merged) == 0: print("No previous merged files.") else: print("Found " + str(len(period_already_merged)) + "periods already merged") print(period_already_merged) periods = periods - period_already_merged periods = list(periods) if len(periods) == 0: print("No periods to merge") job_id = 0 for period in tqdm(list(batch(periods, number_of_runs)), desc='Submitting jobs'): job_name = dataset_name + '_merge_' + str(job_id) arguments = str(dataset_name) + ' --target_sizeGB ' + str(maximum_size) + ' -r ' for p in period: arguments = arguments + str(p) + ' ' command = get_job_command(job_name, definitions.ROOT_DIR + '/io/merge_root_files.py', arguments) subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) job_id += 1 return len(periods)
def process_multicore(worker, list_of_data, n_treads, message=None, **kwargs): data_blocks = list(batch(list_of_data, math.ceil(len(list_of_data) / n_treads))) queue = Queue() processes = list() for data_worker in data_blocks: p = Process(target=lambda x: process_multiple_data_in_one_run(queue, x, worker, **kwargs), args=(data_worker,)) processes.append(p) p.start() combined_results = list(itertools.chain.from_iterable([queue.get() for _ in tqdm(range(len(data_blocks)), total=len(data_blocks), desc=message)])) for p in processes: p.join() return combined_results
def submit_root_to_parquet(dataset_name, name_root, n_files): if name_root is None: name_root = dataset_name print('Configuration (root file) = ' + name_root) print('Dataset name (in this system) = ' + dataset_name) folder_root_files = definitions.DATA_FOLDER + '/root_merged/' + dataset_name print('Folder with root files: ' + folder_root_files) files = reader.find_missing_processed_files(dataset_name, 'root_merged', 'raw', None, full_file_path=True) from dhfcorr.utils import batch job_id = 0 print() print("") for file_list in tqdm(batch(files, n_files), total=int(len(files) / n_files) + 1, desc='Submitting jobs: '): job_name = dataset_name + '_conv_' + str(job_id) script_path = definitions.ROOT_DIR + '/io/convert_to_parquet.py' arguments = name_root + ' ' + dataset_name + ' ' + job_name + '.csv' pd.DataFrame({'file': file_list}).to_csv(job_name + '.csv') command = get_job_command(job_name, script_path, arguments) subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) job_id = job_id + 1 return len(files)
dr.check_for_folder(dr.get_location_step(args.data_config, 'ml')) if not args.skip_signal: prepare_signal(args.mc_config, d_cuts.values['model_building']['bins_pt'], 'dmeson') from dhfcorr.utils import batch, format_list_to_bash runs = dr.get_run_numbers(args.data_config) print("Processing Background:") clear = subprocess.Popen('rm -f ' + ' bkg_*', shell=True) clear.wait() job_id = 0 for run_list in tqdm(list(batch(runs, args.nfiles))): job_name = args.data_config + '_bkg_' + str(job_id) script = definitions.ROOT_DIR + '/io/create_bkg_sample.py' arguments = format_list_to_bash( run_list) + ' ' + args.data_config + ' --id ' + str(job_id) if args.yaml_file is not None: arguments += ' --yaml_file ' + args.yaml_file if args.submit_bkg: command = get_job_command(job_name, script, arguments) subprocess.run(command, shell=True) else: n_short = args.nsubprocess
parser.add_argument("--maximum_pt_filter", default=0., help='Maximum pT that pre_filter_bkg will be used') parser.add_argument("--yaml_file", default=None, help='YAML file with the configurations of the analysis. If None, ' 'uses the default configuration.') parser.add_argument("--nfiles", help='Number of files per job.', default=20) args = parser.parse_args() file_list = reader.get_file_list(args.config, args.particle, step='filtered') print('Skimming the files') total_file_size = (np.array([os.path.getsize(file) for file in file_list])).sum() / (1024 ** 3) print('Size before skimming: {:0.2f} GB'.format(total_file_size)) processing_folder = definitions.PROCESSING_FOLDER + args.config if not os.path.isdir(processing_folder): os.mkdir(processing_folder) if not os.path.isdir(processing_folder + '/skimmed'): os.mkdir(processing_folder + '/skimmed/') file_batches = list(batch(file_list, args.nfiles)) for files, job_id in tqdm(zip(file_batches, range(len(file_batches))), total=len(file_batches)): reduce_opt(files, args.config, args.yaml_file, job_id, args.particle, args.pre_filter_bkg, args.maximum_pt_filter) files_produced = glob.glob(definitions.PROCESSING_FOLDER + args.config + '/skimmed/*' + args.particle + '.parquet') size_after = (np.array([os.path.getsize(file) for file in files_produced])).sum() / (1024 ** 3) print('Size after skimming: {:0.2f} GB (reduction {:0.2f} times)'.format(size_after, total_file_size / size_after)) print('Processing done.')