def auto_select_extension(multiple_selection, multiple_selection_from_file, multiple_selection_filename): for search_string in multiple_selection: info = multiple_selection[search_string] if len(info['paths']) != 2: continue if len(info['paths'][0]) > len(info['paths'][1]): shorter_path = info['paths'][1] longer_path = info['paths'][0] else: shorter_path = info['paths'][0] longer_path = info['paths'][1] add_string = '' remove_string = '' for index, key in enumerate(difflib.ndiff(shorter_path, longer_path)): if key[0] == ' ': continue elif key[0] == '-': #print(u'Delete "{}" from position {}'.format(key[-1],index)) remove_string += key[-1] elif key[0] == '+': #print(u'Add "{}" to position {}'.format(key[-1],index)) add_string += key[-1] # Found extension if remove_string == '' and add_string == '_ext1': if search_string in multiple_selection_from_file: print('[Info] ' + multiple_selection_filename + ' already has ' + search_string) continue common_name = get_common_name(info['paths']) print('Auto-selecting below paths with COMMON_PATH: ' + common_name) print(' ' + shorter_path.replace(common_name, '|COMMON_PATH|')) print(' ' + longer_path.replace(common_name, '|COMMON_PATH|')) #multiple_selection[search_string] = {'paths': info['paths'], 'selected_paths': info['paths'], 'reason': 'Auto-selected for extension file.'} update_selection(search_string, multiple_selection[search_string]['paths'], [0, 1], 'Auto-selected for extension file.', multiple_selection_from_file) nested_dict.save_json_file(multiple_selection_from_file, multiple_selection_filename)
def make_dataset_files_jsons(path_datasets_filename, out_dataset_files_info_filename): list_dataset = dataset_files.get_list_dataset(path_datasets_filename) # Get files for each dataset dataset_file_commands = dataset_files.make_dataset_file_commands( list_dataset) #dataset_file_commands = [[dataset, commands]] dataset_file_results = dataset_files.run_list_command( dataset_file_commands) #datasets_files_info[dataset][filename] = {'number_events':number_events} dataset_files_info = dataset_files.parse_dataset_file_results( dataset_file_results) # Get meta for each file dataset_meta_commands = dataset_files.make_dataset_meta_commands( dataset_files_info) dataset_meta_results = dataset_files.run_list_command( dataset_meta_commands) dataset_files.parse_dataset_meta_results(dataset_meta_results, dataset_files_info) nested_dict.save_json_file(dataset_files_info, out_dataset_files_info_filename)
def update_datasets_files_json(path_datasets_filename, in_dataset_files_info_filename, out_dataset_files_info_filename): list_dataset = dataset_files.get_list_dataset(path_datasets_filename) #dataset_files_info[dataset][filename] = {'number_events':number_events} in_dataset_files_info = nested_dict.load_json_file( in_dataset_files_info_filename) in_list_dataset = in_dataset_files_info.keys() append_list_dataset = list(set(list_dataset) - set(in_list_dataset)) remove_list_dataset = list(set(in_list_dataset) - set(list_dataset)) # Get files for each dataset append_dataset_file_commands = dataset_files.make_dataset_file_commands( append_list_dataset) #dataset_file_commands = [[dataset, commands]] append_dataset_file_results = dataset_files.run_list_command( append_dataset_file_commands) #datasets_files_info[dataset][filename] = {'number_events':number_events} append_dataset_files_info = dataset_files.parse_dataset_file_results( append_dataset_file_results) # Get meta for each file append_dataset_meta_commands = dataset_files.make_dataset_meta_commands( append_dataset_files_info) append_dataset_meta_results = dataset_files.run_list_command( append_dataset_meta_commands) dataset_files.parse_dataset_meta_results(append_dataset_meta_results, append_dataset_files_info) remove_dataset_files_info(in_dataset_files_info, remove_list_dataset) out_dataset_files_info = combine_dataset_files_info( in_dataset_files_info, append_dataset_files_info) print('appended list_dataset: ', str(append_list_dataset)) print('removed list_dataset: ', str(remove_list_dataset)) nested_dict.save_json_file(out_dataset_files_info, out_dataset_files_info_filename)
mc_datasets = nested_dict.load_json_file(mc_datasets_filename) datasets.check_false_none_mc_datasets(mc_datasets) # Make meta data path_to_keys_mc_datasets = datasets.get_path_to_keys_mc_datasets(mc_datasets) search_string_to_keys_mc_datasets = datasets.get_search_string_to_keys_mc_datasets(mc_tag_meta, mc_datasets) same_parent_paths = datasets.get_same_parent_paths(mc_datasets) multiple_mc_datasets = datasets.get_multiple_mc_datasets(mc_datasets) mini_to_nanos_from_nanoaod = datasets.get_mini_to_nanos_from_nanoaod_mc_datasets(mc_datasets) nano_to_mini_from_miniaod = datasets.get_nano_to_mini_from_miniaod_mc_datasets(mc_datasets) if make_data_datasets: data_datasets = nested_dict.load_json_file(data_datasets_filename) datasets.check_false_none_data_datasets(data_datasets) datasets.print_multiple_data_datasets(data_datasets) nested_dict.save_json_file(data_datasets, selected_data_datasets_filename) if make_mc_datasets: # multiple_selection[search_string]= {'paths':[paths], 'selected_paths':[path_selection], 'reason':reason} # Load multiple_selection from file multiple_selection_from_file = {} if os.path.exists(multiple_selection_filename): multiple_selection_from_file = nested_dict.load_json_file(multiple_selection_filename) # Load multiple_selection from json multiple_selection = get_multiple_selection(multiple_mc_datasets, data_tiers) # Should do automatic selection for ext. auto_select_extension(multiple_selection, multiple_selection_from_file, multiple_selection_filename) # Show previous selections print('--------')
initialize_arguments(args) valid, log = are_arguments_valid(args) if not valid: print('[Error] '+log) sys.exit() queue = ucsb_queue.ucsb_queue() #jobs_info_filename = 'jsons/submitted_test_mc_jobs_info.json' #output_json = 'jsons/checked_test_mc_jobs_info.json' #jobscript_check_filename = './copy_aods_check_entries.py' #statuses = ['submitted'] jobs_info_filename = args['jobs_info_filename'] output_json = args['output_json'] jobscript_check_filename = args['jobscript_check_filename'] statuses = args['statuses'] # Checks the jobs # jobs_info = [{'command_script':command_script, 'other_global_key':other_global_key, 'ignore_keys':['job_id', 'job_status', ...]},{'key_for_job':key_for_job},{'key_for_job':key_for_job},...] jobs_info = nested_dict.load_json_file(jobs_info_filename) # Each job type should make job_script, and job_check_script # The ./job_check_script job_log_string should return 'success' or 'fail' for a job_log_string # statuses: [status], where status = 'submitted', 'done', 'fail', 'success', 'to_submit' queue.check_jobs(jobs_info, statuses, jobscript_check_filename, args['debug']) #queue.check_jobs(jobs_info, ['submitted', 'done', 'fail', 'success', 'to_submit'], jobscript_check_filename) queue.print_jobs_status(jobs_info) nested_dict.save_json_file(jobs_info, output_json)
bad_ps_weights_mc_datasets = get_unrejected_if_possible_mc_datasets(filtered_mc_datasets, reject_string_ignore_case_mc_datasets, 'PSweights') print('Using ps_weights for below, because no other datasets') datasets.print_path_mc_datasets(bad_ps_weights_mc_datasets) pu_filtered_mc_datasets = filter_if_possible_mc_datasets(filtered_mc_datasets, reject_bad_pu_2017_mc_datasets) #datasets.print_path_mc_datasets(pu_filtered_mc_datasets) #datasets.print_multiple_mc_datasets(pu_filtered_mc_datasets) #datasets.print_incomplete_parent_mc_datasets(pu_filtered_mc_datasets) datasets.print_missing_mc_datasets(keys_mc_datasets, pu_filtered_mc_datasets) bad_pu_mc_datasets = get_unrejected_if_possible_mc_datasets(filtered_mc_datasets, reject_bad_pu_2017_mc_datasets) #datasets.print_path_parent_mc_datasets(bad_pu_mc_datasets) print('Using bad pileup for below, because no other datasets') datasets.print_path_mc_datasets(bad_pu_mc_datasets) nested_dict.save_json_file(pu_filtered_mc_datasets, out_filtered_mc_datasets_filename) nested_dict.save_json_file(bad_pu_mc_datasets, out_bad_pu_mc_datasets_filename) nested_dict.save_json_file(bad_ps_weights_mc_datasets, out_ps_weight_mc_datasets_filename) if make_data_datasets: # data_dataset[stream][year][run_group][data_tier][path] = {"parent_chain":[], "children":[], "creation time":string, "size":int, "files":int, "events":int, "runs":[]} data_datasets = nested_dict.load_json_file(data_datasets_filename) datasets.check_overlapping_paths_data_datasets(data_datasets) #print_multiple_data_datasets(data_datasets) # keys_data_datasets = [ [stream, year, run_group, data_tier, search_string] ] keys_data_datasets = datasets.get_keys_data_datasets(data_tag_meta, data_tiers) #nested_dict.remove_key_nested_dict(data_datasets, '/SingleElectron/Run2017C-31Mar2018-v1/MINIAOD') datasets.print_missing_data_datasets(keys_data_datasets, data_datasets) filtered_data_datasets = filter_data_datasets(data_datasets, reject_string_ignore_case_path_parent_data_datasets, 'pilot')
args = vars(parser.parse_args()) argparse_helper.initialize_arguments(args, list_args=['mc_data']) valid, log = are_arguments_valid(args) if not valid: print('[Error] ' + log) sys.exit() do_mc = False if 'mc' in args['mc_data']: do_mc = True do_data = False if 'data' in args['mc_data']: do_data = True base_folder = args['base_folder'] mc_disk_files_filename = os.path.join( args['out_jsons_folder'], args['out_jsons_prefix'] + 'mc_disk_files.json') data_disk_files_filename = os.path.join( args['out_jsons_folder'], args['out_jsons_prefix'] + 'data_disk_files.json') if do_mc: # mc_disk_files[data_tier][aod_tag][year][mc_dir][filename] = {'file_events': int} mc_disk_files = make_mc_disk_files(base_folder) nested_dict.save_json_file(mc_disk_files, mc_disk_files_filename) if do_data: # data_disk_files[data_tier][aod_tag][year][data_dir][filename] = {'file_events': int} data_disk_files = make_data_disk_files(base_folder) nested_dict.save_json_file(data_disk_files, data_disk_files_filename)
if len(key_value_split) < 2: print('[Warning]: Cannot add below line to global_key.') print(' ' + line) continue key = key_value_split[0].lstrip().rstrip() value = ':'.join(key_value_split[1:]).lstrip().rstrip() json_value = None try: json_value = json.loads(value) nested_dict.convert_to_ascii(json_value) value = json_value except ValueError: pass if key in jobs_info[0]: if value != jobs_info[0][key]: print('[Warning]: global_key: ' + key + ' is different. ' + jobs_info[0][key] + ' ' + value) jobs_info[0][key] = value if json_value: print('Setting job_info[0][' + key + '] to ' + repr(value) + ' as a json') else: print('Setting job_info[0][' + key + '] to ' + str(value) + ' as a string') # Parse commands elif '#' != line[0]: jobs_info.append({'command': line.rstrip()}) nested_dict.save_json_file(jobs_info, args['jobs_info_filename'])
[mc_dataset_common_names_filename, ['2016', '2017', '2018']], [mc_dataset_2016_names_filename, ['2016']], [mc_dataset_2017_names_filename, ['2017']], [mc_dataset_2018_names_filename, ['2018']], ]) #print(mc_dataset_names) #print ('dataset_names:', mc_dataset_names) # Ex) tag_meta[2016] = RunIISummer16, MiniAODv3, NanoAODv5 mc_tag_meta = datasets.parse_mc_tag_meta(mc_tag_meta_filename) if make_data_datasets: # Ex) data_tag_meta[2016][B][MET][miniaod] = 17Jul2018 data_tag_meta = datasets.parse_data_tag_meta(data_tag_meta_filename) if make_mc_datasets: mc_datasets = nested_dict.load_json_file(mc_datasets_filename) mc_datasets_update = datasets.update_mc_datasets( mc_dataset_names, mc_tag_meta, data_tiers, mc_datasets) #datasets.print_path_mc_datasets(mc_datasets) #datasets.print_path_mc_datasets(mc_datasets_update) nested_dict.save_json_file(mc_datasets_update, out_update_mc_datasets_filename) if make_data_datasets: data_datasets = nested_dict.load_json_file(data_datasets_filename) data_datasets_update = datasets.update_data_datasets( data_tag_meta, data_tiers, data_datasets) nested_dict.save_json_file(data_datasets_update, out_update_data_datasets_filename)
parser.add_argument('jobs_info_filename', metavar='jobs_info.json') parser.add_argument('-o', '--output_json', metavar='submitted_jobs_info.json', nargs=1) parser.add_argument('-n', '--node', metavar='"node_name"', nargs=1) args = vars(parser.parse_args()) initialize_arguments(args) valid, log = are_arguments_valid(args) if not valid: print('[Error] ' + log) sys.exit() #jobs_info_filename = 'jsons/test_mc_jobs_info.json' #output_json = 'jsons/submitted_test_mc_jobs_info.json' #node = 'cms1' jobs_info_filename = args['jobs_info_filename'] output_json = args['output_json'] node = args['node'] # jobs_info = [{'command_script':command_script, 'other_global_key':other_global_key},{'key_for_job':key_for_job},{'key_for_job':key_for_job},...] jobs_info = nested_dict.load_json_file(jobs_info_filename) queue = ucsb_queue.ucsb_queue() # statuses: [status], where status = 'submitted', 'done', 'fail', 'success', 'to_submit' node, number_combined_commands, print_or_run = queue.submit_jobs_info( jobs_info, node=node) if print_or_run == 'r': nested_dict.save_json_file(jobs_info, output_json)