def test_recombine_eventWise(): with TempTestDir("tst") as dir_name: # make a real eventWise to play with file_name = "test.parquet" ew = Components.EventWise(os.path.join(dir_name, file_name)) ew_path = os.path.join(dir_name, file_name) n_events = 12 ew.append(Event_n=ak.from_iter(np.arange(n_events))) # calling it on an eventWise that hasn't been split should return the same eventWise found = ParallelFormJets.recombine_eventWise(ew_path) assert len(found.columns) == 1 tst.assert_allclose(found.Event_n, ew.Event_n) # fragment the eventWise and delete the last fragment paths = ew.fragment("Event_n", n_fragments=3) fragment_dir = os.path.split(paths[0])[0] os.remove(paths[2]) partial = ParallelFormJets.recombine_eventWise(ew_path) # there is no garuntee on which part was removed preserved = [i in partial.Event_n for i in ew.Event_n] assert sum(preserved) == 8 # there should now be a joined component in fragment_dir assert next(name for name in os.listdir(fragment_dir) if "joined.parquet" in name) # repeating the exercize should ignore this joined component partial = ParallelFormJets.recombine_eventWise(ew_path) preserved = [i in partial.Event_n for i in ew.Event_n] assert sum(preserved) == 8
def fake_worker(eventWise_path, run_condition, function, jet_class, jet_name, cluster_parameters, batch_size): eventWise = Components.EventWise.from_file(eventWise_path) eventWise.append(Catto=[2, 3]) eventWise.append_hyperparameters(Run_condition=run_condition, Jet_class=jet_class.__name__, Cluster_parameters=cluster_parameters, Batch_size=batch_size) ParallelFormJets.mark_finished(eventWise_path, function.__name__)
def end_sequence(eventWise_path, end_time, dijet_mass=None, jet_pt_cut=None): run_fix(eventWise_path, end_time) # score clusterings ParallelFormJets.run_score(eventWise_path, end_time, dijet_mass) if time.time() > end_time: return # calculate mass peaks if jet_pt_cut is not None: ParallelFormJets.run_correct_masses(eventWise_path, end_time, jet_pt_cut) if time.time() > end_time: return
def test_name_generator(): # should work fine with no existing jets jet_class = "Horse" gen = ParallelFormJets.name_generator(jet_class, []) assert next(gen) == "Horse1Jet" assert next(gen) == "Horse2Jet" # and should work if there are jets jet_class = "Horse" existing_jets = ["Horse1Jet", "Horse3Jet", "Cat2Jet", "Cat0Jet"] gen = ParallelFormJets.name_generator(jet_class, existing_jets) assert next(gen) == "Horse2Jet" assert next(gen) == "Horse4Jet"
def cluster_sequence(file_name, names, classes, params, jet_pt_cut, dijet_mass, pileup_file=None, mean_pileup_per_event=50, z_cuts=[0.2, 0.4]): if pileup_file is not None: ew = NonwritingEventWise.from_file(file_name) else: ew = Components.EventWise.from_file(file_name) if pileup_file is not None: print("Adding pileup") ew = remove_excess_data(ew) pileup = NonwritingEventWise.from_file(pileup_file) ew = AddPileup.add_pileup(ew, pileup, mean_pileup_per_event) filter_functions = "remove pileup" else: filter_functions = "ignore pileup" print("Adding jet inputs") FormJetInputs.create_jetInputs(ew, filter_functions=filter_functions) print("Adding jets") for name, cla, param in zip(names, classes, params): finished = False while not finished: finished = FormJets.cluster_multiapply(ew, cla, param, name, np.inf) n_jets = len(names) * 20 if z_cuts: print("Doing z_cuts") ParallelFormJets.batch_masks(ew, z_cuts, n_jets) ParallelFormJets.batch_filter(ew, n_jets) print("Calculating scores") if 'top' in str(dijet_mass): ParallelFormJets.batch_semileptonic(ew, n_jets) ParallelFormJets.batch_correct_semileptonic_masses(ew, n_jets) else: CompareClusters.append_scores(ew, dijet_mass=dijet_mass, end_time=np.inf) ParallelFormJets.batch_correct_masses(ew, jet_pt_cut, n_jets) return ew
def make_minimal(ew, minimal_name, with_jet_content=True): hyperparameter_columns = ew.hyperparameter_columns new_columns = [ name for name in ew.columns if ParallelFormJets.check_wanted_column(name) ] new_contents = { name: getattr(ew, name) for name in new_columns + hyperparameter_columns } if with_jet_content: jet_content = get_minimal_jet_content(ew) new_contents.update(jet_content) new_columns += list(jet_content.keys()) if os.sep in minimal_name: path = minimal_name else: path = os.path.join(ew.dir_name, minimal_name) minimal = Components.EventWise(path, new_columns, new_contents, hyperparameter_columns) return minimal
def test_remove_partial(): paths = [] with TempTestDir("tst") as dir_name: # calling it on no paths should run fine ParallelFormJets.remove_partial([]) # now make a real eventWise to play with file_name = "test1.parquet" ew = Components.EventWise(os.path.join(dir_name, file_name)) ew_path1 = os.path.join(dir_name, file_name) paths.append(ew_path1) n_events = 12 params = {} params['Event_n'] = ak.from_iter(np.arange(n_events)) params['JetInputs_Label'] = ak.from_iter(np.arange(n_events)) unfinished_jet = 'CatJet' n_unfinished = 6 params[unfinished_jet + '_Label'] = ak.from_iter( np.random.rand(n_events - n_unfinished)) finished_jet = 'CatJetJet' params[finished_jet + '_Label'] = ak.from_iter( np.random.rand(n_events)) ew.append(**params) # calling it with the wrong total length should yeild an error with pytest.raises(AssertionError): ParallelFormJets.remove_partial(paths, n_events + 1) # calling it with the right length should remove the right jet ParallelFormJets.remove_partial(paths, n_events) ew = Components.EventWise.from_file(paths[0]) for name in ["Event_n", finished_jet + "_Label", "JetInputs_Label"]: tst.assert_allclose(params[name], getattr(ew, name)) assert unfinished_jet + "_Label" not in ew.columns # doing it again should have no effect ParallelFormJets.remove_partial(paths, n_events) ew = Components.EventWise.from_file(paths[0]) for name in ["Event_n", finished_jet + "_Label", "JetInputs_Label"]: tst.assert_allclose(params[name], getattr(ew, name)) assert unfinished_jet + "_Label" not in ew.columns
def end_sequence(eventWise_path, end_time, z_cuts=None, dijet_mass=None, jet_pt_cut=None): """ Scan over all combinations of a range of options. then score the result Parameters ---------- eventWise_path : str Path to the dataset used for input and writing outputs. jet_class : str end_time : int time to stop scanning. scan_parameters : dict or str fix_parameters : dict dijet_mass : float mass of the dijets for scoring if not given the events are not scored irc_prep: bool should the file be preped for irc calculations? """ # make pileup free jets if z_cuts is not None: #jet_names = ["AntiKTp4Jet", "AntiKTp8Jet", "CAp4Jet", "CAp8Jet", "KTp4Jet", "KTp8Jet"] jet_names = None ParallelFormJets.run_modified_mass_drop(eventWise_path, end_time, z_cuts, jet_names) if time.time() > end_time: return # score clusterings ParallelFormJets.run_score(eventWise_path, end_time, dijet_mass) if time.time() > end_time: return # calculate mass peaks if jet_pt_cut is not None: ParallelFormJets.run_correct_masses(eventWise_path, end_time, jet_pt_cut) if time.time() > end_time: return
def test_worker(): # mock multiapply - it has been tested elsewhere with unittest.mock.patch('jet_tools.FormJets.cluster_multiapply', new=fake_cluster_multiapply): with TempTestDir("tst") as temp_dir: jet_name = "GoodJet" ew = Components.EventWise(os.path.join(temp_dir, "file.parquet")) ew.write() eventWise_path = os.path.join(temp_dir, ew.file_name) # get rid of a continue file if there is one or we will get stuck try: os.remove('continue') except FileNotFoundError: pass # running with the continue condition should resutl in instant shutdown ParallelFormJets._worker(eventWise_path, 'continue', ParallelFormJets.batch_cluster, FormJets.Spectral, jet_name, 13) assert len(fake_clusters) == 0 # running with progfiling should strill resilt in a profile being made ParallelFormJets.worker(eventWise_path, 'continue', ParallelFormJets.batch_cluster, FormJets.Spectral, jet_name, 13) assert len(fake_clusters) == 0 assert os.path.exists(eventWise_path.replace('.parquet', '.prof')) # now if we run for 10 seconds we expect less that 11 but more than 7 ParallelFormJets._worker(eventWise_path, time.time() + 10, ParallelFormJets.batch_cluster, FormJets.Spectral, jet_name, 13) assert len(fake_clusters) > 7 assert len(fake_clusters) < 11 for fake in fake_clusters: assert fake['eventWise'] == ew assert fake['cluster_algorithm'] == FormJets.Spectral assert fake['batch_length'] == 13 assert len(fake['dict_jet_params']) == 0
def test_make_n_working_fragments(): with TempTestDir("tst") as dir_name: # calling this one a directory that dosn't contain any eventWise objects # should raise a file not found erroj with pytest.raises(FileNotFoundError): ParallelFormJets.make_n_working_fragments(dir_name, 3, "squibble") # trying to split something that isn't an eventwise shoudl raise a NotADirectoryError wrong_path = os.path.join(dir_name, "wrong.ods") open(wrong_path, 'w').close() # equivalent of touch with pytest.raises(NotADirectoryError): ParallelFormJets.make_n_working_fragments(wrong_path, 3, "flob") os.remove(wrong_path) # now make a real eventWise to play with file_name = "test.parquet" ew = Components.EventWise(os.path.join(dir_name, file_name)) ew_path = os.path.join(dir_name, file_name) # try with 12 events n_events = 12 params = {} params['Event_n'] = ak.from_iter(np.arange(n_events)) params['JetInputs_Energy'] = ak.from_iter(np.arange(n_events)) unfinished_jet = 'DogJet' n_unfinished = 6 params[unfinished_jet + '_Energy'] = ak.from_iter( np.random.rand(n_events - n_unfinished)) params[unfinished_jet + '_Food'] = ak.from_iter([ np.random.rand(np.random.randint(5)) for _ in range(n_events - n_unfinished) ]) finished_jet = 'CatJet' params[finished_jet + '_Energy'] = ak.from_iter([[ ak.from_iter(np.random.rand(np.random.randint(5))) for _ in range(np.random.randint(5)) ] for _ in range(n_events)]) ew.append(**params) # making fragments of the finished jet should result in no change paths = ParallelFormJets.make_n_working_fragments( ew_path, 3, finished_jet) assert isinstance(paths, bool) assert paths # check nothing else has been made in the dir assert len(os.listdir(dir_name)) == 1 # now split the unfinished jet paths = ParallelFormJets.make_n_working_fragments( ew_path, 3, unfinished_jet) assert len(paths) == 3 ew.selected_event = None # there is no garentee that the split events will hold the same order expected_indices = list(range(n_events - n_unfinished, n_events)) for path in paths: ew_part = Components.EventWise.from_file(path) indices_here = ew_part.JetInputs_Energy.tolist() for i in indices_here: expected_indices.remove(i) flat_here = ak.flatten(ew.CatJet_Energy[indices_here]) flat_part = ak.flatten(ew_part.CatJet_Energy) try: flat_here = ak.flatten(flat_here) flat_part = ak.flatten(flat_part) except ValueError: # already flat pass tst.assert_allclose(flat_here.tolist(), flat_part.tolist()) assert not expected_indices, "Didn't find all the expected indices" # if we ask for the same split again it would not do anything paths2 = ParallelFormJets.make_n_working_fragments( ew_path, 3, unfinished_jet) assert set(paths2) == set(paths) paths2 = ParallelFormJets.make_n_working_fragments( os.path.split(paths[0])[0], 3, unfinished_jet) assert set(paths2) == set(paths) # if we ask for a diferent number of paths it should recluster and then split paths3 = ParallelFormJets.make_n_working_fragments( ew_path, 2, unfinished_jet) assert len(paths3) == 2 # there is no garentee that the split events will hold the same order expected_indices = list(range(n_events - n_unfinished, n_events)) for path in paths3: ew_part = Components.EventWise.from_file(path) indices_here = ew_part.JetInputs_Energy.tolist() for i in indices_here: expected_indices.remove(i) flat_here = ak.flatten(ew.CatJet_Energy[indices_here]) flat_part = ak.flatten(ew_part.CatJet_Energy) try: flat_here = ak.flatten(flat_here) flat_part = ak.flatten(flat_part) except ValueError: # already flat pass tst.assert_allclose(flat_here.tolist(), flat_part.tolist()) assert not expected_indices, "Didn't find all the expected indices" # remove any existing directories [ shutil.rmtree(os.path.join(dir_name, name)) for name in os.listdir(dir_name) if '.' not in name ] # try fragmenting and then joining ew.fragment("JetInputs_Energy", n_fragments=3) fragment_dir = next( os.path.join(dir_name, name) for name in os.listdir(dir_name) if "fragment" in name) Components.EventWise.combine(fragment_dir, ew.file_name.split('.', 1)[0]) ParallelFormJets.make_n_working_fragments(ew_path, 4, finished_jet) # check it has reconstructed the original eventWise found = os.listdir(fragment_dir) assert len(found) == 1 new_ew = Components.EventWise.from_file( os.path.join(fragment_dir, found[0])) order = np.argsort(new_ew.JetInputs_Energy) dog_order = order[order < n_events - n_unfinished] tst.assert_allclose(ew.JetInputs_Energy, new_ew.JetInputs_Energy[order]) tst.assert_allclose(ew.DogJet_Energy, new_ew.DogJet_Energy[dog_order]) for i, j in enumerate(dog_order): tst.assert_allclose(ew.DogJet_Food[i], new_ew.DogJet_Food[j]) for i, j in enumerate(order): for evt, new_evt in zip(ew.CatJet_Energy[i], new_ew.CatJet_Energy[j]): tst.assert_allclose(evt, new_evt)
def test_generate_pool(): # mock _worker, this is tested above with unittest.mock.patch('jet_tools.ParallelFormJets._worker', new=fake_worker): with TempTestDir("tst") as temp_dir: ew = Components.EventWise(os.path.join(temp_dir, "file.parquet")) ew.append(JetInputs_Energy=np.arange(100)) eventWise_path = os.path.join(temp_dir, ew.file_name) # get rid of a continue file if there is one or we will get stuck try: os.remove('continue') except FileNotFoundError: pass end_time = time.time() + 100 jet_class = FormJets.Spectral jet_params = {"Bark": 3} n_cores = psutil.cpu_count() function = lambda x: x found = ParallelFormJets.generate_pool( eventWise_path, function, function_args=[jet_class, "Bali", jet_params], function_kwargs=dict(batch_size=500), leave_one_free=True, end_time=end_time) assert found, "One of the processes crashed" # now we expect to find a subdirectory containing n_cores - 1 eventWise subdir = next( os.path.join(temp_dir, name) for name in os.listdir(temp_dir) if ".parquet" not in name) paths = [ os.path.join(subdir, name) for name in os.listdir(subdir) if Components.EventWise.pottential_file(name) ] assert len(paths) == n_cores - 1 # check the ake worker has been run on all of them for path in paths: ew = Components.EventWise.from_file(path) tst.assert_allclose(ew.Catto, [2, 3]) assert ew.Run_condition == end_time assert ew.Jet_class == jet_class.__name__ assert ew.Cluster_parameters["Bark"] == 3 assert ew.Batch_size == 500 # tidy up shutil.rmtree(os.path.split(path)[0]) # it should work fine with a continue file too open('continue', 'w').close() found = ParallelFormJets.generate_pool( eventWise_path, function, function_args=[jet_class, "Bali", jet_params], function_kwargs=dict(batch_size=500), leave_one_free=False, end_time=None) assert found, "One of the processes crashed" # now we expect to find a subdirectory containing n_cores eventWise subdir = next( os.path.join(temp_dir, name) for name in os.listdir(temp_dir) if ".parquet" not in name) paths = [ os.path.join(subdir, name) for name in os.listdir(subdir) if Components.EventWise.pottential_file(name) ] assert len(paths) == n_cores, \ f"paths = {len(paths)}, cores = {n_cores}" # check the ake worker has been run on all of them for path in paths: ew = Components.EventWise.from_file(path) tst.assert_allclose(ew.Catto, [2, 3]) assert ew.Run_condition == 'continue' assert ew.Jet_class == jet_class.__name__ assert ew.Cluster_parameters["Bark"] == 3 assert ew.Batch_size == 500
def test_make_jet_name(): expected = "Dog3Jet" found = ParallelFormJets.make_jet_name("Dog", 3.) assert found == expected
unfinished = [] for name in os.listdir(directory): if not Components.EventWise.pottential_file(name): continue data_file = os.path.join(directory, name) fragment_name = name.replace(Components.EventWise.FILE_EXTENTION, "_fragment") if not os.path.exists(fragment_name): unfinished.append(data_file) continue for name in os.listdir(fragment_name): if name.endswith("_finished_batch_masks"): break else: unfinished.append(data_file) if len(unfinished) == 0: raise RuntimeError(f"Couldn't find unfinished data in {directory}") print(f"Found {len(unfinished)} data files to process,") print(f" starting with {unfinished[0]}") eventWise_path = unfinished[0] scan_parameters = None fix_parameters = None z_cut = [0.2, 0.4] jet_class = FormJets.GeneralisedKT ParallelFormJets.complete_sequence(eventWise_path, jet_class, end_time, scan_parameters, fix_parameters=fix_parameters, dijet_mass=40, z_cuts=z_cut)
ew_shapes = Components.EventWise.from_file("../megaIgnore/IRC_shapes2.parquet") for n in range(1, 5): spectral_kinematics = [[[[] for _ in spectral_jets] for _ in ew_shapes.kinematic_names] for _ in ew_shapes.orders] for order in ["nlo", "lo"]: o_idx = list(ew_shapes.orders).index(order) name = file_name.format(n, order, n) print(name) dir_name = os.path.split(name)[0] if os.path.exists(name): ew = Components.EventWise.from_file(name) else: print("Joining...", end='') ew = ParallelFormJets.combine_completed(dir_name, False) print("joined.", flush=True) print("Getting jet kinematics") for j_idx, jname in enumerate(spectral_jets): print('.', end='', flush=True) ew.selected_event = None for event_n in range(len(ew.Event_n)): ew.selected_event = event_n kinematics = get_event(ew, jname) for i, vals in enumerate(kinematics): spectral_kinematics[o_idx][i][j_idx].append(vals) print("Saving", flush=True) save_name = os.path.join(dir_name, "kinematics.parquet") spectral_kinematics = ak.from_iter(spectral_kinematics) ak.to_parquet(spectral_kinematics, save_name)
def run_fix(eventWise_path, end_time): args = [] ParallelFormJets.generate_pool(eventWise_path, batch_fix, function_args=args, end_time=end_time)
from jet_tools import ParallelFormJets, FormJets import os, sys, time, shutil jet_class_name = sys.argv[1].strip() end_time = time.time() + int(sys.argv[2]) order = sys.argv[3].strip() iteration = sys.argv[4].strip() jet_class = getattr(FormJets, jet_class_name) jet_source = f"../megaIgnore/IRCchecks_noPTcut1/iridislo_Scan_{jet_class_name}.awkd" # copy the eventWise source = f"../megaIgnore/IRCchecks_noPTcut{iteration}/iridis_pp_to_jjj_{order}{iteration}.awkd" eventWise_path = f"../megaIgnore/IRCchecks_noPTcut{iteration}/iridis{order}_Scan_{jet_class_name}.awkd" shutil.copyfile(source, eventWise_path) # get the jet params, changeing the combination method jet_params = ParallelFormJets.scan_score(eventWise_path, jet_class, end_time, jet_source)