def generate_efp_graphs(): # Grab graphs prime_d7 = ef.EFPSet("d<=7", "p==1") chrom_4 = ef.EFPSet("d<=8", "p==1", "c==4") efpsets = [prime_d7, chrom_4] for efpset in efpsets: graphs = efpset.graphs() for efp_ix, graph in enumerate(graphs): n, e, d, v, k, c, p, h = efpset.specs[efp_ix] plot_graph(graph, n, d, k, f"graphs/pdf/efp_{n}_{d}_{k}.pdf") plot_graph(graph, n, d, k, f"graphs/png/efp_{n}_{d}_{k}.png") plot_graph(graph, n, d, k, f"graphs/eps/efp_{n}_{d}_{k}.eps")
def test_get_graph_components(): efpset = ef.EFPSet() ps = np.array( [len(ef.utils.get_components(graph)) for graph in efpset.graphs()]) # note that the empty graph is recorded as having 1 connected component by EFPSet assert np.all(ps[1:] == efpset.specs[1:, -2])
def main(): f = pd.read_hdf( "/data/t3home000/spark/LHCOlympics/data/events_anomalydetection.h5") dt = f.values dt2 = dt[:, :2100] dt3 = dt[:, -1] dt2 = dt2.reshape((len(dt2), len(dt2[0]) // 3, 3)) # data controls num_data = 1100000 test_frac = 0.2 # efp parameters dmax = 7 measure = 'hadr' beta = 0.5 print('Calculating d <= {} EFPs for {} jets... '.format(dmax, num_data), end='') efpset = ef.EFPSet(('d<=', dmax), measure='hadr', beta=beta) test = dt2[:1000] test_X = [x[x[:, 0] > 0] for x in test] X = efpset.batch_compute(test_X) print(X) print('done')
def generate_EFP(): # Calculate HL variables from ET, eta, phi dtypes = ["et", "ht"] splits = ["test", "train", "valid"] for dtype in dtypes: for split in splits: print(f"Processing data type: {dtype}, {split}") X = pd.read_pickle( f"data/processed/{split}_{dtype}.pkl")["features"] y = pd.read_pickle( f"data/processed/y_{split}_{dtype}.pkl")["targets"] # Choose kappa, beta values kappas = [-1, 0, 0.5, 1, 2] betas = [0.5, 1, 2] # Grab graphs prime_d7 = ef.EFPSet("d<=7", "p==1") chrom_4 = ef.EFPSet("d<=8", "p==1", "c==4") efpsets = [prime_d7, chrom_4] for efpset in efpsets: graphs = efpset.graphs() t = tqdm(graphs) for efp_ix, graph in enumerate(t): for kappa in kappas: for beta in betas: n, e, d, v, k, c, p, h = efpset.specs[efp_ix] file_name = f"data/efp/{split}/{dtype}_efp_{n}_{d}_{k}_k_{kappa}_b_{beta}.feather" if not os.path.exists(file_name): graph = graphs[efp_ix] t.set_description( f"Procesing: EFP[{n},{d},{k}](k={kappa},b={beta})" ) efp_val = efp( data=X, graph=graph, kappa=kappa, beta=beta, normed=False, ) efp_df = pd.DataFrame({ f"features": efp_val, f"targets": y }) efp_df.to_feather(file_name)
def test_batch_compute_vs_compute(measure, beta, kappa, normed): if measure == 'hadr' and kappa == 'pf': pytest.skip('hadr does not do pf') if 'efm' in measure and beta != 2: pytest.skip('only test efm when beta=2') events = ef.gen_random_events(10, 15) s = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed) r_batch = s.batch_compute(events) r = np.asarray([s.compute(event) for event in events]) assert epsilon_percent(r_batch, r, 10**-14)
def test_efpset_vs_efps(measure, beta, kappa, normed, event): # handle cases we want to skip if measure == 'hadr' and kappa == 'pf': pytest.skip('hadr does not do pf') if 'efm' in measure and beta != 2: pytest.skip('only test efm when beta=2') s1 = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed) efps = [ef.EFP(g, measure=measure, beta=beta, kappa=kappa, normed=normed) for g in s1.graphs()] r1 = s1.compute(event) r2 = np.asarray([efp.compute(event) for efp in efps]) assert epsilon_percent(r1, r2, 10**-12)
def test_batch_compute_vs_compute(measure, beta, kappa, normed): if measure == 'hadr' and kappa == 'pf': pytest.skip('hadr does not do pf') if kappa == 'pf' and normed: pytest.skip('normed not supported with kappa=pf') if ('efm' in measure) and (beta != 2): pytest.skip('only beta=2 can use efm measure') events = ef.gen_random_events(10, 15) s = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed) r_batch = s.batch_compute(events, n_jobs=1) r = np.asarray([s.compute(event) for event in events]) assert epsilon_percent(r_batch, r, 10**-14)
def test_efpset_vs_efps(measure, beta, kappa, normed, event): # handle cases we want to skip if measure == 'hadr' and kappa == 'pf': pytest.skip('hadr does not do pf') if kappa == 'pf' and normed: pytest.skip('normed not supported with kappa=pf') if ('efm' in measure) and (beta != 2): pytest.skip('only beta=2 can use efm measure') s1 = ef.EFPSet('d<=6', measure=measure, beta=beta, kappa=kappa, normed=normed) efps = [ef.EFP(g, measure=measure, beta=beta, kappa=kappa, normed=normed) for g in s1.graphs()] r1 = s1.compute(event) r2 = np.asarray([efp.compute(event) for efp in efps]) assert epsilon_percent(r1, r2, 10**-12)
def generate_EFP(): hdf_file = path.join(data_path, "processed", "prep_data.h5") X = pd.read_hdf(hdf_file, "features").features.to_numpy() y = pd.read_hdf(hdf_file, "targets").targets.values # Choose kappa, beta values kappas = [-1, 0, 0.25, 0.5, 1, 2] betas = [0.25, 0.5, 1, 2, 3, 4] # Grab graphs prime_d7 = ef.EFPSet("d<=7", "p==1") chrom_4 = ef.EFPSet("d<=8", "p==1", "c==4") efpsets = [prime_d7, chrom_4] for efpset in efpsets: graphs = efpset.graphs() t = tqdm(graphs) for efp_ix, graph in enumerate(t): for kappa in kappas: for beta in betas: n, e, d, v, k, c, p, h = efpset.specs[efp_ix] file_name = f"data/efp/efp_{n}_{d}_{k}_k_{kappa}_b_{beta}.feather" if not path.isfile(file_name): graph = graphs[efp_ix] t.set_description( f"Procesing: EFP[{n},{d},{k}](k={kappa},b={beta})") efp_val = efp( data=X, graph=graph, kappa=kappa, beta=beta, normed=False, ) efp_df = pd.DataFrame({ f"features": efp_val, f"targets": y }) efp_df.to_feather(file_name)
def efp(args, jets, mask=None, real=True): efpset = ef.EFPSet(('n==', 4), ('d==', 4), ('p==', 1), measure='hadr', beta=1, normed=None, coords='ptyphim') efp_format = ef_format(jets) if not real and args.mask: for i in range(jets.shape[0]): for j in range(args.num_hits): if not mask[i][j]: for k in range(4): efp_format[i][j][k] = 0 logging.info("Batch Computing") return efpset.batch_compute(efp_format)
# real_means.append(np.mean(np.array(real_w1s), axis=0)) # real_stds.append(np.std(np.array(real_w1s), axis=0)) gen_means.append(np.mean(np.array(gen_w1s), axis=0)) gen_stds.append(np.std(np.array(gen_w1s), axis=0)) real_means real_stds gen_means gen_stds # Get all prime EFPs with n=4, d=4 # Specify EFPs set efpset = ef.EFPSet(('n==', 4), ('d==', 4), ('p==', 1), measure='hadr', beta=1, normed=None, coords='ptyphim') N = 100000 gen_out_efp_format = np.concatenate( (np.expand_dims(gen_out[:, :, 2], 2), gen_out[:, :, :2], np.zeros((gen_out.shape[0], gen_out.shape[1], 1))), axis=2) X_efp_format = np.concatenate( (np.expand_dims(Xplot[:, :, 2], 2), Xplot[:, :, :2], np.zeros((N, 30, 1))), axis=2) gen_out_efp = efpset.batch_compute(gen_out_efp_format) X_efp = efpset.batch_compute(X_efp_format)
_jetsPL = np.load('jet_input.npy', allow_pickle=True) # _jetsPL = _jetsPL[:2000] print(len(_jetsPL)) BINS = np.linspace(-0.4, 0.4, num=utils.N_IMAGE_BINS + 1) #print(len(bins)) jetsPL_train = [] # jetsDL_train = [] jetsPL_test = [] # jetsDL_test = [] jet_images_pl_test = [] # jet_images_dl_test = [] efpset = energyflow.EFPSet(('d<=', 4), measure='hadr', beta=0.5) # masked_X = [x[x[:,0] > 0] for x in _jetsPL] # X = efpset.compute(_jetsPL[3]) #print(len(X)) # print(X.shape) efps_pl = efpset.batch_compute(_jetsPL, n_jobs=2)[:, 1:] normalization = np.max(efps_pl, axis=0) print(np.max(efps_pl, axis=0)) efps_pl = np.divide(efps_pl, normalization) print(np.max(efps_pl, axis=0)) efps_pl_train = [] efps_pl_test = [] for efp in efps_pl:
def __init__(self, input_path, store_n_jets, jet_delta_r, max_n_constituents, efp_degree): """ Reads input trees, recognizes input types, initializes EFP processor and prepares all arrays needed to store output variables. """ self.set_input_paths_and_selections(input_path=input_path) # read files, trees and recognize input type self.files = { path: uproot.open(path) for path in self.input_file_paths } self.trees = {} self.input_types = {} self.read_trees() self.n_all_events = sum( [tree.num_entries for tree in self.trees.values()]) self.n_events = sum(map(len, list(self.selections.values()))) + 1 print("Found {0} file(s)".format(len(self.files))) print("Found {0} tree(s)".format(len(self.trees))) print("Found ", self.n_events - 1, " selected events, out of a total of ", self.n_all_events) # set internal parameters self.jet_delta_r = jet_delta_r self.max_n_constituents = max_n_constituents if max_n_constituents > 0 else 100 self.max_n_jets = store_n_jets self.EFP_size = 0 # initialize EFP set if efp_degree >= 0: print( "\n\n=======================================================") print("Creating energyflow particle set with degree d <= {0}...". format(efp_degree)) self.efpset = ef.EFPSet("d<={0}".format(efp_degree), measure='hadr', beta=1.0, normed=True, verbose=True) self.EFP_size = self.efpset.count() print("EFP set is size: {}".format(self.EFP_size)) print( "=======================================================\n\n") # prepare arrays for event & jet features, EFPs and jet constituents self.output_arrays = { OutputTypes.EventFeatures: np.empty((self.n_events, len(Event.get_features_names()))), OutputTypes.JetFeatures: np.empty((self.n_events, self.max_n_jets, len(Jet.get_feature_names()))), OutputTypes.JetConstituents: np.empty((self.n_events, self.max_n_jets, self.max_n_constituents, len(Jet.get_constituent_feature_names()))), OutputTypes.EPFs: np.empty((self.n_events, self.max_n_jets, self.EFP_size)) } self.output_names = { OutputTypes.EventFeatures: "event_features", OutputTypes.JetFeatures: "jet_features", OutputTypes.JetConstituents: "jet_constituents", OutputTypes.EPFs: "jet_eflow_variables" } self.output_labels = { OutputTypes.EventFeatures: Event.get_features_names(), OutputTypes.JetFeatures: Jet.get_feature_names(), OutputTypes.JetConstituents: Jet.get_constituent_feature_names(), OutputTypes.EPFs: [str(i) for i in range(self.EFP_size)] } self.save_outputs = { OutputTypes.EventFeatures: True, OutputTypes.JetFeatures: True, OutputTypes.JetConstituents: False if max_n_constituents < 0 else True, OutputTypes.EPFs: False if efp_degree < 0 else True }
beta = 0.5 # plotting colors = ['tab:red', 'tab:orange', 'tab:olive', 'tab:green', 'tab:blue'] ################################################################################ # load data X, y = qg_jets.load(num_data) print('Loaded quark and gluon jets') # calculate EFPs print('Calculating d <= {} EFPs for {} jets... '.format(dmax, num_data), end='') efpset = ef.EFPSet(('d<=', dmax), measure='hadr', beta=beta) masked_X = [x[x[:, 0] > 0] for x in X] X = efpset.batch_compute(masked_X) print('Done') # train models with different numbers of EFPs as input rocs = [] for d in range(1, dmax + 1): # build architecture model = LinearClassifier(linclass_type='lda') # select EFPs with degree <= d X_d = X[:, efpset.sel(('d<=', d))] # do train/val/test split
def test_linear_relations(measure): graphs ={# d=0 'dot': [], # d=1 'line': [(0,1)], # d=2 'dumbbell': [(0,1), (0,1)], 'wedge': [(0,1),(1,2)], 'linesqd' : [(0,1),(2,3)], # d = 3 'tribell' : [(0,1),(0,1),(0,1)], 'triangle' : [(0,1),(1,2),(2,0)], 'asymwedge' : [(0,1),(0,1),(1,2)], 'birdfoot' : [(0,1),(0,2),(0,3)], 'chain' : [(0,1),(1,2),(2,3)], 'linedumbbell' : [(0,1),(2,3),(2,3)], 'linewedge' : [(0,1),(2,3),(3,4)], 'linecbd' : [(0,1),(2,3),(4,5)], # d = 4 'quadbell' : [(0,1),(0,1),(0,1),(0,1)], 'doublewedge' : [(0,1),(0,1),(1,2),(1,2)], 'icecreamcone' : [(0,1),(0,1),(1,2),(2,0)], 'asymwedge2' : [(0,1),(0,1),(0,1),(1,2)], 'square' : [(0,1),(1,2),(2,3),(3,0)], 'flyswatter' : [(0,1),(1,2),(2,3),(3,1)], 'chain2mid' : [(0,1),(1,2),(1,2),(2,3)], 'chain2end' : [(0,1),(1,2),(2,3),(2,3)], 'asymbirdfoot' : [(0,1),(0,1),(1,2),(1,3)], 'bigbirdfoot' : [(0,1),(0,2),(0,3),(0,4)], 'dog' : [(0,1),(1,2),(2,3),(2,4)], 'bigchain' : [(0,1),(1,2),(2,3),(3,4)], 'dumbbellwedge' : [(0,1),(0,1),(2,3),(3,4)], 'triangleline' : [(0,1),(1,2),(2,0),(3,4)], 'dumbbellsqd' : [(0,1),(0,1),(2,3),(2,3)], # d = 5 'pentagon' : [(0,1),(1,2),(2,3),(3,4),(4,0)], 'triangledumbbell': [(0,1),(0,1),(2,3),(3,4),(4,2)] } # pick a random event with 2 particles event = ef.gen_random_events(1, 2, dim=4) # compute the value of all of the EFPs on this event d = {name: ef.EFP(graph, measure=measure, coords='epxpypz')(event) for name,graph in graphs.items()} eps = 10**-8 # check that the identities in the EFM paper are valid (i.e. = 0) assert epsilon_diff(2 * d['wedge'] - d['dumbbell'], 0, eps) assert epsilon_diff(2 * d['triangle'], 0, eps) assert epsilon_diff(d['tribell'] - 2 * d['asymwedge'], 0, eps) assert epsilon_diff(2 * d['chain'] - d['linedumbbell'] - d['triangle'], 0, eps) assert epsilon_diff(d['birdfoot'] + d['chain'] - d['asymwedge'], 0, eps) # Four Dimensions # pick a random event in 4 dimensions event = ef.gen_random_events(1, 25, dim=4) # compute the value of all of the EFPs on this event d = {name: ef.EFP(graph, measure=measure, coords='epxpypz')(event) for name,graph in graphs.items()} # check that the identity in the paper is valid (i.e. = 0) assert epsilon_percent(6*d['pentagon'], 5*d['triangledumbbell'], 10**-11) # count the number of leafless multigraphs (all or just connected) with degree d ds = np.arange(11) counts_all, counts_con = [], [] # for each degree, get the graphs with edges<=d and check whether they are leafless for d in ds: counts_all.append(np.sum([leafless(graph) for graph in ef.EFPSet(('d<=',d)).graphs()])) counts_con.append(np.sum([leafless(graph) for graph in ef.EFPSet(('d<=',d), ('p==',1)).graphs()])) # note: computed counts are cumulative, must take the difference to get individual d counts_all = np.asarray(counts_all[1:]) - np.asarray(counts_all[:-1]) counts_con = np.asarray(counts_con[1:]) - np.asarray(counts_con[:-1]) # ensure agreement with the table in the paper assert epsilon_diff(counts_all, [0,1,2,5,11,34,87,279,897,3129], eps) assert epsilon_diff(counts_con, [0,1,2,4,9,26,68,217,718,2553], eps)