def main(argv): #parse command line arguments parser = argparse.ArgumentParser(add_help=False) parser.add_argument("-d", "--data_dir", type=str, required=True, dest="data_dir", help="name of directory where data is stored") parser.add_argument( "-s", "--dataset", type=str, required=True, dest="data_name", help="name of dataset to create (without hdf5 extension)") parser.add_argument( "-n", "--normed", type=int, default=1, dest="use_normed", help="choose whether to use normalized features or not") args = parser.parse_args() data_path = args.data_dir data_name = args.data_name use_normed = args.use_normed infile_prefix = data_path + data_name + "_" if use_normed: ext = '.normed' else: ext = '' total_true = total_edges = total_b = total_c = 0 dataset_len = np.zeros(3) for i, dataset_type in enumerate(['train', 'val', 'test']): g_list = [] graphs = dgl.load_graphs(infile_prefix + dataset_type + ext + ".bin")[0] dataset_len[i] = len(graphs) for graph in graphs: ntracks = graph.num_nodes() graph = prune_graph(graph) #only consider training dataset when writing values to paramfile if dataset_type == 'train': total_true += int(th.sum(graph.edata['bin_labels'][:, 0])) total_b += int(th.sum(graph.edata['mult_labels'][:, 0] == 1)) total_c += int(th.sum(graph.edata['mult_labels'][:, 0] == 2)) total_edges += list(graph.edata['bin_labels'][:, 0].size())[0] g_list.append(graph) random.shuffle(g_list) dgl.save_graphs(infile_prefix + dataset_type + ext + '.pruned.bin', g_list) #store important values in paramfile paramfile = open(infile_prefix + 'params', "w") paramfile.write(str(dataset_len[0]) + '\n') #train length paramfile.write(str(dataset_len[1]) + '\n') #val length paramfile.write(str(dataset_len[2]) + '\n') #test length paramfile.write(str(total_true / total_edges) + '\n') paramfile.write(str(total_b / total_edges) + '\n') paramfile.write(str(total_c / total_edges) + '\n') paramfile.close()
def create_old_heterograph_files(): path = os.path.join(os.path.dirname(__file__), "data/hetero1.bin") g_list0 = create_heterographs(F.int64) + create_heterographs(F.int32) labels_dict = {"graph_label": F.ones(54)} dgl.save_graphs(path, g_list0, labels_dict)
# assign node feature: origin_id(not node_id) for entity in entity_dic.keys(): node_feature = th.tensor(entity_dic[entity]) g.nodes[entity].data['id'] = node_feature # assign edge feature: time for edge in edge_dic.keys(): edge_feature = th.tensor(edge_dic[edge]) g.edges[edge].data['timestamp'] = edge_feature return g # print(g) # print(g.number_of_nodes('process')) # print(g.nodes['process'].data['id']) # print(g.edges[('process', 'open', 'file')].data['timestamp']) if __name__ == '__main__': scenario = 'VGame' for graph_id in range(200, 299): dgl_graph = data_to_heterograph(scenario, graph_id) dgl_graphname = "dataset/dglGraph/" + scenario + "/" + str( graph_id) + ".bin" graph_labels = {"glabel": th.tensor([graph_id])} dgl.save_graphs(dgl_graphname, [dgl_graph], graph_labels) print("graph #" + str(graph_id) + " of scenario " + scenario + " has been saved!") # load graph from disk # glist, label_dict = dgl.load_graphs("dataset/dglGraph/YouTube/0.bin")
def main(argv): #parse command line arguments parser = argparse.ArgumentParser(add_help=False) parser.add_argument("-d", "--data_dir", type=str, required=True, dest="data_dir", help="name of directory where data is stored") parser.add_argument( "-s", "--dataset", type=str, required=True, dest="data_name", help="name of dataset to create (without hdf5 extension)") args = parser.parse_args() data_path = args.data_dir data_name = args.data_name train_infile_name = data_path + data_name + "_train.bin" val_infile_name = data_path + data_name + "_val.bin" test_infile_name = data_path + data_name + "_test.bin" train_outfile_name = data_path + data_name + "_train.normed.bin" val_outfile_name = data_path + data_name + "_val.normed.bin" test_outfile_name = data_path + data_name + "_test.normed.bin" normfile_name = data_path + data_name + "_norm" incl_errors = incl_hits = incl_corr = incl_vweight = False train_graphs = dgl.load_graphs(train_infile_name)[0] num_features_base = train_graphs[0].ndata['features_base'].size()[1] mean_features_base = np.zeros(num_features_base) std_features_base = np.zeros(num_features_base) if 'features_vweight' in train_graphs[0].ndata.keys(): incl_vweight = True num_features_vweight = train_graphs[0].ndata['features_vweight'].size( )[1] mean_features_vweight = np.zeros(num_features_vweight) std_features_vweight = np.zeros(num_features_vweight) if 'features_errors' in train_graphs[0].ndata.keys(): incl_errors = True num_features_errors = train_graphs[0].ndata['features_errors'].size( )[1] mean_features_errors = np.zeros(num_features_errors) std_features_errors = np.zeros(num_features_errors) if 'features_hits' in train_graphs[0].ndata.keys(): incl_hits = True num_features_hits = train_graphs[0].ndata['features_hits'].size()[1] mean_features_hits = np.zeros(num_features_hits) std_features_hits = np.zeros(num_features_hits) if 'features_corr' in train_graphs[0].ndata.keys(): incl_corr = True num_features_corr = train_graphs[0].ndata['features_corr'].size()[1] mean_features_corr = np.zeros(num_features_corr) std_features_corr = np.zeros(num_features_corr) print("Calculating mean of features") #calculate mean of training features - error and cov scaling is based only on scaling of base features total_tracks = 0 for graph in train_graphs: features_base = graph.ndata['features_base'].numpy() mean_features_base += np.sum(features_base, axis=0) if incl_vweight: features_vweight = graph.ndata['features_vweight'].numpy() mean_features_vweight += np.sum(features_vweight, axis=0) if incl_hits: features_hits = graph.ndata['features_hits'].numpy() mean_features_hits += np.sum(features_hits, axis=0) total_tracks += graph.ndata['features_base'].size()[0] mean_features_base = mean_features_base / total_tracks if incl_vweight: mean_features_vweight = mean_features_vweight / total_tracks if incl_hits: mean_features_hits = mean_features_hits / total_tracks print("Calculating STD of features") #calculate std of training features for graph in train_graphs: features_base = graph.ndata['features_base'].numpy() std_features_base += np.sum(np.square(features_base - mean_features_base), axis=0) if incl_vweight: features_vweight = graph.ndata['features_vweight'].numpy() std_features_vweight += np.sum(np.square(features_vweight - mean_features_vweight), axis=0) if incl_hits: features_hits = graph.ndata['features_hits'].numpy() std_features_hits += np.sum(np.square(features_hits - mean_features_hits), axis=0) std_features_base = np.sqrt(std_features_base / total_tracks) if incl_vweight: std_features_vweight = np.sqrt(std_features_vweight / total_tracks) if incl_hits: std_features_hits = np.sqrt(std_features_hits / total_tracks) #manually set normalization parameters for special features (features that have a fixed range are set to vary from -1 to 1) mean_features_base[1] = math.pi / 2. #track theta varies from 0 to pi std_features_base[1] = math.pi / 2. mean_features_base[2] = 0 #track phi varies from -pi to pi std_features_base[2] = math.pi mean_features_base[7] = 0 #jet phi varies from -pi to pi std_features_base[7] = math.pi if incl_errors: #take std of variance to be std of features squared std_features_errors[0] = std_features_base[0] std_features_errors[1] = std_features_base[1] std_features_errors[2] = std_features_base[2] std_features_errors[3] = std_features_base[3] std_features_errors[4] = std_features_base[4] if incl_corr: #take std of covariance to be product of std of features std_features_corr[0] = std_features_base[0] * std_features_base[1] std_features_corr[1] = std_features_base[0] * std_features_base[2] std_features_corr[2] = std_features_base[0] * std_features_base[3] std_features_corr[3] = std_features_base[0] * std_features_base[4] std_features_corr[4] = std_features_base[1] * std_features_base[2] std_features_corr[5] = std_features_base[1] * std_features_base[3] std_features_corr[6] = std_features_base[1] * std_features_base[4] std_features_corr[7] = std_features_base[2] * std_features_base[3] std_features_corr[8] = std_features_base[2] * std_features_base[4] std_features_corr[9] = std_features_base[3] * std_features_base[4] #store normalization parameters in file normfile = open(normfile_name, "w") for i in range(len(mean_features_base)): normfile.write(str(mean_features_base[i]) + '\n') normfile.write(str(std_features_base[i]) + '\n') if incl_vweight: for i in range(len(mean_features_vweight)): normfile.write(str(mean_features_vweight[i]) + '\n') normfile.write(str(std_features_vweight[i]) + '\n') if incl_errors: for i in range(len(mean_features_errors)): normfile.write(str(mean_features_errors[i]) + '\n') normfile.write(str(std_features_errors[i]) + '\n') if incl_corr: for i in range(len(mean_features_corr)): normfile.write(str(mean_features_corr[i]) + '\n') normfile.write(str(std_features_corr[i]) + '\n') if incl_hits: for i in range(len(mean_features_hits)): normfile.write(str(mean_features_hits[i]) + '\n') normfile.write(str(std_features_hits[i]) + '\n') normfile.close() #apply normalization from training data to all graph features print("Normalizing {} training graphs".format(len(train_graphs))) for graph in train_graphs: features_base = graph.ndata['features_base'].numpy() normed_features_base = np.divide(features_base - mean_features_base, std_features_base) graph.ndata['features_base'] = th.from_numpy(normed_features_base) if incl_vweight: features_vweight = graph.ndata['features_vweight'].numpy() normed_features_vweight = np.divide( features_vweight - mean_features_vweight, std_features_vweight) graph.ndata['features_vweight'] = th.from_numpy( normed_features_vweight) if incl_errors: features_errors = graph.ndata['features_errors'].numpy() normed_features_errors = np.divide( features_errors - mean_features_errors, std_features_errors) graph.ndata['features_errors'] = th.from_numpy( normed_features_errors) if incl_corr: features_corr = graph.ndata['features_corr'].numpy() normed_features_corr = np.divide( features_corr - mean_features_corr, std_features_corr) graph.ndata['features_corr'] = th.from_numpy(normed_features_corr) if incl_hits: features_hits = graph.ndata['features_hits'].numpy() normed_features_hits = np.divide( features_hits - mean_features_hits, std_features_hits) graph.ndata['features_hits'] = th.from_numpy(normed_features_hits) dgl.save_graphs(train_outfile_name, train_graphs) val_graphs = dgl.load_graphs(val_infile_name)[0] print("Normalizing {} validation graphs".format(len(val_graphs))) for graph in val_graphs: features_base = graph.ndata['features_base'].numpy() normed_features_base = np.divide(features_base - mean_features_base, std_features_base) graph.ndata['features_base'] = th.from_numpy(normed_features_base) if incl_vweight: features_vweight = graph.ndata['features_vweight'].numpy() normed_features_vweight = np.divide( features_vweight - mean_features_vweight, std_features_vweight) graph.ndata['features_vweight'] = th.from_numpy( normed_features_vweight) if incl_errors: features_errors = graph.ndata['features_errors'].numpy() normed_features_errors = np.divide( features_errors - mean_features_errors, std_features_errors) graph.ndata['features_errors'] = th.from_numpy( normed_features_errors) if incl_corr: features_corr = graph.ndata['features_corr'].numpy() normed_features_corr = np.divide( features_corr - mean_features_corr, std_features_corr) graph.ndata['features_corr'] = th.from_numpy(normed_features_corr) if incl_hits: features_hits = graph.ndata['features_hits'].numpy() normed_features_hits = np.divide( features_hits - mean_features_hits, std_features_hits) graph.ndata['features_hits'] = th.from_numpy(normed_features_hits) dgl.save_graphs(val_outfile_name, val_graphs) test_graphs = dgl.load_graphs(test_infile_name)[0] print("Normalizing {} testing graphs".format(len(test_graphs))) for graph in test_graphs: features_base = graph.ndata['features_base'].numpy() normed_features_base = np.divide(features_base - mean_features_base, std_features_base) graph.ndata['features_base'] = th.from_numpy(normed_features_base) if incl_vweight: features_vweight = graph.ndata['features_vweight'].numpy() normed_features_vweight = np.divide( features_vweight - mean_features_vweight, std_features_vweight) graph.ndata['features_vweight'] = th.from_numpy( normed_features_vweight) if incl_errors: features_errors = graph.ndata['features_errors'].numpy() normed_features_errors = np.divide( features_errors - mean_features_errors, std_features_errors) graph.ndata['features_errors'] = th.from_numpy( normed_features_errors) if incl_corr: features_corr = graph.ndata['features_corr'].numpy() normed_features_corr = np.divide( features_corr - mean_features_corr, std_features_corr) graph.ndata['features_corr'] = th.from_numpy(normed_features_corr) if incl_hits: features_hits = graph.ndata['features_hits'].numpy() normed_features_hits = np.divide( features_hits - mean_features_hits, std_features_hits) graph.ndata['features_hits'] = th.from_numpy(normed_features_hits) dgl.save_graphs(test_outfile_name, test_graphs)
def sample_opti_sequenz_graph(iter=None, maxIteration = 501): # initialize LEP LEP = LEProblem(bridge2Dstochastic()) tau_adapter = tauadaptor(LEP.get_tau()) mesh_adapter = meshadaptor(mesh_0=LEP.mesh, CVaR=LEP.CVaR) LEP = get_unique_dist_g(LEP, iter=None) IterationStep = 0 get_new_graph_iter = 4 iters = [] taus = [] taus.append(LEP.tau_0) gammas = [] es = [] controls = [] for k in range(maxIteration): IterationStep += 1 get_new_graph_iter +=1 iters.append(IterationStep) print("\nIter: {IterationStep}".format(IterationStep=IterationStep)) # solving SE = LEP.SE(LEP.u, LEP.phi_n[0], LEP.v_u, LEP.g) solve(lhs(SE) == rhs(SE), LEP.u_n, bcs=LEP.bcSE, solver_parameters={"linear_solver": "umfpack", "preconditioner": "default"}, form_compiler_parameters=None) #get u_k and phi_{k-1} if get_new_graph_iter >= 5: # get edges from fenics edge_from = [] edge_to = [] #element = LEP.F.element() dofmap = LEP.F.dofmap() for cell in cells(LEP.mesh): finite_element_node_index = dofmap.cell_dofs(cell.index()) edge_from += list(list(zip(*itertools.permutations(finite_element_node_index[:-1], 2)))[0]) edge_to += list(list(zip(*itertools.permutations(finite_element_node_index[:-1], 2)))[1]) # initialize graph g = dgl.graph((edge_from, edge_to)) # initialize Graph (nodes and edges) g.ndata['value'] = torch.tensor([ [0.5] for i in range(g.number_of_nodes())], dtype=torch.float32) g.ndata['position'] = torch.tensor([ [0.001, 0.001] for i in range(g.number_of_nodes()) ], dtype=torch.float32) g.ndata['u'] = torch.tensor([ [0.001, 0.001] for i in range(g.number_of_nodes()) ], dtype=torch.float32) # get graph data from fenics phi_val = LEP.phi_n.vector()[:-1] ux, uy = LEP.u_n.split(deepcopy=True) ux_val = ux.vector()[:] uy_val = uy.vector()[:] phi_coor = LEP.F.tabulate_dof_coordinates()[:-1] for i in range(len(phi_val)): g.ndata['value'][i] = phi_val[i] g.ndata['u'][i] = torch.tensor([ux_val[i], uy_val[i]], dtype=torch.float32) g.ndata['position'][i] = torch.tensor(phi_coor[i], dtype=torch.float32) dgl.save_graphs('data/' + str(iter)+ '_' +str(IterationStep)+ '_(' + str(LEP.g(0)[0]) + ',' + str(LEP.g(0)[1]) + ')_truncnorm_bridge2D_GRAPH_50_sequence.bin', g) get_new_graph_iter = 0 LEP.p_n.assign(LEP.u_n) # p_n = u_n # solve gradient equation GE = LEP.GE(LEP.phi, LEP.phi_n, LEP.v_phi, LEP.p_n, LEP.u_n, LEP.tau[0], LEP.gamma) solve(lhs(GE) == rhs(GE), LEP.phi_next, bcs=None, solver_parameters={"linear_solver": "umfpack", "preconditioner": "default"}, form_compiler_parameters=None) LEP.project_phi() # optimization adaptions tau_n = LEP.get_tau() taus.append(tau_n) e_n = LEP.get_e_n() es.append(e_n) control = LEP.get_control_change() controls.append(control / tau_n) gammas.append(LEP.gamma(0)) LEP.tau.vector()[:] = tau_adapter.nextTau(e_n)[0] LEP.new_adaptGamma() NewMeshIndicator, new_mesh = mesh_adapter.update(LEP.phi_next, LEP.u_n, controls[-1], LEP.mesh) if NewMeshIndicator: LEP.updateMesh(new_mesh) LEP.phi_last.assign(LEP.phi_n) LEP.phi_n.assign(LEP.phi_next) return
user_table_path = 's3://xhs.alpha/reddm/' + args.user_table + '/dtm=%s' % args.dsnodash user_features = pq.ParquetDataset(user_table_path, filesystem=s3).read().to_pandas() device_table_path = 's3://xhs.alpha/reddm/' + args.device_table + '/dtm=%s' % args.dsnodash device_features = pq.ParquetDataset(device_table_path, filesystem=s3).read().to_pandas() relation_table_path = 's3://xhs.alpha/reddm/' + args.relation_table + '/dtm=%s' % args.dsnodash relation_df = pq.ParquetDataset(relation_table_path, filesystem=s3).read().to_pandas() label_table_path = 's3://xhs.alpha/reddm/' + args.label_table + '/dtm=%s' % args.dsnodash labels = pq.ParquetDataset(label_table_path, filesystem=s3).read().to_pandas() # Build graph graph_builder = PandasGraphBuilder() graph_builder.add_entities(user_features, 'user_entity_id', 'user') graph_builder.add_entities(device_features, 'device_entity_id', 'device') graph_builder.add_binary_relations(relation_df, 'user_entity_id', 'device_entity_id', 'used') graph_builder.add_binary_relations(relation_df, 'device_entity_id', 'user_entity_id', 'used-by') g = graph_builder.build() dgl.save_graphs('./dataset/dgl_graph', [g]) # Assign features. user_features = user_features.sort_values(by='user_entity_id').values[:, 1:] device_features = device_features.sort_values(by='device_entity_id').values[:, 1:] labels = labels.values np.random.shuffle(labels) pos_label_count = np.count_nonzero(labels[:, 1] > 0) neg_labels = labels[labels[:, 1] == 0] neg_labels = neg_labels[:pos_label_count*args.sample_ratio, :] labels = np.vstack((labels[labels[:, 1] > 0], neg_labels)) np.savez_compressed('./dataset/feat_and_label', user_f=user_features, device_f=device_features, labels=labels) else: g = dgl.load_graphs('./dataset/dgl_graph')[0][0] np_ds = np.load('./dataset/feat_and_label.npz' % args.dsnodash) user_features, device_features, labels = np_ds['user_f'], np_ds['device_f'], np_ds['labels']
torch.arange(dataset.num_papers) ])) g.edata['etype'] = g.edata[dgl.ETYPE].byte() del g.edata[dgl.ETYPE] del g.ndata[dgl.NTYPE] del g.ndata[dgl.NID] # Process feature full_feat = np.memmap(args.full_output_path, mode='w+', dtype='float16', shape=(dataset.num_authors + dataset.num_institutions + dataset.num_papers, dataset.num_paper_features)) BLOCK_ROWS = 100000 for start in tqdm.trange(0, dataset.num_authors, BLOCK_ROWS): end = min(dataset.num_authors, start + BLOCK_ROWS) full_feat[author_offset + start:author_offset + end] = author_feat[start:end] for start in tqdm.trange(0, dataset.num_institutions, BLOCK_ROWS): end = min(dataset.num_institutions, start + BLOCK_ROWS) full_feat[inst_offset + start:inst_offset + end] = inst_feat[start:end] for start in tqdm.trange(0, dataset.num_papers, BLOCK_ROWS): end = min(dataset.num_papers, start + BLOCK_ROWS) full_feat[paper_offset + start:paper_offset + end] = paper_feat[start:end] # Convert the graph to the given format and save. (The RGAT baseline needs CSC graph) g = g.formats(args.graph_format) dgl.save_graphs(args.graph_output_path, g)
def load_from_ogbl_with_name(name): choices = ['ogbl-collab', 'ogbl-ddi', 'ogbl-ppa', 'ogbl-citation'] assert name in choices, "name must be selected from " + str(choices) dataset = DglLinkPropPredDataset(name) return dataset[0] def load_from_ogbn_with_name(name): choices = ['ogbn-products', 'ogbn-proteins', 'ogbn-arxiv', 'ogbn-papers100M'] assert name in choices, "name must be selected from " + str(choices) dataset, label = DglNodePropPredDataset(name)[0] return dataset if __name__ == "__main__": """ load datasets as net.txt format """ parser = argparse.ArgumentParser() parser.add_argument('--name', type=str, choices=['ogbl-collab', 'ogbl-ddi', 'ogbl-ppa', 'ogbl-citation', 'ogbn-products', 'ogbn-proteins', 'ogbn-arxiv', 'ogbn-papers100M'], default='ogbl-collab', help="name of datasets by ogb") args = parser.parse_args() name = args.name if name.startswith("ogbl"): g = load_from_ogbl_with_name(name=name) else: g = load_from_ogbn_with_name(name=name) dgl.save_graphs(name + "-graph.bin", g)
g.edges['listened'].data['created_at'] = torch.LongTensor( events['created_at'].values) g.edges['listened-by'].data['created_at'] = torch.LongTensor( events['created_at'].values) n_edges = g.number_of_edges('listened') train_indices, val_indices, test_indices = train_test_split_by_time( events, 'created_at', 'user_id') train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened', 'listened-by') assert train_g.out_degrees(etype='listened').min() > 0 val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'user', 'track', 'listened') dgl.save_graphs(os.path.join(out_directory, 'train_g.bin'), train_g) dataset = { 'val-matrix': val_matrix, 'test-matrix': test_matrix, 'item-texts': {}, 'item-images': None, 'user-type': 'user', 'item-type': 'track', 'user-to-item-type': 'listened', 'item-to-user-type': 'listened-by', 'timestamp-edge-column': 'created_at' } with open(os.path.join(out_directory, 'data.pkl'), 'wb') as f: pickle.dump(dataset, f)
def main(argv): gROOT.SetBatch(True) #parse command line arguments parser = argparse.ArgumentParser(add_help=False) parser.add_argument("-r", "--runnumber", type=str, default=0, dest="runnumber", help="unique identifier for current run") parser.add_argument("-e", "--epochs", type=int, default=20, dest="nepochs", help="number of epochs for training") parser.add_argument("-d", "--data_dir", type=str, required=True, dest="data_dir", help="name of directory where data is stored") parser.add_argument("-o", "--output_dir", type=str, required=True, dest="output_dir", help="name of directory where GNN output is stored") parser.add_argument( "-s", "--dataset", type=str, required=True, dest="infile_name", help="name of dataset to train on (without hdf5 extension)") parser.add_argument( "-n", "--normed", type=int, default=1, dest="use_normed", help="choose whether to use normalized features or not") parser.add_argument( "-m", "--multiclass", type=int, default=0, dest="multi_class", help="choose whether to perform binary of multi-class classification") args = parser.parse_args() runnumber = args.runnumber nepochs = args.nepochs infile_name = args.infile_name infile_path = args.data_dir outfile_path = args.output_dir use_normed = args.use_normed multi_class = args.multi_class #import options from option file learning_rate = options.learning_rate batch_size = options.batch_size attention_heads = options.attention_heads nodemlp_sizes = options.nodemlp_sizes gat_sizes = options.gat_sizes edgemlp_sizes = options.edgemlp_sizes reweight = options.reweight #reweight positive labels in loss to make positives and negatives equally important load_checkpoint = options.load_checkpoint use_lr_scheduler = options.use_lr_scheduler #---------------------------------------------------DATA-IMPORT------------------------------------------------- start_time = time.time() print("Importing input data.") #set relevant filenames if use_normed: ext = ".normed.pruned" else: ext = ".pruned" paramfile_name = infile_path + infile_name + "_params" train_infile_name = infile_path + infile_name + "_train" + ext + ".bin" val_infile_name = infile_path + infile_name + "_val" + ext + ".bin" test_infile_name = infile_path + infile_name + "_test" + ext + ".bin" checkpointfile_name = outfile_path + runnumber + "/" + infile_name + "_" + runnumber + "_model.pt" #calculate number of features in graphs sample_graph = dgl.load_graphs(train_infile_name, [0])[0][0] incl_errors = incl_corr = incl_hits = incl_vweight = False nnfeatures_base = sample_graph.ndata['features_base'].size()[1] in_features = nnfeatures_base if 'features_vweight' in sample_graph.ndata.keys(): nnfeatures_vweight = sample_graph.ndata['features_vweight'].size()[1] incl_vweight = True in_features += nnfeatures_vweight if 'features_errors' in sample_graph.ndata.keys(): nnfeatures_errors = sample_graph.ndata['features_errors'].size()[1] incl_errors = True in_features += nnfeatures_errors if 'features_hits' in sample_graph.ndata.keys(): nnfeatures_hits = sample_graph.ndata['features_hits'].size()[1] incl_hits = True in_features += nnfeatures_hits if 'features_corr' in sample_graph.ndata.keys(): nnfeatures_corr = sample_graph.ndata['features_corr'].size()[1] incl_corr = True in_features += nnfeatures_corr #read in values from parameter file if os.path.isfile(paramfile_name): paramfile = open(paramfile_name, "r") train_len = int(float(paramfile.readline())) val_len = int(float(paramfile.readline())) test_len = int(float(paramfile.readline())) truth_frac = float(paramfile.readline()) b_frac = float(paramfile.readline()) c_frac = float(paramfile.readline()) else: print("ERROR: Specified parameter file not found") return 1 p_time = time.time() - start_time print("Finished importing input data. Time elapsed: {}s.\n".format(p_time)) #reweight positive labels automatically if desired if reweight: pos_weight = th.tensor([0.5 * (1 - truth_frac) / truth_frac]) mult_weights = th.tensor( [1. / (1 - b_frac - c_frac), 1. / b_frac, 1. / c_frac]) print("Setting positive weight to {}".format(pos_weight)) else: pos_weight = th.tensor([1]) mult_weights = th.tensor([1., 1., 1.]) #calculate number of testing, training and validation batches test_batches = int(math.ceil(test_len / batch_size)) val_batches = int(math.ceil(val_len / batch_size)) train_batches = int(math.ceil(train_len / batch_size)) device = th.device('cuda' if th.cuda.is_available() else 'cpu') #automatically run on GPU if available #set up loss if not multi_class: loss = nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction='sum').to(device) outfeats = 1 cm = np.zeros((2, 2), dtype=int) activation = nn.Sigmoid() labeltype = 'bin_labels' else: loss = nn.CrossEntropyLoss(weight=mult_weights).double().to(device) outfeats = 3 cm = np.zeros((3, 3), dtype=int) activation = nn.Softmax(dim=1) labeltype = 'mult_labels' model = EdgePredModel(nodemlp_sizes, gat_sizes, edgemlp_sizes, in_features, outfeats, attention_heads).double().to(device) opt = th.optim.Adam(model.parameters(), lr=learning_rate) if use_lr_scheduler: scheduler = th.optim.lr_scheduler.OneCycleLR( opt, 0.1, epochs=nepochs, steps_per_epoch=train_batches ) #th.optim.lr_scheduler.ReduceLROnPlateau(opt,patience=5) train_loss_array = np.zeros(nepochs) val_loss_array = np.zeros(nepochs) #print model parameters print("Model built. Parameters:") for name, param in model.named_parameters(): print(name, param.size(), param.requires_grad) print("") #load existing checkpoint if load_checkpoint and os.path.exists(checkpointfile_name): checkpoint = th.load(checkpointfile_name) model.load_state_dict(checkpoint['model_state_dict']) opt.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 print("Loading previous model. Starting from epoch {}.".format( start_epoch)) else: start_epoch = 1 #----------------------------------------------------TRAINING--------------------------------------------------- #main training loop t_time = time.time() - start_time print("Beginning training. Running on {}. Time elapsed: {}s.\n".format( device, t_time)) for epoch in range(start_epoch, nepochs + 1): print("Epoch: {}".format(epoch)) #training total_labels = 0 model.train() for ibatch in range(train_batches): #load batch from file istart = ibatch * batch_size if ibatch == (train_batches - 1) and train_len % batch_size != 0: iend = istart + (train_len % batch_size) else: iend = (ibatch + 1) * batch_size batch = dgl.batch( dgl.load_graphs(train_infile_name, list(range(istart, iend)))[0]) #construct feature matrix features = batch.ndata['features_base'] if incl_vweight: features = th.cat((features, batch.ndata['features_vweight']), dim=1) if incl_errors: features = th.cat((features, batch.ndata['features_errors']), dim=1) if incl_hits: features = th.cat((features, batch.ndata['features_hits']), dim=1) if incl_corr: features = th.cat((features, batch.ndata['features_corr']), dim=1) #process batch batch = batch.to(device) #transfer batch to relevant device features = features.to(device) pred = model(batch, features) target = batch.edata[labeltype] if multi_class: target = target[:, 0].long() pred_lt = loss(pred, target) opt.zero_grad() pred_lt.backward() opt.step() #evaluate loss batch_labels = batch.edata['bin_labels'].size()[0] total_labels += batch_labels print("Training loss: {}".format(pred_lt.item() / batch_labels)) train_loss_array[epoch - 1] += pred_lt.item() if use_lr_scheduler: scheduler.step() #normalize loss train_loss_array[epoch - 1] = train_loss_array[epoch - 1] / total_labels #save checkpoint th.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': opt.state_dict() }, checkpointfile_name) #validation total_labels = 0 model.eval() for ibatch in range(val_batches): #load batch from file istart = ibatch * batch_size if ibatch == (val_batches - 1) and val_len % batch_size != 0: iend = istart + (val_len % batch_size) else: iend = (ibatch + 1) * batch_size val_batch = dgl.batch( dgl.load_graphs(val_infile_name, list(range(istart, iend)))[0]) #construct feature matrix val_features = val_batch.ndata['features_base'] if incl_vweight: val_features = th.cat( (val_features, val_batch.ndata['features_vweight']), dim=1) if incl_errors: val_features = th.cat( (val_features, val_batch.ndata['features_errors']), dim=1) if incl_hits: val_features = th.cat( (val_features, val_batch.ndata['features_hits']), dim=1) if incl_corr: val_features = th.cat( (val_features, val_batch.ndata['features_corr']), dim=1) #process batch val_batch = val_batch.to(device) val_features = val_features.to(device) pred = model(val_batch, val_features) target = val_batch.edata[labeltype] if multi_class: target = target[:, 0].long() pred_lv = loss(pred, target) #evaluate loss batch_labels = val_batch.edata['bin_labels'].size()[0] total_labels += batch_labels print("Validation loss: {}".format(pred_lv.item() / batch_labels)) val_loss_array[epoch - 1] += pred_lv.item() #normalize loss val_loss_array[epoch - 1] = val_loss_array[epoch - 1] / total_labels #print validation results e_time = time.time() - start_time print('Time elapsed: {}s.\n'.format(e_time)) print("Training finished. Evaluating model.\n") #---------------------------------------------------EVALUATION-------------------------------------------------- overall_g_list = [] #testing model.eval() for ibatch in range(test_batches): #load batch from file istart = ibatch * batch_size if ibatch == (test_batches - 1) and test_len % batch_size != 0: iend = istart + (test_len % batch_size) else: iend = (ibatch + 1) * batch_size test_batch = dgl.batch( dgl.load_graphs(test_infile_name, list(range(istart, iend)))[0]) #construct feature matrix test_features = test_batch.ndata['features_base'] if incl_vweight: test_features = th.cat( (test_features, test_batch.ndata['features_vweight']), dim=1) if incl_errors: test_features = th.cat( (test_features, test_batch.ndata['features_errors']), dim=1) if incl_hits: test_features = th.cat( (test_features, test_batch.ndata['features_hits']), dim=1) if incl_corr: test_features = th.cat( (test_features, test_batch.ndata['features_corr']), dim=1) #process batch test_batch = test_batch.to(device) test_features = test_features.to(device) edge_labels = test_batch.edata[labeltype] #evaluate results pred = activation(model(test_batch, test_features).float()).cpu().detach().numpy() true = test_batch.edata[labeltype].cpu().numpy().astype(int) test_batch.edata['pred'] = activation(test_batch.edata['pred']) g_test_list = dgl.unbatch(test_batch) overall_g_list.extend(g_test_list) if not multi_class: cm += evaluate_confusion_bin(true, pred.round().astype(int)) else: cm += evaluate_confusion_mult(true, pred.round().astype(int)) #print test results print_output(multi_class, cm) #save results to file outfile_name = outfile_path + runnumber + "/" + infile_name + "_" + runnumber dgl.save_graphs(outfile_name + "_results.bin", overall_g_list) #plot loss plt.ioff() plt.plot(range(nepochs), train_loss_array, label="Training") plt.plot(range(nepochs), val_loss_array, label="Validation") plt.legend() plt.xlabel("Epoch") plt.savefig(outfile_name + "_lossplot.png")
compact_g1.ndata[dgl.NTYPE] = compact_g.ndata[dgl.NTYPE][reshuffle_nodes] compact_g1.ndata[dgl.NID] = compact_g.ndata[dgl.NID][reshuffle_nodes] compact_g1.ndata['inner_node'] = compact_g.ndata['inner_node'][ reshuffle_nodes] compact_g1.edata['orig_id'] = compact_g.edata['orig_id'][compact_g1.edata[ dgl.EID]] compact_g1.edata[dgl.ETYPE] = compact_g.edata[dgl.ETYPE][compact_g1.edata[ dgl.EID]] compact_g1.edata['inner_edge'] = compact_g.edata['inner_edge'][ compact_g1.edata[dgl.EID]] compact_g1.edata[dgl.EID] = compact_g.edata[dgl.EID][compact_g1.edata[ dgl.EID]] part_dir = output_dir + '/part' + str(part_id) os.makedirs(part_dir, exist_ok=True) dgl.save_graphs(part_dir + '/graph.dgl', [compact_g1]) part_metadata = { 'graph_name': graph_name, 'num_nodes': num_nodes, 'num_edges': num_edges, 'part_method': 'metis', 'num_parts': num_parts, 'halo_hops': 1, 'node_map': node_map_val, 'edge_map': edge_map_val, 'ntypes': ntypes_map, 'etypes': etypes_map } for part_id in range(num_parts):
# To eliminate 0-in-degree nodes # bg = dgl.add_reverse_edges(g, copy_ndata=True, copy_edata=True) # return bg return g if __name__ == "__main__": start_time = time() list_user_domian = ['U66@DOM1'] list_authentication_type = [ '?', 'NTLM', 'Kerberos', 'Negotiate', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1_0', 'N' ] list_logon_type = [ '?', 'Network', 'Batch', 'NetworkCleartext', 'Unlock', 'RemoteInteractive', 'Interactive', 'Service', 'CachedInteractive', 'NewCredentials' ] list_authentication_orientation = [ 'TGS', 'TGT', 'LogOn', 'LogOff', 'AuthMap', 'ScreenLock', 'ScreenUnlock' ] list_failure_success = ['Fail', 'Success'] graph = graph_construction() dgl.save_graphs('/data/LANL/data_including_all_malhosts/train/auth.bin', [graph]) print("[+] graph of auth has been saved!") end_time = time() print("Time used: " + str(end_time - start_time))
def _pre_process(self, smiles_to_graph, node_featurizer, edge_featurizer, load, log_every, init_mask, n_jobs=1): """Pre-process the dataset * Convert molecules from smiles format into DGLGraphs and featurize their atoms * Set missing labels to be 0 and use a binary masking matrix to mask them Parameters ---------- smiles_to_graph : callable, SMILES -> DGLGraph Function for converting a SMILES (str) into a DGLGraph. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. load : bool Whether to load the previously pre-processed dataset or pre-process from scratch. ``load`` should be False when we want to try different graph construction and featurization methods and need to preprocess from scratch. Default to True. log_every : bool Print a message every time ``log_every`` molecules are processed. It only comes into effect when :attr:`n_jobs` is greater than 1. init_mask : bool Whether to initialize a binary mask indicating the existence of labels. n_jobs : int Degree of parallelism for pre processing. Default to 1. """ if os.path.exists(self.cache_file_path) and load: # DGLGraphs have been constructed before, reload them print('Loading previously saved dgl graphs...') self.graphs, label_dict = load_graphs(self.cache_file_path) self.labels = label_dict['labels'] if init_mask: self.mask = label_dict['mask'] self.valid_ids = label_dict['valid_ids'].tolist() else: print('Processing dgl graphs from scratch...') if n_jobs > 1: self.graphs = pmap(smiles_to_graph, self.smiles, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, n_jobs=n_jobs) else: self.graphs = [] for i, s in enumerate(self.smiles): if (i + 1) % log_every == 0: print('Processing molecule {:d}/{:d}'.format( i + 1, len(self))) self.graphs.append( smiles_to_graph(s, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer)) # Keep only valid molecules self.valid_ids = [] graphs = [] for i, g in enumerate(self.graphs): if g is not None: self.valid_ids.append(i) graphs.append(g) self.graphs = graphs _label_values = self.df[self.task_names].values # np.nan_to_num will also turn inf into a very large number self.labels = F.zerocopy_from_numpy( np.nan_to_num(_label_values).astype( np.float32))[self.valid_ids] valid_ids = torch.tensor(self.valid_ids) if init_mask: self.mask = F.zerocopy_from_numpy( (~np.isnan(_label_values)).astype( np.float32))[self.valid_ids] save_graphs(self.cache_file_path, self.graphs, labels={ 'labels': self.labels, 'mask': self.mask, 'valid_ids': valid_ids }) else: self.mask = None save_graphs(self.cache_file_path, self.graphs, labels={ 'labels': self.labels, 'valid_ids': valid_ids }) self.smiles = [self.smiles[i] for i in self.valid_ids]
def save(self): """save the graph list and the labels""" graph_path = os.path.join( self.data_save_path, 'dgl_graph_{}_{}.bin'.format(self.hash, self.sub)) save_graphs(str(graph_path), self.graphs, {'labels': self.labels})
def save(self, data_dir): # save graphs graph_path = os.path.join(data_dir, 'kgat_dgl_graph.bin') save_graphs(graph_path, self.train_graph)
def create_dataset(args, processor, retrievers, relation_list, evaluate, input_dir): if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() definition_info = DefinitionInfo() tokenizer, _ = configure_tokenizer_model(args, logger, retrievers, is_preprocess=True) logger.info("tokenizer: {}".format(tokenizer)) if args.test: temp_mark = "test" elif evaluate: temp_mark = "dev" else: temp_mark = "train" cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( temp_mark, args.model_type, str(args.cache_file_suffix), ), ) if os.path.exists(cached_features_file): logger.warning("cache file exist and exit program") exit() logger.info("Creating features from dataset file at %s", input_dir) if not os.path.exists(cached_features_file + "_example"): if args.test: examples = processor.get_test_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) torch.save(examples, cached_features_file + "_example") else: logger.info("Loading examples from cached files.") examples = torch.load(cached_features_file + "_example") examples_tokenized = processor.tokenization_on_examples(examples, tokenizer, is_testing=args.test) features = processor.convert_examples_to_features(args, examples_tokenized, tokenizer, retrievers, not evaluate, debug=args.debug) features, dataset, all_kgs_graphs = processor.pad_and_index_features_all( features, retrievers, args, tokenizer, relation_list, encoder=None, definition_info=definition_info, is_training=not evaluate, debug=args.debug) if args.local_rank in [-1, 0]: if args.model_type == "kelm": all_kgs_graphs_label_dict = {"glabel": torch.tensor([i for i in range(len(all_kgs_graphs))])} save_graphs(cached_features_file+"_all_kgs_graphs.bin", all_kgs_graphs, all_kgs_graphs_label_dict) logger.info("complete data preprocessing") logger.info("Saving features into cached file %s", cached_features_file) for f in features: del f.kgs_conceptids2synset torch.save({"features": features, "dataset": dataset, "examples": examples_tokenized}, cached_features_file) logger.info("Saving knowledge graph retrievers") for kg, retriever in retrievers.items(): if not os.path.exists(os.path.join(input_dir, args.kg_paths[kg])): os.mkdir(os.path.join(input_dir, args.kg_paths[kg])) torch.save(retriever, os.path.join(input_dir, args.kg_paths[kg], kg + args.cache_file_suffix)) logger.info("data create is done") if args.local_rank == 0: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier()
newg.edges() ###################################################################### # Loading and Saving Graphs # ------------------------- # # You can save a graph or a list of graphs via ``dgl.save_graphs`` and # load them back with ``dgl.load_graphs``. # # Save graphs print( "-----------------------------------------------------------------------------------" ) print("Step 5: Loading and Saving Graphs: ") dgl.save_graphs('graph.dgl', g) dgl.save_graphs('graphs.dgl', [g, sg1, sg2]) # Load graphs (g, ), _ = dgl.load_graphs('graph.dgl') print("graph g: ") print( g ) # each graph contain nodes and their features, edges and their features; (g, sg1, sg2), _ = dgl.load_graphs('graphs.dgl') print("graph g: ") print(g) print("graph sg1: ") print(sg1) print("graph sg2: ") print(sg2)
def main(argv): gROOT.SetBatch(True) #parse command line arguments parser = argparse.ArgumentParser(add_help=False) parser.add_argument("-n", "--ntuple", type=str, required=True, dest="ntuple", help="name of HDF5 file to be processed") parser.add_argument("-d", "--dataset", type=str, required=True, dest="dataset", help="name of dataset to be created") parser.add_argument("-i", "--input_dir", type=str, required=True, dest="infile_dir", help="name of input directory") parser.add_argument("-o", "--output_dir", type=str, required=True, dest="outfile_dir", help="name of output directory") parser.add_argument("-e", "--max_graphs", type=int, default=0, dest="max_graphs", help="maximum number of graphs to create") args = parser.parse_args() max_graphs = args.max_graphs #input data parameters connect_btoc = options.connect_btoc incl_errors = options.incl_errors incl_corr = options.incl_corr incl_hits = options.incl_hits incl_vweight = options.incl_vweight jet_pt_cut = options.jet_pt_cut jet_eta_cut = options.jet_eta_cut track_pt_cut = options.track_pt_cut track_eta_cut = options.track_eta_cut track_z0_cut = options.track_z0_cut vweight_pileup_cut = options.vweight_pileup_cut vweight_pv_cut = options.vweight_pv_cut nnfeatures_base = 8 nnfeatures_errors = 5 nnfeatures_corrs = 10 nnfeatures_hits = 10 #file names infiles = glob.glob(args.infile_dir + args.ntuple + "_*.hdf5") infiles.sort() outfiles = [] for infile in infiles: infile_name = os.path.splitext(os.path.basename(infile))[0] outfiles.append(args.outfile_dir + "/" + infile_name + ".bin") start_time = time.time() print( "--------------------------------------------------------------------") #check if track to vertex association information is contained in HDF5 file infile = h5py.File(infiles[0], "r") if 'tfeatures_w' in infile.keys(): ttv_avail = True else: ttv_avail = False print("WARNING: Track to vertex association information not found") total_jets = 0 for infile_name in infiles: infile = h5py.File(infile_name, "r") total_jets += len(infile['jinfo']['event_no']) infile.close() if max_graphs == 0 or max_graphs > total_jets: max_graphs = total_jets print("Total number of jets in dataset: {}".format(total_jets)) print("Maximum number of jets desired: {}".format(max_graphs)) passed_graphs = cut_graphs = jet_req_cuts = track_req_cuts = tracks_kept = tracks_cut = 0 #loop through input files for ifile, infile_name in enumerate(infiles): if passed_graphs >= max_graphs: break #stop reading in new files if maximum desired jet number is reached #check if outfile already exists and skip if it's newer than infile unless it's the last prevously processed file (in case more entries are being added) if ifile != len(outfiles) and os.path.exists( outfiles[ifile + 1]) and os.path.exists( outfiles[ifile]) and os.path.getmtime( outfiles[ifile]) > os.path.getmtime(infile_name): print("Current version of " + os.path.basename(outfiles[ifile]) + " already exists. Skipping file.") continue infile = h5py.File(infile_name, "r") file_jets = len(infile['jinfo']['event_no']) g_list = [] track_offset = 0 #tracks are stored in continuous chunk -> need to offset indices for each jet event_index = previous_event = -1 for ientry in range(file_jets): #read in event/jet information current_event = infile['jinfo']['event_no'][ientry] if current_event != previous_event: event_index += 1 previous_event = current_event if passed_graphs >= max_graphs: break #stop processing events once specified maximum jet number has been read in current_jet = infile['jinfo']['jet_no'][ientry] ntracks = infile['jinfo']['ntracks'][ientry] pv_x = infile['efeatures']['pv_x'][event_index] pv_y = infile['efeatures']['pv_y'][event_index] pv_z = infile['efeatures']['pv_z'][event_index] jet_pt = infile['jfeatures']['pt'][ientry] jet_eta = infile['jfeatures']['eta'][ientry] jet_phi = infile['jfeatures']['phi'][ientry] nedges = ntracks * (ntracks - 1) #apply jet cuts if jet_pt > jet_pt_cut and abs(jet_eta) < jet_eta_cut: #make jet flavor label definitions consistent jet_flavor = infile['jinfo']['jet_flavor'][ientry] if jet_flavor == 5: #b-jet jet_flavor = 1 elif jet_flavor == 4: #c-jet jet_flavor = 2 elif jet_flavor == 15: #tau-jets jet_flavor = 0 else: jet_flavor = 0 node_features_base = np.zeros((ntracks, nnfeatures_base)) if incl_corr: node_features_corrs = np.zeros((ntracks, nnfeatures_corrs)) if incl_errors: node_features_errors = np.zeros( (ntracks, nnfeatures_errors)) if incl_hits: node_features_hits = np.zeros((ntracks, nnfeatures_hits)) if incl_vweight: node_features_vweight = np.zeros((ntracks, 1)) jet_info = np.zeros( (ntracks, 4) ) #store jet info - jet truth label (0 = l, 1 = b, 2 = c), jet pv coordinates track_info = np.zeros( (ntracks, 4) ) #store track general info - track label (see process_ntuples), track sv coordinates track_ancestors = np.zeros( (ntracks, 4)) #store track ancestor info #edge_features = np.zeros((nedges,nefeatures)) #initialize track feature arrays hf_ancestors = np.zeros((ntracks, 1)) prev_b_ancestors = np.zeros((ntracks, 1)) track_flavors = np.zeros((ntracks, 1)) reco_use = np.zeros((ntracks, 2)) #use of track in SV0, SV1 passed_cuts = np.zeros((ntracks, 1)) bin_labels = np.zeros((nedges, 1)) mult_labels = np.zeros((nedges, 1)) #read in features for each track for j in range(ntracks): track_pt = infile['tfeatures_b']['pt'][track_offset + j] track_eta = infile['tfeatures_b']['eta'][track_offset + j] track_theta = infile['tfeatures_b']['theta'][track_offset + j] track_phi = infile['tfeatures_b']['phi'][track_offset + j] track_d0 = infile['tfeatures_b']['d0'][track_offset + j] track_z0 = infile['tfeatures_b']['z0'][track_offset + j] track_q = infile['tfeatures_b']['q'][track_offset + j] if ttv_avail: track_vweight = infile['tfeatures_w']['vweight'][ track_offset + j] track_vtype = infile['tinfo']['vertex_type'][ track_offset + j] if incl_errors: track_cov_d0d0 = math.sqrt( infile['tfeatures_e']['cov_d0d0'][track_offset + j]) track_cov_z0z0 = math.sqrt( infile['tfeatures_e']['cov_z0z0'][track_offset + j]) track_cov_phiphi = math.sqrt( infile['tfeatures_e']['cov_phiphi'][track_offset + j]) track_cov_thetatheta = math.sqrt( infile['tfeatures_e']['cov_thetatheta'][ track_offset + j]) track_cov_qoverpqoverp = math.sqrt( abs(infile['tfeatures_e']['cov_qoverpqoverp'][ track_offset + j])) if incl_corr: track_cov_d0z0 = infile['tfeatures_c']['cov_d0z0'][ track_offset + j] track_cov_d0phi = infile['tfeatures_c']['cov_d0phi'][ track_offset + j] track_cov_d0theta = infile['tfeatures_c'][ 'cov_d0theta'][track_offset + j] track_cov_d0qoverp = infile['tfeatures_c'][ 'cov_d0qoverp'][track_offset + j] track_cov_z0phi = infile['tfeatures_c']['cov_z0phi'][ track_offset + j] track_cov_z0theta = infile['tfeatures_c'][ 'cov_z0theta'][track_offset + j] track_cov_z0qoverp = infile['tfeatures_c'][ 'cov_z0qoverp'][track_offset + j] track_cov_phitheta = infile['tfeatures_c'][ 'cov_phitheta'][track_offset + j] track_cov_phiqoverp = infile['tfeatures_c'][ 'cov_phiqoverp'][track_offset + j] track_cov_thetaqoverp = infile['tfeatures_c'][ 'cov_thetaqoverp'][track_offset + j] if incl_hits: track_nPixHits = infile['tfeatures_h']['nPixHits'][ track_offset + j] track_nSCTHits = infile['tfeatures_h']['nSCTHits'][ track_offset + j] track_nBLHits = infile['tfeatures_h']['nBLHits'][ track_offset + j] track_nPixHoles = infile['tfeatures_h']['nPixHoles'][ track_offset + j] track_nSCTHoles = infile['tfeatures_h']['nSCTHoles'][ track_offset + j] track_nPixShared = infile['tfeatures_h']['nPixShared'][ track_offset + j] track_nSCTShared = infile['tfeatures_h']['nSCTShared'][ track_offset + j] track_nBLShared = infile['tfeatures_h']['nBLShared'][ track_offset + j] track_nPixSplit = infile['tfeatures_h']['nPixSplit'][ track_offset + j] track_nBLSplit = infile['tfeatures_h']['nBLSplit'][ track_offset + j] track_algo = infile['tinfo']['algo'][track_offset + j] reco_use[j] = [(track_algo & 1 << 2) / 4, (track_algo & 1 << 3) / 8] hf_ancestors[j] = infile['tinfo']['hf_ancestor'][ track_offset + j] hf_pdgid = infile['tinfo']['hf_pdgid'][track_offset + j] prev_b_ancestors[j] = infile['tinfo']['prev_b_ancestor'][ track_offset + j] prev_b_pdgid = infile['tinfo']['prev_b_pdgid'][track_offset + j] track_flavors[j] = infile['tinfo']['track_flavor'][ track_offset + j] sv_x = infile['tinfo']['sv_x'][track_offset + j] sv_y = infile['tinfo']['sv_y'][track_offset + j] sv_z = infile['tinfo']['sv_z'][track_offset + j] #make cuts on track level if ttv_avail: vertex_condition = ( track_vweight < vweight_pv_cut and track_vtype == 1) or (track_vweight < vweight_pileup_cut and track_vtype == 2) else: vertex_condition = True if track_pt > track_pt_cut and abs( track_eta) < track_eta_cut and abs( track_z0) < track_z0_cut and vertex_condition: passed_cuts[j] = 1 else: passed_cuts[j] = 0 #store information in feature arrays node_features_base[j] = [ track_q / track_pt, track_theta, track_phi, track_d0, track_z0, jet_pt, jet_eta, jet_phi ] if incl_vweight: node_features_vweight[j] = [track_vweight] if incl_errors: node_features_errors[j] = [ track_cov_qoverpqoverp, track_cov_thetatheta, track_cov_phiphi, track_cov_d0d0, track_cov_z0z0 ] if incl_corr: node_features_corrs[j] = [ track_cov_thetaqoverp, track_cov_phiqoverp, track_cov_d0qoverp, track_cov_z0qoverp, track_cov_phitheta, track_cov_d0theta, track_cov_z0theta, track_cov_d0phi, track_cov_z0phi, track_cov_d0z0 ] if incl_hits: node_features_hits[j] = [ track_nPixHits, track_nSCTHits, track_nBLHits, track_nPixHoles, track_nSCTHoles, track_nPixShared, track_nSCTShared, track_nBLShared, track_nPixSplit, track_nBLSplit ] track_ancestors[j] = [ hf_ancestors[j], hf_pdgid, prev_b_ancestors[j], prev_b_pdgid ] track_info[j] = [track_flavors[j], sv_x, sv_y, sv_z] jet_info[j] = [jet_flavor, pv_x, pv_y, pv_z] #calculate edge features and truth labels counter = 0 for j in range(ntracks): for k in range(j + 1, ntracks): #set edge features #delta_pt = abs(node_features_base[j][0] - node_features_base[k][0]) #edge_features[counter:counter+2] = [delta_pt] #truth labels - vertices have to share the same HF ancestor if hf_ancestors[k] == hf_ancestors[ j] and hf_ancestors[k] > 0 and track_flavors[ j] == 1 and track_flavors[ k] == 1: #matching direct ancestors for non secondaries (B to B) bin_labels[counter:counter + 2] = 1 mult_labels[counter:counter + 2] = 1 elif hf_ancestors[k] == hf_ancestors[ j] and hf_ancestors[k] > 0 and track_flavors[ j] == 2 and track_flavors[ k] == 2: #matching direct ancestors for non secondaries (prompt C to prompt C) bin_labels[counter:counter + 2] = 1 mult_labels[counter:counter + 2] = 2 elif hf_ancestors[k] == hf_ancestors[ j] and hf_ancestors[k] > 0 and track_flavors[ j] == 3 and track_flavors[ k] == 3: #matching direct ancestors for non secondaries (B->C to B->C for same C) bin_labels[counter:counter + 2] = 1 mult_labels[counter:counter + 2] = 1 elif prev_b_ancestors[k] == prev_b_ancestors[ j] and prev_b_ancestors[ k] > 0: #matching second ancestors (B->C to B->C for different C) bin_labels[counter:counter + 2] = connect_btoc mult_labels[counter:counter + 2] = connect_btoc elif ( prev_b_ancestors[k] == hf_ancestors[j] and hf_ancestors[j] > 0 ) or ( prev_b_ancestors[j] == hf_ancestors[k] and hf_ancestors[k] > 0 ): #matching second ancestor and direct ancestor (B to B->C) bin_labels[counter:counter + 2] = connect_btoc mult_labels[counter:counter + 2] = connect_btoc counter += 2 #create graph objects and append them to the list if np.sum(passed_cuts) > 1: g = dgl.graph((create_edge_list(ntracks))) g.ndata['features_base'] = th.from_numpy( node_features_base) if incl_vweight: g.ndata['features_vweight'] = th.from_numpy( node_features_vweight) if incl_errors: g.ndata['features_errors'] = th.from_numpy( node_features_errors) if incl_hits: g.ndata['features_hits'] = th.from_numpy( node_features_hits) if incl_corr: g.ndata['features_corr'] = th.from_numpy( node_features_corrs) g.ndata['jet_info'] = th.from_numpy(jet_info) g.ndata['track_info'] = th.from_numpy(track_info) g.ndata['track_ancestors'] = th.from_numpy(track_ancestors) g.ndata['reco_use'] = th.from_numpy(reco_use) g.ndata['passed_cuts'] = th.from_numpy(passed_cuts) g.edata['bin_labels'] = th.from_numpy(bin_labels) g.edata['mult_labels'] = th.from_numpy(mult_labels) g_list.append(g) tracks_kept += np.sum(passed_cuts == 1) tracks_cut += np.sum(passed_cuts == 0) passed_graphs += 1 else: track_req_cuts += 1 cut_graphs += 1 else: jet_req_cuts += 1 cut_graphs += 1 track_offset += ntracks #output progress sys.stdout.write( "\rJets processed: {} (Passed: {}, Cut: {}); Files processed: {}/{}" .format(cut_graphs + passed_graphs, passed_graphs, cut_graphs, ifile, len(infiles))) sys.stdout.flush() #save graphs to file dgl.save_graphs(outfiles[ifile], g_list) print( "Found enough good jets to reach desired sample size. Finishing up...") print( "--------------------------------------------------------------------") p_time = time.time() - start_time print("\nGraphs cut due to jet requirements: {}".format(jet_req_cuts)) print("Graphs cut due to track requirements: {}".format(track_req_cuts)) print("Fraction of tracks cut from passed jets: {}".format( tracks_cut / (tracks_cut + tracks_kept))) print("Finished creating graphs. Time elapsed: {}s.".format(p_time))
def _save_graph(self): data_dir = os.path.join(args.preprocessed_data_dir, args.city_list[self.city_id]) dgl.save_graphs(os.path.join(data_dir, "city_graph"), [self.graph])
def save_building_block_data(building_block_smis, building_block_molgraphs): with open(f"{PROCESSED_DATA_DIR}/building_block_smis.pt", "wb") as f: torch.save(building_block_smis, f) dgl.save_graphs(f"{PROCESSED_DATA_DIR}/building_block_molgraphs.pt", building_block_molgraphs)