def main(args): directed = False path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], 'muon_graph_v4_small') # #path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], 'single_mu_v0') full_dataset = HitGraphDataset(path, directed=directed) fulllen = len(full_dataset) tv_frac = 0.2 tv_num = math.ceil(fulllen * tv_frac) splits = np.cumsum([fulllen - 2 * tv_num, tv_num, tv_num]) test_dataset = HitGraphDataset(path, directed=directed)[splits[1]:splits[2]] test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) test_samples = len(test_dataset) print('Testing with %s samples' % test_samples) d = full_dataset num_features = d.num_features num_classes = d[0].y.max().item() + 1 if d[0].y.dim( ) == 1 else d[0].y.size(1) tester = GNNTrainer(real_weight=sig_weight, fake_weight=bkg_weight, device=device, output_dir=os.path.abspath( os.path.join(os.path.dirname(args.model), '..'))) tester.load_model('EdgeNet', input_dim=num_features, hidden_dim=hidden_dim) print('Model: \n%s\nParameters: %i' % (tester.model, sum(p.numel() for p in tester.model.parameters()))) tester.model.load_state_dict(torch.load(args.model)['model']) y, pred, events = tester.predict(test_loader) print(y[0], pred[0], events[0]) # plotting: # make_test_plots(y,pred,0.5,osp.join(tester.output_dir,'lastmodel.pdf')) # now need to load torch orgininal dataset again to find the hits associated with the edges test = test_dataset.get(8842) # print(len(pred)) # print(test) print(test_dataset.get(7075))
def test_accuracy(theta_learn): # This function only test the accuracy over a very limited set of data # due to time constraints # TODO: Need to test properly data = HitGraphDataset(val_input_dir, 1) X, Ro, Ri, y = data[0] bo = np.dot(Ro.T, X) bi = np.dot(Ri.T, X) B = np.concatenate((bo, bi), axis=1) B = map2angle(B) acc = 0 size = len(B[:, 0]) for i in range(size): out = TTN_edge_forward(B[i], theta_learn) #print(str(i) + ': Result: ' + str(out) + ' Expected: ' + str(y[i])) if (y[i] == 0): acc = acc + 1 - out else: acc = acc + out acc = 100.0 * acc / size print('Total Accuracy: ' + str(acc) + ' %') print('Theta_learn: ' + str(theta_learn)) return acc
# Adjust axes ax0.set_xlabel('$\phi$') ax1.set_xlabel('$\phi$') ax0.set_ylabel('$r$') ax1.set_ylabel('$r$') plt.tight_layout() #plt.show() plt.savefig('png/QEN_output_RvsPhi.png') ############################################################################################ input_dir = '/home/cenktuysuz/MyRepos/HepTrkX-quantum/data/hitgraphs' theta_learn = [ 5.81938258, 0.65791055, 3.50325001, 5.99779941, 2.18404964, 0.03780523, 0.12155696, 3.44766096, 5.7402678, 4.45497403, 2.91924544 ] data = HitGraphDataset(input_dir, 1) X, Ro, Ri, y = data[0] n_edges = len(y) out = np.zeros(n_edges) bo = np.dot(Ro.T, X) bi = np.dot(Ri.T, X) B = np.concatenate((bo, bi), axis=1) B = normalize(B) epoch = n_edges for i in range(epoch): out[i] = round(TTN_edge_forward(B[i], theta_learn)) # Plot the results draw_sample(X, Ri, Ro, y, out)
#client = Client(processes=False, threads_per_worker=1, n_workers=8, memory_limit='2GB') #client if __name__ == '__main__': theta_learn = np.random.rand(11) * np.pi * 2 n_files = 16 * 10 testEVERY = 1 accuracy = np.zeros(round(n_files / testEVERY) + 1) loss_log = np.zeros(n_files) theta_log = np.zeros((n_files, 11)) #accuracy[0] = test_accuracy(theta_learn) print('Training is starting!') for n_file in range(n_files): data = HitGraphDataset(input_dir, n_files) X, Ro, Ri, y = data[n_file] bo = np.dot(Ro.T, X) bi = np.dot(Ri.T, X) B = np.concatenate((bo, bi), axis=1) B = map2angle(B) theta_learn, loss_log[n_file] = train(B, theta_learn, y) theta_log[n_file, :] = theta_learn # ''' if (n_file+1)%testEVERY==0: accuracy[n_file+1] = test_accuracy(theta_learn) print('Accuracy: ' + str(accuracy[n_file+1])) '''
def main(args): path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], 'single_mu') print(path) full_dataset = HitGraphDataset(path, directed=directed) fulllen = len(full_dataset) tv_frac = 0.10 tv_num = math.ceil(fulllen*tv_frac) splits = np.cumsum([fulllen-tv_num,0,tv_num]) print(fulllen, splits) train_dataset = torch.utils.data.Subset(full_dataset,np.arange(start=0,stop=splits[0])) valid_dataset = torch.utils.data.Subset(full_dataset,np.arange(start=splits[1],stop=splits[2])) train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) train_samples = len(train_dataset) valid_samples = len(valid_dataset) d = full_dataset num_features = d.num_features num_classes = d[0].y.max().item() + 1 if d[0].y.dim() == 1 else d[0].y.size(1) trainer = GNNTrainer(real_weight=sig_weight, fake_weight=bkg_weight, output_dir='/home/lagray/hgcal_ldrd/', device=device) trainer.logger.setLevel(logging.DEBUG) strmH = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') strmH.setFormatter(formatter) trainer.logger.addHandler(strmH) #example lr scheduling definition def lr_scaling(optimizer): from torch.optim.lr_scheduler import LambdaLR lr_type = 'linear' lr_warmup_epochs = 0 warmup_factor = 0. if lr_scaling == 'linear': warmup_factor = 1. # LR ramp warmup schedule def lr_warmup(epoch, warmup_factor=warmup_factor, warmup_epochs=lr_warmup_epochs): if epoch < warmup_epochs: return (1. - warmup_factor) * epoch / warmup_epochs + warmup_factor else: return 1. # give the LR schedule to the trainer return LambdaLR(optimizer, lr_warmup) trainer.build_model(name='EdgeNet', loss_func='binary_cross_entropy', optimizer='Adam', learning_rate=0.01, lr_scaling=lr_scaling, input_dim=num_features, hidden_dim=hidden_dim, n_iters=n_iters) print('made the hep.trkx trainer!') train_summary = trainer.train(train_loader, n_epochs, valid_data_loader=valid_loader) print(train_summary)
def main(args): # path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], 'single_mu_v0') path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], 'muon_graph_v4') full_dataset = HitGraphDataset(path, directed=directed) fulllen = 1000 if fulldata: fulllen = len(full_dataset) # splitting datasets tv_frac = 0.2 tv_num = math.ceil(int(fulllen) * tv_frac) splits = np.cumsum([fulllen - 2 * tv_num, tv_num, tv_num]) print("train, validation, testing splitting : ", fulllen, splits) train_dataset = HitGraphDataset(path, directed=directed)[0:splits[0]] valid_dataset = HitGraphDataset(path, directed=directed)[splits[0]:splits[1]] test_dataset = HitGraphDataset(path, directed=directed)[splits[1]:splits[2]] train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) train_samples = len(train_dataset) valid_samples = len(valid_dataset) test_samples = len(test_dataset) print("Number of training samples : ", train_samples) print("Number of validation samples : ", valid_samples) print("Number of testing samples : ", test_samples) d = full_dataset num_features = d.num_features num_classes = d[0].y.max().item() + 1 if d[0].y.dim( ) == 1 else d[0].y.size(1) trainer = GNNTrainer(real_weight=sig_weight, fake_weight=bkg_weight, output_dir=args.output_dir, device=device) trainer.logger.setLevel(logging.DEBUG) strmH = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') strmH.setFormatter(formatter) trainer.logger.addHandler(strmH) #example lr scheduling definition def lr_scaling(optimizer): from torch.optim.lr_scheduler import LambdaLR lr_type = 'linear' lr_warmup_epochs = 0 warmup_factor = 0. if lr_scaling == 'linear': warmup_factor = 1. # LR ramp warmup schedule def lr_warmup(epoch, warmup_factor=warmup_factor, warmup_epochs=lr_warmup_epochs): if epoch < warmup_epochs: return (1. - warmup_factor) * epoch / warmup_epochs + warmup_factor else: return 1. # give the LR schedule to the trainer return LambdaLR(optimizer, lr_warmup) trainer.build_model(name='EdgeNet', loss_func='binary_cross_entropy', optimizer='Adam', learning_rate=lr, lr_scaling=lr_scaling, input_dim=num_features, hidden_dim=hidden_dim, n_iters=n_iters) print('made the hep.trkx trainer!') train_summary = trainer.train(train_loader, n_epochs, valid_data_loader=valid_loader) print(train_summary) # plot for the last epoch y, pred = trainer.predict(test_loader) make_test_plots(y, pred, 0.5, osp.join(trainer.output_dir, 'lastmodel.pdf')) # plot for the best model output_checkpoint = glob.glob( os.path.join(trainer.output_dir, 'checkpoints') + '/*.tar') bestmodel_path = [i for i in output_checkpoint if 'best' in i][0] trainer.model.load_state_dict(torch.load(bestmodel_path)['model']) y, pred = trainer.predict(test_loader) make_test_plots(y, pred, 0.5, osp.join(trainer.output_dir, 'bestmodel.pdf'))
def get_dataset(config): return HitGraphDataset(get_input_dir(config))
def main(args): directed = False threshold = 0. path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], 'muon_graph_v4') # #path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], 'single_mu_v0') full_dataset = HitGraphDataset(path, directed=directed) full_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=False) fulllen = len(full_dataset) tv_frac = 1.0 tv_num = math.ceil(fulllen*tv_frac) splits = np.cumsum([fulllen-2*tv_num,tv_num,tv_num]) train_dataset = HitGraphDataset(path, directed=directed)[0:splits[0]] valid_dataset = HitGraphDataset(path, directed=directed)[splits[0]:splits[1]] test_dataset = HitGraphDataset(path, directed=directed)[splits[1]:splits[2]] train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) test_samples = len(test_dataset) print('Testing with %s samples'%test_samples) d = full_dataset num_features = d.num_features num_classes = d[0].y.max().item() + 1 if d[0].y.dim() == 1 else d[0].y.size(1) tester = GNNTrainer(real_weight=sig_weight, fake_weight=bkg_weight, device=device, output_dir = os.path.abspath(os.path.join(os.path.dirname( args.model ), '..'))) tester.load_model('EdgeNet',input_dim=num_features,hidden_dim=hidden_dim) print('Model: \n%s\nParameters: %i' % (tester.model, sum(p.numel() for p in tester.model.parameters()))) tester.model.load_state_dict(torch.load(args.model)['model']) y,pred = tester.predict(full_loader) make_test_plots(y,pred,threshold,osp.join(tester.output_dir,'bestmodel.pdf')) # now need to load torch orgininal dataset again to find the hits associated with the edges scores = [] totalscores = [] n_edges = [] ntrue_edges=[] nhits= [] pt_target = [] hit_features = [] for i in range(splits[1],splits[2]): data = test_dataset.get(i).to(device) edges_score = tester.model(data).cpu().detach().numpy() x= data.x.cpu().detach().numpy() y= data.y.cpu().detach().numpy() pt = data.pt eta = data.eta pt_target.append(data.pt.item(0)) edge_index_array = np.asarray(data.edge_index.cpu().detach().numpy()) edge_score_array = np.asarray(edges_score) hits_score = np.asarray((edge_index_array[0],edge_score_array))[:,edges_score>threshold] hits_true_score = np.asarray((edge_index_array[0],y))[:,edges_score>threshold] ### now calculate some hit level features df = pd.DataFrame(hits_score.T) sum_edges_score = df.groupby(0, as_index=False).sum().to_numpy() ### average hit score and coordinates will be used for regression, sorted by average hit score sum_edges_score_ave = df.groupby(0, as_index=False).mean().sort_values(1,ascending=False).to_numpy() x_filtered = x[sum_edges_score_ave[:,0].astype(int),64:64+nhitfeature-1] x_filtered = np.concatenate((x_filtered,sum_edges_score_ave[:,1][:,np.newaxis]),axis=1) x_filtered = x_filtered*np.array([1/1000,1/45,1/180,1/800,1,1]) x_padded = np.zeros((nhits_sel,nhitfeature)) if x_filtered.shape[0] < nhits_sel: x_padded[:x_filtered.shape[0],:x_filtered.shape[1]] = x_filtered else: x_padded = x_filtered[:nhits_sel,:] hit_features.append(x_padded.flatten()) ### average number of edges, and number of average true edges. sum_edges = df.groupby(0, as_index=False).count().to_numpy() sum_true_edges = pd.DataFrame(hits_true_score.T).groupby(0, as_index=False).sum().to_numpy() ### save other features scores.append(sum_edges_score_ave) totalscores.append(sum_edges_score) n_edges.append(sum_edges) ntrue_edges.append(sum_true_edges) nhits.append(sum_edges.shape) # plotting: target = torch.Tensor(pt_target) features = torch.Tensor(hit_features) dataset = TensorDataset(features,target) tv_frac = 0.8 train_set, val_set = torch.utils.data.random_split(dataset, [math.ceil(len(dataset)*tv_frac),len(pt_target)-math.ceil(len(dataset)*tv_frac)]) train_dataloader = DataLoader(train_set) val_dataloader = DataLoader(val_set) regressor = RegressionTrainer(real_weight=sig_weight, fake_weight=bkg_weight, device=device, output_dir = os.path.abspath(os.path.join(os.path.dirname( args.model ), '..'))) regressor.build_model(learning_rate=lr, lr_scaling=lr_scaling,input_dim=nhits_sel*nhitfeature, hidden_dim=[64,32,32]) train_summary = regressor.train(train_dataloader, n_epochs=n_epoch, valid_data_loader=val_dataloader) targ,pred = regressor.predict(val_dataloader) print(train_summary) print(targ) print(pred) figs = [] #factorize the plotting part fig,axes = plt.subplots(figsize=(12, 7)) plt.plot(train_summary['train_loss'],label = 'train loss') plt.title('train loss') figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) plt.plot(train_summary['valid_loss'],label = 'valid loss') plt.title('validation loss') figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) _, bins,_ = axes.hist([targ,pred], bins=100,range = (0,100),color=['r','b'],label=['target','pred'],histtype='step',fill=False) plt.title('pt distribution') # plt.xlim(0,100) figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) plt.scatter(pred,targ) plt.xlim(0,100) plt.ylim(0,100) figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) _, bins,_ = axes.hist(np.concatenate(scores)[:,1],bins=100,color=['b'],label=['edge score'],histtype='step',fill=False) plt.title("Edge classifier score (per hit) on test data") plt.ylabel("Number of edges") plt.xlabel("Classifier score") plt.legend(loc='upper left') plt.yscale('log') figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) _, bins,_ = axes.hist(np.concatenate(nhits),bins=100,color=['b'],label=['Number of hits'],histtype='step',fill=False) plt.title("Number of hits") plt.ylabel("Number of events") plt.xlabel("Number of hits") plt.legend(loc='upper left') plt.yscale('log') figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) _, bins,_ = axes.hist(np.concatenate(totalscores)[:,1],bins=100,color=['b'],label=['Total score'],histtype='step',fill=False) plt.title("sum of edge classifier score per hit") plt.ylabel("Number of edges") plt.xlabel("Classifier score") plt.legend(loc='upper left') plt.yscale('log') figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) _, bins,_ = axes.hist(np.concatenate(n_edges)[:,1],bins=100,color=['b'],label=['Number of edges'],histtype='step',fill=False) plt.title("Number of edges per hit") plt.ylabel("Number of edges") plt.xlabel("Classifier score") plt.legend(loc='upper left') plt.yscale('log') figs.append(fig) fig,axes = plt.subplots(figsize=(12, 7)) _, bins,_ = axes.hist(np.concatenate(ntrue_edges)[:,1][np.concatenate(scores)[:,1]>0.7],bins=100,color=['b'],label=['true edge number'],histtype='step',fill=False) plt.title("Number of true edges per hit") plt.ylabel("Number of edges") plt.xlabel("Classifier score") plt.legend(loc='upper left') plt.yscale('log') figs.append(fig) import matplotlib.backends.backend_pdf pdf = matplotlib.backends.backend_pdf.PdfPages('hit_score.pdf') for fig in figs: pdf.savefig(fig) pdf.close()