def inference(MODEL_NAME, dataset, params, net_params, model_path): if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() trainset, valset, testset = dataset.train, dataset.val, dataset.test device = net_params['device'] model = gnn_model(MODEL_NAME, net_params) model = model.to(device) model.load_state_dict(torch.load(model_path)) # batching exception for Diffpool drop_last = True if MODEL_NAME == 'DiffPool' else False train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, drop_last=drop_last, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) _, train_mae = evaluate_network(model, device, train_loader) _, val_mae = evaluate_network(model, device, val_loader) _, test_mae = evaluate_network(model, device, test_loader) print("Train MAE: {:.4f}".format(train_mae)) print("Val MAE: {:.4f}".format(val_mae)) print("Test MAE: {:.4f}".format(test_mae))
def test_pipeline(MODEL_NAME, dataset, device, verbose, out_dir): # Load models print('\n>> Loading models...') model_ls = load_model(out_dir, device=device, only_best=False, verbose=verbose, filter=lambda df: df[df['model'] == MODEL_NAME][df[ 'dataset'] == dataset.name]) # Preparing dataset print('\n>> Preparing data...') if MODEL_NAME in ['GCN']: if model_ls[0]['net_params']['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() testset = dataset.test print("Test Graphs: ", len(testset)) # Batching test data test_loader = DataLoader( testset, batch_size=model_ls[0]['net_params']['batch_size'], shuffle=False, drop_last=False, collate_fn=dataset.collate) # Test models print('\n>> Testing models...') mae_ls = [] for i, item in enumerate(model_ls): model = item['model'] net_params = item['net_params'] # Set random seed set_random_seed(item['seed'], device) # Evaluate model _, test_mae = evaluate_network(model, device, test_loader, 0) mae_ls.append(test_mae) if verbose: print('\nModel #%s' % i) print('Test MAE: %s' % mae_ls[-1]) print('\n') print('AVG Test MAE: %s, s.d.: %s' % (np.mean(mae_ls), np.std(mae_ls)))
def train_val_pipeline(MODEL_NAME, dataset, params, net_params, dirs): t0 = time.time() per_epoch_time = [] DATASET_NAME = dataset.name if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() if MODEL_NAME in ['GatedGCN']: if net_params['pos_enc']: print("[!] Adding graph positional encoding.") dataset._add_positional_encodings(net_params['pos_enc_dim']) print('Time PE:', time.time() - t0) trainset, valset, testset = dataset.train, dataset.val, dataset.test root_log_dir, root_ckpt_dir, write_file_name, write_config_file = dirs device = net_params['device'] # Write the network and optimization hyper-parameters in folder config/ with open(write_config_file + '.txt', 'w') as f: f.write( """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n\nTotal Parameters: {}\n\n""" .format(DATASET_NAME, MODEL_NAME, params, net_params, net_params['total_param'])) log_dir = os.path.join(root_log_dir, "RUN_" + str(0)) writer = SummaryWriter(log_dir=log_dir) # setting seeds random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) if device.type == 'cuda': torch.cuda.manual_seed(params['seed']) print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = gnn_model(MODEL_NAME, net_params) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience'], verbose=True) epoch_train_losses, epoch_val_losses = [], [] epoch_train_MAEs, epoch_val_MAEs = [], [] # batching exception for Diffpool drop_last = True if MODEL_NAME == 'DiffPool' else False if MODEL_NAME in ['RingGNN', '3WLGNN']: # import train functions specific for WLGNNs from train.train_molecules_graph_regression import train_epoch_dense as train_epoch, evaluate_network_dense as evaluate_network from functools import partial # util function to pass edge_feat to collate function train_loader = DataLoader(trainset, shuffle=True, collate_fn=partial( dataset.collate_dense_gnn, edge_feat=net_params['edge_feat'])) val_loader = DataLoader(valset, shuffle=False, collate_fn=partial( dataset.collate_dense_gnn, edge_feat=net_params['edge_feat'])) test_loader = DataLoader(testset, shuffle=False, collate_fn=partial( dataset.collate_dense_gnn, edge_feat=net_params['edge_feat'])) else: # import train functions for all other GNNs from train.train_molecules_graph_regression import train_epoch_sparse as train_epoch, evaluate_network_sparse as evaluate_network train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, drop_last=drop_last, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(params['epochs'])) as t: for epoch in t: t.set_description('Epoch %d' % epoch) start = time.time() if MODEL_NAME in [ 'RingGNN', '3WLGNN' ]: # since different batch training function for RingGNN epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch, params['batch_size']) else: # for all other models common train function epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch) epoch_val_loss, epoch_val_mae = evaluate_network( model, device, val_loader, epoch) _, epoch_test_mae = evaluate_network(model, device, test_loader, epoch) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(epoch_val_loss) epoch_train_MAEs.append(epoch_train_mae) epoch_val_MAEs.append(epoch_val_mae) writer.add_scalar('train/_loss', epoch_train_loss, epoch) writer.add_scalar('val/_loss', epoch_val_loss, epoch) writer.add_scalar('train/_mae', epoch_train_mae, epoch) writer.add_scalar('val/_mae', epoch_val_mae, epoch) writer.add_scalar('test/_mae', epoch_test_mae, epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_MAE=epoch_train_mae, val_MAE=epoch_val_mae, test_MAE=epoch_test_mae) per_epoch_time.append(time.time() - start) # Saving checkpoint ckpt_dir = os.path.join(root_ckpt_dir, "RUN_") if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) torch.save(model.state_dict(), '{}.pkl'.format(ckpt_dir + "/epoch_" + str(epoch))) files = glob.glob(ckpt_dir + '/*.pkl') for file in files: epoch_nb = file.split('_')[-1] epoch_nb = int(epoch_nb.split('.')[0]) if epoch_nb < epoch - 1: os.remove(file) scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') _, test_mae = evaluate_network(model, device, test_loader, epoch) _, train_mae = evaluate_network(model, device, train_loader, epoch) print("Test MAE: {:.4f}".format(test_mae)) print("Train MAE: {:.4f}".format(train_mae)) print("Convergence Time (Epochs): {:.4f}".format(epoch)) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) writer.close() """ Write the results in out_dir/results folder """ with open(write_file_name + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Convergence Time (Epochs): {:.4f}\nTotal Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], test_mae, train_mae, epoch, (time.time()-t0)/3600, np.mean(per_epoch_time)))
def train_val_pipeline(dataset, params, net_params, dirs): t0 = time.time() per_epoch_time = [] DATASET_NAME = dataset.name MODEL_NAME = 'EIG' trainset, valset, testset = dataset.train, dataset.val, dataset.test root_log_dir, root_ckpt_dir, write_file_name, write_config_file = dirs device = net_params['device'] # Write the network and optimization hyper-parameters in folder config/ with open(write_config_file + '.txt', 'w') as f: f.write( """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n\nTotal Parameters: {}\n\n""" .format(DATASET_NAME, MODEL_NAME, params, net_params, net_params['total_param'])) log_dir = os.path.join(root_log_dir, "RUN_" + str(0)) writer = SummaryWriter(log_dir=log_dir) # setting seeds random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) if device == 'cuda': torch.cuda.manual_seed(params['seed']) if hydra.is_first_execution(): print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = EIGNet(net_params) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience'], verbose=True) if hydra.is_first_execution(): start_epoch = 0 else: t0 -= hydra.retrieved_checkpoint.time_elapsed start_epoch = hydra.retrieved_checkpoint.last_epoch states = torch.load(hydra.retrieved_checkpoint.linked_files()[0]) model.load_state_dict(states['model']) optimizer.load_state_dict(states['optimizer']) scheduler.load_state_dict(states['scheduler']) epoch_train_losses, epoch_val_losses = [], [] epoch_train_MAEs, epoch_val_MAEs = [], [] train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) last_hydra_checkpoint = t0 # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(start_epoch, params['epochs']), mininterval=params['hydra_progress_bar_every'], maxinterval=None, unit='epoch', initial=start_epoch, total=params['epochs']) as t: for epoch in t: if epoch == -1: #reset params of eig_attn model.reset_params() t.set_description('Epoch %d' % epoch) start = time.time() epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch, net_params['flip']) epoch_val_loss, epoch_val_mae = evaluate_network( model, device, val_loader, epoch) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(epoch_val_loss) epoch_train_MAEs.append(epoch_train_mae.detach().cpu().item()) epoch_val_MAEs.append(epoch_val_mae.detach().cpu().item()) writer.add_scalar('train/_loss', epoch_train_loss, epoch) writer.add_scalar('val/_loss', epoch_val_loss, epoch) writer.add_scalar('train/_mae', epoch_train_mae, epoch) writer.add_scalar('val/_mae', epoch_val_mae, epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) _, epoch_test_mae = evaluate_network(model, device, test_loader, epoch) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_MAE=epoch_train_mae.item(), val_MAE=epoch_val_mae.item(), test_MAE=epoch_test_mae.item(), refresh=False) per_epoch_time.append(time.time() - start) scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break # Saving checkpoint if hydra.is_available() and (time.time( ) - last_hydra_checkpoint) > params['hydra_checkpoint_every']: last_hydra_checkpoint = time.time() ck_path = '/tmp/epoch_{}.pkl'.format(epoch + 1) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, ck_path) ck = hydra.checkpoint() ck.last_epoch = epoch + 1 ck.time_elapsed = time.time() - t0 # save best epoch ck.link_file(ck_path) ck.save_to_server() if hydra.is_available( ) and epoch % params['hydra_eta_every'] == 0: hydra.set_eta(per_epoch_time[-1] * (params['epochs'] - epoch - 1)) #for _ in range(5): #print('Sampled value is ', model.layers[1].towers[0].eigfiltbis(torch.FloatTensor([random.random() for i in range(4)]).to('cuda'))) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') _, test_mae = evaluate_network(model, device, test_loader, epoch) _, val_mae = evaluate_network(model, device, val_loader, epoch) _, train_mae = evaluate_network(model, device, train_loader, epoch) test_mae = test_mae.item() val_mae = val_mae.item() train_mae = train_mae.item() print("Train MAE: {:.4f}".format(train_mae)) print("Val MAE: {:.4f}".format(val_mae)) print("Test MAE: {:.4f}".format(test_mae)) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) #for i, layer in enumerate(model.layers): #for j, tower in enumerate(layer.towers): #print('For layer ', i, ' tower ', j, ' the weights are ', tower.bias) #print('For layer ', i, ' tower ', j, ' the bias are ', tower.bias) writer.close() if hydra.is_available(): hydra.save_output( { 'loss': { 'train': epoch_train_losses, 'val': epoch_val_losses }, 'MAE': { 'train': epoch_train_MAEs, 'val': epoch_val_MAEsFix } }, 'history') hydra.save_output( { 'test_acc': test_acc, 'train_acc': train_acc, 'val_acc': val_acc, 'total_time': time.time() - t0, 'avg_epoch_time': np.mean(per_epoch_time) }, 'summary') """ Write the results in out_dir/results folder """ with open(write_file_name + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n""" \ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_mae)), np.array(train_mae), (time.time() - t0) / 3600, np.mean(per_epoch_time)))
def train_val_pipeline(dataset, params, net_params, dirs): t0 = time.time() per_epoch_time = [] DATASET_NAME = dataset.name MODEL_NAME = 'PNA' trainset, valset, testset = dataset.train, dataset.val, dataset.test root_log_dir, root_ckpt_dir, write_file_name, write_config_file = dirs device = net_params['device'] # Write the network and optimization hyper-parameters in folder config/ with open(write_config_file + '.txt', 'w') as f: f.write( """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n\nTotal Parameters: {}\n\n""" .format(DATASET_NAME, MODEL_NAME, params, net_params, net_params['total_param'])) log_dir = os.path.join(root_log_dir, "RUN_" + str(0)) writer = SummaryWriter(log_dir=log_dir) # setting seeds random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) if device.type == 'cuda': torch.cuda.manual_seed(params['seed']) print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = PNANet(net_params) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience'], verbose=True) epoch_train_losses, epoch_val_losses = [], [] epoch_train_MAEs, epoch_val_MAEs = [], [] train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(params['epochs']), unit='epoch') as t: for epoch in t: t.set_description('Epoch %d' % epoch) start = time.time() epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch) epoch_val_loss, epoch_val_mae = evaluate_network( model, device, val_loader, epoch) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(epoch_val_loss) epoch_train_MAEs.append(epoch_train_mae.detach().cpu().item()) epoch_val_MAEs.append(epoch_val_mae.detach().cpu().item()) writer.add_scalar('train/_loss', epoch_train_loss, epoch) writer.add_scalar('val/_loss', epoch_val_loss, epoch) writer.add_scalar('train/_mae', epoch_train_mae, epoch) writer.add_scalar('val/_mae', epoch_val_mae, epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) _, epoch_test_mae = evaluate_network(model, device, test_loader, epoch) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_MAE=epoch_train_mae.item(), val_MAE=epoch_val_mae.item(), test_MAE=epoch_test_mae.item(), refresh=False) per_epoch_time.append(time.time() - start) scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') _, test_mae = evaluate_network(model, device, test_loader, epoch) _, val_mae = evaluate_network(model, device, val_loader, epoch) _, train_mae = evaluate_network(model, device, train_loader, epoch) test_mae = test_mae.item() val_mae = val_mae.item() train_mae = train_mae.item() print("Train MAE: {:.4f}".format(train_mae)) print("Val MAE: {:.4f}".format(val_mae)) print("Test MAE: {:.4f}".format(test_mae)) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) writer.close() """ Write the results in out_dir/results folder """ with open(write_file_name + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n""" \ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_mae)), np.array(train_mae), (time.time() - t0) / 3600, np.mean(per_epoch_time)))
def train_val_pipeline(MODEL_NAME, dataset, params, net_params, dirs): t0 = time.time() per_epoch_time = [] DATASET_NAME = dataset.name if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() trainset, valset, testset = dataset.train, dataset.val, dataset.test root_log_dir, root_ckpt_dir, write_file_name, write_config_file = dirs device = net_params['device'] # Write the network and optimization hyper-parameters in folder config/ with open(write_config_file + '.txt', 'w') as f: f.write( """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n\nTotal Parameters: {}\n\n""" .format(DATASET_NAME, MODEL_NAME, params, net_params, net_params['total_param'])) log_dir = os.path.join(root_log_dir, "RUN_" + str(0)) writer = SummaryWriter(log_dir=log_dir) # setting seeds random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) if device == 'cuda': torch.cuda.manual_seed(params['seed']) print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = gnn_model(MODEL_NAME, net_params) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience'], verbose=True) epoch_train_losses, epoch_val_losses = [], [] epoch_train_MAEs, epoch_val_MAEs = [], [] # batching exception for Diffpool drop_last = True if MODEL_NAME == 'DiffPool' else False train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, drop_last=drop_last, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(params['epochs'])) as t: best_val_mae = 10000 for epoch in t: t.set_description('Epoch %d' % epoch) start = time.time() epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch) epoch_val_loss, epoch_val_mae = evaluate_network( model, device, val_loader, epoch) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(epoch_val_loss) epoch_train_MAEs.append(epoch_train_mae) epoch_val_MAEs.append(epoch_val_mae) writer.add_scalar('train/_loss', epoch_train_loss, epoch) writer.add_scalar('val/_loss', epoch_val_loss, epoch) writer.add_scalar('train/_mae', epoch_train_mae, epoch) writer.add_scalar('val/_mae', epoch_val_mae, epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) _, epoch_test_mae = evaluate_network(model, device, test_loader, epoch) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_MAE=epoch_train_mae.item(), val_MAE=epoch_val_mae.item(), test_MAE=epoch_test_mae.item()) per_epoch_time.append(time.time() - start) # Saving checkpoint ckpt_dir = os.path.join(root_ckpt_dir, "RUN_") if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) torch.save(model.state_dict(), '{}.pkl'.format(ckpt_dir + "/epoch_" + str(epoch))) if best_val_mae > epoch_val_mae: best_val_mae = epoch_val_mae torch.save(model.state_dict(), '{}.pkl'.format(ckpt_dir + "/best")) files = glob.glob(ckpt_dir + '/*.pkl') for file in files: if file[-8:] == 'best.pkl': continue else: epoch_nb = file.split('_')[-1] epoch_nb = int(epoch_nb.split('.')[0]) if epoch_nb < epoch - 1: os.remove(file) scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') model.load_state_dict(torch.load('{}.pkl'.format(ckpt_dir + "/best"))) _, val_mae = evaluate_network(model, device, val_loader, epoch) _, test_mae = evaluate_network(model, device, test_loader, epoch) _, train_mae = evaluate_network(model, device, train_loader, epoch) print("Test MAE: {:.4f}".format(test_mae)) print("Train MAE: {:.4f}".format(train_mae)) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) writer.close() """ Write the results in out_dir/results folder """ with open(write_file_name + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_mae.cpu())), np.array(train_mae.cpu()), (time.time()-t0)/3600, np.mean(per_epoch_time))) # send results to gmail try: from gmail import send subject = 'Result for Dataset: {}, Model: {}'.format( DATASET_NAME, MODEL_NAME) body = """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_mae.cpu())), np.array(train_mae.cpu()), (time.time()-t0)/3600, np.mean(per_epoch_time)) send(subject, body) except: pass return val_mae, test_mae
def train_val_pipeline(dataset, params, net_params): t0 = time.time() per_epoch_time = [] trainset, valset, testset = dataset.train, dataset.val, dataset.test device = net_params['device'] # setting seeds random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) if device == 'cuda': torch.cuda.manual_seed(params['seed']) print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = DGNNet(net_params) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience']) start_epoch = 0 epoch_train_losses, epoch_val_losses = [], [] epoch_train_MAEs, epoch_val_MAEs = [], [] train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(start_epoch, params['epochs']), mininterval=params['print_epoch_interval'], maxinterval=None, unit='epoch', initial=start_epoch, total=params['epochs']) as t: for epoch in t: t.set_description('Epoch %d' % epoch) start = time.time() epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch, net_params['flip']) epoch_val_loss, epoch_val_mae = evaluate_network( model, device, val_loader, epoch) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(epoch_val_loss) epoch_train_MAEs.append(epoch_train_mae.detach().cpu().item()) epoch_val_MAEs.append(epoch_val_mae.detach().cpu().item()) _, epoch_test_mae = evaluate_network(model, device, test_loader, epoch) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_MAE=epoch_train_mae.item(), val_MAE=epoch_val_mae.item(), test_MAE=epoch_test_mae.item(), refresh=False) per_epoch_time.append(time.time() - start) scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') _, test_mae = evaluate_network(model, device, test_loader, epoch) _, val_mae = evaluate_network(model, device, val_loader, epoch) _, train_mae = evaluate_network(model, device, train_loader, epoch) test_mae = test_mae.item() val_mae = val_mae.item() train_mae = train_mae.item() print("Train MAE: {:.4f}".format(train_mae)) print("Val MAE: {:.4f}".format(val_mae)) print("Test MAE: {:.4f}".format(test_mae)) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time)))