def trainNet(model, train_loader, val_loader, device, static_map, start_epoch=0, globaliter_=0): # Print all of the hyper parameters of the training iteration: print("===== HYPERPARAMETERS =====") print("batch_size=", config['dataloader']['batch_size']) print("epochs=", config['num_epochs']) print('starting from epoch %i' % start_epoch) print("learning_rate=", config['optimizer']['lr']) print("network_depth=", config['model']['depth']) print("=" * 30) # define the optimizer & learning rate optim = torch.optim.SGD(model.parameters(), **config['optimizer']) scheduler = StepLR(optim, step_size=config['lr_step_size'], gamma=config['lr_gamma']) if config['cont_model_path'] is not None: log_dir = config['cont_model_path'] else: log_dir = 'runs/Unet-' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S-") + \ '-'.join(config['dataset']['cities']) writer = Visualizer(log_dir) # dump config file with open(os.path.join(log_dir, 'config.json'), 'w') as fp: json.dump(config, fp) # Time for printing training_start_time = time.time() globaliter = globaliter_ # initialize the early_stopping object early_stopping = EarlyStopping(log_dir, patience=config['patience'], verbose=True) # Loop for n_epochs for epoch_idx, epoch in enumerate(range(start_epoch, config['num_epochs'])): writer.write_lr(optim, epoch) # train for one epoch globaliter = train(model, train_loader, static_map, optim, device, writer, epoch, globaliter) # At the end of the epoch, do a pass on the validation set val_loss = validate(model, val_loader, static_map, device, writer, globaliter) # At the end of the epoch, do a pass on the validation set only considering the test times # val_loss_testtimes = validate(model, val_loader_ttimes, device, writer, globaliter, if_testtimes=True) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(val_loss, model, epoch+1, globaliter) if early_stopping.early_stop: print("Early stopping") break if config['debug'] and epoch_idx >= 0: break scheduler.step(epoch) print("Training finished, took {:.2f}s".format(time.time() - training_start_time)) # remember to close tensorboard writer writer.close()
def trainNet(model, train_loader, val_loader, device, adj, nn_ixs, edge_index, config, log_dir, coords=None): """ Args: model: train_loader: val_loader: device: adj: nn_ixs: edge_index: config: log_dir: coords: Returns: """ # define the optimizer & learning rate optim = torch.optim.Adam(model.parameters(), **config['optimizer']) # scheduler = StepLR(optim, step_size=config['lr_step_size'], gamma=config['lr_gamma']) writer = Visualizer(log_dir) # dump config file with open(os.path.join(log_dir, 'config.json'), 'w') as fp: json.dump(config, fp) # Time for printing training_start_time = time.time() globaliter = 0 # initialize the early_stopping object early_stopping = EarlyStopping(log_dir, patience=config['patience'], verbose=True) # adj = adj.to(device) batch_size = config['dataloader']['batch_size'] print_every_step = config['print_every_step'] # Loop for n_epochs for epoch_idx, epoch in enumerate(range(config['num_epochs'])): writer.write_lr(optim, globaliter) # train for one epoch globaliter = train(model=model, train_loader=train_loader, optim=optim, device=device, writer=writer, epoch=epoch, globaliter=globaliter, adj=adj, nn_ixs=nn_ixs, edge_index=edge_index, batch_size=batch_size, coords=coords, print_every_step=print_every_step) # At the end of the epoch, do a pass on the validation set # val_loss = validate(model, val_loader, device, writer, globaliter, adj, nn_ixs, edge_index) val_loss = validate(model=model, val_loader=val_loader, device=device, adj=adj, nn_ixs=nn_ixs, edge_index=edge_index, batch_size=batch_size, coords=coords, writer=writer, globaliter=globaliter) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(val_loss, model) if early_stopping.early_stop: print("Early stopping") break if config['debug'] and epoch_idx >= 0: break # scheduler.step() print("Training finished, took {:.2f}s".format(time.time() - training_start_time)) # remember to close writer writer.close()
# define data dataset_name = 'Cora' path = osp.join(os.getcwd(), '..', 'data', dataset_name) dataset = Planetoid(path, dataset_name, T.NormalizeFeatures()) data = dataset[0] data.batch = None data.adj = to_dense_adj(data.edge_index) # define logging log_dir = os.path.join( '..', 'runs', config['modelname'] + '_' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '_' + config['model_log_info']) writer = Visualizer(log_dir) with open(os.path.join(log_dir, 'config.json'), 'w') as fp: json.dump(config, fp) # define device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # model model = diff_pool_net2(dataset, **config['model']).to(device) data = data.to(device) lr = config['optimizer']['lr'] optimizer = torch.optim.Adam(model.parameters(), lr=lr) # initialize the early_stopping object early_stopping = EarlyStopping(log_dir, patience=config['patience'],
def networkTraining(model, train_loader, val_loader, epochs, learning_rate, device, log_path, includeHeading): """ Do the whole training """ print('------ STARTING TRAINING ------') print('Number of epochs: ', epochs) print('Learning rate: ', learning_rate) print('Batch Size: ', config['batch_size']) print('City: ', config['city']) print('-' * 31) # Define optimizer optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Initialize log file writer = Visualizer(log_path) # dump config file with open(os.path.join(log_path, 'config.json'), 'w') as fp: json.dump(config, fp) startTime = time.time() iterator = 0 val_loss_min = np.Inf counter = 0 # For each epoch for epoch in range(epochs): writer.write_lr(optimizer, iterator) # train for one epoch iterator = training(model, train_loader, optimizer, device, writer, epoch, iterator) # Early stopping (training failed) if iterator == -1: duration = time.time() - startTime print("Training finished (Error), took {:.2f}s".format(duration)) break # get validation loss and save images valLoss = validation(model, val_loader, device, writer, iterator, log_path, includeHeading) # Early stopping (didn't improve for 2 epochs) if valLoss < val_loss_min: torch.save(model.state_dict(), os.path.join(log_path, 'checkpoint.pt')) val_loss_min = valLoss counter = 0 elif counter == 1: duration = time.time() - startTime print("Training finished (early), took {:.2f}s".format(duration)) break else: counter += 1 # Dump statistics in tensorboard file duration = time.time() - startTime print("Training finished, took {:.2f}s".format(duration)) writer.write_text('{:.2f}'.format(duration), 'Time') if device != 'cpu': mem = torch.cuda.max_memory_allocated(device) mem = mem // 1048576 writer.write_text('{:d}'.format(mem), 'Memory') writer.close()