def training_eval(): print("Benchmarking training time...") optimizer = getattr(optim, 'Adam')(model.parameters(), lr=0.001) model.train() all_durations = [] out = model(data) # do one time to wake gpu. for i in np.arange(args.nruns): # start timer start = time() optimizer.zero_grad() out = model(data) loss = F.nll_loss(out, target) loss.backward() optimizer.step() # end timer. gpu_usage() duration = time() - start # print and save duration print(f"Run: {i} \t Duration: {duration}", ) all_durations.append(duration) # print mean and std of durations. all_durations = np.array(all_durations) mean_time = np.mean(all_durations) std_time = np.std(all_durations) print(f"mean time: {mean_time} \t std time: {std_time}")
def free_gpu_cache(): print("Initial GPU Usage") gpu_usage() torch.cuda.empty_cache() cuda.select_device(0) cuda.close() cuda.select_device(0) print("GPU Usage after emptying the cache") gpu_usage()
def simplify(sketch_np_array, imgbasename): t0 = time.time() use_cuda = torch.cuda.device_count() > 0 cache = load_lua('model_gan.t7') model = cache.model immean = cache.mean imstd = cache.std model.evaluate() data = Image.fromarray(sketch_np_array) w, h = data.size[0], data.size[1] pw = 8 - (w % 8) if w % 8 != 0 else 0 ph = 8 - (h % 8) if h % 8 != 0 else 0 data = ((transforms.ToTensor()(data) - immean) / imstd).unsqueeze(0) if pw != 0 or ph != 0: data = torch.nn.ReplicationPad2d((0, pw, 0, ph))(data).data if use_cuda: print("CUDA device count :", torch.cuda.device_count()) print("GPU :", torch.cuda.get_device_name(0)) print('Initial GPU Usage') gpu_usage() ''' GPU 사용할거면 아래 mode.cuda() 코드 주석 해제하고, 그 밑에 pred를 주석처리 할 것. GPU 사용 시 속도는 빠르나 CUDA out of memory 에러 생겨서 계속 재시작 해줘야함 잘 모를 경우 그냥 pred = model.forward(data)코드 사용 권장 ''' # pred = model.cuda().forward(data.cuda()).float() pred = model.forward(data) else: pred = model.forward(data) print('GPU Usage after allocating a bunch of Tensors') gpu_usage() pngname = imgbasename + '.png' save_image(pred[0], pngname) png2svg(pngname, imgbasename) t1 = time.time() total = t1 - t0 print(total, "sec spent")
def testing_eval(): print("Benchmarking test time...") model.eval() all_durations = [] _ = model(data) # do one time to wake gpu. with torch.no_grad(): for i in np.arange(args.nruns): # time forward pass start = time() _ = model(data) gpu_usage() duration = time() - start # save duration print(f"Run: {i} \t Duration: {duration}", ) all_durations.append(duration) # print mean and std of durations. all_durations = np.array(all_durations) mean_time = np.mean(all_durations) std_time = np.std(all_durations) print(f"mean time: {mean_time} \t std time: {std_time}")
def testNet(net, test_dataset, device): print("Initial GPU Usage: ") gpu_usage() number_of_batches = len(test_dataset) test_start_time = time() print("Test started at: " + test_start_time) correct = 0 total = 0 with torch.no_grad(): for inputs, labels in test_dataset: # inputs, labels = Variable(inputs), Variable(labels) inputs = Variable(inputs) inputs = inputs.to(device) # labels = labels.to(device) test_outputs = net(inputs) test_outputs = test_outputs.data.cpu().numpy() _, predicted = torch.max(test_outputs.data, 1) labels = labels.data.cpu().numpy() total += labels.size(0) correct += (predicted == labels).sum().item() # test_accuracy = get_accuracy(test_outputs, labels) print("Accuracy of network: %d" % (100 * correct / total))
def load_train_evaluate_save(mode): # ------------------------------------------------------------------------- # PARSER # ------------------------------------------------------------------------- # Parse cmdline args and setup environment parser = argparse.ArgumentParser( 'OpenQA Question Answering Model', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) add_main_args(parser, mode) config.add_model_args(parser) args = parser.parse_args() set_defaults(args) # ------------------------------------------------------------------------- # INITIALIZATIONS # ------------------------------------------------------------------------- # CUDA args.cuda = not args.no_cuda and torch.cuda.is_available() assert(args.cuda) if args.cuda: torch.cuda.set_device(args.gpu) # no-op if args.gpu is negative torch.cuda.empty_cache() # Set random state np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) if args.cuda: torch.cuda.manual_seed(args.random_seed) if args.log_file: if args.checkpoint: logfile = logging.FileHandler(args.log_file, 'a') else: logfile = logging.FileHandler(args.log_file, 'w') logfile.setFormatter(txtfmt) logger.addHandler(logfile) logger.info('COMMAND: {}'.format(' '.join(sys.argv))) # GPU cleaning gc.collect() for obj in gc.get_objects(): del obj torch.cuda.empty_cache() # -------------------------------------------------------------------------- # DATASET # ------------------------------------------------------------------------- logger.info('-' * 100) logger.info('Load data files') dataset = args.dataset # == 'searchqa', 'quasart' or 'unftriviaqa' filename_train_docs = sys_dir+'/data/datasets/'+dataset+'/train.json' filename_dev_docs = sys_dir+'/data/datasets/'+dataset+'/dev.json' filename_test_docs = sys_dir+'/data/datasets/'+dataset+'/test.json' filename_train = sys_dir+'/data/datasets/'+dataset+'/train.txt' filename_dev = sys_dir+'/data/datasets/'+dataset+'/dev.txt' filename_test = sys_dir+'/data/datasets/'+dataset+'/test.txt' train_docs, train_questions, train_len = utils.load_data_with_doc( args, filename_train_docs) logger.info(len(train_docs)) logger.info(len(train_questions)) train_exs_with_doc = read_data(filename_train, train_questions, train_len) logger.info('Num train examples = {}'.format(str(len(train_exs_with_doc)))) dev_docs, dev_questions, _ = utils.load_data_with_doc( args, filename_dev_docs) logger.info(len(dev_docs)) logger.info(len(dev_questions)) dev_exs_with_doc = read_data(filename_dev, dev_questions) logger.info('Num dev examples = {}'.format(str(len(dev_exs_with_doc)))) test_docs, test_questions, _ = utils.load_data_with_doc( args, filename_test_docs) logger.info(len(test_docs)) logger.info(len(test_questions)) test_exs_with_doc = read_data(filename_test, test_questions) logger.info('Num test examples = {}'.format(str(len(test_exs_with_doc)))) # -------------------------------------------------------------------------- # MODEL SETUP # ------------------------------------------------------------------------- logger.info('-' * 100) start_epoch = 0 if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'): # Just resume training, no modifications. logger.info('Found a checkpoint...') checkpoint_file = args.model_file + '.checkpoint' model, start_epoch = DocReader.load_checkpoint(checkpoint_file) start_epoch = 0 else: # Training starts fresh. But the model state is either pretrained or # newly (randomly) initialized. if args.pretrained: logger.info('Using pretrained model...') model = DocReader.load(args.pretrained, args) if args.expand_dictionary: logger.info('Expanding dictionary for new data...') # Add words in training and dev examples #words = utils.load_words(args, train_exs + dev_exs) words = utils.load_words( args, train_exs_with_doc + dev_exs_with_doc) added = model.expand_dictionary(words) # Load pretrained embeddings for added words if args.embedding_file: model.load_embeddings(added, args.embedding_file) else: logger.info('Training model from scratch...') model = init_from_scratch(args, train_docs) # Set up optimizer model.init_optimizer() # Use the GPU? if args.cuda: model.cuda() # Use multiple GPUs? if args.parallel: model.parallelize() # GPU usage if args.show_cuda_stats: gpu_usage() # -------------------------------------------------------------------------- # DATA ITERATORS # ------------------------------------------------------------------------- # Two datasets: train and dev. If we sort by length it's faster. logger.info('-' * 100) logger.info('Make data loaders') # best practices for memory management are available here: # https://pytorch.org/docs/stable/notes/cuda.html#best-practices train_dataset_with_doc = data.ReaderDataset_with_Doc( train_exs_with_doc, model, train_docs, single_answer=True) train_sampler_with_doc = torch.utils.data.sampler.SequentialSampler( train_dataset_with_doc) train_loader_with_doc = torch.utils.data.DataLoader( train_dataset_with_doc, batch_size=args.batch_size, # batch_size of 128 samples sampler=train_sampler_with_doc, num_workers=args.data_workers, # num_workers increased to 12 collate_fn=vector.batchify_with_docs, pin_memory=args.cuda, # pin_memory = True by default ) dev_dataset_with_doc = data.ReaderDataset_with_Doc( dev_exs_with_doc, model, dev_docs, single_answer=False) dev_sampler_with_doc = torch.utils.data.sampler.SequentialSampler( dev_dataset_with_doc) dev_loader_with_doc = torch.utils.data.DataLoader( dev_dataset_with_doc, batch_size=args.test_batch_size, sampler=dev_sampler_with_doc, num_workers=args.data_workers, collate_fn=vector.batchify_with_docs, pin_memory=args.cuda, ) test_dataset_with_doc = data.ReaderDataset_with_Doc( test_exs_with_doc, model, test_docs, single_answer=False) test_sampler_with_doc = torch.utils.data.sampler.SequentialSampler( test_dataset_with_doc) test_loader_with_doc = torch.utils.data.DataLoader( test_dataset_with_doc, batch_size=args.test_batch_size, sampler=test_sampler_with_doc, num_workers=args.data_workers, collate_fn=vector.batchify_with_docs, pin_memory=args.cuda, ) # ------------------------------------------------------------------------- # PRINT CONFIG # ------------------------------------------------------------------------- logger.info('-' * 100) logger.info('CONFIG:') print(json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # TRAIN/VALIDATION LOOP # ------------------------------------------------------------------------- logger.info('-' * 100) logger.info('Starting training...') stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0} for epoch in range(start_epoch, args.num_epochs): stats['epoch'] = epoch # Train logger.info('-' * 100) logger.info('Mode: ' + args.mode) if (args.mode == 'all'): train(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs) if (args.mode == 'reader'): pretrain_reader(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs) if (args.mode == 'selector'): pretrain_selector(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs) # --------------------------------------------------------------------- with torch.no_grad(): # ----------------------------------------------------------------- result = validate_with_doc(args, dev_loader_with_doc, model, stats, dev_exs_with_doc, dev_docs, 'dev') validate_with_doc(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs, 'train') if (dataset=='webquestions' or dataset=='CuratedTrec'): # not applicable result = validate_with_doc(args, test_loader_with_doc, model, stats, test_exs_with_doc, test_docs, 'test') else: # dataset == 'searchqa' by default, 'squad', 'quasart' or 'unftriviaqa' validate_with_doc(args, test_loader_with_doc, model, stats, test_exs_with_doc, test_docs, 'test') # --------------------------------------------------------------------- # Save model with improved evaluation results if result[args.valid_metric] > stats['best_valid']: txt = 'Best valid: {} = {:.2f} (epoch {}, {} updates)' logger.info(txt.format( args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) model.save(args.model_file) stats['best_valid'] = result[args.valid_metric] # Clean the gpu before running a new iteration if args.cuda: gc.collect() # force garbage collection for obj in gc.get_objects(): if torch.is_tensor(obj): del obj torch.cuda.synchronize(device=model.device) # wait for the gpu torch.cuda.empty_cache() # force garbage removal # CUDA memory txt_cuda(show=True, txt='after garbage collection')
def trainNet(net, batch_size, number_of_epochs, learning_rate): # Print all the hyperparameters of the training iteration print("Hyperparameters: ") print("Batch size = ", batch_size) print("epochs = ", number_of_epochs) print("Learning Rate = ", learning_rate) # Get Training Data train_loader = get_train_loader(batch_size) number_of_batches = len(train_loader) # Create our loss and optimizer functions loss, optimizer = createLossAndOptimizer(net, learning_rate) # Keep track of time training_start_time = time.time() print("GPU Usage before starting the first epoch") gpu_usage() print(number_of_epochs) # Loop for number_of_epochs for epoch in range(number_of_epochs): #print("inside for loop") train_loss = 0.0 total_val_loss = 0 accuracy = 0 print_every = number_of_batches // 10 start_time = time.time() total_train_loss = 0.0 #print("GPU Usage in epoch: ", epoch) gpu_usage() for i, data in enumerate(train_loader, 0): # Get inputs #print("Get inputs") inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) # Wraps them in a Variable object inputs, labels = Variable(inputs), Variable(labels) # Set the parameter gradients to zero # And make the forward pass, calculate gradient, do backprop optimizer.zero_grad() outputs = net(inputs) loss_size = loss(outputs, labels) del(inputs) del(labels) loss_size.backward() optimizer.step() #gpu_usage() # Print statistics train_loss += loss_size.data #print("Calculates train loss: ", train_loss) total_train_loss += loss_size.data #print("Calculates total train loss: ", total_train_loss) # Print every 10th batch of an epoch if (i + 1) % (print_every + 1) == 0: #print("Epoch {}, {:d}% \t Train loss: {:.2f} took: {:.2f}s".format( epoch+1, int(100* (i+1)/number_of_batches), train_loss / print_every, #time.time() - start_time() #print("GPU Usage:") gpu_usage() # Reset running loss and time train_loss = 0.0 start_time = time.time() # At the end of the epoch, do a pass on the validation set for inputs, labels in val_loader: # Wrap tensors in variables inputs, labels = Variable(inputs), Variable(labels) inputs = inputs.to(device) labels = labels.to(device) # Forward pass val_outputs = net(inputs) val_loss_size = loss(val_outputs, labels) total_val_loss += val_loss_size.data print("validation loss = {:.2f}".format(total_val_loss / len(val_loader))) torch.save(net.state_dict(), '/content/drive/My Drive/IDP/test_model.pt') print("Training finished. Took: {:.2f}s".format(time.time() - training_start_time))
def run(self, early_stopping_patience=10, verbose=False, fout=None, custom_params=None): # user can customize the parameter sets if custom_params: self.param_list = custom_params if self.param_list: try: stime = time.time() all_logs = [] n_iter = len(self.param_list) for i, hyperparams in enumerate(self.param_list): #if i>1: break try: print("-"*100) print(i, hyperparams) print("-"*100) # TODO combine user params with hyper params hyperparams['epochs'] = self.user_params['epochs'] hyperparams['save_model'] = self.user_params['save_model'] hyperparams['log_interval'] = self.user_params['log_interval'] hyperparams['use_cuda'] = self.user_params['use_cuda'] hyperparams['conv_config'] = self.user_params['conv_config'][hyperparams['conv_config_num']] if hyperparams['fc3_nodes']==0: hyperparams['dense_config'] = [hyperparams['fc1_nodes'], hyperparams['fc2_nodes']] else: hyperparams['dense_config'] = [hyperparams['fc1_nodes'], hyperparams['fc2_nodes'], hyperparams['fc3_nodes']] cv_performance = [] for k in range(self.user_params['kfold']): hyperparams['model_output'] = "{}_{}_{}".format(self.user_params['model_output'], i, k) ibm_dataset, train_loader, valid_loader = get_train_valid_split( ibm_data=self.train_data, to_tensor=self.user_params['to_tensor'], valid_split=self.user_params['valid_split'], batch_size=hyperparams['batch_size'], valid_batch_size=self.user_params['valid_batch_size'], mean_for_normalize=self.user_params['train_mean'], std_for_normalize=self.user_params['train_std'], k_fold_idx=k, use_cuda=self.user_params['use_cuda'], random_seed=self.random_seed ) model = self.model(train_loader=train_loader, valid_loader=valid_loader, hyperparams=hyperparams, optimize_fn=self.user_params['optimize_fn'], loss_fn=self.user_params['loss_fn'], random_seed=self.random_seed) gpu_usage() model.train(early_stopping_patience=early_stopping_patience, verbose=verbose) model.test(test_loader=self.test_loader, verbose=verbose) if model.best_performance is None: break model.best_performance['model_output'] = hyperparams['model_output'] if verbose: print("Best:", model.best_performance) cv_performance.append(model.best_performance) hyperparams['total_params'] = model.total_params hyperparams['total_size'] = model.total_size torch.cuda.empty_cache() gpu_usage() hyperparams['cv'] = cv_performance hyperparams['idx'] = i all_logs.append(hyperparams) print("-"*100) print("{}/{} Time:{}min".format(i, n_iter, (time.time()-stime)/60)) print("-"*100) except Exception as e: print(e) save_logs(all_logs, fout) except (KeyboardInterrupt, SystemExit): save_logs(all_logs, fout)
def trainNet(self, device, net, number_of_epochs, learning_rate, training_dataset, validation_dataset, path_to_tensorboard_log, path_to_saved_model): print("Initial GPU Usage") gpu_usage() # Get Training Data number_of_batches = len(training_dataset) # Create our loss and optimizer functions loss, optimizer = self.createLossAndOptimizer(net, learning_rate) # Keep track of time training_start_time = time.time() print("GPU Usage before starting the first epoch") gpu_usage() print(number_of_epochs) # initialize the tensorboard # in the command line, navigate to the root folder of the project and then type: # tensorboard --logdir=runs # after launching it, navigate to the following website in the browser: # http://localhost:6006/ writer = SummaryWriter() # get some random training images dataiter = iter(training_dataset) images, labels = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(images) # show images # matplotlib_imshow(img_grid, one_channel=True) # write to tensorboard writer.add_image('idp_sr_training_images', img_grid) writer.add_graph(net, images) # Print model's state_dict print("Model's state_dict:") for param_tensor in net.state_dict(): print(param_tensor, "\t", net.state_dict()[param_tensor].size()) # Print optimizer's state_dict print("Optimizer's state_dict:") for var_name in optimizer.state_dict(): print(var_name, "\t", optimizer.state_dict()[var_name]) # transfer the network into the GPU net.to(device) # assign platform's maximum max_val = sys.maxsize # assign the max value as lowest validation loss lowest_validation_loss = max_val # Loop for number_of_epochs number_of_minibatches = 1 for epoch in range(number_of_epochs): print("inside for loop") train_loss = 0.0 total_val_loss = 0 # accuracy = 0 print_every = number_of_batches // 10 start_time = time.time() total_train_loss = 0.0 print("GPU Usage in epoch: ", epoch) gpu_usage() net.train() for i, data in enumerate(training_dataset, 0): # Get inputs inputs, labels = data #print(labels) inputs = inputs.to(self.device) labels = labels.to(self.device) # Wraps them in a Variable object inputs, labels = Variable(inputs), Variable(labels) # Set the parameter gradients to zero # And make the forward pass, calculate gradient, do backprop optimizer.zero_grad() outputs = net(inputs) #outputs = out_act(outputs) #labels = labels.view(-1,1) #labels = labels.float() loss_size = loss(outputs, labels) train_loss += loss_size.data total_train_loss += loss_size.data outputs = outputs.data.cpu().numpy() labels= labels.data.cpu().numpy() current_accuracy = self.get_accuracy(outputs, labels) #print(current_accuracy) del (inputs) del (labels) loss_size.backward() optimizer.step() # gpu_usage() if (i + 1) % (print_every + 1) == 0: current_avg_loss = train_loss/print_every print("Epoch {}, {:d}% \t Train loss: {:.4f} took: {:.4f}s".format( epoch + 1, int(100 * (i + 1) / number_of_batches), current_avg_loss, time.time() - start_time)) writer.add_scalar('Mini-batch Training Loss', current_avg_loss, number_of_minibatches) writer.add_scalar('Mini-batch accuracy:', current_accuracy, number_of_minibatches) number_of_minibatches+=1 print("GPU Usage after 10th batch:") gpu_usage() # Reset running loss and time train_loss = 0.0 start_time = time.time() # At the end of the epoch, do a pass on the validation set writer.add_histogram('conv1.bias', net.conv1.bias, epoch+1) writer.add_histogram('conv1.weight', net.conv1.weight, epoch+1) writer.add_histogram('conv1.weight.grad', net.conv1.weight.grad, epoch+1) net.eval() with torch.no_grad(): for inputs, labels in validation_dataset: # Wrap tensors in variables inputs, labels = Variable(inputs), Variable(labels) inputs = inputs.to(self.device) labels = labels.to(self.device) # Forward pass val_outputs = net(inputs) # val_outputs = out_act(val_outputs) # labels = labels.view(-1, 1) # labels = labels.float() val_loss_size = loss(val_outputs, labels) total_val_loss += val_loss_size.data val_outputs = val_outputs.data.cpu().numpy() labels = labels.data.cpu().numpy() val_accuracy = self.get_accuracy(val_outputs, labels) writer.add_scalar('Validation accuracy: ', val_accuracy, epoch + 1) current_validation_loss = total_val_loss / len(validation_dataset) print("validation loss = {:.4f}".format(current_validation_loss)) if current_validation_loss < lowest_validation_loss: lowest_validation_loss = current_validation_loss torch.save(net.state_dict(), path_to_saved_model) print("Saving model..., path: ", path_to_saved_model) else: print("Validation loss increased. Keeping the previous model.") writer.close() print("Training finished. Took: {:.4f}s".format(time.time() - training_start_time))
from pkg_resources import resource_stream from PIL import Image import skimage.color from skimage.segmentation import mark_boundaries from itertools import chain from pysnic.algorithms.snic import snic, compute_grid from pysnic.ndim.operations_collections import nd_computations from pysnic.metric.snic import create_augmented_snic_distance from torch import einsum use_cuda = torch.cuda.is_available() torch.cuda.empty_cache() print("Initial GPU Usage") gpu_usage() parser = argparse.ArgumentParser(description='PyTorch Group Affinity Unsupervised Segmentation') parser.add_argument('--nChannel', metavar='N', default=60, type=int, help='number of channels') parser.add_argument('--nGroup', metavar='G', default=60, type=int, help='number of channels') parser.add_argument('--maxIter', metavar='T', default=1000, type=int, help='number of maximum iterations') parser.add_argument('--minLabels', metavar='minL', default=3, type=int, help='minimum number of labels') parser.add_argument('--lr', metavar='LR', default=0.1, type=float, help='learning rate') parser.add_argument('--nConv', metavar='M', default=2, type=int, help='number of convolutional layers') parser.add_argument('--num_superpixels', metavar='K', default=5000, type=int,
def training(self): if self.on_cuda: print('training on GPU') else: print('training on CPU') optimizer = optim.Adam(self.Net.parameters(), lr=self.lr) t0 = time.time() idx = 0 print('begining training') for epoch in range(self.n_epochs): self.ULoss_factor = min(self.ULoss_factor + 3, 400) labeled_loss_epoch = 0 # avg cross entropy loss unlabeled_loss_epoch = 0 # avg L2**2 loss iter_unlabeled_loaders = [ iter(loader) for loader in self.unlabeled_loaders ] for local_X, local_y in self.train_loader: idx += 1 one_hot_y = self.make_one_hot(local_y) if self.on_cuda: local_X = local_X.to('cuda') one_hot_y = one_hot_y.to('cuda') local_Us = [ next(loader)[0].to('cuda') for loader in iter_unlabeled_loaders ] else: local_Us = [ next(loader)[0] for loader in iter_unlabeled_loaders ] predictions_Us = self.prediction_unlabeled(local_Us) Labels, Ws = self.concatenate_shuffle(local_Us, local_X, one_hot_y, predictions_Us) # MixUp on local_X and random batch from Ws lmbda = np.random.beta(self.alpha, self.alpha) local_X_W = lmbda * local_X + (1 - lmbda) * Ws[:len(local_X)] local_y_W = lmbda * one_hot_y + (1 - lmbda) * Labels[:len(local_X)] # prediction and gradient step prediction = self.Net(local_X_W) loss_X = cross_entropy(prediction, local_y_W) # mean labeled_loss_epoch += float(loss_X) # MixUp on local_Us and remaining random batches from Ws loss_U = 0 for i in range(self.K): lmbda = np.long(np.random.beta(self.alpha, self.alpha)) local_U_W = lmbda * local_Us[i] + ( 1 - lmbda) * Ws[len(local_X):][i * self.batch_size_u: (i + 1) * self.batch_size_u] local_y_W = lmbda * predictions_Us + (1 - lmbda) * Labels[ len(local_X):][i * self.batch_size_u: (i + 1) * self.batch_size_u] prediction = self.softmax(self.Net(local_U_W), dim=1) loss_U += self.MSELoss(prediction, local_y_W) loss_U /= (Ws.shape[0] - self.batch_size_l) * self.n_classes unlabeled_loss_epoch += float(loss_U) # gradient descent batch_loss = loss_X + self.ULoss_factor * loss_U if idx % 50 == 0: print( f"batch_loss: {batch_loss} -- loss_X: {int(100*(loss_X/batch_loss).item())}% -- loss_U: {int(100*(loss_U*self.ULoss_factor/batch_loss).item())}%" ) gpu_usage() optimizer.zero_grad() batch_loss.backward() optimizer.step() del loss_X, loss_U, prediction, batch_loss if self.on_cuda: torch.cuda.empty_cache() if self.save_path is not None and (epoch + 1) % self.checkpoint_save == 0: model_location = os.path.join(self.save_path, f'MixMatch_{epoch+1}.pth') torch.save(self.Net.state_dict(), model_location) labeled_loss_epoch /= len(self.train_loader) self.l_training_losses.append(float(labeled_loss_epoch)) self.u_training_losses.append( float(self.ULoss_factor * unlabeled_loss_epoch)) accuracy, val_loss = self.evaluate() # timing current_time = get_duration(t0, time.time()) print( f'epoch {epoch+1} --- l_train_loss = {labeled_loss_epoch} -- u_train_loss = {self.ULoss_factor * unlabeled_loss_epoch} --- val_loss = {val_loss} -- val_accuracy = {accuracy}%' f'--- time: {current_time}') del predictions_Us, local_X, local_y, local_Us, Ws, Labels, local_X_W, local_y_W, local_U_W, val_loss if self.on_cuda: torch.cuda.empty_cache() gpu_usage() # testing if (epoch + 1) % self.checkpoint_test == 0: self.testing(epoch) # accuracy self.testing(epoch) self.save_losses(MixMatch=True) self.plot_results(MixMatch=True)
def main(): torch.cuda.empty_cache() print("Initial GPU Usage") gpu_usage() args = docopt(__doc__) config_file = args["<yaml-config>"] or "config/wireframe.yaml" C.update(C.from_yaml(filename=config_file)) M.update(C.model) pprint.pprint(C, indent=4) resume_from = C.io.resume_from # WARNING: L-CNN is still not deterministic random.seed(0) np.random.seed(0) torch.manual_seed(0) device_name = "cpu" os.environ["CUDA_VISIBLE_DEVICES"] = args["--devices"] if torch.cuda.is_available(): device_name = "cuda" torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(0) print("Let's use", torch.cuda.device_count(), "GPU(s)!") else: print("CUDA is not available") device = torch.device(device_name) # 1. dataset # uncomment for debug DataLoader # wireframe.datasets.WireframeDataset(datadir, split="train")[0] # sys.exit(0) datadir = C.io.datadir kwargs = { "collate_fn": collate, "num_workers": C.io.num_workers if os.name != "nt" else 0, "pin_memory": True, } train_loader = torch.utils.data.DataLoader( WireframeDataset(datadir, split="train"), shuffle=True, batch_size=M.batch_size, **kwargs, ) val_loader = torch.utils.data.DataLoader( WireframeDataset(datadir, split="valid"), shuffle=False, batch_size=M.batch_size_eval, **kwargs, ) epoch_size = len(train_loader) print("epoch_size (train):", epoch_size) print("epoch_size (valid):", len(val_loader)) if resume_from: checkpoint = torch.load(osp.join(resume_from, "checkpoint_latest.pth")) # 2. model if M.backbone == "stacked_hourglass": model = lcnn.models.hg( depth=M.depth, head=MultitaskHead, num_stacks=M.num_stacks, num_blocks=M.num_blocks, num_classes=sum(sum(M.head_size, [])), ) else: raise NotImplementedError model = MultitaskLearner(model) model = LineVectorizer(model) if resume_from: model.load_state_dict(checkpoint["model_state_dict"]) model = model.to(device) # 3. optimizer if C.optim.name == "Adam": optim = torch.optim.Adam( model.parameters(), lr=C.optim.lr, weight_decay=C.optim.weight_decay, amsgrad=C.optim.amsgrad, ) elif C.optim.name == "SGD": optim = torch.optim.SGD( model.parameters(), lr=C.optim.lr, weight_decay=C.optim.weight_decay, momentum=C.optim.momentum, ) else: raise NotImplementedError if resume_from: optim.load_state_dict(checkpoint["optim_state_dict"]) outdir = resume_from or get_outdir(args["--identifier"]) print("outdir:", outdir) try: trainer = lcnn.trainer.Trainer( device=device, model=model, optimizer=optim, train_loader=train_loader, val_loader=val_loader, out=outdir, ) if resume_from: trainer.iteration = checkpoint["iteration"] if trainer.iteration % epoch_size != 0: print( "WARNING: iteration is not a multiple of epoch_size, reset it" ) trainer.iteration -= trainer.iteration % epoch_size trainer.best_mean_loss = checkpoint["best_mean_loss"] del checkpoint trainer.train() except BaseException: if len(glob.glob(f"{outdir}/viz/*")) <= 1: shutil.rmtree(outdir) raise
def train(args, data_loader, model, global_stats, exs_with_doc, docs_by_question): '''Run through one epoch of model training with the provided data loader.''' # Initialize meters and timers train_loss = utils.AverageMeter() epoch_time = utils.Timer() # Run one epoch global HasAnswer_Map update_step = 0 for idx, ex_with_doc in enumerate(data_loader): ex = ex_with_doc[0] batch_size, ex_id = ex[0].size(0), ex[-1] # Display GPU usage statitstics every <display_stats> iterations show_stats = (args.show_cuda_stats and (idx % args.display_stats == args.display_stats - 1)) if (idx not in HasAnswer_Map): HasAnswer_list = [] for idx_doc in range(0, num_docs): HasAnswer = [] for i in range(batch_size): idx_doc_i = idx_doc % len(docs_by_question[ex_id[i]]) answer = exs_with_doc[ex_id[i]]['answer'] document = docs_by_question[ ex_id[i]][idx_doc_i]['document'] # --------------------------------------------------------- # Looking for the answer in the document... # --------------------------------------------------------- HasAnswer.append(has_answer(args, answer, document)) # --------------------------------------------------------- HasAnswer_list.append(HasAnswer) HasAnswer_Map[idx] = HasAnswer_list else: HasAnswer_list = HasAnswer_Map[idx] # Initializing weights and sampling indices... weights = torch.tensor([1.0 for idx_doc in range(0, num_docs)]) idx_random = torch.multinomial(weights, int(num_docs)) HasAnswer_list_sample = [] ex_with_doc_sample = [] for idx_doc in idx_random: HasAnswer_list_sample.append(HasAnswer_list[idx_doc]) ex_with_doc_sample.append(ex_with_doc[idx_doc]) l_list_doc = [] r_list_doc = [] for idx_doc in idx_random: l_list = [] r_list = [] for i in range(batch_size): if HasAnswer_list[idx_doc][i][0]: l_list.append(HasAnswer_list[idx_doc][i][1]) else: l_list.append((-1, -1)) l_list_doc.append(l_list) r_list_doc.append(r_list) # Generating predictions... pred_s_list_doc = [] pred_e_list_doc = [] tmp_top_n = 1 # CUDA memory before forward pass txt_cuda(show_stats, 'before forward pass') for idx_doc in idx_random: ex = ex_with_doc[idx_doc] pred_s, pred_e, pred_score = model.predict(ex, top_n=tmp_top_n) pred_s_list = [] pred_e_list = [] for i in range(batch_size): pred_s_list.append(pred_s[i].tolist()) pred_e_list.append(pred_e[i].tolist()) pred_s_list_doc.append(torch.tensor(pred_s_list, dtype=torch.long)) pred_e_list_doc.append(torch.tensor(pred_e_list, dtype=torch.long)) # CUDA memory before backpropagation txt_cuda(show_stats, 'before backpropagation') # --------------------------------------------------------------------- # Updating (one epoch)... # --------------------------------------------------------------------- train_loss.update(*model.update_with_doc( update_step, ex_with_doc_sample, pred_s_list_doc, pred_e_list_doc, tmp_top_n, l_list_doc, r_list_doc, HasAnswer_list_sample)) # --------------------------------------------------------------------- update_step = (update_step + 1) % 4 # --------------------------------------------------------------------- # CUDA memory after backpropagation txt_cuda(show_stats, 'after backpropagation') if show_stats: gpu_usage() # Resetting... if idx % args.display_iter == 0: txt = 'train: Epoch = {} | iter = {}/{} | loss = {:.2f} | ' txt += 'elapsed time = {:.2f} (s)' logger.info( txt.format(global_stats['epoch'], idx, len(data_loader), train_loss.avg, global_stats['timer'].time())) train_loss.reset() # Validation... if show_stats: with torch.no_grad(): validate_with_doc(args, data_loader, model, global_stats, exs_with_doc, docs_by_question, mode='train') logger.info('-' * 100) txt = 'train: Epoch {} done. Time for epoch = {:.2f} (s)' logger.info(txt.format(global_stats['epoch'], epoch_time.time())) logger.info('-' * 100) # Checkpoint if args.checkpoint: model.checkpoint(args.model_file + '.checkpoint', global_stats['epoch'] + 1)