def training_eval():
    print("Benchmarking training time...")
    optimizer = getattr(optim, 'Adam')(model.parameters(), lr=0.001)
    model.train()
    all_durations = []
    out = model(data)  # do one time to wake gpu.

    for i in np.arange(args.nruns):
        # start timer
        start = time()

        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out, target)
        loss.backward()
        optimizer.step()

        # end timer.
        gpu_usage()
        duration = time() - start

        # print and save duration
        print(f"Run: {i} \t Duration: {duration}", )
        all_durations.append(duration)

    # print mean and std of durations.
    all_durations = np.array(all_durations)
    mean_time = np.mean(all_durations)
    std_time = np.std(all_durations)
    print(f"mean time: {mean_time} \t std time: {std_time}")
Example #2
0
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
Example #3
0
def simplify(sketch_np_array, imgbasename):
    t0 = time.time()
    use_cuda = torch.cuda.device_count() > 0
    cache = load_lua('model_gan.t7')
    model = cache.model
    immean = cache.mean
    imstd = cache.std
    model.evaluate()

    data = Image.fromarray(sketch_np_array)
    w, h = data.size[0], data.size[1]
    pw = 8 - (w % 8) if w % 8 != 0 else 0
    ph = 8 - (h % 8) if h % 8 != 0 else 0
    data = ((transforms.ToTensor()(data) - immean) / imstd).unsqueeze(0)
    if pw != 0 or ph != 0:
        data = torch.nn.ReplicationPad2d((0, pw, 0, ph))(data).data

    if use_cuda:
        print("CUDA device count :", torch.cuda.device_count())
        print("GPU :", torch.cuda.get_device_name(0))
        print('Initial GPU Usage')
        gpu_usage()
        '''
      GPU 사용할거면 아래 mode.cuda() 코드 주석 해제하고, 그 밑에 pred를 주석처리 할 것.
      GPU 사용 시 속도는 빠르나 CUDA out of memory 에러 생겨서 계속 재시작 해줘야함
      잘 모를 경우 그냥 pred = model.forward(data)코드 사용 권장
      '''
        # pred = model.cuda().forward(data.cuda()).float()
        pred = model.forward(data)
    else:
        pred = model.forward(data)

    print('GPU Usage after allocating a bunch of Tensors')
    gpu_usage()

    pngname = imgbasename + '.png'
    save_image(pred[0], pngname)

    png2svg(pngname, imgbasename)
    t1 = time.time()
    total = t1 - t0
    print(total, "sec spent")
def testing_eval():
    print("Benchmarking test time...")
    model.eval()
    all_durations = []
    _ = model(data)  # do one time to wake gpu.

    with torch.no_grad():
        for i in np.arange(args.nruns):
            # time forward pass
            start = time()
            _ = model(data)
            gpu_usage()
            duration = time() - start

            # save duration
            print(f"Run: {i} \t Duration: {duration}", )
            all_durations.append(duration)

    # print mean and std of durations.
    all_durations = np.array(all_durations)
    mean_time = np.mean(all_durations)
    std_time = np.std(all_durations)
    print(f"mean time: {mean_time} \t std time: {std_time}")
Example #5
0
def testNet(net, test_dataset, device):
    print("Initial GPU Usage: ")
    gpu_usage()
    number_of_batches = len(test_dataset)
    test_start_time = time()
    print("Test started at: " + test_start_time)
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_dataset:
            # inputs, labels = Variable(inputs), Variable(labels)
            inputs = Variable(inputs)
            inputs = inputs.to(device)
            # labels = labels.to(device)

            test_outputs = net(inputs)
            test_outputs = test_outputs.data.cpu().numpy()
            _, predicted = torch.max(test_outputs.data, 1)
            labels = labels.data.cpu().numpy()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            # test_accuracy = get_accuracy(test_outputs, labels)
            print("Accuracy of network: %d" % (100 * correct / total))
Example #6
0
def load_train_evaluate_save(mode):
    
    # -------------------------------------------------------------------------   
    # PARSER
    # -------------------------------------------------------------------------   
    
    # Parse cmdline args and setup environment
    parser = argparse.ArgumentParser(
        'OpenQA Question Answering Model',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    add_main_args(parser, mode)
    config.add_model_args(parser)
    args = parser.parse_args()
    set_defaults(args)
        
    
    # -------------------------------------------------------------------------   
    # INITIALIZATIONS
    # -------------------------------------------------------------------------   
    
    # CUDA
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    assert(args.cuda)
    if args.cuda:
        torch.cuda.set_device(args.gpu) # no-op if args.gpu is negative
        torch.cuda.empty_cache()
    
    # Set random state
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    if args.cuda:
        torch.cuda.manual_seed(args.random_seed)
    
    if args.log_file:
        if args.checkpoint:
            logfile = logging.FileHandler(args.log_file, 'a')
        else:
            logfile = logging.FileHandler(args.log_file, 'w')
    
        logfile.setFormatter(txtfmt)
        logger.addHandler(logfile)
    
    logger.info('COMMAND: {}'.format(' '.join(sys.argv)))
    
    # GPU cleaning
    gc.collect()
    for obj in gc.get_objects():
        del obj
    torch.cuda.empty_cache()
    
    
    # --------------------------------------------------------------------------
    # DATASET
    # -------------------------------------------------------------------------   
    
    logger.info('-' * 100)
    logger.info('Load data files')
    
    dataset = args.dataset # == 'searchqa', 'quasart' or 'unftriviaqa'
    
    filename_train_docs = sys_dir+'/data/datasets/'+dataset+'/train.json' 
    filename_dev_docs = sys_dir+'/data/datasets/'+dataset+'/dev.json' 
    filename_test_docs = sys_dir+'/data/datasets/'+dataset+'/test.json' 
    filename_train = sys_dir+'/data/datasets/'+dataset+'/train.txt' 
    filename_dev = sys_dir+'/data/datasets/'+dataset+'/dev.txt' 
    filename_test = sys_dir+'/data/datasets/'+dataset+'/test.txt'
    
    train_docs, train_questions, train_len = utils.load_data_with_doc(
            args, filename_train_docs)
    logger.info(len(train_docs))
    logger.info(len(train_questions))
    
    train_exs_with_doc = read_data(filename_train, train_questions, train_len)
    logger.info('Num train examples = {}'.format(str(len(train_exs_with_doc))))
    
    dev_docs, dev_questions, _ = utils.load_data_with_doc(
            args, filename_dev_docs)
    logger.info(len(dev_docs))
    logger.info(len(dev_questions))
    
    dev_exs_with_doc = read_data(filename_dev, dev_questions)
    logger.info('Num dev examples = {}'.format(str(len(dev_exs_with_doc))))
    
    test_docs, test_questions, _ = utils.load_data_with_doc(
            args, filename_test_docs)
    logger.info(len(test_docs))
    logger.info(len(test_questions))
    
    test_exs_with_doc = read_data(filename_test, test_questions)
    logger.info('Num test examples = {}'.format(str(len(test_exs_with_doc))))


    # --------------------------------------------------------------------------
    # MODEL SETUP
    # -------------------------------------------------------------------------   
    
    logger.info('-' * 100)
    start_epoch = 0
    
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = DocReader.load_checkpoint(checkpoint_file)
        start_epoch = 0
    
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            
            logger.info('Using pretrained model...')
            model = DocReader.load(args.pretrained, args)
            
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                
                # Add words in training and dev examples
                #words = utils.load_words(args, train_exs + dev_exs)
                words = utils.load_words(
                        args, train_exs_with_doc + dev_exs_with_doc)
                added = model.expand_dictionary(words)
                
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added, args.embedding_file)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_docs)

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()
    
    # GPU usage
    if args.show_cuda_stats:
        gpu_usage()

    
    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # -------------------------------------------------------------------------   
    
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')
    
    # best practices for memory management are available here:
    # https://pytorch.org/docs/stable/notes/cuda.html#best-practices
    
    train_dataset_with_doc = data.ReaderDataset_with_Doc(
            train_exs_with_doc, model, train_docs, single_answer=True)
    train_sampler_with_doc = torch.utils.data.sampler.SequentialSampler(
            train_dataset_with_doc)
    train_loader_with_doc = torch.utils.data.DataLoader(
            train_dataset_with_doc,
            batch_size=args.batch_size, # batch_size of 128 samples
            sampler=train_sampler_with_doc,
            num_workers=args.data_workers, # num_workers increased to 12
            collate_fn=vector.batchify_with_docs,
            pin_memory=args.cuda, # pin_memory = True by default
            )

    dev_dataset_with_doc = data.ReaderDataset_with_Doc(
            dev_exs_with_doc, model, dev_docs, single_answer=False)
    dev_sampler_with_doc = torch.utils.data.sampler.SequentialSampler(
            dev_dataset_with_doc)
    dev_loader_with_doc = torch.utils.data.DataLoader(
            dev_dataset_with_doc,
            batch_size=args.test_batch_size,
            sampler=dev_sampler_with_doc,
            num_workers=args.data_workers,
            collate_fn=vector.batchify_with_docs,
            pin_memory=args.cuda,
            )

    test_dataset_with_doc = data.ReaderDataset_with_Doc(
            test_exs_with_doc, model, test_docs, single_answer=False)
    test_sampler_with_doc = torch.utils.data.sampler.SequentialSampler(
            test_dataset_with_doc)
    test_loader_with_doc = torch.utils.data.DataLoader(
           test_dataset_with_doc,
           batch_size=args.test_batch_size,
           sampler=test_sampler_with_doc,
           num_workers=args.data_workers,
           collate_fn=vector.batchify_with_docs,
           pin_memory=args.cuda,
           )


    # -------------------------------------------------------------------------
    # PRINT CONFIG 
    # -------------------------------------------------------------------------   
    
    logger.info('-' * 100)
    logger.info('CONFIG:')
    print(json.dumps(vars(args), indent=4, sort_keys=True))


    # --------------------------------------------------------------------------
    # TRAIN/VALIDATION LOOP
    # -------------------------------------------------------------------------   
    
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}
          
    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        logger.info('-' * 100)
        logger.info('Mode: ' + args.mode)
        
        if (args.mode == 'all'):
            train(args, 
                    train_loader_with_doc, model, stats, 
                    train_exs_with_doc, train_docs)
        if (args.mode == 'reader'):
            pretrain_reader(args, 
                    train_loader_with_doc, model, stats, 
                    train_exs_with_doc, train_docs)
        if (args.mode == 'selector'):
            pretrain_selector(args, 
                    train_loader_with_doc, model, stats, 
                    train_exs_with_doc, train_docs)
        
        # ---------------------------------------------------------------------
        with torch.no_grad():
            # -----------------------------------------------------------------
            result = validate_with_doc(args, 
                    dev_loader_with_doc, model, stats, dev_exs_with_doc, 
                    dev_docs, 'dev')
            
            validate_with_doc(args, 
                    train_loader_with_doc, model, stats, train_exs_with_doc, 
                    train_docs, 'train')
            
            if (dataset=='webquestions' or dataset=='CuratedTrec'): # not applicable
                result = validate_with_doc(args, 
                        test_loader_with_doc, model, stats, 
                        test_exs_with_doc, test_docs, 'test')
            else: # dataset == 'searchqa' by default, 'squad', 'quasart' or 'unftriviaqa'
                validate_with_doc(args, 
                        test_loader_with_doc, model, stats, 
                        test_exs_with_doc, test_docs, 'test')
        # ---------------------------------------------------------------------
        
        # Save model with improved evaluation results
        if result[args.valid_metric] > stats['best_valid']:
            
            txt = 'Best valid: {} = {:.2f} (epoch {}, {} updates)'
            logger.info(txt.format(
                    args.valid_metric, result[args.valid_metric],
                    stats['epoch'], model.updates))
            
            model.save(args.model_file)
            stats['best_valid'] = result[args.valid_metric]
        
        # Clean the gpu before running a new iteration
        if args.cuda:                 
            
            gc.collect() # force garbage collection
            for obj in gc.get_objects(): 
                if torch.is_tensor(obj): 
                    del obj
            
            torch.cuda.synchronize(device=model.device) # wait for the gpu
            torch.cuda.empty_cache() # force garbage removal
        
        # CUDA memory
        txt_cuda(show=True, txt='after garbage collection')
Example #7
0
def trainNet(net, batch_size, number_of_epochs, learning_rate):
    # Print all the hyperparameters of the training iteration
    print("Hyperparameters: ")
    print("Batch size = ", batch_size)
    print("epochs = ", number_of_epochs)
    print("Learning Rate = ", learning_rate)
    
    # Get Training Data
    train_loader = get_train_loader(batch_size)
    number_of_batches = len(train_loader)
    
    # Create our loss and optimizer functions
    loss, optimizer = createLossAndOptimizer(net, learning_rate)
    
    # Keep track of time
    training_start_time = time.time()
    
    print("GPU Usage before starting the first epoch")
    gpu_usage()
    print(number_of_epochs)
    # Loop for number_of_epochs
    for epoch in range(number_of_epochs):
        #print("inside for loop")
        train_loss = 0.0
        total_val_loss = 0
        accuracy = 0
        print_every = number_of_batches // 10
        start_time = time.time()
        total_train_loss = 0.0
        #print("GPU Usage in epoch: ", epoch)
        gpu_usage()
        
        for i, data in enumerate(train_loader, 0):
            # Get inputs
            #print("Get inputs")
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            # Wraps them in a Variable object
            inputs, labels = Variable(inputs), Variable(labels)
            
            # Set the parameter gradients to zero
            # And make the forward pass, calculate gradient, do backprop
            optimizer.zero_grad()
            outputs = net(inputs)
            loss_size = loss(outputs, labels)
            del(inputs)
            del(labels)
            loss_size.backward()
            optimizer.step()
            #gpu_usage()
            # Print statistics
            train_loss += loss_size.data
            #print("Calculates train loss: ", train_loss)
            total_train_loss += loss_size.data
            #print("Calculates total train loss: ", total_train_loss)
            
            # Print every 10th batch of an epoch
            if (i + 1) % (print_every + 1) == 0:
                #print("Epoch {}, {:d}% \t Train loss: {:.2f} took: {:.2f}s".format(
                epoch+1, int(100* (i+1)/number_of_batches),
                train_loss / print_every,
                #time.time() - start_time()
                #print("GPU Usage:")
                gpu_usage()
                # Reset running loss and time
                train_loss = 0.0
                start_time = time.time()
                
        # At the end of the epoch, do a pass on the validation set
        
        for inputs, labels in val_loader:
            # Wrap tensors in variables
            inputs, labels = Variable(inputs), Variable(labels)
            inputs = inputs.to(device)
            labels = labels.to(device)
            # Forward pass
            val_outputs = net(inputs)
            val_loss_size = loss(val_outputs, labels)
            total_val_loss += val_loss_size.data
            
        print("validation loss = {:.2f}".format(total_val_loss / len(val_loader)))
    
    torch.save(net.state_dict(), '/content/drive/My Drive/IDP/test_model.pt')
    
    print("Training finished. Took: {:.2f}s".format(time.time() - training_start_time))
    def run(self, early_stopping_patience=10, verbose=False, fout=None, custom_params=None):
        # user can customize the parameter sets
        if custom_params: self.param_list = custom_params

        if self.param_list:
            try:
                stime = time.time()
                all_logs = []
                n_iter = len(self.param_list)
                for i, hyperparams in enumerate(self.param_list):
                    #if i>1: break
                    try:
                        print("-"*100)
                        print(i, hyperparams)
                        print("-"*100)

                        # TODO combine user params with hyper params
                        hyperparams['epochs'] = self.user_params['epochs']
                        hyperparams['save_model'] = self.user_params['save_model']
                        hyperparams['log_interval'] = self.user_params['log_interval']
                        hyperparams['use_cuda'] = self.user_params['use_cuda']
                        
                        hyperparams['conv_config'] = self.user_params['conv_config'][hyperparams['conv_config_num']]

                        if hyperparams['fc3_nodes']==0:
                            hyperparams['dense_config'] = [hyperparams['fc1_nodes'], hyperparams['fc2_nodes']]
                        else:
                            hyperparams['dense_config'] = [hyperparams['fc1_nodes'], hyperparams['fc2_nodes'], hyperparams['fc3_nodes']]
                        
                        cv_performance = []
                        for k in range(self.user_params['kfold']):
                            hyperparams['model_output'] = "{}_{}_{}".format(self.user_params['model_output'], i, k)

                            ibm_dataset, train_loader, valid_loader = get_train_valid_split(
                                ibm_data=self.train_data, to_tensor=self.user_params['to_tensor'], valid_split=self.user_params['valid_split'],
                                batch_size=hyperparams['batch_size'], valid_batch_size=self.user_params['valid_batch_size'],
                                mean_for_normalize=self.user_params['train_mean'], std_for_normalize=self.user_params['train_std'], k_fold_idx=k,
                                use_cuda=self.user_params['use_cuda'], random_seed=self.random_seed
                            )
                            model = self.model(train_loader=train_loader, valid_loader=valid_loader, hyperparams=hyperparams,
                                optimize_fn=self.user_params['optimize_fn'], loss_fn=self.user_params['loss_fn'], random_seed=self.random_seed)

                            gpu_usage()

                            model.train(early_stopping_patience=early_stopping_patience, verbose=verbose)
                            model.test(test_loader=self.test_loader, verbose=verbose)

                            if model.best_performance is None: break
                            
                            model.best_performance['model_output'] = hyperparams['model_output']
                            if verbose: print("Best:", model.best_performance)
                            cv_performance.append(model.best_performance)
                            hyperparams['total_params'] = model.total_params
                            hyperparams['total_size'] = model.total_size

                            torch.cuda.empty_cache()
                            gpu_usage()
                        
                        hyperparams['cv'] = cv_performance
                        hyperparams['idx'] = i
                        
                        all_logs.append(hyperparams)
                        print("-"*100)
                        print("{}/{} Time:{}min".format(i, n_iter, (time.time()-stime)/60))
                        print("-"*100)
                    except Exception as e:
                        print(e)
                save_logs(all_logs, fout)
            except (KeyboardInterrupt, SystemExit):
                save_logs(all_logs, fout)
Example #9
0
    def trainNet(self, device, net, number_of_epochs, learning_rate,
                 training_dataset, validation_dataset, path_to_tensorboard_log, path_to_saved_model):

        print("Initial GPU Usage")
        gpu_usage()

        # Get Training Data
        number_of_batches = len(training_dataset)

        # Create our loss and optimizer functions
        loss, optimizer = self.createLossAndOptimizer(net, learning_rate)

        # Keep track of time
        training_start_time = time.time()

        print("GPU Usage before starting the first epoch")
        gpu_usage()
        print(number_of_epochs)

        # initialize the tensorboard
        # in the command line, navigate to the root folder of the project and then type:
        # tensorboard --logdir=runs
        # after launching it, navigate to the following website in the browser:
        # http://localhost:6006/
        writer = SummaryWriter()
        # get some random training images
        dataiter = iter(training_dataset)
        images, labels = dataiter.next()

        # create grid of images
        img_grid = torchvision.utils.make_grid(images)

        # show images
        # matplotlib_imshow(img_grid, one_channel=True)

        # write to tensorboard
        writer.add_image('idp_sr_training_images', img_grid)
        writer.add_graph(net, images)

        # Print model's state_dict
        print("Model's state_dict:")
        for param_tensor in net.state_dict():
            print(param_tensor, "\t", net.state_dict()[param_tensor].size())

        # Print optimizer's state_dict
        print("Optimizer's state_dict:")
        for var_name in optimizer.state_dict():
            print(var_name, "\t", optimizer.state_dict()[var_name])
        
        # transfer the network into the GPU
        net.to(device)
        
        # assign platform's maximum 
        max_val = sys.maxsize
        # assign the max value as lowest validation loss
        lowest_validation_loss  = max_val
        # Loop for number_of_epochs
        number_of_minibatches = 1
        
        for epoch in range(number_of_epochs):
            print("inside for loop")
            train_loss = 0.0
            total_val_loss = 0
            # accuracy = 0
            print_every = number_of_batches // 10
            start_time = time.time()
            total_train_loss = 0.0
            print("GPU Usage in epoch: ", epoch)
            gpu_usage()
            net.train()
            for i, data in enumerate(training_dataset, 0):
                # Get inputs
                inputs, labels = data
                #print(labels)
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)
                # Wraps them in a Variable object
                inputs, labels = Variable(inputs), Variable(labels)

                # Set the parameter gradients to zero
                # And make the forward pass, calculate gradient, do backprop
                optimizer.zero_grad()
                outputs = net(inputs)
                #outputs = out_act(outputs)
                #labels = labels.view(-1,1)
                #labels = labels.float()
                loss_size = loss(outputs, labels)
                train_loss += loss_size.data
                total_train_loss += loss_size.data
                outputs = outputs.data.cpu().numpy()
                labels= labels.data.cpu().numpy()
                current_accuracy = self.get_accuracy(outputs, labels)
                #print(current_accuracy)
                del (inputs)
                del (labels)
                loss_size.backward()
                optimizer.step()
                # gpu_usage()
                
                if (i + 1) % (print_every + 1) == 0:
                    current_avg_loss = train_loss/print_every
                    print("Epoch {}, {:d}% \t Train loss: {:.4f} took: {:.4f}s".format(
                        epoch + 1, int(100 * (i + 1) / number_of_batches),
                        current_avg_loss,
                        time.time() - start_time))
                    writer.add_scalar('Mini-batch Training Loss', current_avg_loss, number_of_minibatches)
                    writer.add_scalar('Mini-batch accuracy:', current_accuracy, number_of_minibatches)
                    number_of_minibatches+=1

                    print("GPU Usage after 10th batch:")
                    gpu_usage()
                    # Reset running loss and time
                    train_loss = 0.0
                    start_time = time.time()
            
            # At the end of the epoch, do a pass on the validation set

            writer.add_histogram('conv1.bias', net.conv1.bias, epoch+1)
            writer.add_histogram('conv1.weight', net.conv1.weight, epoch+1)
            writer.add_histogram('conv1.weight.grad', net.conv1.weight.grad, epoch+1)
            net.eval()

            with torch.no_grad():
                for inputs, labels in validation_dataset:
                    # Wrap tensors in variables
                    inputs, labels = Variable(inputs), Variable(labels)
                    inputs = inputs.to(self.device)
                    labels = labels.to(self.device)
                    # Forward pass
                    val_outputs = net(inputs)
                    # val_outputs = out_act(val_outputs)
                    # labels = labels.view(-1, 1)
                    # labels = labels.float()
                    val_loss_size = loss(val_outputs, labels)
                    total_val_loss += val_loss_size.data
                    val_outputs = val_outputs.data.cpu().numpy()
                    labels = labels.data.cpu().numpy()
                    val_accuracy = self.get_accuracy(val_outputs, labels)
                    writer.add_scalar('Validation accuracy: ', val_accuracy, epoch + 1)

            current_validation_loss = total_val_loss / len(validation_dataset)
            print("validation loss = {:.4f}".format(current_validation_loss))
            if current_validation_loss < lowest_validation_loss:
                lowest_validation_loss = current_validation_loss
                torch.save(net.state_dict(), path_to_saved_model)
                print("Saving model..., path: ", path_to_saved_model)
            else:
                print("Validation loss increased. Keeping the previous model.")

        writer.close()
        print("Training finished. Took: {:.4f}s".format(time.time() - training_start_time))
Example #10
0
from pkg_resources import resource_stream
from PIL import Image
import skimage.color
from skimage.segmentation import mark_boundaries
from itertools import chain
from pysnic.algorithms.snic import snic, compute_grid
from pysnic.ndim.operations_collections import nd_computations
from pysnic.metric.snic import create_augmented_snic_distance

from torch import einsum

use_cuda = torch.cuda.is_available()
torch.cuda.empty_cache()
print("Initial GPU Usage")
gpu_usage()

parser = argparse.ArgumentParser(description='PyTorch Group Affinity Unsupervised Segmentation')
parser.add_argument('--nChannel', metavar='N', default=60, type=int,
                    help='number of channels')
parser.add_argument('--nGroup', metavar='G', default=60, type=int,
                    help='number of channels')
parser.add_argument('--maxIter', metavar='T', default=1000, type=int, 
                    help='number of maximum iterations')
parser.add_argument('--minLabels', metavar='minL', default=3, type=int, 
                    help='minimum number of labels')
parser.add_argument('--lr', metavar='LR', default=0.1, type=float, 
                    help='learning rate')
parser.add_argument('--nConv', metavar='M', default=2, type=int,
                    help='number of convolutional layers')
parser.add_argument('--num_superpixels', metavar='K', default=5000, type=int,
Example #11
0
    def training(self):
        if self.on_cuda:
            print('training on GPU')
        else:
            print('training on CPU')
        optimizer = optim.Adam(self.Net.parameters(), lr=self.lr)

        t0 = time.time()

        idx = 0
        print('begining training')
        for epoch in range(self.n_epochs):
            self.ULoss_factor = min(self.ULoss_factor + 3, 400)
            labeled_loss_epoch = 0  # avg cross entropy loss
            unlabeled_loss_epoch = 0  # avg L2**2 loss

            iter_unlabeled_loaders = [
                iter(loader) for loader in self.unlabeled_loaders
            ]
            for local_X, local_y in self.train_loader:
                idx += 1
                one_hot_y = self.make_one_hot(local_y)
                if self.on_cuda:
                    local_X = local_X.to('cuda')
                    one_hot_y = one_hot_y.to('cuda')
                    local_Us = [
                        next(loader)[0].to('cuda')
                        for loader in iter_unlabeled_loaders
                    ]
                else:
                    local_Us = [
                        next(loader)[0] for loader in iter_unlabeled_loaders
                    ]
                predictions_Us = self.prediction_unlabeled(local_Us)
                Labels, Ws = self.concatenate_shuffle(local_Us, local_X,
                                                      one_hot_y,
                                                      predictions_Us)

                # MixUp on local_X and random batch from Ws
                lmbda = np.random.beta(self.alpha, self.alpha)
                local_X_W = lmbda * local_X + (1 - lmbda) * Ws[:len(local_X)]
                local_y_W = lmbda * one_hot_y + (1 -
                                                 lmbda) * Labels[:len(local_X)]
                # prediction and gradient step
                prediction = self.Net(local_X_W)
                loss_X = cross_entropy(prediction, local_y_W)  # mean
                labeled_loss_epoch += float(loss_X)
                # MixUp on local_Us and remaining random batches from Ws
                loss_U = 0
                for i in range(self.K):
                    lmbda = np.long(np.random.beta(self.alpha, self.alpha))
                    local_U_W = lmbda * local_Us[i] + (
                        1 - lmbda) * Ws[len(local_X):][i * self.batch_size_u:
                                                       (i + 1) *
                                                       self.batch_size_u]
                    local_y_W = lmbda * predictions_Us + (1 - lmbda) * Labels[
                        len(local_X):][i * self.batch_size_u:
                                       (i + 1) * self.batch_size_u]
                    prediction = self.softmax(self.Net(local_U_W), dim=1)
                    loss_U += self.MSELoss(prediction, local_y_W)
                loss_U /= (Ws.shape[0] - self.batch_size_l) * self.n_classes
                unlabeled_loss_epoch += float(loss_U)
                # gradient descent
                batch_loss = loss_X + self.ULoss_factor * loss_U
                if idx % 50 == 0:
                    print(
                        f"batch_loss: {batch_loss} -- loss_X: {int(100*(loss_X/batch_loss).item())}% -- loss_U: {int(100*(loss_U*self.ULoss_factor/batch_loss).item())}%"
                    )
                    gpu_usage()
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                del loss_X, loss_U, prediction, batch_loss
                if self.on_cuda:
                    torch.cuda.empty_cache()
            if self.save_path is not None and (epoch +
                                               1) % self.checkpoint_save == 0:
                model_location = os.path.join(self.save_path,
                                              f'MixMatch_{epoch+1}.pth')
                torch.save(self.Net.state_dict(), model_location)
            labeled_loss_epoch /= len(self.train_loader)
            self.l_training_losses.append(float(labeled_loss_epoch))
            self.u_training_losses.append(
                float(self.ULoss_factor * unlabeled_loss_epoch))
            accuracy, val_loss = self.evaluate()
            # timing
            current_time = get_duration(t0, time.time())
            print(
                f'epoch {epoch+1} --- l_train_loss = {labeled_loss_epoch}  -- u_train_loss = {self.ULoss_factor * unlabeled_loss_epoch} --- val_loss = {val_loss} -- val_accuracy = {accuracy}%'
                f'--- time: {current_time}')
            del predictions_Us, local_X, local_y, local_Us, Ws, Labels, local_X_W, local_y_W, local_U_W, val_loss
            if self.on_cuda:
                torch.cuda.empty_cache()
            gpu_usage()
            # testing
            if (epoch + 1) % self.checkpoint_test == 0:
                self.testing(epoch)
        # accuracy
        self.testing(epoch)
        self.save_losses(MixMatch=True)
        self.plot_results(MixMatch=True)
Example #12
0
def main():
    torch.cuda.empty_cache()
    print("Initial GPU Usage")
    gpu_usage()

    args = docopt(__doc__)
    config_file = args["<yaml-config>"] or "config/wireframe.yaml"
    C.update(C.from_yaml(filename=config_file))
    M.update(C.model)
    pprint.pprint(C, indent=4)
    resume_from = C.io.resume_from

    # WARNING: L-CNN is still not deterministic
    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)

    device_name = "cpu"
    os.environ["CUDA_VISIBLE_DEVICES"] = args["--devices"]
    if torch.cuda.is_available():
        device_name = "cuda"
        torch.backends.cudnn.deterministic = True
        torch.cuda.manual_seed(0)
        print("Let's use", torch.cuda.device_count(), "GPU(s)!")
    else:
        print("CUDA is not available")
    device = torch.device(device_name)

    # 1. dataset

    # uncomment for debug DataLoader
    # wireframe.datasets.WireframeDataset(datadir, split="train")[0]
    # sys.exit(0)

    datadir = C.io.datadir
    kwargs = {
        "collate_fn": collate,
        "num_workers": C.io.num_workers if os.name != "nt" else 0,
        "pin_memory": True,
    }
    train_loader = torch.utils.data.DataLoader(
        WireframeDataset(datadir, split="train"),
        shuffle=True,
        batch_size=M.batch_size,
        **kwargs,
    )
    val_loader = torch.utils.data.DataLoader(
        WireframeDataset(datadir, split="valid"),
        shuffle=False,
        batch_size=M.batch_size_eval,
        **kwargs,
    )
    epoch_size = len(train_loader)
    print("epoch_size (train):", epoch_size)
    print("epoch_size (valid):", len(val_loader))

    if resume_from:
        checkpoint = torch.load(osp.join(resume_from, "checkpoint_latest.pth"))

    # 2. model
    if M.backbone == "stacked_hourglass":
        model = lcnn.models.hg(
            depth=M.depth,
            head=MultitaskHead,
            num_stacks=M.num_stacks,
            num_blocks=M.num_blocks,
            num_classes=sum(sum(M.head_size, [])),
        )
    else:
        raise NotImplementedError

    model = MultitaskLearner(model)
    model = LineVectorizer(model)

    if resume_from:
        model.load_state_dict(checkpoint["model_state_dict"])
    model = model.to(device)

    # 3. optimizer
    if C.optim.name == "Adam":
        optim = torch.optim.Adam(
            model.parameters(),
            lr=C.optim.lr,
            weight_decay=C.optim.weight_decay,
            amsgrad=C.optim.amsgrad,
        )
    elif C.optim.name == "SGD":
        optim = torch.optim.SGD(
            model.parameters(),
            lr=C.optim.lr,
            weight_decay=C.optim.weight_decay,
            momentum=C.optim.momentum,
        )
    else:
        raise NotImplementedError

    if resume_from:
        optim.load_state_dict(checkpoint["optim_state_dict"])
    outdir = resume_from or get_outdir(args["--identifier"])
    print("outdir:", outdir)

    try:
        trainer = lcnn.trainer.Trainer(
            device=device,
            model=model,
            optimizer=optim,
            train_loader=train_loader,
            val_loader=val_loader,
            out=outdir,
        )
        if resume_from:
            trainer.iteration = checkpoint["iteration"]
            if trainer.iteration % epoch_size != 0:
                print(
                    "WARNING: iteration is not a multiple of epoch_size, reset it"
                )
                trainer.iteration -= trainer.iteration % epoch_size
            trainer.best_mean_loss = checkpoint["best_mean_loss"]
            del checkpoint
        trainer.train()
    except BaseException:
        if len(glob.glob(f"{outdir}/viz/*")) <= 1:
            shutil.rmtree(outdir)
        raise
Example #13
0
def train(args, data_loader, model, global_stats, exs_with_doc,
          docs_by_question):
    '''Run through one epoch of model training with the provided data loader.'''

    # Initialize meters and timers
    train_loss = utils.AverageMeter()
    epoch_time = utils.Timer()

    # Run one epoch
    global HasAnswer_Map

    update_step = 0

    for idx, ex_with_doc in enumerate(data_loader):
        ex = ex_with_doc[0]
        batch_size, ex_id = ex[0].size(0), ex[-1]

        # Display GPU usage statitstics every <display_stats> iterations
        show_stats = (args.show_cuda_stats
                      and (idx % args.display_stats == args.display_stats - 1))

        if (idx not in HasAnswer_Map):
            HasAnswer_list = []

            for idx_doc in range(0, num_docs):

                HasAnswer = []
                for i in range(batch_size):

                    idx_doc_i = idx_doc % len(docs_by_question[ex_id[i]])
                    answer = exs_with_doc[ex_id[i]]['answer']
                    document = docs_by_question[
                        ex_id[i]][idx_doc_i]['document']

                    # ---------------------------------------------------------
                    # Looking for the answer in the document...
                    # ---------------------------------------------------------
                    HasAnswer.append(has_answer(args, answer, document))
                    # ---------------------------------------------------------
                HasAnswer_list.append(HasAnswer)

            HasAnswer_Map[idx] = HasAnswer_list

        else:
            HasAnswer_list = HasAnswer_Map[idx]

        # Initializing weights and sampling indices...
        weights = torch.tensor([1.0 for idx_doc in range(0, num_docs)])
        idx_random = torch.multinomial(weights, int(num_docs))

        HasAnswer_list_sample = []
        ex_with_doc_sample = []

        for idx_doc in idx_random:
            HasAnswer_list_sample.append(HasAnswer_list[idx_doc])
            ex_with_doc_sample.append(ex_with_doc[idx_doc])

        l_list_doc = []
        r_list_doc = []
        for idx_doc in idx_random:

            l_list = []
            r_list = []
            for i in range(batch_size):
                if HasAnswer_list[idx_doc][i][0]:
                    l_list.append(HasAnswer_list[idx_doc][i][1])
                else:
                    l_list.append((-1, -1))

            l_list_doc.append(l_list)
            r_list_doc.append(r_list)

        # Generating predictions...
        pred_s_list_doc = []
        pred_e_list_doc = []
        tmp_top_n = 1

        # CUDA memory before forward pass
        txt_cuda(show_stats, 'before forward pass')

        for idx_doc in idx_random:
            ex = ex_with_doc[idx_doc]
            pred_s, pred_e, pred_score = model.predict(ex, top_n=tmp_top_n)

            pred_s_list = []
            pred_e_list = []
            for i in range(batch_size):
                pred_s_list.append(pred_s[i].tolist())
                pred_e_list.append(pred_e[i].tolist())

            pred_s_list_doc.append(torch.tensor(pred_s_list, dtype=torch.long))
            pred_e_list_doc.append(torch.tensor(pred_e_list, dtype=torch.long))

        # CUDA memory before backpropagation
        txt_cuda(show_stats, 'before backpropagation')

        # ---------------------------------------------------------------------
        # Updating (one epoch)...
        # ---------------------------------------------------------------------
        train_loss.update(*model.update_with_doc(
            update_step, ex_with_doc_sample, pred_s_list_doc, pred_e_list_doc,
            tmp_top_n, l_list_doc, r_list_doc, HasAnswer_list_sample))
        # ---------------------------------------------------------------------
        update_step = (update_step + 1) % 4
        # ---------------------------------------------------------------------

        # CUDA memory after backpropagation
        txt_cuda(show_stats, 'after backpropagation')
        if show_stats: gpu_usage()

        # Resetting...
        if idx % args.display_iter == 0:

            txt = 'train: Epoch = {} | iter = {}/{} | loss = {:.2f} | '
            txt += 'elapsed time = {:.2f} (s)'
            logger.info(
                txt.format(global_stats['epoch'], idx, len(data_loader),
                           train_loss.avg, global_stats['timer'].time()))

            train_loss.reset()

        # Validation...
        if show_stats:
            with torch.no_grad():
                validate_with_doc(args,
                                  data_loader,
                                  model,
                                  global_stats,
                                  exs_with_doc,
                                  docs_by_question,
                                  mode='train')

    logger.info('-' * 100)
    txt = 'train: Epoch {} done. Time for epoch = {:.2f} (s)'
    logger.info(txt.format(global_stats['epoch'], epoch_time.time()))
    logger.info('-' * 100)

    # Checkpoint
    if args.checkpoint:
        model.checkpoint(args.model_file + '.checkpoint',
                         global_stats['epoch'] + 1)