Exemple #1
0
def test_epoch(test_loader, model, test_meter, cur_epoch):
    """Evaluates the model on the test set."""

    # Enable eval mode
    model.eval()
    test_meter.iter_tic()

    for cur_iter, (inputs, labels) in enumerate(test_loader):
        # Transfer the data to the current GPU device
        inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True)
        # Compute the predictions
        preds = model(inputs)
        # Compute the errors
        top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5])
        # Combine the errors across the GPUs
        if cfg.NUM_GPUS > 1:
            top1_err, top5_err = du.scaled_all_reduce([top1_err, top5_err])
        # Copy the errors from GPU to CPU (sync point)
        top1_err, top5_err = top1_err.item(), top5_err.item()
        test_meter.iter_toc()
        # Update and log stats
        test_meter.update_stats(top1_err, top5_err,
                                inputs.size(0) * cfg.NUM_GPUS)
        test_meter.log_iter_stats(cur_epoch, cur_iter)
        test_meter.iter_tic()

    # Log epoch stats
    test_meter.log_epoch_stats(cur_epoch)
    test_meter.reset()
Exemple #2
0
def train_epoch(train_loader,
                model,
                loss_fun,
                optimizer,
                train_meter,
                cur_epoch,
                writer_train=None,
                params=0,
                flops=0,
                is_master=False):
    """Performs one epoch of training."""

    # Shuffle the data
    loader.shuffle(train_loader, cur_epoch)
    # Update the learning rate
    lr = optim.get_epoch_lr(cur_epoch)
    optim.set_lr(optimizer, lr)
    # Enable training mode
    model.train()
    train_meter.iter_tic()

    for cur_iter, (inputs, labels) in enumerate(train_loader):
        # Transfer the data to the current GPU device
        inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True)
        # Perform the forward pass
        preds = model(inputs)
        # Compute the loss
        loss = loss_fun(preds, labels)
        # Perform the backward pass
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters
        optimizer.step()
        # Compute the errors
        top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5])
        # Combine the stats across the GPUs
        if cfg.NUM_GPUS > 1:
            loss, top1_err, top5_err = du.scaled_all_reduce(
                [loss, top1_err, top5_err])
        # Copy the stats from GPU to CPU (sync point)
        loss, top1_err, top5_err = loss.item(), top1_err.item(), top5_err.item(
        )
        train_meter.iter_toc()
        # Update and log stats
        train_meter.update_stats(top1_err, top5_err, loss, lr,
                                 inputs.size(0) * cfg.NUM_GPUS)
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats
    train_meter.log_epoch_stats(cur_epoch,
                                writer_train,
                                params,
                                flops,
                                is_master=is_master)
    train_meter.reset()
Exemple #3
0
def test_epoch(test_loader, model, ssl_model, test_meter, cur_epoch):
    """Evaluates the model on the test set."""

    global plot_epoch_xvalues
    global plot_epoch_yvalues

    global plot_it_x_values
    global plot_it_y_values

    if torch.cuda.is_available():
        model.cuda()
        ssl_model.cuda()

    # Enable eval mode
    model.eval()
    ssl_model.eval()
    test_meter.iter_tic()

    misclassifications = 0.
    totalSamples = 0.
    ssl_model.penultimate_active = True

    for cur_iter, (inputs, labels) in enumerate(test_loader):
        with torch.no_grad():
            # Transfer the data to the current GPU device
            inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True)
            inputs = inputs.type(torch.cuda.FloatTensor)
            # Get representations
            with torch.no_grad():
                inputs, _ = ssl_model(inputs)
            # Compute the predictions
            preds = model(inputs)
            # Compute the errors
            top1_err, top4_err = mu.topk_errors(preds, labels, [1, 4])
            # Combine the errors across the GPUs
            # if cfg.NUM_GPUS > 1:
            #     top1_err = du.scaled_all_reduce([top1_err])
            #     #as above returns a list
            #     top1_err = top1_err[0]
            # Copy the errors from GPU to CPU (sync point)
            top1_err = top1_err.item()
            # Multiply by Number of GPU's as top1_err is scaled by 1/Num_GPUs
            misclassifications += top1_err * inputs.size(0) * cfg.NUM_GPUS
            totalSamples += inputs.size(0) * cfg.NUM_GPUS
            test_meter.iter_toc()
            # Update and log stats
            test_meter.update_stats(top1_err=top1_err,
                                    mb_size=inputs.size(0) * cfg.NUM_GPUS)
            test_meter.log_iter_stats(cur_epoch, cur_iter)
            test_meter.iter_tic()
    # Log epoch stats
    test_meter.log_epoch_stats(cur_epoch)
    test_meter.reset()

    return misclassifications / totalSamples
Exemple #4
0
def eval_epoch(test_loader,
               model,
               test_meter,
               cur_epoch,
               writer_eval=None,
               params=0,
               flops=0,
               is_master=False):
    """Evaluates the model on the test set."""

    # Enable eval mode
    model.eval()
    test_meter.iter_tic()

    for cur_iter, (inputs, labels) in enumerate(test_loader):
        # Transfer the data to the current GPU device
        inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True)
        # Compute the predictions
        preds = model(inputs)
        # Compute the errors
        top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5])
        # Combine the errors across the GPUs
        if cfg.NUM_GPUS > 1:
            top1_err, top5_err = du.scaled_all_reduce([top1_err, top5_err])
        # Copy the errors from GPU to CPU (sync point)
        top1_err, top5_err = top1_err.item(), top5_err.item()
        test_meter.iter_toc()
        # Update and log stats
        test_meter.update_stats(top1_err, top5_err,
                                inputs.size(0) * cfg.NUM_GPUS)
        test_meter.log_iter_stats(cur_epoch, cur_iter)
        test_meter.iter_tic()
    # Log epoch stats
    # test_meter.log_epoch_stats(cur_epoch,writer_eval,params,flops)
    test_meter.log_epoch_stats(cur_epoch,
                               writer_eval,
                               params,
                               flops,
                               model,
                               is_master=is_master)
    stats = test_meter.get_epoch_stats(cur_epoch)
    test_meter.reset()
    if cfg.RGRAPH.SAVE_GRAPH:
        adj_dict = nu.model2adj(model)
        adj_dict = {**adj_dict, 'top1_err': stats['top1_err']}
        os.makedirs('{}/graphs/{}'.format(cfg.OUT_DIR, cfg.RGRAPH.SEED_TRAIN),
                    exist_ok=True)
        np.savez(
            '{}/graphs/{}/{}.npz'.format(cfg.OUT_DIR, cfg.RGRAPH.SEED_TRAIN,
                                         cur_epoch), **adj_dict)
Exemple #5
0
def train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                cur_epoch, cfg, clf_iter_count, clf_change_lr_iter,
                clf_max_iter):
    """Performs one epoch of training."""

    global plot_epoch_xvalues
    global plot_epoch_yvalues

    global plot_it_x_values
    global plot_it_y_values

    # Shuffle the data
    #loader.shuffle(train_loader, cur_epoch)
    if cfg.NUM_GPUS > 1: train_loader.sampler.set_epoch(cur_epoch)

    # Update the learning rate
    # Currently we only support LR schedules for only 'SGD' optimizer
    lr = optim.get_epoch_lr(cfg, cur_epoch)
    if cfg.OPTIM.TYPE == "sgd":
        optim.set_lr(optimizer, lr)

    if torch.cuda.is_available():
        model.cuda()

    # Enable training mode
    model.train()
    train_meter.iter_tic(
    )  #This basically notes the start time in timer class defined in utils/timer.py

    len_train_loader = len(train_loader)
    for cur_iter, (inputs, labels) in enumerate(train_loader):
        #ensuring that inputs are floatTensor as model weights are
        inputs = inputs.type(torch.cuda.FloatTensor)
        inputs, labels = inputs.cuda(), labels.cuda(non_blocking=True)
        # Perform the forward pass
        preds = model(inputs)
        # Compute the loss
        loss = loss_fun(preds, labels)
        # Perform the backward pass
        optimizer.zero_grad()
        loss.backward()
        # Update the parametersSWA
        optimizer.step()
        # Compute the errors
        top1_err, top5_err = mu.topk_errors(preds, labels, [1, 5])
        # Combine the stats across the GPUs
        # if cfg.NUM_GPUS > 1:
        #     #Average error and losses across GPUs
        #     #Also this this calls wait method on reductions so we are ensured
        #     #to obtain synchronized results
        #     loss, top1_err = du.scaled_all_reduce(
        #         [loss, top1_err]
        #     )
        # Copy the stats from GPU to CPU (sync point)
        loss, top1_err = loss.item(), top1_err.item()
        # #Only master process writes the logs which are used for plotting
        # if du.is_master_proc():
        if True:
            if cur_iter != 0 and cur_iter % 19 == 0:
                #because cur_epoch starts with 0
                plot_it_x_values.append((cur_epoch) * len_train_loader +
                                        cur_iter)
                plot_it_y_values.append(loss)
                save_plot_values(
                    [plot_it_x_values, plot_it_y_values],
                    ["plot_it_x_values.npy", "plot_it_y_values.npy"],
                    out_dir=cfg.EPISODE_DIR,
                    isDebug=False)
                # print(plot_it_x_values)
                # print(plot_it_y_values)
                #Plot loss graphs
                plot_arrays(
                    x_vals=plot_it_x_values,
                    y_vals=plot_it_y_values,
                    x_name="Iterations",
                    y_name="Loss",
                    dataset_name=cfg.DATASET.NAME,
                    out_dir=cfg.EPISODE_DIR,
                )

        #Compute the difference in time now from start time initialized just before this for loop.
        train_meter.iter_toc()
        train_meter.update_stats(top1_err=top1_err, loss=loss, \
            lr=lr, mb_size=inputs.size(0) * cfg.NUM_GPUS)
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()
    # Log epoch stats
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
    return loss, clf_iter_count