Example #1
0
def main():
    test_seen_loader = torch.utils.data.DataLoader(AttributeDataset(
        args.data_dir,
        args.dataset,
        features_path=args.gan_path,
        mode='test_seen',
        generalized=True,
        normalize=args.normalize,
        sentences=args.sentences),
                                                   batch_size=args.batch_size,
                                                   shuffle=False)

    test_unseen_loader = torch.utils.data.DataLoader(
        AttributeDataset(args.data_dir,
                         args.dataset,
                         features_path=args.gan_path,
                         mode='test_unseen',
                         generalized=True,
                         normalize=args.normalize,
                         sentences=args.sentences),
        batch_size=args.batch_size,
        shuffle=False)

    # instanciate the models
    if args.mlp:
        mlp = MLP(args.dim_input, [args.nhidden * 2], args.nhidden)
    else:
        mlp = LinearProjection(args.dim_input, args.nhidden)
    embed = LinearProjection(args.nhidden, args.dim_embed)

    if args.sentences:
        cam_key = 'sentences'
    else:
        cam_key = 'emb'

    if args.gan_path is not None:
        cam_key = 'full_' + cam_key

    cam = torch.from_numpy(test_seen_loader.dataset.data[cam_key].T)
    proxies = ProxyNet(args.n_classes, args.dim_embed, proxies=cam)

    model = Base(mlp, embed, proxies)

    criterion = ProxyLoss(temperature=args.temp)

    if args.cuda:
        mlp.cuda()
        embed.cuda()
        model.cuda()
        proxies.cuda()

    # loading
    checkpoint = torch.load(args.model_path)
    model.load_state_dict(checkpoint['state_dict'])
    txt = ("=> loaded checkpoint '{}' (epoch {})".format(
        args.model_path, checkpoint['epoch']))
    print(txt)

    compute_scores(test_seen_loader, test_unseen_loader, model, criterion)
Example #2
0
optimizer = optim.Adam(itertools.chain(model.parameters(), mlp.parameters()),
                       lr=args.lr,
                       weight_decay=args.weight_decay)

creterion = torch.nn.TripletMarginLoss(margin=args.margin, p=2)
CE = torch.nn.CrossEntropyLoss()

loss_list = RunningAvg(window_size=200)
loss_list_CE = RunningAvg(window_size=200)
acc_list = RunningAvg(window_size=200)
loss_by_iter = []

if args.cuda:
    device = torch.device(args.cuda_device)
    model.cuda(device)
    mlp.cuda(device)


def train_step(epoch, loss_save):
    for _, data_train_group in tqdm(enumerate(dataloader_train),
                                    desc='Training',
                                    total=len(dataloader_train)):
        # pass three times first, then back propagate the loss
        model.train()
        mlp.train()
        for data_train in data_train_group:
            vector3 = []
            regularization = 0
            for data_train_item in data_train:
                # and/or children: [[1,2],[3,4]]
                adj, features, labels, idx_train, idx_val, idx_test = cuda_input(
Example #3
0
optimizer = optim.Adam(itertools.chain(model.parameters(), mlp.parameters()),
                       lr=args.lr,
                       weight_decay=args.weight_decay)

creterion = torch.nn.TripletMarginLoss(margin=args.margin, p=2)
CE = torch.nn.CrossEntropyLoss()

loss_list = deque(maxlen=100)
loss_list_CE = deque(maxlen=100)
acc_list = deque(maxlen=100)

loss_by_iter = []

if args.cuda:
    model.cuda()
    mlp.cuda()


def train(epoch, loss_save):
    for _, data_train_group in tqdm(enumerate(dataloader_train),
                                    desc='Training',
                                    total=len(dataloader_train)):
        # pass three times first, then back propagate the loss
        model.train()
        mlp.train()
        for data_train in data_train_group:
            vector3 = []
            regularization = 0
            for data_train_item in data_train:
                # and/or children: [[1,2],[3,4]]
                adj, features, labels = cuda_input(*data_train_item[:-2])
                                           shuffle=True)

test_dataset = datasets.MNIST(root='../data/',
                              train=False,
                              download=True,
                              transform=transforms.ToTensor())
loader_test = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=param['test_batch_size'],
                                          shuffle=True)

# Load the pretrained model
net = MLP()
net.load_state_dict(torch.load('models/mlp_pretrained.pkl'))
if torch.cuda.is_available():
    print('CUDA ensabled.')
    net.cuda()
print("--- Pretrained network loaded ---")
test(net, loader_test)

# prune the weights
masks = weight_prune(net, param['pruning_perc'])
net.set_masks(masks)
print("--- {}% parameters pruned ---".format(param['pruning_perc']))
test(net, loader_test)

# Retraining
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(net.parameters(),
                                lr=param['learning_rate'],
                                weight_decay=param['weight_decay'])
word_ae.load_state_dict(word_ae_params)

classifier = MLP(word_args.nhidden)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(list(word_ae.parameters()) +
                             list(classifier.parameters()),
                             lr=3e-4)

one = torch.FloatTensor([1])
mone = one * (-1)
lamda = torch.FloatTensor([10])

if torch.cuda.is_available():
    logger.info("Running on GPU")
    word_ae = word_ae.cuda()
    classifier = classifier.cuda()
    one = one.cuda()
    # D = D.cuda()
    # G = G.cuda()
else:
    logger.info("Running on CPU")


###############################################################################
# Training code
###############################################################################
def train_GAN(batch, train_mode=True):
    if train_mode:
        word_ae.train()
        classifier.train()
    else:
Example #6
0
def train_MLP(train_X, train_Y, test_X, test_Y, batch_size=20, epochs=100):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"

    model = MLP(10, 20, 2)
    model.cuda()
    model.train()

    learn_rate = 1e-3
    grad_clip = 2.0
    dispFreq = 50
    validFreq = 200
    early_stop = 20
    weight = torch.FloatTensor([2.0, 1.0])
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=learn_rate)
    params = filter(lambda p: p.requires_grad, model.parameters())

    dev_tensor = Variable(torch.FloatTensor(test_X).cuda())

    curr = 0
    uidx = 0
    # For Early-stopping

    best_step = 0
    for iepx in xrange(1, epochs + 1):
        for ibx in xrange(0, len(train_X), batch_size):
            if ibx + batch_size >= len(train_X):
                batch = Variable(
                    torch.FloatTensor(train_X[ibx:len(train_X)]).cuda())
                target = Variable(
                    torch.LongTensor(train_Y[ibx:len(train_X)]).cuda())
            else:
                batch = Variable(
                    torch.FloatTensor(train_X[ibx:ibx + batch_size]).cuda())
                target = Variable(
                    torch.LongTensor(train_Y[ibx:ibx + batch_size]).cuda())

            uidx += 1

            pred = model(batch)

            loss = loss_function(pred, target)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(params, grad_clip)
            optimizer.step()

            if np.mod(uidx, dispFreq) == 0:
                print 'Epoch ', iepx, '\tUpdate ', uidx, '\tCost ', loss.data.cpu(
                ).numpy()[0]

            if np.mod(uidx, validFreq) == 0:
                # compute dev
                model.eval()
                out = model.forward(dev_tensor)
                model.train()
                # score = nn.NLLLoss(weight=weight)(out, vs_tensor).data[0]
                pred = categoryFromOutput(out)
                F1 = f1_score(test_Y, pred)

                curr_step = uidx / validFreq

                currscore = F1

                print 'F1 on dev', F1

                if currscore > curr:
                    curr = currscore
                    best_step = curr_step

                    # Save model
                    print 'Saving model...',
                    # torch.save(model.state_dict(), '%s_model_%s.pkl' % (saveto, run))
                    print 'Done'

                if curr_step - best_step > early_stop:
                    print 'Early stopping ...'
                    print best_step
                    print curr
                    return
Example #7
0
adj, features, labels, idx_train, idx_val, idx_test = load_data(args.dataset)
adj = reduce_noise(adj, labels, noise_rate=args.noise_rate)
graph = adj_to_graph(adj)

# Model and optimizer
model = MLP(nfeat=features.shape[1],
            nhid=args.hidden,
            nclass=labels.max().item() + 1,
            dropout=args.dropout)

optimizer = optim.Adam(model.parameters(),
                       lr=args.lr,
                       weight_decay=args.weight_decay)

if args.cuda:
    model.cuda()
    features = features.cuda()
    adj = adj.cuda()
    labels = labels.cuda()


def train(iteration):
    t = time.time()

    x_batch, y_batch = graph_sample(idx_train, args.batch_size, graph,
                                    args.trdep, features, labels)
    model.train()
    optimizer.zero_grad()
    y_pre_batch = model(x_batch)
    loss_train = F.nll_loss(y_pre_batch, y_batch)
    acc_train = accuracy(y_pre_batch, y_batch)
def train_permutedmnist(num_tasks,
                        batch_size,
                        hidden_size,
                        lr,
                        num_epochs,
                        num_points,
                        select_method='lambda_descend',
                        use_cuda=True,
                        tau=0.5):

    # Log console output to 'pmlog.txt'
    logging.basicConfig(filename='pmlog.txt')
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    # Data generator
    datagen = PermutedMnistGenerator(max_iter=num_tasks)

    # Model
    num_classes = 10
    layer_size = [784, hidden_size, hidden_size, num_classes]
    model = MLP(layer_size, act='relu')

    criterion = nn.CrossEntropyLoss()
    if use_cuda:
        criterion.cuda()
        model.cuda()

    # Optimiser
    opt = opt_fromp(model,
                    lr=lr,
                    prior_prec=1e-5,
                    grad_clip_norm=0.01,
                    tau=tau)

    # Train on tasks
    memorable_points = []
    testloaders = []
    acc_list = []
    for tid in range(num_tasks):

        # If not first task, need to calculate and store regularisation-term-related quantities
        if tid > 0:

            def closure(task_id):
                memorable_points_t = memorable_points[task_id][0]
                if use_cuda:
                    memorable_points_t = memorable_points_t.cuda()
                opt.zero_grad()
                logits = model.forward(memorable_points_t)
                return logits

            opt.init_task(closure, tid, eps=1e-5)

        # Data generator for this task
        itrain, itest = datagen.next_task()
        itrainloader = DataLoader(dataset=itrain,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=3)
        itestloader = DataLoader(dataset=itest,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=3)
        memorableloader = DataLoader(dataset=itrain,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=3)
        testloaders.append(itestloader)
        iloaders = [itrainloader, testloaders]

        # Train and test
        acc = train(model,
                    iloaders,
                    memorable_points,
                    criterion,
                    opt,
                    task_id=tid,
                    num_epochs=num_epochs,
                    use_cuda=use_cuda)

        # Select memorable past datapoints
        if select_method == 'random':
            i_memorable_points = random_memorable_points(
                itrain, num_points=num_points, num_classes=num_classes)
        elif select_method == 'lambda_descend':
            i_memorable_points = select_memorable_points(
                memorableloader,
                model,
                num_points=num_points,
                num_classes=num_classes,
                use_cuda=use_cuda,
                descending=True)
        elif select_method == 'lambda_ascend':
            i_memorable_points = select_memorable_points(
                memorableloader,
                model,
                num_points=num_points,
                num_classes=num_classes,
                use_cuda=use_cuda,
                descending=False)
        else:
            raise Exception('Invalid memorable points selection method.')

        memorable_points.append(i_memorable_points)

        # Update covariance (\Sigma)
        update_fisher(memorableloader, model, opt, use_cuda=use_cuda)

        print(acc)
        print('Mean accuracy after task %d: %f' %
              (tid + 1, sum(acc) / len(acc)))
        logger.info('After learn task: %d' % (tid + 1))
        logger.info(acc)
        acc_list.append(acc)

    return acc_list
Example #9
0
def train_model(args, use_cuda=False):
    start_time = time.time()

    # Read values from args
    num_tasks = args.num_tasks
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    lr = args.lr
    num_epochs = args.num_epochs
    num_points = args.num_points
    coreset_select_method = args.select_method

    # Some parameters
    dataset_generation_test = False
    dataset_num_samples = 2000

    # Colours for plotting
    color = ['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']

    # Load / Generate toy data
    datagen = ToydataGenerator(max_iter=num_tasks,
                               num_samples=dataset_num_samples)

    plt.figure()
    datagen.reset()
    total_loaders = []
    criterion_cl = nn.CrossEntropyLoss()

    # Create model
    layer_size = [2, hidden_size, hidden_size, 2]
    model = MLP(layer_size, act='sigmoid')
    if use_cuda:
        model = model.cuda()

    # Optimiser
    opt = opt_fromp(model,
                    lr=lr,
                    prior_prec=1e-4,
                    grad_clip_norm=None,
                    tau=args.tau)

    memorable_points = None
    inducing_targets = None

    for tid in range(num_tasks):
        # If not first task, need to calculate and store regularisation-term-related quantities
        if tid > 0:

            def closure(task_id):
                memorable_points_t = memorable_points[task_id]
                if use_cuda:
                    memorable_points_t = memorable_points_t.cuda()
                opt.zero_grad()
                logits = model.forward(memorable_points_t)
                return logits

            opt.init_task(closure, tid, eps=1e-3)

        # Data generator for this task
        itrain, itest = datagen.next_task()
        itrainloader = DataLoader(dataset=itrain,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=8)
        itestloader = DataLoader(dataset=itest,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=8)
        inducingloader = DataLoader(dataset=itrain,
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=8)
        iloaders = [itrainloader, itestloader]

        if tid == 0:
            total_loaders = [itrainloader]
        else:
            total_loaders.append(itrainloader)

        # Train and test
        cl_outputs = train(model,
                           iloaders,
                           memorable_points,
                           criterion_cl,
                           opt,
                           task_id=tid,
                           num_epochs=num_epochs,
                           use_cuda=use_cuda)

        # Select memorable past datapoints
        if coreset_select_method == 'random':
            i_memorable_points, i_inducing_targets = random_memorable_points(
                itrain, num_points=num_points, num_classes=2)
        else:
            i_memorable_points, i_inducing_targets = select_memorable_points(
                inducingloader,
                model,
                num_points=num_points,
                use_cuda=use_cuda)

        # Add memory points to set
        if tid > 0:
            memorable_points.append(i_memorable_points)
            inducing_targets.append(i_inducing_targets)
        else:
            memorable_points = [i_memorable_points]
            inducing_targets = [i_inducing_targets]

        # Update covariance (\Sigma)
        update_fisher(inducingloader, model, opt, use_cuda=use_cuda)

        # Plot visualisation (2D figure)
        cl_outputs, _ = torch.max(cl_outputs, dim=-1)
        cl_show = 2 * cl_outputs - 1

        cl_show = cl_show.detach()
        if use_cuda:
            cl_show = cl_show.cpu()
        cl_show = cl_show.numpy()
        cl_show = cl_show.reshape(datagen.test_shape)

        plt.figure()
        axs = plt.subplot(111)
        axs.title.set_text('FROMP')
        if not dataset_generation_test:
            plt.imshow(cl_show,
                       cmap='gray',
                       extent=(datagen.x_min, datagen.x_max, datagen.y_min,
                               datagen.y_max),
                       origin='lower')
        for l in range(tid + 1):
            idx = np.where(datagen.y == l)
            plt.scatter(datagen.X[idx][:, 0],
                        datagen.X[idx][:, 1],
                        c=color[l],
                        s=0.03)
            idx = np.where(datagen.y == l + datagen.offset)
            plt.scatter(datagen.X[idx][:, 0],
                        datagen.X[idx][:, 1],
                        c=color[l + datagen.offset],
                        s=0.03)
            if not dataset_generation_test:
                plt.scatter(memorable_points[l][:, 0],
                            memorable_points[l][:, 1],
                            c='m',
                            s=0.4,
                            marker='x')

        plt.show()

        # Calculate and print train accuracy and negative log likelihood
        with torch.no_grad():
            if not dataset_generation_test:
                model.eval()
                N = len(itrain)

                metric_task_id = 0
                nll_loss_avg = 0
                accuracy_avg = 0
                for metric_loader in total_loaders:
                    nll_loss = 0
                    correct = 0
                    for inputs, labels in metric_loader:
                        if use_cuda:
                            inputs, labels = inputs.cuda(), labels.cuda()

                        logits = model.forward(inputs)

                        nll_loss += nn.functional.cross_entropy(
                            torch.squeeze(logits, dim=-1), labels) * float(
                                inputs.shape[0])

                        # Calculate predicted classes
                        pred = logits.data.max(1, keepdim=True)[1]

                        # Count number of correctly predicted datapoints
                        correct += pred.eq(labels.data.view_as(pred)).sum()

                    nll_loss /= N
                    accuracy = float(correct) / float(N) * 100.

                    print(
                        'Task {}, Train accuracy: {:.2f}%, Train Loglik: {:.4f}'
                        .format(metric_task_id, accuracy, nll_loss))

                    metric_task_id += 1
                    nll_loss_avg += nll_loss
                    accuracy_avg += accuracy

                print('Avg train accuracy: {:.2f}%, Avg train Loglik: {:.4f}'.
                      format(accuracy_avg / metric_task_id,
                             nll_loss_avg / metric_task_id))

    print('Time taken: ', time.time() - start_time)
Example #10
0
    'number of shifted samples used for loss estimation (for grad methods).')
parser.add_argument('--max_var',
                    type=float,
                    default=10.0,
                    help='maximum variance shift')
parser.add_argument('--model_path',
                    type=str,
                    default='',
                    help='path to saved model if loading.')

opt = parser.parse_args()

#instantiate model:
net = MLP(input_size=784, width=opt.netWidth)
if opt.cuda:
    net = net.cuda()
if opt.model_path != '':
    net.load_state_dict(torch.load(opt.model_path), strict=False)

#instantiate optimizer:
optimizer = get_optimizer(net=net, lr=opt.lr, opt_str=opt.optim)

#getting data loaders:
train_loader, test_loader = get_data_loaders(BS=opt.batchSize)

#train model:
if opt.model_path == '':
    net, stats = train(net, opt.epochs, opt.cuda, optimizer, train_loader,
                       test_loader)
# net, stats = train(torch.nn.Sequential(AddNoise(mean=0,std=np.sqrt(0.25)),net), opt.epochs, opt.cuda, optimizer, train_loader, test_loader)
Example #11
0
def main():
    if args.gan_path is None:
        both = False
    else:
        both = True

    if args.validation:
        train_loader = torch.utils.data.DataLoader(AttributeDataset(
            args.data_dir,
            args.dataset,
            features_path=args.gan_path,
            mode='train',
            both=both,
            normalize=args.normalize,
            sentences=args.sentences),
                                                   batch_size=args.batch_size,
                                                   shuffle=True)
        val_seen_loader = torch.utils.data.DataLoader(
            AttributeDataset(args.data_dir,
                             args.dataset,
                             features_path=args.gan_path,
                             mode='val_seen',
                             generalized=True,
                             normalize=args.normalize,
                             sentences=args.sentences),
            batch_size=args.batch_size,
            shuffle=False)
        val_unseen_loader = torch.utils.data.DataLoader(
            AttributeDataset(args.data_dir,
                             args.dataset,
                             features_path=args.gan_path,
                             mode='val_unseen',
                             generalized=True,
                             normalize=args.normalize,
                             sentences=args.sentences),
            batch_size=args.batch_size,
            shuffle=False)
    else:
        trainval_loader = torch.utils.data.DataLoader(
            AttributeDataset(args.data_dir,
                             args.dataset,
                             features_path=args.gan_path,
                             mode='trainval',
                             both=both,
                             normalize=args.normalize,
                             sentences=args.sentences),
            batch_size=args.batch_size,
            shuffle=True)

    test_seen_loader = torch.utils.data.DataLoader(AttributeDataset(
        args.data_dir,
        args.dataset,
        features_path=args.gan_path,
        mode='test_seen',
        generalized=True,
        normalize=args.normalize,
        sentences=args.sentences),
                                                   batch_size=args.batch_size,
                                                   shuffle=False)

    test_unseen_loader = torch.utils.data.DataLoader(
        AttributeDataset(args.data_dir,
                         args.dataset,
                         features_path=args.gan_path,
                         mode='test_unseen',
                         generalized=True,
                         normalize=args.normalize,
                         sentences=args.sentences),
        batch_size=args.batch_size,
        shuffle=False)

    # instanciate the models
    if args.mlp:
        mlp = MLP(args.dim_input, [args.nhidden * 2], args.nhidden)
    else:
        mlp = LinearProjection(args.dim_input, args.nhidden)
    embed = LinearProjection(args.nhidden, args.dim_embed)

    if args.sentences:
        cam_key = 'sentences'
    else:
        cam_key = 'emb'

    if args.validation:
        cam = torch.from_numpy(train_loader.dataset.data[cam_key].T)
    else:
        cam = torch.from_numpy(trainval_loader.dataset.data[cam_key].T)
    proxies = ProxyNet(args.n_classes, args.dim_embed, proxies=cam)

    model = Base(mlp, embed, proxies)

    criterion = ProxyLoss(temperature=args.temp)

    if args.cuda:
        mlp.cuda()
        embed.cuda()
        model.cuda()
        proxies.cuda()

    parameters_set = []

    layers = []
    for c in mlp.children():
        if isinstance(c, nn.Linear) or isinstance(c, nn.ModuleList):
            layers.extend(list(c.parameters()))

    for c in embed.children():
        if isinstance(c, nn.Linear):
            layers.extend(list(c.parameters()))

    parameters_set.append({'params': layers, 'lr': args.lr})

    optimizer = optim.SGD(parameters_set,
                          lr=args.lr,
                          momentum=0.9,
                          nesterov=True,
                          weight_decay=5e-5)

    n_parameters = sum([p.data.nelement() for p in model.parameters()])
    print('  + Number of params: {}'.format(n_parameters))

    scheduler = CosineAnnealingLR(optimizer, args.epochs)

    best_acc = 0
    print('Random results:')
    if args.validation:
        validate(val_seen_loader, val_unseen_loader, model, criterion)
    else:
        validate(test_seen_loader, test_unseen_loader, model, criterion)

    for epoch in range(args.start_epoch, args.epochs + 1):
        # update learning rate
        if args.lr_decay:
            scheduler.step()

        # train for one epoch
        if args.validation:
            train(train_loader, model, criterion, optimizer, epoch)
            validate(val_seen_loader, val_unseen_loader, model, criterion)
        else:
            train(trainval_loader, model, criterion, optimizer, epoch)
            validate(test_seen_loader, test_unseen_loader, model, criterion)

        # saving
        save_checkpoint({'epoch': epoch, 'state_dict': model.state_dict()})

    print('\nFinal evaluation on last epoch model:')
    validate(test_seen_loader, test_unseen_loader, model, criterion)
def train_mlp(**kwargs):
    name, directory = set_directory(name=kwargs['name'],
                                    type_net=kwargs['type_net'],
                                    dof=kwargs['dof'])
    if kwargs['tensorboard']:
        writer = SummaryWriter(directory)
    else:
        writer = None

    train_loader, val_loader, iter_per_epoch = load_mnist(
        batch_size=kwargs['batch_size'])

    model = MLP(input_dim=784,
                num_classes=10,
                layer_dims=kwargs['ldims'],
                type_net=kwargs['type_net'],
                N=60000,
                dof=kwargs['dof'],
                beta_ema=kwargs['beta_ema'])

    num_parameters = sum([p.data.nelement() for p in model.parameters()])
    print(f'Number of model parameters: {num_parameters}')

    if torch.cuda.is_available():
        torch.cuda.set_device(kwargs['device'])

    # for training on multiple GPUs.
    # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    if kwargs['multi_gpu']:
        model = torch.nn.DataParallel(model).cuda()
    else:
        if torch.cuda.is_available():
            model = model.cuda()

    optimizer = construct_optimizer(optimizer=kwargs['optim'],
                                    model=model,
                                    lr=kwargs['lr'])

    if kwargs['resume'] != '':
        kwargs[
            'start_epoch'], best_prec1, total_steps, model, optimizer = resume_from_checkpoint(
                resume_path=kwargs['resume'], model=model, optimizer=optimizer)
    else:
        total_steps = 0
        best_prec1 = 0.

    cudnn.benchmark = True

    if kwargs['type_net'] == 'kerneldense':
        loss_function = torch.nn.CrossEntropyLoss().cuda()
    else:
        loss_function = CrossEntropyLossWithAnnealing(
            iter_per_epoch=iter_per_epoch,
            total_steps=total_steps,
            anneal_type=kwargs['anneal_type'],
            anneal_kl=kwargs['anneal_kl'],
            epzero=kwargs['epzero'],
            epmax=kwargs['epmax'],
            anneal_maxval=kwargs['anneal_maxval'],
            writer=writer)

    # loss_function = CrossEntropyLossWithMMD(num_samples=2)

    for epoch in range(kwargs['start_epoch'], kwargs['epochs']):
        total_steps = train_single_epoch(train_loader=train_loader,
                                         model=model,
                                         criterion=loss_function,
                                         optimizer=optimizer,
                                         epoch=epoch,
                                         clip_var=kwargs['clip_var'],
                                         total_steps=total_steps,
                                         print_freq=kwargs['print_freq'],
                                         writer=writer,
                                         thres_stds=kwargs['thres_std'],
                                         shape=[-1, 784])

        prec1 = validate(val_loader=val_loader,
                         model=model,
                         criterion=loss_function,
                         epoch=epoch,
                         print_freq=kwargs['print_freq'],
                         shape=[-1, 784],
                         writer=writer)

        if kwargs['restart'] and epoch % kwargs['restart_interval'] == 0:
            print('Restarting optimizer...')
            optimizer = construct_optimizer(optimizer=kwargs['restart_optim'],
                                            model=model,
                                            lr=kwargs['restart_lr'])

        is_best = prec1 > best_prec1
        if is_best:
            best_prec1 = prec1
        if isinstance(model, torch.nn.DataParallel):
            state = {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_prec1': max(prec1, best_prec1),
                'beta_ema': model.module.beta_ema,
                'optimizer': optimizer.state_dict(),
                'total_steps': total_steps
            }
            if model.module.beta_ema > 0:
                state['avg_params'] = model.module.avg_param
                state['steps_ema'] = model.module.steps_ema
        else:
            state = {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_prec1': max(prec1, best_prec1),
                'beta_ema': model.beta_ema,
                'optimizer': optimizer.state_dict(),
                'total_steps': total_steps
            }
            if model.beta_ema > 0:
                state['avg_params'] = model.avg_param
                state['steps_ema'] = model.steps_ema

        if epoch in kwargs['save_at']:
            name = f'checkpoint_{epoch}.pth.tar'
        else:
            name = 'checkpoint.pth.tar'

        save_checkpoint(state=state, is_best=is_best, name=name)

    print('Best accuracy: ', best_prec1)

    if writer is not None:
        writer.close()