def test_2layer_net(): params = init_toy_model() X, y = init_toy_data() Y_enc = ut.encode_labels(y) # Make the net layer_1 = layers.Linear(*params['W1'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W1'].T, params['b1'].ravel())) act_1 = layers.Relu() layer_2 = layers.Linear(*params['W2'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W2'].T, params['b2'].ravel())) net_2 = nn.Network([layer_1, act_1, layer_2], ls.CrossEntropy(), optim.SGD(lr=1e-5)) scores = net_2.forward(X) correct_scores = np.asarray([[-1.07260209, 0.05083871, -0.87253915], [-2.02778743, -0.10832494, -1.52641362], [-0.74225908, 0.15259725, -0.39578548], [-0.38172726, 0.10835902, -0.17328274], [-0.64417314, -0.18886813, -0.41106892]]) diff = np.sum(np.abs(scores - correct_scores)) assert (np.isclose(diff, 0.0, atol=1e-6)) loss = net_2.loss(X, Y_enc) correct_loss = 1.071696123862817 assert (np.isclose(loss, correct_loss, atol=1e-8))
def loss_func_b(bb): layer_lin = layers.Linear(n, c, reg='l2', reg_param=0.05, init_vals=(W.T, bb.ravel())) loss_func = ls.CrossEntropy() net = nn.Network([layer_lin], loss_func, optimizer=None) return net.loss(X_dev, Y_dev_enc)
def test_CrossEntropyLoss(): np.random.seed(1) W = np.random.randn(c, n) * 0.0001 b = np.random.randn(c, 1) * 0.0001 layer_lin = layers.Linear(n, c, init_vals=(W.T, b.ravel())) loss_func = ls.CrossEntropy() net = nn.Network([layer_lin], loss_func, optimizer=None) my_loss = net.loss(X_dev, Y_dev_enc) assert (np.isclose(my_loss, -np.log(.1), atol=1e-2))
def test_CrossEntropy_Linear_Grad(): np.random.seed(1) W = np.random.randn(c, n) * 0.0001 b = np.random.randn(c, 1) * 0.0001 layer_lin = layers.Linear(n, c, reg='l2', reg_param=0.05, init_vals=(W.T, b.ravel())) loss_func = ls.CrossEntropy() net = nn.Network([layer_lin], loss_func, optimizer=None) net_loss = net.loss(X_dev, Y_dev_enc) ngrad = net.backward() # Define functions to pass to helper def loss_func_W(ww): layer_lin = layers.Linear(n, c, reg='l2', reg_param=0.05, init_vals=(ww.T, b.ravel())) loss_func = ls.CrossEntropy() net = nn.Network([layer_lin], loss_func, optimizer=None) return net.loss(X_dev, Y_dev_enc) def loss_func_b(bb): layer_lin = layers.Linear(n, c, reg='l2', reg_param=0.05, init_vals=(W.T, bb.ravel())) loss_func = ls.CrossEntropy() net = nn.Network([layer_lin], loss_func, optimizer=None) return net.loss(X_dev, Y_dev_enc) # Actually run the test rel_err_weight = dutil.grad_check_sparse(loss_func_W, W, net.grads[0].T, 10, seed=42) rel_err_bias = dutil.grad_check_sparse(loss_func_b, b.ravel(), net.grads[1], 10, seed=42) assert (np.allclose(rel_err_weight, np.zeros(rel_err_weight.shape), atol=1e-4)) assert (np.allclose(rel_err_bias, np.zeros(rel_err_bias.shape), atol=1e-4))
def test_2layer_grad(): params = init_toy_model() X, y = init_toy_data() Y_enc = ut.encode_labels(y) # Make the net layer_1 = layers.Linear(*params['W1'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W1'].T, params['b1'].ravel())) act_1 = layers.Relu() layer_2 = layers.Linear(*params['W2'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W2'].T, params['b2'].ravel())) net_2 = nn.Network([layer_1, act_1, layer_2], ls.CrossEntropy(), optim.SGD(lr=1e-5)) loss = net_2.loss(X, Y_enc) net_2.backward() def f_change_param(param_name, U): if param_name == 3: net_2.layers[0].params['b'] = U if param_name == 2: net_2.layers[0].params['W'] = U if param_name == 1: net_2.layers[2].params['b'] = U if param_name == 0: net_2.layers[2].params['W'] = U return net_2.loss(X, Y_enc) rel_errs = np.empty(4) for param_name in range(4): f = lambda U: f_change_param(param_name, U) if param_name == 3: pass_pars = net_2.layers[0].params['b'] if param_name == 2: pass_pars = net_2.layers[0].params['W'] if param_name == 1: pass_pars = net_2.layers[2].params['b'] if param_name == 0: pass_pars = net_2.layers[2].params['W'] param_grad_num = dutil.grad_check(f, pass_pars, epsilon=1e-5) rel_errs[param_name] = ut.rel_error(param_grad_num, net_2.grads[param_name]) assert (np.allclose(rel_errs, np.zeros(4), atol=1e-7))
def train(model, args, datasets): """ train for one epoch args are some parameters of our model, e.g. batch size or n_class, etc. """ #switch the model in training mode model.encoder.train() model.decoder.train() if args.adversarial: model.discr.train() #the loader function will take care of the batching # train_set was defined prior loader = torch.utils.data.DataLoader(datasets, \ batch_size=args.batch_size, shuffle=True, drop_last=True) # loss on the whole dataset loss_data = tnt.meter.AverageValueMeter() loss_data_alt = tnt.meter.AverageValueMeter() loss_data_rad = tnt.meter.AverageValueMeter() loss_disc_val = tnt.meter.AverageValueMeter() accu_discr = 0.0 # loops over the batches for index, (tiles, labels) in enumerate(loader): # loading on the gpu if args.cuda: tiles = tiles.cuda().float() labels = labels.cuda().long() else: tiles = tiles.float() labels = labels.long() # adding noise to the sample noise = np.random.normal(0, 0.01, tiles.shape) noise_tens = fun.torch_raster(noise) # adding noise tiles_noise = tiles + noise_tens # applying arg max on labels for cross entropy _, labels = labels.max(dim=1) # ============discriminator=========== if args.adversarial: # ============forward=========== #pred_year = discr(code.detach()) code = model.encoder(tiles_noise, args) pred_year = model.discr(code, args) # ============loss=========== # applying arg max for checking accuracy _, pred_max = pred_year.max(dim=1) ## applying loss function for the discriminator and optimizing the weights loss_disc = loss_fun.CrossEntropy(pred_year, labels) # checking the accuracy matrix_accu = pred_max == labels matrix_accu_f = matrix_accu.flatten() matrix_accu_f = matrix_accu_f.cpu().detach().numpy() nb_true = np.count_nonzero(matrix_accu_f == True) accu_discr += nb_true / len(matrix_accu_f) # ============backward=========== # optimizing the discriminator. optional: training the encoder as well model.opti_D.zero_grad() #model.opti_AE.zero_grad() loss_disc.backward(retain_graph=True) #we clip the gradient at norm 1 this helps learning faster if args.grad_clip: for p in model.discr.parameters(): p.register_hook(lambda grad: torch.clamp(grad, -1, 1)) model.opti_D.step() model.opti_AE.zero_grad() model.opti_AE.step() # saving the loss loss_disc_val.add(loss_disc.item()) # putting an adversarial training on the encoder if args.opti_adversarial_encoder: code = model.encoder(tiles, args) pred_year = model.discr(code, args) loss_disc = loss_fun.CrossEntropy(pred_year, labels) loss_disc_adv = loss_disc model.opti_AE.zero_grad() loss_disc_adv.backward() model.opti_AE.step() #averaging accuracy accufin = accu_discr/(len(loader)) # ============auto_encoder optimization=========== # ============forward auto-encoder=========== # compute the prediction pred = model.predict(tiles_noise, args) code = model.encoder(tiles_noise, args) # boolean matrixes to remove effect of no data bool_matr_alt = tiles[:,None,0,:,:] != 0 bool_matr_rad = tiles[:,None,1,:,:] != 0 # filtering the data pred_alt = pred[:,None,0,:,:][bool_matr_alt] tiles_alt = tiles[:,None,0,:,:][bool_matr_alt] pred_rad = pred[:,None,1,:,:][bool_matr_rad] tiles_rad = tiles[:,None,1,:,:][bool_matr_rad] ## defiance part if args.defiance: # loading defiance matrix d_mat_rad = pred[:,None,2,:,:][bool_matr_rad] # calculating the loss eps = 10**-5 loss_alt = loss_fun.MeanSquareError(pred_alt, tiles_alt) # loss for the defiance mse_rad = (tiles_rad - pred_rad)**2 loss_rad = torch.mean(mse_rad / (d_mat_rad+eps) + (1/2)*torch.log(d_mat_rad+eps)) else: ## sum of squares loss_alt = loss_fun.MeanSquareError(pred_alt, tiles_alt) loss_rad = loss_fun.MeanSquareError(pred_rad, tiles_rad) if args.auto_encod: # ============forward=========== if args.adversarial: code = model.encoder(tiles_noise, args) pred_year = model.discr(code, args) loss_disc = loss_fun.CrossEntropy(pred_year, labels) # ============loss========== if args.adversarial and args.data_fusion: loss = loss_rad + loss_alt #- args.disc_loss_weight * loss_disc elif args.data_fusion: loss = loss_rad + loss_alt elif args.adversarial and args.rad_input: loss = loss_rad - args.disc_loss_weight * loss_disc elif args.adversarial: loss = loss_alt - args.disc_loss_weight * loss_disc elif args.rad_input: loss = loss_rad else: loss = loss_alt loss_data.add(loss.item()) # ============backward=========== model.opti_AE.zero_grad() loss.backward() #we clip the gradient at norm 1 this helps learning faster if args.grad_clip: for p in model.AE_params: p.register_hook(lambda grad: torch.clamp(grad, -1, 1)) model.opti_AE.step() # storing the loss values loss_data_alt.add(loss_alt.item()) loss_data_rad.add(loss_rad.item()) if args.adversarial == False: accufin = 0 # output of various losses result = (loss_data.value()[0], len(loader), loss_data_alt.value()[0], loss_data_rad.value()[0], loss_disc_val.value()[0], accufin) return result
transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), ])), shuffle=True, batch_size=batch_size) return train_loader, test_loader if __name__ == '__main__': torch.random.manual_seed(1234) np.random.seed(1234) epochs = 10 lr = 0.01 batch_size = 32 optimizer = optimizers.SGD(learning_rate=lr) criterion = loss.CrossEntropy() layers = [ layers.LinearLayer(784, 512), layers.ReLU(), layers.Dropout(keep_rate=0.8), layers.LinearLayer(512, 512), layers.ReLU(), layers.Dropout(keep_rate=0.8), layers.LinearLayer(512, 10) ] model = Model(layers, optimizer, criterion) train_loader, test_loader = get_dataset(batch_size) for epoch_id in range(epochs): model.train() total = 0
model.Linear(25, 25, initOption = 'He'), model.ReLU(), model.Linear(25, 25, initOption = 'He'), model.ReLU(), model.Linear(25, 25, initOption = 'He'), model.ReLU(), model.Linear(25, 2, initOption = 'Xavier')) Model_8 = model.MLP(model.Linear(2, 25), model.ReLU(), model.Linear(25, 25), model.ReLU(), model.Linear(25, 25), model.ReLU(), model.Linear(25, 25), model.ReLU(), model.Linear(25, 2)) learning_rate = 0.01 nb_epochs = 200 l_CE = [] text = ['Tanh', 'ReLU + BN', 'ReLU + Init', 'ReLU'] for i, M in enumerate([Model_5, Model_6, Model_7, Model_8]): loss_fnc = loss.CrossEntropy(M) print('Model', i+5) print('Loss function: Cross-Entropy; Activation function:', text[i]) l_CE.append(train_model(M, train_input, train_target, loss_fnc, nb_epochs, learning_rate)) print("---------------------- Error ---------------------") nb_train_errors = compute_nb_errors(M, train_input, train_target) nb_test_errors = compute_nb_errors(M, test_input, test_target) print('Test error Net {:0.2f}% {:d}/{:d}'.format((100 * nb_test_errors) / test_input.size(0), nb_test_errors, test_input.size(0))) print('Train error Net {:0.2f}% {:d}/{:d}'.format((100 * nb_train_errors) / train_input.size(0), nb_train_errors, train_input.size(0))) print("--------------------------------------------------\n")