def generate_inverted_image_specific_layer(self, input_image, img_size, target_layer=3): # Generate a random image which we will optimize opt_img = Variable(1e-1 * torch.randn(1, 3, img_size, img_size), requires_grad=True) # Define optimizer for previously created image optimizer = SGD([opt_img], lr=1e4, momentum=0.9) # Get the output from the model after a forward pass until target_layer # with the input image (real image, NOT the randomly generated one) input_image_layer_output = \ self.get_output_from_specific_layer(input_image, target_layer) # Alpha regularization parametrs # Parameter alpha, which is actually sixth norm alpha_reg_alpha = 6 # The multiplier, lambda alpha alpha_reg_lambda = 1e-7 # Total variation regularization parameters # Parameter beta, which is actually second norm tv_reg_beta = 2 # The multiplier, lambda beta tv_reg_lambda = 1e-8 for i in range(201): optimizer.zero_grad() # Get the output from the model after a forward pass until target_layer # with the generated image (randomly generated one, NOT the real image) output = self.get_output_from_specific_layer(opt_img, target_layer) # Calculate euclidian loss euc_loss = 1e-1 * self.euclidian_loss(input_image_layer_output.detach(), output) # Calculate alpha regularization reg_alpha = alpha_reg_lambda * self.alpha_norm(opt_img, alpha_reg_alpha) # Calculate total variation regularization reg_total_variation = tv_reg_lambda * self.total_variation_norm(opt_img, tv_reg_beta) # Sum all to optimize loss = euc_loss + reg_alpha + reg_total_variation # Step loss.backward() optimizer.step() # Generate image every 5 iterations if i % 5 == 0: print('Iteration:', str(i), 'Loss:', loss.data.numpy()[0]) x = recreate_image(opt_img) cv2.imwrite('../generated/Inv_Image_Layer_' + str(target_layer) + '_Iteration_' + str(i) + '.jpg', x) # Reduce learning rate every 40 iterations if i % 40 == 0: for param_group in optimizer.param_groups: param_group['lr'] *= 1/10
def test_mask_same_after_update(generate_batch): from torch.optim import SGD unary, tags, lengths = generate_batch h = unary.size(2) constraint = torch.rand(h, h) < 0.5 crf = CRF(h, constraint=constraint) opt = SGD(crf.parameters(), lr=10) m1 = crf.constraint.numpy() t1 = crf.transitions_p.detach().clone().numpy() l = crf.neg_log_loss(unary, tags, lengths) l = torch.mean(l) l.backward() opt.step() m2 = crf.constraint.numpy() t2 = crf.transitions_p.detach().numpy() np.testing.assert_allclose(m1, m2) with pytest.raises(AssertionError): np.testing.assert_allclose(t1, t2)
def generate(self): initial_learning_rate = 6 for i in range(1, 150): # Process image and return variable self.processed_image = preprocess_image(self.created_image) # Define optimizer for the image optimizer = SGD([self.processed_image], lr=initial_learning_rate) # Forward output = self.model(self.processed_image) # Target specific class class_loss = -output[0, self.target_class] print('Iteration:', str(i), 'Loss', "{0:.2f}".format(class_loss.data.numpy()[0])) # Zero grads self.model.zero_grad() # Backward class_loss.backward() # Update image optimizer.step() # Recreate image self.created_image = recreate_image(self.processed_image) # Save image cv2.imwrite('../generated/c_specific_iteration_'+str(i)+'.jpg', self.created_image) return self.processed_image
def train_siamese_distrib_margine(directory, version, model, train_loader, valid_loader, resize, batch_size, margine, exp_name='model_1', decay=None, lr=0.0001, epochs=10, momentum=0.99, logdir='logs', modeLoss=None, dizionario_array=None): print("momonetum", momentum) print("lr", lr) print(margine) if not modeLoss is None: if modeLoss == "single": criterion = ContrastiveLoss(margine) if not decay is None: print("Weight_Decay", decay) optimizer = SGD(model.parameters(), lr, momentum=momentum, weight_decay=decay) else: optimizer = SGD(model.parameters(), lr, momentum=momentum) if not dizionario_array is None: optimizer.load_state_dict(dizionario_array["optimizer"]) #meters loss_meter = AverageValueMeter() acc_meter = AverageValueMeter() #writer writer = SummaryWriter(join(logdir, exp_name)) #device device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) criterion.to(device) #definiamo un dizionario contenente i loader di training e test loader = {'train': train_loader, 'valid': valid_loader} if not dizionario_array is None: array_accuracy_train = dizionario_array["a_train"] array_accuracy_valid = dizionario_array["a_valid"] array_loss_train = dizionario_array["l_train"] array_loss_valid = dizionario_array["l_valid"] array_glb_train = dizionario_array["g_train"] array_glb_valid = dizionario_array["g_valid"] global_step = array_glb_valid[-1] last_loss_train = array_loss_train[-1] last_loss_val = array_loss_valid[-1] last_acc_train = array_accuracy_train[-1] last_acc_val = array_accuracy_valid[-1] epoche_fatte = dizionario_array["epoche_fatte"] epoche_avanza = dizionario_array["epoche_avanza"] else: array_accuracy_train = [] array_accuracy_valid = [] array_loss_train = [] array_loss_valid = [] array_glb_train = [] array_glb_valid = [] global_step = 0 last_loss_train = 0 last_loss_val = 0 last_acc_train = 0 last_acc_val = 0 #inizializziamo il global step tempo = Timer() start = timer() for e in range(epochs): print("Epoca= ", e) #iteriamo tra due modalità: train e test for mode in ['train', 'valid']: loss_meter.reset() acc_meter.reset() model.train() if mode == 'train' else model.eval() with torch.set_grad_enabled( mode == 'train'): #abilitiamo i gradienti solo in training for i, batch in enumerate(loader[mode]): print("Num batch =", i) I_i, I_j, l_ij, _, _ = [b.to(device) for b in batch] #img1, img2, label12, label1, label2 #l'implementazione della rete siamese è banale: #eseguiamo la embedding net sui due input phi_i = model(I_i) #img 1 phi_j = model(I_j) #img2 print("Output train img1", phi_i.size()) print("Output train img2", phi_j.size()) #print("Etichetta reale",l_ij) euclidean_distance = F.pairwise_distance(phi_i, phi_j) euclid_tmp = torch.Tensor.numpy( euclidean_distance.detach().cpu()) # distanza labs = l_ij.to('cpu').numpy() # etichette reali print(euclid_tmp) etichette_predette = [euclid_tmp > margine] print(etichette_predette) etichette_predette = np.int8(etichette_predette) etichette_predette = np.reshape(etichette_predette, -1) print(etichette_predette) #l_ij = l_ij.type(torch.LongTensor).to(device) #calcoliamo la loss l = criterion(phi_i, phi_j, l_ij) #aggiorniamo il global_step #conterrà il numero di campioni visti durante il training n = I_i.shape[0] #numero di elementi nel batch #print("numero elementi nel batch ",n) global_step += n if mode == 'train': l.backward() optimizer.step() optimizer.zero_grad() acc = accuracy_score(np.array(labs), np.array(etichette_predette)) n = batch[0].shape[0] loss_meter.add(l.item(), n) acc_meter.add(acc, n) #loggiamo i risultati iterazione per iterazione solo durante il training if mode == 'train': writer.add_scalar('loss/train', loss_meter.value(), global_step=global_step) writer.add_scalar('accuracy/train', acc_meter.value(), global_step=global_step) #una volta finita l'epoca (sia nel caso di training che test, loggiamo le stime finali) if mode == 'train': global_step_train = global_step last_loss_train = loss_meter.value() last_acc_train = acc_meter.value() array_accuracy_train.append(acc_meter.value()) array_loss_train.append(loss_meter.value()) array_glb_train.append(global_step) else: global_step_val = global_step last_loss_val = loss_meter.value() last_acc_val = acc_meter.value() array_accuracy_valid.append(acc_meter.value()) array_loss_valid.append(loss_meter.value()) array_glb_valid.append(global_step) writer.add_scalar('loss/' + mode, loss_meter.value(), global_step=global_step) writer.add_scalar('accuracy/' + mode, acc_meter.value(), global_step=global_step) print("Loss TRAIN", array_loss_train) print("Losss VALID", array_loss_valid) print("Accuracy TRAIN", array_accuracy_train) print("Accuracy VALID", array_accuracy_valid) print("dim acc train", len(array_accuracy_train)) print("dim acc valid", len(array_accuracy_valid)) plt.figure(figsize=(12, 8)) plt.plot(array_glb_train, array_accuracy_train) plt.plot(array_glb_valid, array_accuracy_valid) plt.xlabel('samples') plt.ylabel('accuracy') plt.grid() plt.legend(['Training', 'Valid']) plt.savefig(directory + '//plotAccuracy_' + version + '.png') plt.show() plt.figure(figsize=(12, 8)) plt.plot(array_glb_train, array_loss_train) plt.plot(array_glb_valid, array_loss_valid) plt.xlabel('samples') plt.ylabel('loss') plt.grid() plt.legend(['Training', 'Valid']) plt.savefig(directory + '//plotLoss_' + version + '.png') plt.show() saveArray(directory, version, array_loss_train, array_loss_valid, array_accuracy_train, array_accuracy_valid, array_glb_train, array_glb_valid) saveinFileJson(start, directory, version, resize, batch_size, e, lr, momentum, len(train_loader), array_accuracy_train[-1], array_accuracy_valid[-1], array_loss_train[-1], array_loss_valid[-1]) #writer.add_embedding(phi_i, batch[3], I_i, global_step=global_step, tag=exp_name+'_embedding') #conserviamo i pesi del modello alla fine di un ciclo di training e test net_save(epochs, model, optimizer, last_loss_train, last_loss_val, last_acc_train, last_acc_val, global_step_train, global_step_val, '%s.pth' % (exp_name + "_dict")) torch.save(model, '%s.pth' % exp_name) torch.save( model, directory + "//" + version + "//" + '%s.pth' % (exp_name + "_" + str(e))) f = '{:.7f}'.format(tempo.stop()) return model, f, last_loss_train, last_loss_val, last_acc_train, last_acc_val
def train(train_source_iter: ForeverDataIterator, train_target_iter: ForeverDataIterator, G: nn.Module, F1: ImageClassifierHead, F2: ImageClassifierHead, optimizer_g: SGD, optimizer_f: SGD, epoch: int, args: argparse.Namespace): batch_time = AverageMeter('Time', ':3.1f') data_time = AverageMeter('Data', ':3.1f') losses = AverageMeter('Loss', ':3.2f') trans_losses = AverageMeter('Trans Loss', ':3.2f') cls_accs = AverageMeter('Cls Acc', ':3.1f') tgt_accs = AverageMeter('Tgt Acc', ':3.1f') progress = ProgressMeter( args.iters_per_epoch, [batch_time, data_time, losses, trans_losses, cls_accs, tgt_accs], prefix="Epoch: [{}]".format(epoch)) # switch to train mode G.train() F1.train() F2.train() end = time.time() for i in range(args.iters_per_epoch): x_s, labels_s = next(train_source_iter) x_t, labels_t = next(train_target_iter) x_s = x_s.to(device) x_t = x_t.to(device) labels_s = labels_s.to(device) labels_t = labels_t.to(device) x = torch.cat((x_s, x_t), dim=0) assert x.requires_grad is False # measure data loading time data_time.update(time.time() - end) # Step A train all networks to minimize loss on source domain optimizer_g.zero_grad() optimizer_f.zero_grad() g = G(x) y_1 = F1(g) y_2 = F2(g) y1_s, y1_t = y_1.chunk(2, dim=0) y2_s, y2_t = y_2.chunk(2, dim=0) y1_t, y2_t = F.softmax(y1_t, dim=1), F.softmax(y2_t, dim=1) loss = F.cross_entropy(y1_s, labels_s) + F.cross_entropy(y2_s, labels_s) + \ 0.01 * (entropy(y1_t) + entropy(y2_t)) loss.backward() optimizer_g.step() optimizer_f.step() # Step B train classifier to maximize discrepancy optimizer_g.zero_grad() optimizer_f.zero_grad() g = G(x) y_1 = F1(g) y_2 = F2(g) y1_s, y1_t = y_1.chunk(2, dim=0) y2_s, y2_t = y_2.chunk(2, dim=0) y1_t, y2_t = F.softmax(y1_t, dim=1), F.softmax(y2_t, dim=1) loss = F.cross_entropy(y1_s, labels_s) + F.cross_entropy(y2_s, labels_s) + \ 0.01 * (entropy(y1_t) + entropy(y2_t)) - classifier_discrepancy(y1_t, y2_t) * args.trade_off loss.backward() optimizer_f.step() # Step C train genrator to minimize discrepancy for k in range(args.num_k): optimizer_g.zero_grad() g = G(x) y_1 = F1(g) y_2 = F2(g) y1_s, y1_t = y_1.chunk(2, dim=0) y2_s, y2_t = y_2.chunk(2, dim=0) y1_t, y2_t = F.softmax(y1_t, dim=1), F.softmax(y2_t, dim=1) mcd_loss = classifier_discrepancy(y1_t, y2_t) * args.trade_off mcd_loss.backward() optimizer_g.step() cls_acc = accuracy(y1_s, labels_s)[0] tgt_acc = accuracy(y1_t, labels_t)[0] losses.update(loss.item(), x_s.size(0)) cls_accs.update(cls_acc.item(), x_s.size(0)) tgt_accs.update(tgt_acc.item(), x_t.size(0)) trans_losses.update(mcd_loss.item(), x_s.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
def train(opt): from tensorboardX import SummaryWriter writer = SummaryWriter(path_output) source1, source2, source3, target = taskSelect(opt.target) dataset_s1 = dataset.DA(dir=root, name=source1, img_size=(224, 224), train=True) dataset_s2 = dataset.DA(dir=root, name=source2, img_size=(224, 224), train=True) dataset_s3 = dataset.DA(dir=root, name=source3, img_size=(224, 224), train=True) dataset_t = dataset.DA(dir=root, name=target, img_size=(224, 224), train=True) dataset_tt = dataset.DA(dir=root, name=target, img_size=(224,224), train=False,real_val=False) dataloader_s1 = DataLoader(dataset_s1, batch_size=opt.bs, shuffle=True, num_workers=2) dataloader_s2 = DataLoader(dataset_s2, batch_size=opt.bs, shuffle=True, num_workers=2) dataloader_s3 = DataLoader(dataset_s3, batch_size=opt.bs, shuffle=True, num_workers=2) dataloader_t = DataLoader(dataset_t, batch_size=opt.bs, shuffle=True, num_workers=2) dataloader_tt = DataLoader(dataset_tt, batch_size=opt.bs, shuffle=False, num_workers=2) # dataset_s1 = dataset.DA(dir=root, name=source1, img_size=(224, 224), train=True) # dataset_s2 = dataset.DA(dir=root, name=source2, img_size=(224, 224), train=True) # dataset_s3 = dataset.DA(dir=root, name=source3, img_size=(224, 224), train=True) # dataset_t = dataset.DA(dir=root, name=target, img_size=(224, 224), train=True) # if target == 'real': # tmp = os.path.join(root, 'test') # dataset_tt = dataset.DA_test(dir=tmp, img_size=(224,224)) # else: # dataset_tt = dataset.DA(dir=root, name=target, img_size=(224, 224), train=False) # dataloader_s1 = DataLoader(dataset_s1, batch_size=opt.bs, shuffle=True, num_workers=2) # dataloader_s2 = DataLoader(dataset_s2, batch_size=opt.bs, shuffle=True, num_workers=2) # dataloader_s3 = DataLoader(dataset_s3, batch_size=opt.bs, shuffle=True, num_workers=2) # dataloader_t = DataLoader(dataset_t, batch_size=opt.bs, shuffle=True, num_workers=2) # dataloader_tt = DataLoader(dataset_tt, batch_size=opt.bs, shuffle=False, num_workers=2) len_data = min(len(dataset_s1), len(dataset_s2), len(dataset_s3), len(dataset_t)) # length of "shorter" domain len_bs = min(len(dataloader_s1), len(dataloader_s2), len(dataloader_s3), len(dataloader_t)) # Define networks feature_extractor = models.feature_extractor() classifier_1 = models.class_classifier() classifier_2 = models.class_classifier() classifier_3 = models.class_classifier() classifier_1_ = models.class_classifier() classifier_2_ = models.class_classifier() classifier_3_ = models.class_classifier() # if torch.cuda.is_available(): feature_extractor = feature_extractor.to(device) classifier_1 = classifier_1.to(device).apply(weight_init) classifier_2 = classifier_2.to(device).apply(weight_init) classifier_3 = classifier_3.to(device).apply(weight_init) classifier_1_ = classifier_1_.to(device).apply(weight_init) classifier_2_ = classifier_2_.to(device).apply(weight_init) classifier_3_ = classifier_3_.to(device).apply(weight_init) # Define loss mom_loss = momentumLoss() cl_loss = nn.CrossEntropyLoss() disc_loss = discrepancyLoss() # Optimizers # Change the LR optimizer_features = SGD(feature_extractor.parameters(), lr=0.0001,momentum=0.9,weight_decay=5e-4) optimizer_classifier = SGD(([{'params': classifier_1.parameters()}, {'params': classifier_2.parameters()}, {'params': classifier_3.parameters()}]), lr=0.002,momentum=0.9,weight_decay=5e-4) optimizer_classifier_ = SGD(([{'params': classifier_1_.parameters()}, {'params': classifier_2_.parameters()}, {'params': classifier_3_.parameters()}]), lr=0.002,momentum=0.9,weight_decay=5e-4) # optimizer_features = SGD(feature_extractor.parameters(), lr=0.0001) # optimizer_classifier = Adam(([{'params': classifier_1.parameters()}, # {'params': classifier_2.parameters()}, # {'params': classifier_3.parameters()}]), lr=0.002) # optimizer_classifier_ = Adam(([{'params': classifier_1_.parameters()}, # {'params': classifier_2_.parameters()}, # {'params': classifier_3_.parameters()}]), lr=0.002) if opt.pretrain is not None: state = torch.load(opt.pretrain) feature_extractor.load_state_dict(state['feature_extractor']) classifier_1.load_state_dict(state['{}_classifier'.format(source1)]) classifier_2.load_state_dict(state['{}_classifier'.format(source2)]) classifier_3.load_state_dict(state['{}_classifier'.format(source3)]) classifier_1_.load_state_dict(state['{}_classifier_'.format(source1)]) classifier_2_.load_state_dict(state['{}_classifier_'.format(source2)]) classifier_3_.load_state_dict(state['{}_classifier_'.format(source3)]) # Lists train_loss = [] acc_on_target = [] tot_loss, tot_clf_loss, tot_mom_loss, tot_s2_loss, tot_s3_loss = 0.0, 0.0, 0.0, 0.0, 0.0 n_samples, iteration = 0, 0 tot_correct = [0, 0, 0, 0, 0, 0] saved_time = time.time() feature_extractor.train() classifier_1.train(), classifier_2.train(), classifier_3.train() classifier_1_.train(), classifier_2_.train(), classifier_3_.train() for epoch in range(opt.ep): if epoch+1 == 5: optimizer_classifier = SGD(([{'params': classifier_1.parameters()}, {'params': classifier_2.parameters()}, {'params': classifier_3.parameters()}]), lr=0.001,momentum=0.9,weight_decay=5e-4) optimizer_classifier_ = SGD(([{'params': classifier_1_.parameters()}, {'params': classifier_2_.parameters()}, {'params': classifier_3_.parameters()}]), lr=0.001,momentum=0.9,weight_decay=5e-4) if epoch+1 == 10: optimizer_classifier = SGD(([{'params': classifier_1.parameters()}, {'params': classifier_2.parameters()}, {'params': classifier_3.parameters()}]), lr=0.0001,momentum=0.9,weight_decay=5e-4) optimizer_classifier_ = SGD(([{'params': classifier_1_.parameters()}, {'params': classifier_2_.parameters()}, {'params': classifier_3_.parameters()}]), lr=0.0001,momentum=0.9,weight_decay=5e-4) for i, (data_1, data_2, data_3, data_t) in enumerate(zip(dataloader_s1, dataloader_s2, dataloader_s3, dataloader_t)): img1, lb1 = data_1 img2, lb2 = data_2 img3, lb3 = data_3 imgt, _ = data_t # Prepare data cur_batch = min(img1.shape[0], img2.shape[0], img3.shape[0], imgt.shape[0]) # print(i, cur_batch) img1, lb1 = Variable(img1[0:cur_batch,:,:,:]).to(device), Variable(lb1[0:cur_batch]).to(device) img2, lb2 = Variable(img2[0:cur_batch,:,:,:]).to(device), Variable(lb2[0:cur_batch]).to(device) img3, lb3 = Variable(img3[0:cur_batch,:,:,:]).to(device), Variable(lb3[0:cur_batch]).to(device) imgt = Variable(imgt[0:cur_batch,:,:,:]).to(device) ### STEP 1 ### train G and C pairs # Forward optimizer_features.zero_grad() optimizer_classifier.zero_grad() optimizer_classifier_.zero_grad() # Extract Features ft1 = feature_extractor(img1) ft2 = feature_extractor(img2) ft3 = feature_extractor(img3) ft_t = feature_extractor(imgt) # Class Prediction [bs, 345] cl1, cl1_ = classifier_1(ft1), classifier_1_(ft1) cl2, cl2_ = classifier_2(ft2), classifier_2_(ft2) cl3, cl3_ = classifier_3(ft3), classifier_3_(ft3) # Compute "momentum loss" loss_mom = mom_loss(ft1, ft2, ft3, ft_t) # Cross entropy loss l1, l1_ = cl_loss(cl1, lb1), cl_loss(cl1_, lb1) l2, l2_ = cl_loss(cl2, lb2), cl_loss(cl2_, lb2) l3, l3_ = cl_loss(cl3, lb3), cl_loss(cl3_, lb3) # total loss s1loss = l1 + l2 + l3 + l1_ + l2_ + l3_ + opt.alpha * loss_mom s1loss.backward() optimizer_features.step() optimizer_classifier.step() optimizer_classifier_.step() ### STEP 2 ### fix G, and train C pairs optimizer_classifier.zero_grad() optimizer_classifier_.zero_grad() # Class Prediction on each src domain cl1, cl1_ = classifier_1(ft1.detach()), classifier_1_(ft1.detach()) cl2, cl2_ = classifier_2(ft2.detach()), classifier_2_(ft2.detach()) cl3, cl3_ = classifier_3(ft3.detach()), classifier_3_(ft3.detach()) # discrepancy on tgt domain clt1, clt1_ = classifier_1(ft_t.detach()), classifier_1_(ft_t.detach()) clt2, clt2_ = classifier_2(ft_t.detach()), classifier_2_(ft_t.detach()) clt3, clt3_ = classifier_3(ft_t.detach()), classifier_3_(ft_t.detach()) # classification loss l1, l1_ = cl_loss(cl1, lb1), cl_loss(cl1_, lb1) l2, l2_ = cl_loss(cl2, lb2), cl_loss(cl2_, lb2) l3, l3_ = cl_loss(cl3, lb3), cl_loss(cl3_, lb3) # print(clt1.shape) dl1 = disc_loss(clt1, clt1_) dl2 = disc_loss(clt2, clt2_) dl3 = disc_loss(clt3, clt3_) # print(dl1, dl2, dl3) # backward s2loss = l1 + l2 + l3 + l1_ + l2_ + l3_ - dl1 - dl2 - dl3 s2loss.backward() optimizer_classifier.step() optimizer_classifier_.step() ### STEP 3 #### fix C pairs, train G optimizer_features.zero_grad() ft_t = feature_extractor(imgt) clt1, clt1_ = classifier_1(ft_t), classifier_1_(ft_t) clt2, clt2_ = classifier_2(ft_t), classifier_2_(ft_t) clt3, clt3_ = classifier_3(ft_t), classifier_3_(ft_t) dl1 = disc_loss(clt1, clt1_) dl2 = disc_loss(clt2, clt2_) dl3 = disc_loss(clt3, clt3_) s3loss = dl1 + dl2 + dl3 s3loss.backward() optimizer_features.step() pred = torch.stack((cl1, cl2, cl3, cl1_, cl2_, cl3_), 0) # [6, bs, 345] _, pred = torch.max(pred, dim = 2) # [6, bs] gt = torch.stack((lb1, lb2, lb3, lb1, lb2, lb3), 0) # [6, bs] correct = pred.eq(gt.data) correct = torch.mean(correct.type(torch.FloatTensor), dim = 1).cpu().numpy() tot_loss += s1loss.item() * cur_batch tot_clf_loss += (s1loss.item() - opt.alpha * loss_mom.item()) * cur_batch tot_s2_loss += s2loss.item() * cur_batch tot_s3_loss += s3loss.item() * cur_batch tot_mom_loss += loss_mom.item() * cur_batch tot_correct += correct * cur_batch n_samples += cur_batch # print(cur_batch) if iteration % opt.log_interval == 0: current_time = time.time() print('Train Epoch: {} [{}/{} ({:.0f}%)]\tClfLoss: {:.4f}\tMMLoss: {:.4f}\t \ S2Loss: {:.4f}\tS3Loss: {:.4f}\t \ Accu: {:.4f}\\{:.4f}\\{:.4f}\\{:.4f}\\{:.4f}\\{:.4f}\tTime: {:.3f}'.format(\ epoch, i * opt.bs, len_data, 100. * i / len_bs, \ tot_clf_loss / n_samples, tot_mom_loss / n_samples, tot_s2_loss / n_samples, tot_s3_loss / n_samples, tot_correct[0] / n_samples, tot_correct[1] / n_samples, tot_correct[2] / n_samples, tot_correct[3] / n_samples, tot_correct[4] / n_samples, tot_correct[5] / n_samples, current_time - saved_time)) writer.add_scalar('Train/ClfLoss', tot_clf_loss / n_samples, iteration * opt.bs) writer.add_scalar('Train/MMLoss', tot_mom_loss / n_samples, iteration * opt.bs) writer.add_scalar('Train/s2Loss', tot_s2_loss / n_samples, iteration * opt.bs) writer.add_scalar('Train/s3Loss', tot_s3_loss / n_samples, iteration * opt.bs) writer.add_scalar('Train/Accu0', tot_correct[0] / n_samples, iteration * opt.bs) writer.add_scalar('Train/Accu1', tot_correct[1] / n_samples, iteration * opt.bs) writer.add_scalar('Train/Accu2', tot_correct[2] / n_samples, iteration * opt.bs) writer.add_scalar('Train/Accu0_', tot_correct[3] / n_samples, iteration * opt.bs) writer.add_scalar('Train/Accu1_', tot_correct[4] / n_samples, iteration * opt.bs) writer.add_scalar('Train/Accu2_', tot_correct[5] / n_samples, iteration * opt.bs) saved_weight = torch.FloatTensor([tot_correct[0], tot_correct[1], tot_correct[2], tot_correct[3], tot_correct[4], tot_correct[5]]).to(device) if torch.sum(saved_weight) == 0.: saved_weight = torch.FloatTensor(6).to(device).fill_(1)/6. else: saved_weight = saved_weight/torch.sum(saved_weight) saved_time = time.time() tot_clf_loss, tot_mom_loss, tot_correct, n_samples = 0, 0, [0, 0, 0, 0, 0, 0], 0 tot_s2_loss, tot_s3_loss = 0, 0 train_loss.append(tot_loss) # evaluation and save if iteration % opt.eval_interval == 0 and iteration >= 0 and target != 'real': print('weight = ', saved_weight.cpu().numpy()) evalacc = eval(saved_weight, feature_extractor, classifier_1_, classifier_2_, classifier_3_, classifier_1, classifier_2, classifier_3, dataloader_tt) writer.add_scalar('Test/Accu', evalacc, iteration * opt.bs) acc_on_target.append(evalacc) print('Eval Acc = {:.2f}\n'.format(evalacc*100)) torch.save({ 'epoch': epoch, 'feature_extractor': feature_extractor.state_dict(), '{}_classifier'.format(source1): classifier_1.state_dict(), '{}_classifier'.format(source2): classifier_2.state_dict(), '{}_classifier'.format(source3): classifier_3.state_dict(), '{}_classifier_'.format(source1): classifier_1_.state_dict(), '{}_classifier_'.format(source2): classifier_2_.state_dict(), '{}_classifier_'.format(source3): classifier_3_.state_dict(), 'features_optimizer': optimizer_features.state_dict(), 'classifier_optimizer': optimizer_classifier.state_dict(), 'loss': tot_loss, 'saved_weight': saved_weight }, os.path.join(path_output, target + '-{}-{:.2f}.pth'.format(epoch, evalacc*100))) iteration += 1 pkl.dump(train_loss, open('{}train_loss.p'.format(path_output), 'wb')) if target != 'real': pkl.dump(acc_on_target, open('{}target_accuracy.p'.format(path_output), 'wb'))
def main(): training_size = 1 #10000 valid_size = 1 #1000 test_size = 15 #1000 epochs_num = 1 #1000 hidden_size = 60 #5 batch_size = 1 #100 data_length = 60 train_x, train_t = mkDataSet(training_size) valid_x, valid_t = mkDataSet(valid_size) #print(valid_t) model = Predictor(2, hidden_size, 2) criterion = nn.MSELoss() optimizer = SGD(model.parameters(), lr=0.01) for epoch in range(epochs_num): # training running_loss = 0.0 training_accuracy = 0.0 for i in range(int(training_size / batch_size)): optimizer.zero_grad() # 勾配の初期化 data, label = mkRandomBatch(train_x, train_t, batch_size) output = model(data) # 順伝播 loss = criterion(output, label) # ロスの計算 loss.backward() # 勾配の計算 optimizer.step() # パラメータの更新 running_loss += loss.item() #training_accuracy += np.sum(np.abs((output.data - label.data).numpy()) < 0.1) #print(label.data) training_accuracy = mean_squared_error(np.ravel(output.data), np.ravel( label.data)) #MSEで誤差算出 #print('MSE Train : %.3f' % training_accuracy) #valid test_accuracy = 0.0 for i in range(int(valid_size / batch_size)): offset = i * batch_size data, label = torch.FloatTensor( valid_x[offset:offset + batch_size]), torch.FloatTensor( valid_t[offset:offset + batch_size]) output = model(data, None) #test_accuracy += np.sum(np.abs((output.data - label.data).numpy()) < 10) test_accuracy = mean_squared_error(np.ravel(output.data), np.ravel(label.data)) #print(output.data) #print(label.data) #training_accuracy /= training_size #test_accuracy /= valid_size print('%d loss: %.3f, training_accuracy: %.5f, valid_accuracy: %.5f' % (epoch + 1, running_loss, training_accuracy, test_accuracy)) #test test_accuracy = 0.0 test_x, test_t = mkTestSet(test_size) result = [] process = [] for i in range(int( test_size / batch_size)): #testではlabelが正解なので、output.data(出力)とlabel.dataを比較する offset = i * batch_size data, label = torch.FloatTensor( test_x[offset:offset + batch_size]), torch.FloatTensor( test_t[offset:offset + batch_size]) output = model(data, None) test_accuracy = mean_squared_error(np.ravel(output.data), np.ravel(label.data)) process = output.data.numpy().flatten() result.append(process) print('%d loss: %.3f, training_accuracy: %.5f, test_accuracy: %.5f' % (epoch + 1, running_loss, training_accuracy, test_accuracy)) #print(test_x) #print(type(result)) #print(result) #print(result) #data_np=result.numpy() data_np = np.asarray(result).flatten() #data_np[data_np % 2 == 0]=(data_np[data_np%2==0]+1)*1280/2 print(len(data_np)) print(data_np) data_np = np.resize(data_np, (test_size * data_length, 2)) #print(data_np[:,([0]+1)*1280/2]) #submission = pd.Series(data_np) #name=['x','y']) #submission.to_csv("C:\\Users\\010170243\\work\\seq2seq\\dataset\\kusakaGomiToCSV\\all\\kusaka_result.csv", header=True, index_label='id') np.savetxt( "C:\\Users\\010170243\\work\\seq2seq\\dataset\\kusakaGomiToCSV\\all\\kusaka_result.csv", # ファイル名 X=data_np, # 保存したい配列 delimiter=",", fmt='%.15f', header="x,y", # 区切り文字 )
class BaseModel(object): def __init__(self, n_ent, n_rel, args, struct): self.model = KGEModule(n_ent, n_rel, args, struct) self.model.cuda() self.n_ent = n_ent self.n_rel = n_rel self.time_tot = 0 self.args = args def train(self, train_data, tester_val, tester_tst): head, tail, rela = train_data # useful information related to cache n_train = len(head) if self.args.optim == 'adam' or self.args.optim == 'Adam': self.optimizer = Adam(self.model.parameters(), lr=self.args.lr) elif self.args.optim == 'adagrad' or self.args.optim == 'Adagrad': self.optimizer = Adagrad(self.model.parameters(), lr=self.args.lr) else: self.optimizer = SGD(self.model.parameters(), lr=self.args.lr) scheduler = ExponentialLR(self.optimizer, self.args.decay_rate) n_epoch = self.args.n_epoch n_batch = self.args.n_batch best_mrr = 0 # used for counting repeated triplets for margin based loss for epoch in range(n_epoch): start = time.time() self.epoch = epoch rand_idx = torch.randperm(n_train) head = head[rand_idx].cuda() tail = tail[rand_idx].cuda() rela = rela[rand_idx].cuda() epoch_loss = 0 for h, t, r in batch_by_size(n_batch, head, tail, rela, n_sample=n_train): self.model.zero_grad() loss = self.model.forward(h, t, r) loss += self.args.lamb * self.model.regul loss.backward() self.optimizer.step() self.prox_operator() epoch_loss += loss.data.cpu().numpy() self.time_tot += time.time() - start scheduler.step() if (epoch + 1) % self.args.epoch_per_test == 0: # output performance valid_mrr, valid_mr, valid_10 = tester_val() test_mrr, test_mr, test_10 = tester_tst() out_str = '%.4f\t%.4f\t\t%.4f\t%.4f\n' % (valid_mrr, valid_10, test_mrr, test_10) # output the best performance info if valid_mrr > best_mrr: best_mrr = valid_mrr best_str = out_str if best_mrr < self.args.thres: print( '\tearly stopped in Epoch:{}, best_mrr:{}'.format( epoch + 1, best_mrr), self.model.struct) return best_mrr, best_str return best_mrr, best_str def prox_operator(self, ): for n, p in self.model.named_parameters(): if 'ent' in n: X = p.data.clone() Z = torch.norm(X, p=2, dim=1, keepdim=True) Z[Z < 1] = 1 X = X / Z p.data.copy_(X.view(self.n_ent, -1)) def test_link(self, test_data, head_filter, tail_filter): heads, tails, relas = test_data batch_size = self.args.test_batch_size num_batch = len(heads) // batch_size + int(len(heads) % batch_size > 0) head_probs = [] tail_probs = [] for i in range(num_batch): start = i * batch_size end = min((i + 1) * batch_size, len(heads)) batch_h = heads[start:end].cuda() batch_t = tails[start:end].cuda() batch_r = relas[start:end].cuda() h_embed = self.model.ent_embed(batch_h) r_embed = self.model.rel_embed(batch_r) t_embed = self.model.ent_embed(batch_t) head_scores = torch.sigmoid(self.model.test_head(r_embed, t_embed)).data tail_scores = torch.sigmoid(self.model.test_tail(h_embed, r_embed)).data head_probs.append(head_scores.data.cpu().numpy()) tail_probs.append(tail_scores.data.cpu().numpy()) head_probs = np.concatenate(head_probs) * head_filter tail_probs = np.concatenate(tail_probs) * tail_filter head_ranks = cal_ranks(head_probs, label=heads.data.numpy()) tail_ranks = cal_ranks(tail_probs, label=tails.data.numpy()) h_mrr, h_mr, h_h10 = cal_performance(head_ranks) t_mrr, t_mr, t_h10 = cal_performance(tail_ranks) mrr = (h_mrr + t_mrr) / 2 mr = (h_mr + t_mr) / 2 h10 = (h_h10 + t_h10) / 2 return mrr, mr, h10
def train(train_source_iter: ForeverDataIterator, train_target_iter: ForeverDataIterator, model: ImageClassifier, jmmd_loss: JointMultipleKernelMaximumMeanDiscrepancy, optimizer: SGD, lr_sheduler: StepwiseLR, epoch: int, args: argparse.Namespace): batch_time = AverageMeter('Time', ':4.2f') data_time = AverageMeter('Data', ':3.1f') losses = AverageMeter('Loss', ':3.2f') trans_losses = AverageMeter('Trans Loss', ':5.4f') cls_accs = AverageMeter('Cls Acc', ':3.1f') tgt_accs = AverageMeter('Tgt Acc', ':3.1f') progress = ProgressMeter( args.iters_per_epoch, [batch_time, data_time, losses, trans_losses, cls_accs, tgt_accs], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() jmmd_loss.train() end = time.time() for i in range(args.iters_per_epoch): lr_sheduler.step() # measure data loading time data_time.update(time.time() - end) x_s, labels_s = next(train_source_iter) x_t, labels_t = next(train_target_iter) x_s = x_s.to(device) x_t = x_t.to(device) labels_s = labels_s.to(device) labels_t = labels_t.to(device) # compute output x = torch.cat((x_s, x_t), dim=0) y, f = model(x) y_s, y_t = y.chunk(2, dim=0) f_s, f_t = f.chunk(2, dim=0) cls_loss = F.cross_entropy(y_s, labels_s) transfer_loss = jmmd_loss( (f_s, F.softmax(y_s, dim=1)), (f_t, F.softmax(y_t, dim=1)) ) loss = cls_loss + transfer_loss * args.trade_off cls_acc = accuracy(y_s, labels_s)[0] tgt_acc = accuracy(y_t, labels_t)[0] losses.update(loss.item(), x_s.size(0)) cls_accs.update(cls_acc.item(), x_s.size(0)) tgt_accs.update(tgt_acc.item(), x_t.size(0)) trans_losses.update(transfer_loss.item(), x_s.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional RNN-CNN') parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', required=True) parser.add_argument('--cuda', action='store_true', help='using GPU') parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') parser.add_argument('--num_filters', type=int, default=30, help='Number of filters in CNN') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--dropout', choices=['std', 'variational'], help='type of dropout', required=True) parser.add_argument('--p_rnn', nargs=2, type=float, required=True, help='dropout rate for RNN') parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input embeddings') parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer') parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() logger = get_logger("NER") mode = args.mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma schedule = args.schedule p_rnn = tuple(args.p_rnn) p_in = args.p_in p_out = args.p_out unk_replace = args.unk_replace embedding = args.embedding embedding_path = args.embedding_dict embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) logger.info("Creating Alphabets") word_alphabet, char_alphabet, pos_alphabet, \ chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner/", train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info("Reading Data") device = torch.device('cuda') if args.cuda else torch.device('cpu') data_train = conll03_data.read_data_to_tensor(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, device=device) num_data = sum(data_train[1]) num_labels = ner_alphabet.size() data_dev = conll03_data.read_data_to_tensor(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, device=device) data_test = conll03_data.read_data_to_tensor(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, device=device) writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conll03_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = args.char_dim window = 3 num_layers = args.num_layers tag_space = args.tag_space initializer = nn.init.xavier_uniform_ if args.dropout == 'std': network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, initializer=initializer) else: network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, initializer=initializer) network = network.to(device) lr = learning_rate # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" % (mode, num_layers, hidden_size, num_filters, tag_space)) logger.info( "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (gamma, num_data, batch_size, unk_replace)) logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_out, p_rnn)) num_batches = num_data / batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, args.dropout, lr, decay_rate, schedule)) train_err = 0. train_corr = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch in range(1, num_batches + 1): word, char, _, _, labels, masks, lengths = conll03_data.get_batch_tensor( data_train, batch_size, unk_replace=unk_replace) optim.zero_grad() loss, corr, _ = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) loss.backward() optim.step() with torch.no_grad(): num_tokens = masks.sum() train_err += loss * num_tokens train_corr += corr train_total += num_tokens time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' % (num_batches, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)) # evaluate performance on dev data with torch.no_grad(): network.eval() tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_tensor( data_dev, batch_size): word, char, pos, chunk, labels, masks, lengths = batch _, _, preds = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.cpu().numpy(), pos.cpu().numpy(), chunk.cpu().numpy(), preds.cpu().numpy(), labels.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1 < f1: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # evaluate on test data when better performance detected tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_tensor( data_test, batch_size): word, char, pos, chunk, labels, masks, lengths = batch _, _, preds = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.cpu().numpy(), pos.cpu().numpy(), chunk.cpu().numpy(), preds.cpu().numpy(), labels.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc, test_precision, test_recall, test_f1 = evaluate( tmp_filename) print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc, test_precision, test_recall, test_f1, best_epoch)) if epoch % schedule == 0: lr = learning_rate / (1.0 + epoch * decay_rate) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
def train( cfg, img_size=416, resume=False, epochs=273, # 500200 batches at bs 64, dataset length 117263 batch_size=16, accumulate=1, multi_scale=False, freeze_backbone=False, num_workers=4, transfer=False # Transfer learning (train only YOLO layers) ): weights = 'weights' + os.sep latest = weights + 'latest.pt' best = weights + 'best.pt' device = torch_utils.select_device() if multi_scale: img_size = 608 # initiate with maximum multi_scale size else: torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Initialize model model = Darknet(cfg, img_size).to(device) # Optimizer lr0 = 0.001 # initial learning rate optimizer = SGD(model.parameters(), lr=lr0, momentum=0.9, weight_decay=0.0005) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 best_loss = float('inf') yl = get_yolo_layers(model) # yolo layers nf = int(model.module_defs[yl[0] - 1]['filters']) # yolo layer size (i.e. 255) if resume: # Load previously saved model if transfer: # Transfer learning chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device) model.load_state_dict( { k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255 }, strict=False) for p in model.parameters(): p.requires_grad = True if p.shape[0] == nf else False else: # resume from latest.pt chkpt = torch.load(latest, map_location=device) # load checkpoint model.load_state_dict(chkpt['model']) start_epoch = chkpt['epoch'] + 1 if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_loss = chkpt['best_loss'] del chkpt else: # Initialize model with backbone (optional) if '-tiny.cfg' in cfg: cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') else: cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') # multi gpus if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model, device_ids=list( range(torch.cuda.device_count()))) # Set scheduler scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[20, 40], gamma=0.1, last_epoch=start_epoch - 1) # Dataset # train_dataset = VOCDetection(root=os.path.join('~', 'data', 'VOCdevkit'), img_size=img_size, mode='train') train_dataset = DFSignDetection(root=os.path.join('~', 'data', 'dfsign', 'dfsign_chip_voc'), img_size=img_size, mode='train') # Dataloader dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True, collate_fn=train_dataset.collate_fn) # Start training t = time.time() # model_info(model) nB = len(dataloader) n_burnin = nB # burn-in batches for epoch in range(start_epoch, epochs): model.train() print( ('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time')) # Update scheduler scheduler.step() # Freeze backbone at epoch 0, unfreeze at epoch 1 if freeze_backbone and epoch < 2: for name, p in model.named_parameters(): if int(name.split('.')[1]) < cutoff: # if layer < 75 p.requires_grad = False if epoch == 0 else True mloss = defaultdict(float) # mean loss for i, (imgs, targets, _, paths) in enumerate(dataloader): imgs = imgs.to(device) targets = targets.to(device) nt = len(targets) if nt == 0: # if no targets continue continue # SGD burn-in if epoch == 0 and i <= n_burnin: lr = lr0 * (i / n_burnin)**4 for x in optimizer.param_groups: x['lr'] = lr optimizer.zero_grad() # Run model pred = model(imgs) # Build targets target_list = build_targets(model, targets) # Compute loss loss, loss_dict = compute_loss(pred, target_list) loss.backward() # Accumulate gradient for x batches before optimizing if (i + 1) % accumulate == 0 or (i + 1) == nB: optimizer.step() # Running epoch-means of tracked metrics for key, val in loss_dict.items(): mloss[key] = (mloss[key] * i + val) / (i + 1) s = ('%8s%12s' + '%10.3g' * 7) % ( '%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, nB - 1), mloss['xy'], mloss['wh'], mloss['conf'], mloss['cls'], mloss['total'], nt, time.time() - t) t = time.time() if i % 30 == 0: print(s) # Multi-Scale training (320 - 608 pixels) every 10 batches if multi_scale and (i + 1) % 10 == 0: dataset.img_size = random.choice(range(10, 20)) * 32 print('multi_scale img_size = %g' % dataset.img_size) # Update best loss if mloss['total'] < best_loss: best_loss = mloss['total'] # Save latest checkpoint checkpoint = { 'epoch': epoch, 'best_loss': best_loss, 'model': model.module.state_dict() if type(model) is nn.parallel.DataParallel else model.state_dict(), 'optimizer': optimizer.state_dict() } if epoch % 5 == 0: torch.save(checkpoint, 'weights/epoch_tt100k_%03d.pt' % epoch) # if epoch > 9 and epoch % 10 == 0: if False: with torch.no_grad(): APs, mAP = test.test(cfg, weights=None, batch_size=32, img_size=img_size, model=model) pprint(APs) print(mAP) del checkpoint
class GoogLeNet(nn.Module): @staticmethod def init_weights(m): if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): truncated_normal_(m.weight) # nn.init.kaiming_normal_( # tensor=m.weight, # mode='fan_out', # nonlinearity='relu' # ) if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def __init__(self, num_classes: int, enable_aux=False, conv_type=None): super(GoogLeNet, self).__init__() self.criterion = None self.optimizer = None self.scheduler = None self.enable_aux = enable_aux # Input size: 224x224x3 (RGB color space w/ zero mean) # Kernel size: 7x7 # Padding size: 3x3 # Stride = 2 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((224 - 7 + 2*3)/2) + 1 = 112 # # Output size: 112x112x64 self.conv1 = nn.Conv2d( in_channels=3, out_channels=64, kernel_size=(7, 7), stride=2, padding=(3, 3) ) # Input size: 112x112x64 # Kernel size: 3x3 # Padding size: 1x1 # Stride: 2 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((112 - 3 + 1*2)/2) + 1 = 56 # # Output size: 56x56x64 self.maxPooling1 = nn.MaxPool2d( kernel_size=(3, 3), stride=2, padding=(1, 1) ) # Input size: 56x56x64 # Kernel size: 1x1 # Padding size: 0 # Stride: 1 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((56 - 1 + 0)/1) + 1 = 56 # # Output size: 56x56x64 self.conv2_1 = nn.Conv2d( in_channels=64, out_channels=64, kernel_size=(1, 1), stride=1, padding=0 ) # Input size: 56x56x64 # Kernel size: 3x3 # Padding size: 1x1 # Stride: 1 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((56 - 3 + 1*2)/1) + 1 = 56 # # Output size: 56x56x192 self.conv2_2 = nn.Conv2d( in_channels=64, out_channels=192, kernel_size=(3, 3), stride=1, padding=(1, 1) ) # Input size: 56x56x192 # Kernel size: 3x3 # Padding size: 1x1 # Stride: 2 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((56 - 3 + 1*2)/2) + 1 = 28 # # Output size: 28x28x192 self.maxPooling2 = nn.MaxPool2d( kernel_size=(3, 3), stride=2, padding=(1, 1) ) self.inception_3a = Inception( in_channels=192, ch_1x1=64, ch_3x3_reduce=96, ch_3x3=128, ch_5x5_reduce=16, ch_5x5=32, pool_proj=32, conv_type=conv_type ) self.inception_3b = Inception( in_channels=256, ch_1x1=128, ch_3x3_reduce=128, ch_3x3=192, ch_5x5_reduce=32, ch_5x5=96, pool_proj=64, conv_type=conv_type ) # Input size: 28x28x480 # Kernel size: 3x3 # Padding size: 1x1 # Stride: 2 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((28 - 3 + 1*2)/2) + 1 = 14 # # Output size: 14x14x480 self.maxPooling3 = nn.MaxPool2d( kernel_size=(3, 3), stride=2, padding=(1, 1) ) self.inception_4a = Inception( in_channels=480, ch_1x1=192, ch_3x3_reduce=96, ch_3x3=208, ch_5x5_reduce=16, ch_5x5=48, pool_proj=64, conv_type=conv_type ) self.inception_4b = Inception( in_channels=512, ch_1x1=160, ch_3x3_reduce=112, ch_3x3=224, ch_5x5_reduce=24, ch_5x5=64, pool_proj=64, conv_type=conv_type ) self.inception_4c = Inception( in_channels=512, ch_1x1=128, ch_3x3_reduce=128, ch_3x3=256, ch_5x5_reduce=24, ch_5x5=64, pool_proj=64, conv_type=conv_type ) self.inception_4d = Inception( in_channels=512, ch_1x1=112, ch_3x3_reduce=144, ch_3x3=288, ch_5x5_reduce=32, ch_5x5=64, pool_proj=64, conv_type=conv_type ) self.inception_4e = Inception( in_channels=528, ch_1x1=256, ch_3x3_reduce=160, ch_3x3=320, ch_5x5_reduce=32, ch_5x5=128, pool_proj=128, conv_type=conv_type ) # Input size: 14x14x832 # Kernel size: 3x3 # Padding size: 1x1 # Stride: 2 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((14 - 3 + 1*2)/2) + 1 = 7 # # Output size: 7x7x832 self.maxPooling4 = nn.MaxPool2d( kernel_size=(3, 3), stride=2, padding=(1, 1) ) self.inception_5a = Inception( in_channels=832, ch_1x1=256, ch_3x3_reduce=160, ch_3x3=320, ch_5x5_reduce=32, ch_5x5=128, pool_proj=128, conv_type=conv_type ) self.inception_5b = Inception( in_channels=832, ch_1x1=384, ch_3x3_reduce=192, ch_3x3=384, ch_5x5_reduce=48, ch_5x5=128, pool_proj=128, conv_type=conv_type ) # Input size: 7x7x1024 # Kernel size: 7x7 # Padding size: 0 # Stride: 1 # floor((n_h - k_h + 2*p_h)/s_h) + 1 = floor((7 - 7 + 0)/2) + 1 = 1 # # Output size: 1x1x1024 self.avgPooling1 = nn.AvgPool2d( kernel_size=(3, 3), stride=2, padding=0 ) self.dropout = nn.Dropout(p=0.4, inplace=True) self.fc = nn.Linear(in_features=1024, out_features=num_classes, bias=True) if enable_aux: self.aux1 = AuxInception(in_channels=512, num_classes=num_classes) self.aux2 = AuxInception(in_channels=528, num_classes=num_classes) else: self.aux1 = None self.aux2 = None def initialize(self, criterion=None, optimizer=None, scheduler=None, weight_init=None, learning_rate=1e-2) -> None: if criterion is None: self.criterion = nn.CrossEntropyLoss() else: self.criterion = criterion if optimizer is None: self.optimizer = SGD(self.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4) else: self.optimizer = optimizer if scheduler is None: self.scheduler = torch.optim.lr_scheduler.StepLR( optimizer=self.optimizer, step_size=4, gamma=0.04 ) else: self.scheduler = scheduler if weight_init is None: for m in self.modules(): m.apply(self.init_weights) else: for m in self.modules(): m.apply(weight_init) def _forward(self, X: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: # input: 224x224x3 X = self.conv1(X) # input: 112x112x64 X = self.maxPooling1(X) # input 56x56x64 X = self.conv2_1(X) X = self.conv2_2(X) # input: 56x56x192 X = self.maxPooling2(X) # input: 28x28x192 X = self.inception_3a(X) # input: 28x28x256 X = self.inception_3b(X) # input: 28x28x480 X = self.maxPooling3(X) # input: 14x14x480 X = self.inception_4a(X) aux1 = self.aux1(X) if (self.aux1 is not None) else None # input: 14x14x512 X = self.inception_4b(X) # input: 14x14x512 X = self.inception_4c(X) # input: 14x14x512 X = self.inception_4d(X) aux2 = self.aux2(X) if (self.aux1 is not None) else None # input: 14x14x528 X = self.inception_4e(X) # input: 14x14x528 X = self.maxPooling4(X) # input: 7x7x832 X = self.inception_5a(X) # input: 7x7x832 X = self.inception_5b(X) # input: 7x7x1024 X = self.avgPooling1(X) X = torch.flatten(X, 1) # input: 1x1x1024 X = self.dropout(X) # input: 1x1x1000 X = self.fc(X) # output: 1 x 1 x num_classes return X, aux1, aux2 def forward(self, X:torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]]: X, aux1, aux2 = self._forward(X) if self.training and self.enable_aux: return X, aux2, aux1 else: return X def train(self, mode=True, data=None, epochs=10) -> 'GoogLeNet': device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.to(device) if (data is None) and (mode): raise FileNotFoundError("\"data\" has to be a valid Dataloader object!") self.training = mode for m in self.modules(): m.train(mode) if mode: running_loss = 0.0 for epoch in range(0, epochs): for i, datum in enumerate(data, 0): features, labels = datum[0].to(device), datum[1].to(device) loss = self.criterion(self(features), labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() running_loss += loss.item() batch_split = int(len(data.dataset) / data.batch_size / 5) batch_split = 1 if batch_split < 1 else batch_split if i % batch_split == batch_split - 1: if self.verbose: print(f"[epoch {epoch + 1}, batch {i + 1}] loss: {running_loss / batch_split}") self.scheduler.step(epoch) running_loss = 0.0 if self.verbose: print('Finished Training') return self
valid_acc = 0.0 model.train() if epoch in [30, 80]: lr /= 3.0 for pg in opt.param_groups: pg['lr'] = lr for X, y in dataset: X, y = V(X), V(y) outputs = model(X) epoch_acc += accuracy(outputs, y) y = y.max(1)[1].view(-1).long() error = loss(outputs, y) epoch_error += error.data[0] opt.zero_grad() error.backward() opt.step() print('Train loss', epoch_error / len(dataset)) print('Train accuracy', epoch_acc / (bsz * len(dataset))) model.eval() for X, y in validation: X, y = V(X, volatile=True), V(y, volatile=True) outputs = model(X) valid_acc += accuracy(outputs, y) y = y.max(1)[1].view(-1).long() error = loss(outputs, y) valid_error += error.data[0] print('Valid loss', valid_error / len(validation)) print('Valid accuracy', valid_acc / (bsz * len(validation))) print('') # Save model weights
class MaskTrainer(BaseTrainer): def __init__(self, config): super(MaskTrainer, self).__init__(config) self.init_mask() def init_mask(self): if self.config.mask_type == 'icon': project_path = self.config.firelab.paths.project_path data_dir = os.path.join(project_path, self.config.data_dir) icon = imread(os.path.join(data_dir, self.config.hp.icon_file_path)) if self.config.hp.get('should_resize_icon', False): icon = resize(icon, self.config.hp.target_icon_size, mode='constant', anti_aliasing=True) icon = convert_img_to_binary(icon) self.mask = make_mask_ternary(icon) elif self.config.mask_type == 'custom': self.mask = np.array(self.config.mask) elif self.config.mask_type == 'square': self.mask = generate_square_mask(self.config.hp.square_size) self.mask = make_mask_ternary(self.mask) elif self.config.mask_type == 'randomly_filled_square': self.mask = generate_square_mask(self.config.hp.square_size) self.mask = randomly_fill_square(self.mask, self.config.hp.fill_prob) self.mask = make_mask_ternary(self.mask) elif self.config.mask_type == 'square_grid': self.mask = generate_square_grid_mask(self.config.hp.n_good_cells) self.mask = make_mask_ternary(self.mask) else: raise NotImplementedError('Mask type %s is not supported' % self.config.mask_type) def init_dataloaders(self): dataset = self.config.hp.get('dataset', 'FashionMNIST') batch_size = self.config.hp.batch_size project_path = self.config.firelab.paths.project_path data_dir = os.path.join(project_path, self.config.data_dir) if dataset == 'FashionMNIST': data_train = FashionMNIST(data_dir, train=True, transform=transforms.ToTensor()) data_test = FashionMNIST(data_dir, train=False, transform=transforms.ToTensor()) elif dataset == 'MNIST': data_train = MNIST(data_dir, train=True, transform=transforms.ToTensor()) data_test = MNIST(data_dir, train=False, transform=transforms.ToTensor()) elif dataset == 'CIFAR10': train_transform = transforms.Compose([ transforms.Pad(padding=4), transforms.RandomCrop(size=(32, 32)), transforms.RandomHorizontalFlip(p=0.5), transforms.ToTensor(), transforms.RandomErasing(p=0.5, scale=(0.25, 0.25), ratio=(1., 1.)), # Cut out 8x8 square transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) test_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) data_train = CIFAR10(data_dir, train=True, transform=train_transform) data_test = CIFAR10(data_dir, train=False, transform=test_transform) else: raise NotImplementedError(f"Unknown dataset: {dataset}") data_vis_train = Subset(data_train, random.sample(range(len(data_train)), self.config.get('n_points_for_vis', 1000))) data_vis_test = Subset(data_test, random.sample(range(len(data_test)), self.config.get('n_points_for_vis', 1000))) self.train_dataloader = DataLoader(data_train, batch_size=batch_size, shuffle=True) self.val_dataloader = DataLoader(data_test, batch_size=batch_size, shuffle=False) self.vis_train_dataloader = DataLoader(data_vis_train, batch_size=batch_size, shuffle=False) self.vis_test_dataloader = DataLoader(data_vis_test, batch_size=batch_size, shuffle=False) def init_models(self): self.init_torch_model_builder() self.model = MaskModel( self.mask, self.torch_model_builder, should_center_origin=self.config.hp.should_center_origin, parametrization_type=self.config.hp.parametrization_type) self.model = self.model.to(self.device_name) # self.logger.info(f'Model initial orthogonality: {self.model.compute_ort_reg()}') # self.logger.info(f'Model params: {self.config.hp.conv_model_config.to_dict()}. Parametrization: {self.config.hp.parametrization_type}') def init_torch_model_builder(self): if self.config.hp.model_name == 'fast_resnet': self.torch_model_builder = lambda: FastResNet( n_classes=10, n_input_channels=self.config.hp.get('n_input_channels', 1)).nn elif self.config.hp.model_name == 'resnet18': self.torch_model_builder = lambda: ResNet18( n_classes=10, n_input_channels=self.config.hp.get('n_input_channels', 1)).nn elif self.config.hp.model_name == "vgg": self.torch_model_builder = lambda: VGG11( n_input_channels=self.config.hp.get('n_input_channels', 1), use_bn=self.config.hp.get('use_bn', True)).model elif self.config.hp.model_name == "simple": self.torch_model_builder = lambda: SimpleModel().nn elif self.config.hp.model_name == "conv": self.torch_model_builder = lambda: ConvModel(self.config.hp.conv_model_config).nn else: raise NotImplementedError(f"Model {self.config.hp.model_name} is not supported") def init_criterions(self): self.criterion = nn.CrossEntropyLoss(reduction='none') def init_optimizers(self): optim_type = self.config.hp.get('optim.type', 'adam').lower() if optim_type == 'adam': self.optim = Adam(self.model.parameters(), **self.config.hp.optim.kwargs.to_dict()) elif optim_type == 'sgd': self.optim = SGD(self.model.parameters(), **self.config.hp.optim.kwargs.to_dict()) else: raise NotImplementedError(f'Unknown optimizer: {optim_type}') if not self.config.hp.optim.has('scheduler'): self.scheduler = None elif self.config.hp.optim.get('scheduler.type') == 'triangle_lr': epoch_size = len(self.train_dataloader) self.scheduler = TriangleLR(self.optim, epoch_size, **self.config.hp.optim.scheduler.kwargs.to_dict()) else: raise NotImplementedError(f"Unknown scheduler.type: {self.config.hp.optim.get('scheduler.type')}") def train_on_batch(self, batch): self.optim.zero_grad() x = batch[0].to(self.device_name) y = batch[1].to(self.device_name) good_losses = [] good_accs = [] bad_losses = [] bad_accs = [] good_idx = self.model.get_class_idx(1).tolist() bad_idx = self.model.get_class_idx(-1).tolist() num_good_points_to_use = min(len(good_idx), self.config.hp.num_good_cells_per_update) num_bad_points_to_use = min(len(bad_idx), self.config.hp.num_bad_cells_per_update) for i, j in random.sample(good_idx, num_good_points_to_use): preds = self.model.run_from_weights(self.model.compute_point(i,j), x) good_loss = self.criterion(preds, y).mean() good_losses.append(good_loss.item()) good_loss /= num_good_points_to_use good_loss.backward() # To make the graph free good_accs.append((preds.argmax(dim=1) == y).float().mean().item()) for i, j in random.sample(bad_idx, num_bad_points_to_use): preds = self.model.run_from_weights(self.model.compute_point(i,j), x) bad_loss = self.criterion(preds, y).mean() bad_losses.append(bad_loss.item()) bad_loss = bad_loss.clamp(0, self.config.hp.neg_loss_clip_threshold) bad_loss /= num_bad_points_to_use bad_loss *= self.config.hp.get('negative_loss_coef', 1.) bad_loss *= -1 # To make it grow bad_loss.backward() # To make the graph free bad_accs.append((preds.argmax(dim=1) == y).float().mean().item()) good_losses = np.array(good_losses) good_accs = np.array(good_accs) bad_losses = np.array(bad_losses) bad_accs = np.array(bad_accs) # Adding regularization if self.config.hp.parametrization_type != "up_orthogonal": ort = self.model.compute_ort_reg() norm_diff = self.model.compute_norm_reg() reg_loss = self.config.hp.ort_l2_coef * ort.pow(2) + self.config.hp.norm_l2_coef * norm_diff.pow(2) reg_loss.backward() self.writer.add_scalar('Reg/ort', ort.item(), self.num_iters_done) self.writer.add_scalar('Reg/norm_diff', norm_diff.item(), self.num_iters_done) clip_grad_norm_(self.model.parameters(), self.config.hp.grad_clip_threshold) self.optim.step() if not self.scheduler is None: self.scheduler.step() self.writer.add_scalar('good/train/loss', good_losses.mean().item(), self.num_iters_done) self.writer.add_scalar('good/train/acc', good_accs.mean().item(), self.num_iters_done) self.writer.add_scalar('bad/train/loss', bad_losses.mean().item(), self.num_iters_done) self.writer.add_scalar('bad/train/acc', bad_accs.mean().item(), self.num_iters_done) self.writer.add_scalar('diff/train/loss', good_losses.mean().item() - bad_losses.mean().item(), self.num_iters_done) self.writer.add_scalar('diff/train/acc', good_accs.mean().item() - bad_accs.mean().item(), self.num_iters_done) self.writer.add_scalar('Stats/lengths/right', self.model.right.norm(), self.num_iters_done) self.writer.add_scalar('Stats/lengths/up', self.model.up.norm(), self.num_iters_done) self.writer.add_scalar('Stats/grad_norms/origin', self.model.origin.grad.norm().item(), self.num_iters_done) self.writer.add_scalar('Stats/grad_norms/right_param', self.model.right_param.grad.norm().item(), self.num_iters_done) self.writer.add_scalar('Stats/grad_norms/up_param', self.model.up_param.grad.norm().item(), self.num_iters_done) self.writer.add_scalar('Stats/grad_norms/scaling', self.model.scaling_param.grad.norm().item(), self.num_iters_done) self.writer.add_scalar('Stats/scaling', self.model.scaling_param.item(), self.num_iters_done) def before_training_hook(self): self.plot_mask() self.save_mask() self.plot_all_weights_histograms() self.write_config() def after_training_hook(self): if self.is_explicitly_stopped: self.delete_logs() # So tensorboard does not lag else: self.visualize_minimum(self.vis_train_dataloader, 'train') self.visualize_minimum(self.vis_test_dataloader, 'test') def delete_logs(self): shutil.rmtree(self.config.firelab.paths.logs_path) self.writer.close() def compute_mask_scores(self, dataloader): pad = self.config.get('solution_vis.padding', 0) x_num_points = self.config.get('solution_vis.granularity.x', self.mask.shape[0] + 2 * pad) y_num_points = self.config.get('solution_vis.granularity.y', self.mask.shape[1] + 2 * pad) xs = np.linspace(-pad, self.mask.shape[0] + pad, x_num_points) ys = np.linspace(-pad, self.mask.shape[1] + pad, y_num_points) dummy_model = self.torch_model_builder().to(self.device_name) scores = [[self.compute_mask_score(x, y, dummy_model, dataloader) for y in ys] for x in xs] return xs, ys, scores def compute_mask_score(self, x, y, dummy_model, dataloader): w = self.model.compute_point(x, y, should_orthogonalize=True) return validate_weights(w, dataloader, dummy_model) def visualize_minimum(self, dataloader:DataLoader, subtitle:str): xs, ys, scores = self.compute_mask_scores(dataloader) self.save_minima_grid(scores, subtitle) fig = self.build_minimum_figure(xs, ys, scores, subtitle) self.writer.add_figure(f'Minimum_{subtitle}', fig, self.num_iters_done) def build_minimum_figure(self, xs, ys, scores, subtitle:str): X, Y = np.meshgrid(xs, ys) scores = np.array(scores) fig = plt.figure(figsize=(20, 4)) plt.subplot(141) cntr = plt.contourf(X, Y, scores[:,:,0].T, cmap="RdBu_r", levels=np.linspace(0.3, 2.5, 30)) plt.title(f'Loss [{subtitle}]') plt.colorbar(cntr) plt.subplot(142) cntr = plt.contourf(X, Y, scores[:,:,1].T, cmap="RdBu_r", levels=np.linspace(0.5, 0.9, 30)) plt.title(f'Accuracy [{subtitle}]') plt.colorbar(cntr) plt.subplot(143) cntr = plt.contourf(X, Y, scores[:,:,0].T, cmap="RdBu_r", levels=100) plt.title(f'Loss [{subtitle}]') plt.colorbar(cntr) plt.subplot(144) cntr = plt.contourf(X, Y, scores[:,:,1].T, cmap="RdBu_r", levels=np.linspace(0, 1, 100)) plt.title(f'Accuracy [{subtitle}]') plt.colorbar(cntr) return fig def validate(self): self.model.is_good_mode = True good_val_loss, good_val_acc = validate(self.model, self.train_dataloader, self.criterion) self.model.is_good_mode = False bad_val_loss, bad_val_acc = validate(self.model, self.train_dataloader, self.criterion) self.model.is_good_mode = True self.writer.add_scalar('good/val/loss', good_val_loss, self.num_epochs_done) self.writer.add_scalar('good/val/acc', good_val_acc, self.num_epochs_done) self.writer.add_scalar('bad/val/loss', bad_val_loss, self.num_epochs_done) self.writer.add_scalar('bad/val/acc', bad_val_acc, self.num_epochs_done) self.writer.add_scalar('diff/val/loss', good_val_loss - bad_val_loss, self.num_epochs_done) self.writer.add_scalar('diff/val/acc', good_val_acc - bad_val_acc, self.num_epochs_done) self.plot_all_weights_histograms() if self.num_epochs_done > self.config.get('val_acc_stop_threshold_num_warmup_epochs', -1): if good_val_acc < self.config.get('good_val_acc_stop_threshold', 0.): self.stop(f'Good val accuracy is too low (epoch #{self.num_epochs_done}): {good_val_acc}') elif bad_val_acc > self.config.get('bad_val_acc_stop_threshold', 1.): self.stop(f'Bad val accuracy is too high (epoch #{self.num_epochs_done}): {bad_val_acc}') else: pass if self.num_epochs_done > self.config.get('diff_threshold_num_warmup_epochs', -1): if good_val_acc - bad_val_acc < self.config.get('good_and_bad_val_acc_diff_threshold', float('-inf')): self.stop(f'Difference between good and val accuracies is too small '\ f'(epoch #{self.num_epochs_done}): {good_val_acc} - {bad_val_acc} = {good_val_acc - bad_val_acc}') def plot_mask(self): fig = plt.figure(figsize=(5, 5)) mask_img = np.copy(self.mask) mask_img[mask_img == 2] = 0.5 plt.imshow(mask_img, cmap='gray') self.writer.add_figure('Mask', fig, self.num_iters_done) def save_mask(self): save_path = os.path.join(self.config.firelab.paths.custom_data_path, 'mask.npy') np.save(save_path, self.mask) def plot_params_histograms(self, w, subtag:str): dummy_model = self.torch_model_builder() params = weight_to_param(w, param_sizes(dummy_model.parameters())) tags = ['Weights_histogram_{}/{}'.format(i, subtag) for i in range(len(params))] for tag, param in zip(tags, params): self.writer.add_histogram(tag, param, self.num_iters_done) def plot_all_weights_histograms(self): # TODO: we do not need histograms currently... # self.plot_params_histograms(self.model.origin + self.model.right, 'origin_right') # self.plot_params_histograms(self.model.origin + self.model.up, 'origin_up') # self.plot_params_histograms(self.model.origin + self.model.up + self.model.right, 'origin_up_right') pass def write_config(self): config_yml = yaml.safe_dump(self.config.to_dict()) config_yml = config_yml.replace('\n', ' \n') # Because tensorboard uses markdown self.writer.add_text('Config', config_yml, self.num_iters_done) def save_minima_grid(self, scores, subtitle:str): save_path = os.path.join(self.config.firelab.paths.custom_data_path, f'minima_grid_{subtitle}.npy') np.save(save_path, scores)
class Policy(nn.Module): def __init__(self, env): # game params self.board_x, self.board_y = env.get_ub_board_size() self.action_size = env.n_actions self.n_inputs = env.n_inputs self.lr = args.lr self.env = env self.device = 'cuda' if torch.cuda.is_available() else 'cpu' super(Policy, self).__init__() self.conv1 = nn.Conv2d(self.n_inputs, args.num_channels, 3, stride=1, padding=1).to(self.device) self.conv2 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1, padding=1).to(self.device) self.conv3 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1).to(self.device) self.conv4 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1).to(self.device) self.bn1 = nn.BatchNorm2d(args.num_channels).to(self.device) self.bn2 = nn.BatchNorm2d(args.num_channels).to(self.device) self.bn3 = nn.BatchNorm2d(args.num_channels).to(self.device) self.bn4 = nn.BatchNorm2d(args.num_channels).to(self.device) self.fc1 = nn.Linear(args.num_channels*(self.board_x - 4)*(self.board_y - 4) \ + env.agent_step_dim, 1024).to(self.device) self.fc_bn1 = nn.BatchNorm1d(1024).to(self.device) self.fc2 = nn.Linear(1024, 512).to(self.device) self.fc_bn2 = nn.BatchNorm1d(512).to(self.device) self.fc3 = nn.Linear(512, self.action_size).to(self.device) self.fc4 = nn.Linear(512, 1).to(self.device) self.entropies = 0 self.pi_losses = AverageMeter() self.v_losses = AverageMeter() self.action_probs = [[], []] self.state_values = [[], []] self.rewards = [[], []] self.next_states = [[], []] if args.optimizer == 'adas': self.optimizer = Adas(self.parameters(), lr=self.lr) elif args.optimizer == 'adam': self.optimizer = Adam(self.parameters(), lr=self.lr) else: self.optimizer = SGD(self.parameters(), lr=self.lr) def forward(self, s, agent): # s: batch_size x n_inputs x board_x x board_y s = s.view(-1, self.n_inputs, self.board_x, self.board_y) # batch_size x n_inputs x board_x x board_y s = F.relu(self.bn1( self.conv1(s))) # batch_size x num_channels x board_x x board_y s = F.relu(self.bn2( self.conv2(s))) # batch_size x num_channels x board_x x board_y s = F.relu(self.bn3(self.conv3( s))) # batch_size x num_channels x (board_x-2) x (board_y-2) s = F.relu(self.bn4(self.conv4( s))) # batch_size x num_channels x (board_x-4) x (board_y-4) s = s.view(-1, args.num_channels * (self.board_x - 4) * (self.board_y - 4)) s = torch.cat((s, agent), dim=1) s = F.dropout(F.relu(self.fc1(s)), p=args.dropout, training=self.training) # batch_size x 1024 s = F.dropout(F.relu(self.fc2(s)), p=args.dropout, training=self.training) # batch_size x 512 pi = self.fc3(s) # batch_size x action_size v = self.fc4(s) # batch_size x 1 return F.log_softmax(pi, dim=1), v # torch.tanh(v) def step(self, obs, agent): """ Returns policy and value estimates for given observations. :param obs: Array of shape [N] containing N observations. :return: Policy estimate [N, n_actions] and value estimate [N] for the given observations. """ obs = torch.from_numpy(obs).to(self.device) agent = torch.from_numpy(agent).to(self.device) pi, v = self.forward(obs, agent) return torch.exp(pi).detach().to('cpu').numpy(), v.detach().to( 'cpu').numpy() def store(self, player_ID, prob, state_value, reward): self.action_probs[player_ID].append(prob) self.state_values[player_ID].append(state_value) self.rewards[player_ID].append(reward) def clear(self): self.action_probs = [[], []] self.state_values = [[], []] self.rewards = [[], []] self.next_states = [[], []] self.entropies = 0 def get_data(self): return self.action_probs, self.state_values, self.rewards def optimize(self): self.optimizer.step() def reset_grad(self): self.optimizer.zero_grad() def train_examples(self, examples): """ examples: list of examples, each example is of form (board, pi, v) """ for epoch in range(args.epochs): # print('\nEPOCH ::: ' + str(epoch + 1)) self.train() batch_count = int(len(examples) / args.batch_size) t = tqdm(range(batch_count), desc='Training Net') for _ in t: sample_ids = np.random.randint(len(examples), size=args.batch_size) boards, agent_steps, pis, vs = list( zip(*[examples[i] for i in sample_ids])) boards = self.env.get_states_for_step(boards) agent_steps = self.env.get_agents_for_step(agent_steps) boards = torch.FloatTensor(boards.astype(np.float64)).to( self.device) agent_steps = torch.FloatTensor(agent_steps.astype( np.float64)).to(self.device) target_pis = torch.FloatTensor(np.array(pis)) target_vs = torch.FloatTensor(np.array(vs).astype(np.float64)) # predict if self.device == 'cuda': boards, target_pis, target_vs = boards.contiguous().cuda( ), target_pis.contiguous().cuda(), target_vs.contiguous( ).cuda() # compute output out_pi, out_v = self.forward(boards, agent_steps) l_pi = self.loss_pi(target_pis, out_pi) l_v = self.loss_v(target_vs, out_v) total_loss = l_pi + l_v # record loss self.pi_losses.update(l_pi.item(), boards.size(0)) self.v_losses.update(l_v.item(), boards.size(0)) t.set_postfix(Loss_pi=self.pi_losses, Loss_v=self.v_losses) # compute gradient and do Adas step self.reset_grad() total_loss.backward() self.optimize() self.pi_losses.plot('PolicyLoss') self.v_losses.plot('ValueLoss') def loss_pi(self, targets, outputs): return -torch.sum(targets * outputs) / targets.size()[0] def loss_v(self, targets, outputs): return torch.sum((targets - outputs.view(-1))**2) / targets.size()[0] def save_checkpoint(self, folder='Models', filename='model.pt'): filepath = os.path.join(folder, filename) if not os.path.exists(folder): print("Checkpoint Directory does not exist! Making directory {}". format(folder)) os.mkdir(folder) else: print("Checkpoint Directory exists! ") torch.save({ 'state_dict': self.state_dict(), }, filepath) def load_checkpoint(self, folder='Models', filename='model.pt'): # https://github.com/pytorch/examples/blob/master/imagenet/main.py#L98 filepath = os.path.join(folder, filename) if not os.path.exists(filepath): raise ("No model in path {}".format(filepath)) checkpoint = torch.load(filepath, map_location=self.device) self.load_state_dict(checkpoint['state_dict']) print('-- Load model succesfull!') def load_colab_model(self, _dir): self.load_state_dict(torch.load(_dir, map_location=self.device)) def save_colab_model(self, _dir): torch.save(self.state_dict(), _dir)
def train(self, epochs=10, lr=0.003, save_model=True, save_dir='./static/models', testing=True): # Setup optimizers # IT is a little bit complicated, but to match caffe implementation, it must be like this. optimizer = SGD([ { 'params': self.model.module.conv1.weight }, { 'params': self.model.module.conv1.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.conv2.weight }, { 'params': self.model.module.conv2.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.conv3a.weight }, { 'params': self.model.module.conv3a.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.conv3b.weight }, { 'params': self.model.module.conv3b.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.conv4a.weight }, { 'params': self.model.module.conv4a.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.conv4b.weight }, { 'params': self.model.module.conv4b.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.conv5a.weight }, { 'params': self.model.module.conv5a.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.conv5b.weight }, { 'params': self.model.module.conv5b.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.fc6.weight }, { 'params': self.model.module.fc6.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.fc7.weight }, { 'params': self.model.module.fc7.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, { 'params': self.model.module.fc8.weight }, { 'params': self.model.module.fc8.bias, 'lr': 2 * lr, 'weight_decay': 0.0 }, ], lr=lr, momentum=0.9, weight_decay=0.005) # summary log_dir = 'log/{}'.format( datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')) if not os.path.exists(log_dir): os.makedirs(log_dir) summary = SummaryWriter(log_dir=log_dir, comment='Training started at {}'.format( datetime.datetime.now())) # training train_steps_per_epoch = len(self.loaders[0]) lr_sched = lr_scheduler.StepLR(optimizer, step_size=4 * train_steps_per_epoch, gamma=0.1) for i in range(epochs): # train self.model.train(True) categorical_loss = 0.0 num_iter = 0 optimizer.zero_grad() pbar = tqdm(self.loaders[0]) for data in pbar: num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) labels = Variable(labels.cuda()) # Compute outputs outputs = self.model(inputs) # compute loss loss = nn.CrossEntropyLoss()(outputs, labels) categorical_loss = loss.detach().item() # /steps_per_update loss.backward() optimizer.step() optimizer.zero_grad() lr_sched.step() pbar.set_description( 'Epoch {}/{}, Iter {}, Loss: {:.8f}'.format( i + 1, epochs, num_iter, categorical_loss)) summary.add_scalars('Train loss', {'Loss': categorical_loss}, global_step=i * train_steps_per_epoch + num_iter) if num_iter == train_steps_per_epoch: if save_model: if not os.path.exists(save_dir): os.makedirs(save_dir) torch.save( self.model.module.state_dict(), '{}/model_{:06d}.pt'.format(save_dir, i + 1)) if testing: val_loss, top1, top5 = self.test() summary.add_scalars( 'Validation performance', { 'Validation loss': val_loss, 'Top-1 accuracy': top1, 'Top-5 accuracy': top5, }, i) print( 'Epoch {}/{}: Top-1 accuracy {:.2f} %, Top-5 accuracy: {:.2f} %' .format(i + 1, epochs, top1.item(), top5.item())) break
class Trainer(object): """ Trainer encapsulates all the logic necessary for training the Recurrent Attention Model. All hyperparameters are provided by the user in the config file. """ def __init__(self, data_loader): """ Construct a new Trainer instance. Args ---- - config: object containing command line arguments. - data_loader: data iterator """ # self.config = config # glimpse network params self.patch_size = 16 self.glimpse_scale = 2 self.num_patches = 3 self.loc_hidden = 128 self.glimpse_hidden = 128 # core network params self.num_glimpses = 6 self.hidden_size = 256 # reinforce params self.std = 0.17 self.M = 10 # data params self.train_loader = data_loader[0] self.valid_loader = data_loader[1] self.num_train = len(self.train_loader.sampler.indices) self.num_valid = len(self.valid_loader.sampler.indices) self.num_classes = 27 self.num_channels = 3 # training params self.epochs = 25 self.start_epoch = 0 self.saturate_epoch = 150 self.init_lr = 0.001 self.min_lr = 1e-06 self.decay_rate = (self.min_lr - self.init_lr) / (self.saturate_epoch) self.momentum = 0.5 self.lr = self.init_lr # misc params self.use_gpu = False self.best = True # self.ckpt_dir = config.ckpt_dir # self.logs_dir = config.logs_dir self.best_valid_acc = 0. self.counter = 0 # self.patience = config.patience # self.use_tensorboard = config.use_tensorboard # self.resume = config.resume # self.print_freq = config.print_freq # self.plot_freq = config.plot_freq # self.plot_dir = './plots/' + self.model_name + '/' # if not os.path.exists(self.plot_dir): # os.makedirs(self.plot_dir) # configure tensorboard logging # build RAM model self.model = RecurrentAttention( self.patch_size, self.num_patches, self.glimpse_scale, self.num_channels, self.loc_hidden, self.glimpse_hidden, self.std, self.hidden_size, self.num_classes, ) if self.use_gpu: self.model.cuda() print('[*] Number of model parameters: {:,}'.format( sum([p.data.nelement() for p in self.model.parameters()]))) # initialize optimizer and scheduler self.optimizer = SGD( self.model.parameters(), lr=self.lr, momentum=self.momentum, ) self.scheduler = ReduceLROnPlateau(self.optimizer, 'min') def reset(self): """ Initialize the hidden state of the core network and the location vector. This is called once every time a new minibatch `x` is introduced. """ h_t = torch.zeros(self.batch_size, self.hidden_size) h_t = Variable(h_t) l_t = torch.Tensor(self.batch_size, 2).uniform_(-1, 1) l_t = Variable(l_t) return h_t, l_t def train(self): """ Train the model on the training set. A checkpoint of the model is saved after each epoch and if the validation accuracy is improved upon, a separate ckpt is created for use on the test set. """ # load the most recent checkpoint # if self.resume: # self.load_checkpoint(best=False) print("\n[*] Train on {} samples, validate on {} samples".format( self.num_train, self.num_valid) ) for epoch in range(self.epochs): print( '\nEpoch: {}/{} - LR: {:.6f}'.format( epoch+1, self.epochs, self.lr) ) # train for 1 epoch train_loss, train_acc = self.train_one_epoch(epoch) # evaluate on validation set valid_loss, valid_acc = self.validate(epoch) # self.scheduler.step(valid_loss) # # # # decay learning rate # # if epoch < self.saturate_epoch: # # self.anneal_learning_rate(epoch) # is_best = valid_acc > self.best_valid_acc msg1 = "train loss: {:.3f} - train acc: {:.3f} " msg2 = "- val loss: {:.3f} - val acc: {:.3f}" if is_best: msg2 += " [*]" msg = msg1 + msg2 print(msg.format(train_loss, train_acc, valid_loss, valid_acc)) # # check for improvement # if not is_best: # self.counter += 1 # if self.counter > self.patience: # print("[!] No improvement in a while, stopping training.") # return # self.best_valid_acc = max(valid_acc, self.best_valid_acc) # self.save_checkpoint( # {'epoch': epoch + 1, 'state_dict': self.model.state_dict(), # 'best_valid_acc': self.best_valid_acc, # 'lr': self.lr}, is_best # ) def train_one_epoch(self, epoch): """ Train the model for 1 epoch of the training set. An epoch corresponds to one full pass through the entire training set in successive mini-batches. This is used by train() and should not be called manually. """ batch_time = AverageMeter() losses = AverageMeter() accs = AverageMeter() tic = time.time() for i, (x, y) in enumerate(self.train_loader): if self.use_gpu: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) # plot = False # if (epoch % self.plot_freq == 0) and (i == 0): # plot = True # initialize location vector and hidden state self.batch_size = x.shape[0] h_t, l_t = self.reset() # save images # imgs = [] # imgs.append(x[0:9]) # extract the glimpses locs = [] log_pi = [] baselines = [] for t in range(self.num_glimpses - 1): # forward pass through model h_t, l_t, b_t, p = self.model(x, l_t, h_t) # store locs.append(l_t[0:9]) baselines.append(b_t) log_pi.append(p) # last iteration h_t, l_t, b_t, log_probas, p = self.model( x, l_t, h_t, last=True ) log_pi.append(p) baselines.append(b_t) # locs.append(l_t[0:9]) # convert list to tensors and reshape baselines = torch.stack(baselines).transpose(1, 0) log_pi = torch.stack(log_pi).transpose(1, 0) # calculate reward predicted = torch.max(log_probas, 1)[1] R = (predicted.detach() == y).float() R = R.unsqueeze(1).repeat(1, self.num_glimpses) # compute losses for differentiable modules loss_action = F.nll_loss(log_probas, y) loss_baseline = F.mse_loss(baselines, R) # compute reinforce loss adjusted_reward = R - baselines.detach() loss_reinforce = torch.mean(-log_pi*adjusted_reward) # sum up into a hybrid loss loss = loss_action + loss_baseline + loss_reinforce # compute accuracy correct = (predicted == y).float() acc = 100 * (correct.sum() / len(y)) # store losses.update(loss.item(), x.size()[0]) accs.update(acc.item(), x.size()[0]) # a = list(self.model.sensor.parameters())[0].clone() # self.optimizer.zero_grad() # loss_reinforce.backward() # self.optimizer.step() # b = list(self.model.sensor.parameters())[0].clone() # print("Same: {}".format(torch.equal(a.data, b.data))) # compute gradients and update SGD self.optimizer.zero_grad() loss.backward() self.optimizer.step() # measure elapsed time toc = time.time() batch_time.update(toc-tic) # print("{:.1f}s - loss: {:.3f} - acc: {:.3f}".format( # (toc-tic), loss.data[0], acc.data[0] # )) return losses.avg, accs.avg def validate(self, epoch): """ Evaluate the model on the validation set. """ losses = AverageMeter() accs = AverageMeter() for i, (x, y) in enumerate(self.valid_loader): if self.use_gpu: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) # duplicate 10 times x = x.repeat(self.M, 1, 1, 1) # initialize location vector and hidden state self.batch_size = x.shape[0] h_t, l_t = self.reset() # extract the glimpses log_pi = [] baselines = [] for t in range(self.num_glimpses - 1): # forward pass through model h_t, l_t, b_t, p = self.model(x, l_t, h_t) # store baselines.append(b_t) log_pi.append(p) # last iteration h_t, l_t, b_t, log_probas, p = self.model( x, l_t, h_t, last=True ) log_pi.append(p) baselines.append(b_t) # convert list to tensors and reshape baselines = torch.stack(baselines).transpose(1, 0) log_pi = torch.stack(log_pi).transpose(1, 0) # average log_probas = log_probas.view( self.M, -1, log_probas.shape[-1] ) log_probas = torch.mean(log_probas, dim=0) baselines = baselines.contiguous().view( self.M, -1, baselines.shape[-1] ) baselines = torch.mean(baselines, dim=0) log_pi = log_pi.contiguous().view( self.M, -1, log_pi.shape[-1] ) log_pi = torch.mean(log_pi, dim=0) # calculate reward predicted = torch.max(log_probas, 1)[1] R = (predicted.detach() == y).float() R = R.unsqueeze(1).repeat(1, self.num_glimpses) # compute losses for differentiable modules loss_action = F.nll_loss(log_probas, y) loss_baseline = F.mse_loss(baselines, R) # compute reinforce loss adjusted_reward = R - baselines.detach() loss_reinforce = torch.mean(-log_pi*adjusted_reward) # sum up into a hybrid loss loss = loss_action + loss_baseline + loss_reinforce # compute accuracy correct = (predicted == y).float() acc = 100 * (correct.sum() / len(y)) # store losses.update(loss.item(), x.size()[0]) accs.update(acc.item(), x.size()[0]) return losses.avg, accs.avg def test(self, loader): """ Test the model on the held-out test data. This function should only be called at the very end once the model has finished training. """ correct = 0 self.test_loader = loader # load the best checkpoint # self.load_checkpoint(best=self.best) self.num_test = len(self.test_loader.dataset) for i, (x, y) in enumerate(self.test_loader): # if self.use_gpu: # x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) # duplicate 10 times x = x.repeat(self.M, 1, 1, 1) # initialize location vector and hidden state self.batch_size = x.shape[0] h_t, l_t = self.reset() # extract the glimpses for t in range(self.num_glimpses - 1): # forward pass through model h_t, l_t, b_t, p = self.model(x, l_t, h_t) # last iteration h_t, l_t, b_t, log_probas, p = self.model( x, l_t, h_t, last=True ) log_probas = log_probas.view( self.M, -1, log_probas.shape[-1] ) log_probas = torch.mean(log_probas, dim=0) pred = log_probas.data.max(1, keepdim=True)[1] correct += pred.eq(y.data.view_as(pred)).cpu().sum() perc = (100. * correct) / (self.num_test) print( '[*] Test Acc: {}/{} ({:.2f}%)'.format( correct, self.num_test, perc) ) def anneal_learning_rate(self, epoch): """ This function linearly decays the learning rate to a predefined minimum over a set amount of epochs. """ self.lr += self.decay_rate # log to tensorboard if self.use_tensorboard: log_value('learning_rate', self.lr, epoch) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr def save_checkpoint(self, state, is_best): """ Save a copy of the model so that it can be loaded at a future date. This function is used when the model is being evaluated on the test data. If this model has reached the best validation accuracy thus far, a seperate file with the suffix `best` is created. """ # print("[*] Saving model to {}".format(self.ckpt_dir)) filename = self.model_name + '_ckpt.pth.tar' ckpt_path = os.path.join(self.ckpt_dir, filename) torch.save(state, ckpt_path) if is_best: filename = self.model_name + '_model_best.pth.tar' shutil.copyfile( ckpt_path, os.path.join(self.ckpt_dir, filename) ) def load_checkpoint(self, best=False): """ Load the best copy of a model. This is useful for 2 cases: - Resuming training with the most recent model checkpoint. - Loading the best validation model to evaluate on the test data. Params ------ - best: if set to True, loads the best model. Use this if you want to evaluate your model on the test data. Else, set to False in which case the most recent version of the checkpoint is used. """ print("[*] Loading model from {}".format(self.ckpt_dir)) filename = self.model_name + '_ckpt.pth.tar' if best: filename = self.model_name + '_model_best.pth.tar' ckpt_path = os.path.join(self.ckpt_dir, filename) ckpt = torch.load(ckpt_path) # load variables from checkpoint self.start_epoch = ckpt['epoch'] self.best_valid_acc = ckpt['best_valid_acc'] self.lr = ckpt['lr'] self.model.load_state_dict(ckpt['state_dict']) if best: print( "[*] Loaded {} checkpoint @ epoch {} " "with best valid acc of {:.3f}".format( filename, ckpt['epoch']+1, ckpt['best_valid_acc']) ) else: print( "[*] Loaded {} checkpoint @ epoch {}".format( filename, ckpt['epoch']+1) )
def train(config): loader = DataLoader(TrainEvalDataset( config.dataset(split='train', **config.dataset_parameter), config), config.batch_size, True, num_workers=num_processor) test_loader = DataLoader(TrainEvalDataset( config.dataset(split='test', **config.dataset_parameter), config), config.batch_size, False, num_workers=num_processor) net = NetModel(config.net) loss_calculator = LossCalculator(config.net.loss) # net = nn.DataParallel(net) logger.info(config.net.pre_train) logger.info(type(config.net.pre_train)) if config.net.pre_train is not None and os.path.exists( config.net.pre_train): unused, unused1 = net.load_state_dict( {(('base_net.' + k) if not k.startswith('base_net') else k): v for k, v in torch.load(config.net.pre_train).items()}, strict=False) logger.info(unused) logger.info(unused1) net = net.to(device) optimizer = SGD(net.parameters(), config.lr, 0.9, weight_decay=0.0005) # optimizer = Adam(net.parameters(), config.lr, weight_decay=0.0005) exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer, 0.30) storage_dict = SqliteDict(f'{config.output_dir}/dcl_snap.db') start_epoach = 0 if len(storage_dict) > 0: kk = list(storage_dict.keys()) # net.load_state_dict( # torch.load(BytesIO(storage_dict[38]))) net.load_state_dict(torch.load(BytesIO(storage_dict[kk[-1]]))) start_epoach = int(kk[-1]) + 1 logger.info(f'loading from epoach{start_epoach}') global_step = 0 for epoach in (range(start_epoach, config.max_it)): net.train() for batch_cnt, batch in tqdm(enumerate(loader), total=len(loader)): image, label = batch if isinstance(image, torch.Tensor): image = image.to(device) elif isinstance(image, dict): for k, v in image.items(): if isinstance(v, torch.Tensor): image[k] = image[k].to(device) elif isinstance(image, list): for v in image: v.to(device) for k, v in label.items(): if isinstance(v, torch.Tensor): label[k] = label[k].to(device) optimizer.zero_grad() net_out = net(image) loss_sum, loss_map = loss_calculator(net_out, label) loss_sum.backward() optimizer.step() global_step += 1 wtire_summary(loss_map, 'train', global_step) exp_lr_scheduler.step(epoach) logger.debug(f'saving epoach {epoach}') buffer = BytesIO() torch.save(net.state_dict(), buffer) buffer.seek(0) storage_dict[epoach] = buffer.read() storage_dict.commit() test(config, net, test_loader, epoach, loss_calculator)
next_pred, _ = n_copy(input, (h.detach(), c.detach())) target = input[0, 0, 0].detach() + gamma * next_pred.detach().item() # print(gt_target, target) real_error = (target - value_prediction)**2 gt_error = (gt_target - value_prediction)**2 if sum_of_error is None: sum_of_error = real_error else: sum_of_error = sum_of_error + real_error running_error = running_error * 0.9999 + gt_error.detach().item() * 0.0001 if (i % args["truncation"] == 0): opti.zero_grad() sum_of_error.backward() opti.step() h = h.detach() c = c.detach() sum_of_error = None if (i % 50000 == 20000): error_list.append([str(rank), str(i), str(running_error)]) if (i % 100000 == 4): my_experiment.insert_values("predictions", predictions_table_keys, predictions_list) predictions_list = [] my_experiment.insert_values("error_table", error_table_keys, error_list) error_list = [] if (i % 100000 == 0):
batch_size = 256 trans_mnist = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) train_dataset = MNIST('./data/mnist/', train=True, download=True, transform=trans_mnist) test_dataset = MNIST('./data/mnist/', train=False, download=True, transform=trans_mnist) train_loader = DataLoader(train_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) model = LeNet().to(args.device) sgd = SGD(model.parameters(), lr=1e-1) cross_error = CrossEntropyLoss() epoch = 100 writer = SummaryWriter('./runs/t_centerlize') for _epoch in range(epoch): epoch_loss = [] for idx, (train_x, train_label) in enumerate(train_loader): train_x, train_label = train_x.to(args.device), train_label.to(args.device) #label_np = np.zeros((train_label.shape[0], 10)) sgd.zero_grad() predict_y = model(train_x.float()) _error = cross_error(predict_y, train_label.long()) _error.backward() sgd.step() epoch_loss.append(_error) avg_epoch = sum(epoch_loss) / len(epoch_loss) writer.add_scalar("train_loss", avg_epoch, _epoch) print('Round {:3d}, Average loss {:.3f}'.format(_epoch, avg_epoch)) acc_test, loss_test = test_img(model, test_dataset, args) print("Testing accuracy: {:.2f}".format(acc_test))
class LunaTrainingApp(): def __init__(self, sys_argv=None): if sys_argv is None: sys_argv = sys.argv[1:] parser = argparse.ArgumentParser() parser.add_argument('--batch-size', help="Batch size to use for training", default=32, type=int) parser.add_argument( '--num-workers', help="Number of worker processes for background data loading", default=8, type=int) parser.add_argument('--epochs', help="Number of epochs to train for", default=1, type=int) self.cli_args = parser.parse_args(sys_argv) self.time_str = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") def main(self): log.info("Starting {}, {}".format(type(self).__name__, self.cli_args)) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") self.model = LunaModel() if self.use_cuda: if torch.cuda.device_count() > 1: self.model = nn.DataParallel(self.model) self.model = self.model.to(self.device) self.optimizer = SGD(self.model.parameters(), lr=0.01, momentum=0.9) train_dl = DataLoader( LunaDataset(test_stride=10, isTestSet_bool=False), batch_size=self.cli_args.batch_size * (torch.cuda.device_count() if self.use_cuda else 1), num_workers=self.cli_args.num_workers, pin_memory=self.use_cuda) test_dl = DataLoader( LunaDataset(test_stride=10, isTestSet_bool=True), batch_size=self.cli_args.batch_size * (torch.cuda.device_count() if self.use_cuda else 1), num_workers=self.cli_args.num_workers, pin_memory=self.use_cuda) for epoch_ndx in range(1, self.cli_args.epochs + 1): log.info("Epoch {} of {}, {}/{} batches of size {}*{}".format( epoch_ndx, self.cli_args.epochs, len(train_dl), len(test_dl), self.cli_args.batch_size, (torch.cuda.device_count() if self.use_cuda else 1))) # Trainig loop, very similar to below self.model.train() trainingMetrics_tensor = torch.zeros(3, len(train_dl.dataset), 1) batch_iter = enumerateWithEstimate(train_dl, "E{} Traning".format(epoch_ndx), start_ndx=train_dl.num_workers) for batch_ndx, batch_tup in batch_iter: self.optimizer.zero_grad() loss_var = self.computeBatchLoss(batch_ndx, batch_tup, train_dl.batch_size, trainingMetrics_tensor) loss_var.backward() self.optimizer.step() del loss_var # Testing loop, very similar to above, but simplified with torch.no_grad(): self.model.eval() testingMetrics_tensor = torch.zeros(3, len(test_dl.dataset), 1) batch_iter = enumerateWithEstimate( test_dl, "E{} Testing".format(epoch_ndx), start_ndx=test_dl.num_workers) for batch_ndx, batch_tup in batch_iter: self.computeBatchLoss(batch_ndx, batch_tup, test_dl.batch_size, testingMetrics_tensor) self.logMetrics(epoch_ndx, trainingMetrics_tensor, testingMetrics_tensor) def computeBatchLoss(self, batch_ndx, batch_tup, batch_size, metrics_tensor): input_tensor, label_tensor, _series_list, _center_list = batch_tup input_devtensor = input_tensor.to(self.device) label_devtensor = label_tensor.to(self.device) prediction_devtensor = self.model(input_devtensor) loss_devsensor = nn.MSELoss(reduction='none')(prediction_devtensor, label_devtensor) start_ndx = batch_ndx * batch_size end_ndx = start_ndx + label_tensor.size(0) metrics_tensor[METRICS_LABEL_NDX, start_ndx:end_ndx] = label_tensor metrics_tensor[METRICS_PRED_NDX, start_ndx:end_ndx] = \ prediction_devtensor.to('cpu') metrics_tensor[METRICS_LOSS_NDX, start_ndx:end_ndx] = \ loss_devsensor return loss_devsensor.mean() def logMetrics(self, epoch_ndx, trainingMetrics_tensor, testingMetrics_tensor, classificationThreshold_float=0.5): log.info("E{} {}".format(epoch_ndx, type(self).__name__)) for mode_str, metrics_tensor in [('trn', trainingMetrics_tensor), ('tst', testingMetrics_tensor)]: metrics_ary = metrics_tensor.detach().numpy()[:, :, 0] assert np.isfinite(metrics_ary).all() benLabel_mask = metrics_ary[METRICS_LABEL_NDX] <= \ classificationThreshold_float benPred_mask = metrics_ary[METRICS_PRED_NDX] <= \ classificationThreshold_float malLabel_mask = ~benLabel_mask malPred_mask = ~benPred_mask benLabel_count = benLabel_mask.sum() malLabel_count = malLabel_mask.sum() benCorrect_count = (benLabel_mask & benPred_mask).sum() malCorrect_count = (malLabel_mask & malPred_mask).sum() metrics_dict = {} metrics_dict['loss/all'] = metrics_ary[METRICS_LOSS_NDX].mean() metrics_dict['loss/ben'] = metrics_ary[METRICS_LOSS_NDX, benLabel_mask].mean() metrics_dict['loss/mal'] = metrics_ary[METRICS_LOSS_NDX, malLabel_mask].mean() metrics_dict['correct/all'] = (malCorrect_count + benCorrect_count) \ / metrics_ary.shape[1] * 100 metrics_dict[ 'correct/ben'] = benCorrect_count / benLabel_count * 100 metrics_dict[ 'correct/mal'] = malCorrect_count / malLabel_count * 100 log.info(("E{} {:8} {loss/all:.4f} loss, {correct/all:-5.1f}% " "correct").format(epoch_ndx, mode_str, **metrics_dict)) log.info(("E{} {:8} {loss/ben:.4f} loss, {correct/ben:-5.1f}% " "correct").format(epoch_ndx, mode_str + '_ben', **metrics_dict)) log.info(("E{} {:8} {loss/mal:.4f} loss, {correct/mal:-5.1f}% " "correct").format(epoch_ndx, mode_str + 'mal', **metrics_dict))
def train_class(directory, version, model, train_loader, valid_loader, resize, batch_size, exp_name='experiment', lr=0.01, epochs=10, momentum=0.99, logdir='logs', dizionario=None): print("Taining classifacation") criterion = nn.CrossEntropyLoss() optimizer = SGD(model.parameters(), lr, momentum=momentum) #meters loss_meter = AverageValueMeter() acc_meter = AverageValueMeter() #writer writer = SummaryWriter(join(logdir, exp_name)) #device device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) #definiamo un dizionario contenente i loader di training e test loader = {'train': train_loader, 'valid': valid_loader} array_accuracy_train = [] array_accuracy_valid = [] array_loss_train = [] array_loss_valid = [] array_glb_train = [] array_glb_valid = [] last_loss_train = 0 last_loss_val = 0 last_acc_train = 0 last_acc_val = 0 #inizializziamo il global step global_step = 0 tempo = Timer() start = timer() start_epoca = 0 if dizionario is not None: print("Inizializza") array_accuracy_train = dizionario["a_train"] array_accuracy_valid = dizionario["a_valid"] array_loss_train = dizionario["l_train"] array_loss_valid = dizionario["l_valid"] array_glb_train = dizionario["g_train"] array_glb_valid = dizionario["g_valid"] global_step = dizionario["g_valid"][-1] start_epoca = dizionario["epoche_fatte"] + 1 # indice epoca di inizio print("global step", global_step) print("a_acc_train", array_accuracy_train) print("a_acc_valid", array_accuracy_valid) print("loss_train", array_loss_train) print("loss_valid", array_loss_valid) print("glb_train", array_glb_train) print("glb_valid", array_glb_valid) print("epoca_start_indice ", start_epoca) start = timer() print("Num epoche", epochs) for e in range(start_epoca, epochs): print("Epoca= ", e) #iteriamo tra due modalità: train e test for mode in ['train', 'valid']: loss_meter.reset() acc_meter.reset() model.train() if mode == 'train' else model.eval() with torch.set_grad_enabled( mode == 'train'): #abilitiamo i gradienti solo in training for i, batch in enumerate(loader[mode]): print(batch['label']) #x, y = [b.to(device) for b in batch] x = batch['image'].to( device) #"portiamoli sul device corretto" y = batch['label'].to(device) output = model(x) #aggiorniamo il global_step #conterrà il numero di campioni visti durante il training n = x.shape[0] #numero di elementi nel batch print("numero elementi nel batch ", n) global_step += n l = criterion(output, y) if mode == 'train': l.backward() optimizer.step() optimizer.zero_grad() print("Etichette predette", output.to('cpu').max(1)[1]) acc = accuracy_score(y.to('cpu'), output.to('cpu').max(1)[1]) loss_meter.add(l.item(), n) acc_meter.add(acc, n) #loggiamo i risultati iterazione per iterazione solo durante il training if mode == 'train': writer.add_scalar('loss/train', loss_meter.value(), global_step=global_step) writer.add_scalar('accuracy/train', acc_meter.value(), global_step=global_step) print("Accuracy Train=", acc_meter.value()) #una volta finita l'epoca (sia nel caso di training che test, loggiamo le stime finali) if mode == 'train': global_step_train = global_step last_loss_train = loss_meter.value() last_acc_train = acc_meter.value() print("Accuracy Train=", acc_meter.value()) array_accuracy_train.append(acc_meter.value()) array_loss_train.append(loss_meter.value()) array_glb_train.append(global_step) else: global_step_val = global_step last_loss_val = loss_meter.value() last_acc_val = acc_meter.value() print("Accuracy Valid=", acc_meter.value()) array_accuracy_valid.append(acc_meter.value()) array_loss_valid.append(loss_meter.value()) array_glb_valid.append(global_step) writer.add_scalar('loss/' + mode, loss_meter.value(), global_step=global_step) writer.add_scalar('accuracy/' + mode, acc_meter.value(), global_step=global_step) print("Loss TRAIN", array_loss_train) print("Losss VALID", array_loss_valid) print("Accuracy TRAIN", array_accuracy_train) print("Accuracy VALID", array_accuracy_valid) print("dim acc train", len(array_accuracy_train)) print("dim acc valid", len(array_accuracy_valid)) figure = plt.figure(figsize=(12, 8)) plt.plot(array_glb_train, array_accuracy_train) plt.plot(array_glb_valid, array_accuracy_valid) plt.xlabel('samples') plt.ylabel('accuracy') plt.grid() plt.legend(['Training', 'Valid']) plt.savefig(directory + '//plotAccuracy_' + version + '.png') plt.clf() plt.close(figure) figure = plt.figure(figsize=(12, 8)) plt.plot(array_glb_train, array_loss_train) plt.plot(array_glb_valid, array_loss_valid) plt.xlabel('samples') plt.ylabel('loss') plt.grid() plt.legend(['Training', 'Valid']) plt.savefig(directory + '//plotLoss_' + version + '.png') plt.clf() plt.close(figure) #conserviamo i pesi del modello alla fine di un ciclo di training e test net_save(epochs, model, optimizer, last_loss_train, last_loss_val, last_acc_train, last_acc_val, global_step_train, global_step_val, '%s.pth' % (exp_name + "_dict"), dict_stato_no=True) #conserviamo i pesi del modello alla fine di un ciclo di training e test torch.save( model, directory + "//" + version + "//" + '%s.pth' % (exp_name + "_" + str(e))) torch.save(model, '%s.pth' % (exp_name)) saveArray(directory, version, array_loss_train, array_loss_valid, array_accuracy_train, array_accuracy_valid, array_glb_train, array_glb_valid) saveinFileJson(start, directory, version, resize, batch_size, e, lr, momentum, len(train_loader), array_accuracy_train[-1], array_accuracy_valid[-1], array_loss_train[-1], array_loss_valid[-1]) f = '{:.7f}'.format(tempo.stop()) return model, f, last_loss_train, last_loss_val, last_acc_train, last_acc_val
fitness_shaping) train_writer.add_scalar('fitness', raw_fitness.mean(), i) train_writer.add_scalar('fitness/std', raw_fitness.std(), i) for p_idx, p in enumerate(population.parameters()): train_writer.add_histogram('grads/%d' % p_idx, p.grad, i) for k, p in population.mixing_logits.items(): train_writer.add_histogram( "entropy/%s" % k, t.distributions.Categorical(logits=p).entropy(), i) means = population.component_means # (480, 5) dist = ((means.unsqueeze(0) - means.unsqueeze(1))**2).sum( dim=2).sqrt() # (1, 480, 5,) - (480, 1, 5) = (480, 480, 5) train_writer.add_histogram("dist", dist, i) optim.step() sched.step() population.std *= 0.999 mean_fit = raw_fitness.mean().item() pbar.set_description("avg fit: %.3f, std: %.3f" % (mean_fit, raw_fitness.std().item())) all_params = population.parameters() t.save(all_params, 'last.t') if mean_fit > best_so_far: best_so_far = mean_fit t.save(all_params, 'best.t') util.upload_results('best.t')
def train(train_source_iter: ForeverDataIterator, train_target_iter: ForeverDataIterator, model: ImageClassifier, domain_adv: ConditionalDomainAdversarialLoss, optimizer: SGD, lr_sheduler: StepwiseLR, epoch: int, args: argparse.Namespace): batch_time = AverageMeter('Time', ':3.1f') data_time = AverageMeter('Data', ':3.1f') losses = AverageMeter('Loss', ':3.2f') trans_losses = AverageMeter('Trans Loss', ':3.2f') cls_accs = AverageMeter('Cls Acc', ':3.1f') domain_accs = AverageMeter('Domain Acc', ':3.1f') progress = ProgressMeter( args.iters_per_epoch, [batch_time, data_time, losses, trans_losses, cls_accs, domain_accs], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() domain_adv.train() end = time.time() for i in range(args.iters_per_epoch): lr_sheduler.step() # measure data loading time data_time.update(time.time() - end) x_s, labels_s = next(train_source_iter) x_t, _ = next(train_target_iter) x_s = x_s.to(device) x_t = x_t.to(device) labels_s = labels_s.to(device) # compute output x = torch.cat((x_s, x_t), dim=0) y, f = model(x) y_s, y_t = y.chunk(2, dim=0) f_s, f_t = f.chunk(2, dim=0) cls_loss = F.cross_entropy(y_s, labels_s) transfer_loss = domain_adv(y_s, f_s, y_t, f_t) domain_acc = domain_adv.domain_discriminator_accuracy loss = cls_loss + transfer_loss * args.trade_off cls_acc = accuracy(y_s, labels_s)[0] losses.update(loss.item(), x_s.size(0)) cls_accs.update(cls_acc.item(), x_s.size(0)) domain_accs.update(domain_acc.item(), x_s.size(0)) trans_losses.update(transfer_loss.item(), x_s.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
# Choose different scheduler to test scheduler = StepLR(optim, step_size=10, gamma=0.1) scheduler = MultiStepLR(optim, milestones=[3, 6, 9], gamma=0.1) scheduler = ReduceLROnPlateau(optim, threshold=0.99, mode='min', patience=2, cooldown=5) scheduler = WarmupLR(scheduler, init_lr=0.01, num_warmup=3, warmup_strategy='cos') # this zero gradient update is needed to avoid a warning message, issue #8. optim.zero_grad() optim.step() # The wrapper doesn't affect old scheduler api # Simply plug and play for epoch in range(1, 20): # step with pseudo loss if we're using reducelronplateau if isinstance(scheduler._scheduler, ReduceLROnPlateau): pseudo_loss = 20 - epoch scheduler.step(pseudo_loss) print('Epoch: {} LR: {:.3f} pseudo loss: {:.2f}'.format( epoch, optim.param_groups[0]['lr'], pseudo_loss)) # step without any parameters else: scheduler.step() print(epoch, optim.param_groups[0]['lr']) optim.step() # backward pass (update network)
def main(): # Arguments parser parser = argparse.ArgumentParser( description='Tuning with DNN Model for NER') # Model Hyperparameters parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', default='LSTM') parser.add_argument('--encoder_mode', choices=['cnn', 'lstm'], help='Encoder type for sentence encoding', default='lstm') parser.add_argument('--char_method', choices=['cnn', 'lstm'], help='Method to create character-level embeddings', required=True) parser.add_argument( '--hidden_size', type=int, default=128, help='Number of hidden units in RNN for sentence level') parser.add_argument('--char_hidden_size', type=int, default=30, help='Output character-level embeddings size') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') parser.add_argument('--dropout', choices=['std', 'weight_drop'], help='Dropout method', default='weight_drop') parser.add_argument('--p_em', type=float, default=0.33, help='dropout rate for input embeddings') parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input of RNN model') parser.add_argument('--p_rnn', nargs=2, type=float, required=True, help='dropout rate for RNN') parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer') parser.add_argument('--bigram', action='store_true', help='bi-gram parameter for CRF') # Data loading and storing params parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument('--dataset_name', type=str, default='alexa', help='Which dataset to use') parser.add_argument('--train', type=str, required=True, help='Path of train set') parser.add_argument('--dev', type=str, required=True, help='Path of dev set') parser.add_argument('--test', type=str, required=True, help='Path of test set') parser.add_argument('--results_folder', type=str, default='results', help='The folder to store results') parser.add_argument('--tmp_folder', type=str, default='tmp', help='The folder to store tmp files') parser.add_argument('--alphabets_folder', type=str, default='data/alphabets', help='The folder to store alphabets files') parser.add_argument('--result_file_name', type=str, default='hyperparameters_tuning', help='File name to store some results') parser.add_argument('--result_file_path', type=str, default='results/hyperparameters_tuning', help='File name to store some results') # Training parameters parser.add_argument('--cuda', action='store_true', help='whether using GPU') parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--learning_rate', type=float, default=0.001, help='Base learning rate') parser.add_argument('--decay_rate', type=float, default=0.95, help='Decay rate of learning rate') parser.add_argument('--schedule', type=int, default=3, help='schedule for learning rate decay') parser.add_argument('--gamma', type=float, default=0.0, help='weight for l2 regularization') parser.add_argument('--max_norm', type=float, default=1., help='Max norm for gradients') parser.add_argument('--gpu_id', type=int, nargs='+', required=True, help='which gpu to use for training') # Misc parser.add_argument('--embedding', choices=['glove', 'senna', 'alexa'], help='Embedding for words', required=True) parser.add_argument('--restore', action='store_true', help='whether restore from stored parameters') parser.add_argument('--save_checkpoint', type=str, default='', help='the path to save the model') parser.add_argument('--o_tag', type=str, default='O', help='The default tag for outside tag') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--evaluate_raw_format', action='store_true', help='The tagging format for evaluation') args = parser.parse_args() logger = get_logger("NERCRF") # rename the parameters mode = args.mode encoder_mode = args.encoder_mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size char_hidden_size = args.char_hidden_size char_method = args.char_method learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma max_norm = args.max_norm schedule = args.schedule dropout = args.dropout p_em = args.p_em p_rnn = tuple(args.p_rnn) p_in = args.p_in p_out = args.p_out unk_replace = args.unk_replace bigram = args.bigram embedding = args.embedding embedding_path = args.embedding_dict dataset_name = args.dataset_name result_file_name = args.result_file_name evaluate_raw_format = args.evaluate_raw_format o_tag = args.o_tag restore = args.restore save_checkpoint = args.save_checkpoint gpu_id = args.gpu_id results_folder = args.results_folder tmp_folder = args.tmp_folder alphabets_folder = args.alphabets_folder use_elmo = False p_em_vec = 0. result_file_path = args.result_file_path score_file = "%s/score_gpu_%s" % (tmp_folder, '-'.join(map(str, gpu_id))) if not os.path.exists(results_folder): os.makedirs(results_folder) if not os.path.exists(tmp_folder): os.makedirs(tmp_folder) if not os.path.exists(alphabets_folder): os.makedirs(alphabets_folder) embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) logger.info("Creating Alphabets") word_alphabet, char_alphabet, ner_alphabet = conll03_data.create_alphabets( "{}/{}/".format(alphabets_folder, dataset_name), train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info("Reading Data") device = torch.device('cuda') if args.cuda else torch.device('cpu') print(device) data_train = conll03_data.read_data_to_tensor(train_path, word_alphabet, char_alphabet, ner_alphabet, device=device) num_data = sum(data_train[1]) num_labels = ner_alphabet.size() data_dev = conll03_data.read_data_to_tensor(dev_path, word_alphabet, char_alphabet, ner_alphabet, device=device) data_test = conll03_data.read_data_to_tensor(test_path, word_alphabet, char_alphabet, ner_alphabet, device=device) writer = CoNLL03Writer(word_alphabet, char_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conll03_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = args.char_dim window = 3 num_layers = args.num_layers tag_space = args.tag_space initializer = nn.init.xavier_uniform_ if args.dropout == 'std': network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, initializer=initializer) elif args.dropout == 'var': network = BiVarRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, initializer=initializer) else: network = BiWeightDropRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_em=p_em, p_in=p_in, p_out=p_out, p_rnn=p_rnn, bigram=bigram, initializer=initializer) network = network.to(device) lr = learning_rate optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) # optim = Adam(network.parameters(), lr=lr, weight_decay=gamma, amsgrad=True) nn.utils.clip_grad_norm_(network.parameters(), max_norm) logger.info("Network: %s, encoder_mode=%s, num_layer=%d, hidden=%d, char_hidden_size=%d, char_method=%s, tag_space=%d, crf=%s" % \ (mode, encoder_mode, num_layers, hidden_size, char_hidden_size, char_method, tag_space, 'bigram' if bigram else 'unigram')) logger.info( "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (gamma, num_data, batch_size, unk_replace)) logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_out, p_rnn)) num_batches = num_data // batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 best_test_f1 = 0.0 best_test_acc = 0.0 best_test_precision = 0.0 best_test_recall = 0.0 best_test_epoch = 0.0 for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, args.dropout, lr, decay_rate, schedule)) train_err = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch in range(1, num_batches + 1): _, word, char, labels, masks, lengths = conll03_data.get_batch_tensor( data_train, batch_size, unk_replace=unk_replace) optim.zero_grad() loss = network.loss(_, word, char, labels, mask=masks) loss.backward() optim.step() with torch.no_grad(): num_inst = word.size(0) train_err += loss * num_inst train_total += num_inst time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 20 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, time: %.2fs' % (num_batches, train_err / train_total, time.time() - start_time)) # evaluate performance on dev data with torch.no_grad(): network.eval() tmp_filename = '%s/gpu_%s_dev' % (tmp_folder, '-'.join( map(str, gpu_id))) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_tensor( data_dev, batch_size): _, word, char, labels, masks, lengths = batch preds, _ = network.decode( _, word, char, target=labels, mask=masks, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.cpu().numpy(), preds.cpu().numpy(), labels.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename, score_file, evaluate_raw_format, o_tag) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1 < f1: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # evaluate on test data when better performance detected tmp_filename = '%s/gpu_%s_test' % (tmp_folder, '-'.join( map(str, gpu_id))) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_tensor( data_test, batch_size): _, word, char, labels, masks, lengths = batch preds, _ = network.decode( _, word, char, target=labels, mask=masks, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.cpu().numpy(), preds.cpu().numpy(), labels.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc, test_precision, test_recall, test_f1 = evaluate( tmp_filename, score_file, evaluate_raw_format, o_tag) if best_test_f1 < test_f1: best_test_acc, best_test_precision, best_test_recall, best_test_f1 = test_acc, test_precision, test_recall, test_f1 best_test_epoch = epoch print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc, test_precision, test_recall, test_f1, best_epoch)) print( "overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch)) if epoch % schedule == 0: lr = learning_rate / (1.0 + epoch * decay_rate) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) with open(result_file_path, 'a') as ofile: ofile.write( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) ofile.write( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % (test_acc, test_precision, test_recall, test_f1, best_epoch)) ofile.write( "overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n\n" % (best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch)) print('Training finished!')
x = model.module.features[2](x) x = model.module.features[3](x) #x = x.view(x.size(0), model.module.nfscat*3, model.module.nspace, model.module.nspace) loss = -1.0* x[0, f_num, 2, 2] #https://towardsdatascience.com/pytorch-implementation-of-perceptual-losses-for-real-time-style-transfer-8d608e2e9902 #reg_loss = REGULARIZATION * ( #torch.sum(torch.abs(im_as_var[:, :, :-1] - im_as_var[ :, :, 1:])) + #torch.sum(torch.abs(im_as_var[ :, :-1, :] - im_as_var[:, 1:, :])) #) reg_loss = 0 loss = loss + reg_loss loss.backward() optimizer.step() recreated_im = copy.copy(im_as_var.data.cpu().numpy()[0]).transpose(2,1,0) #recreated_im = recreated_im[11:22,11:22,:] minned = recreated_im - np.min(recreated_im) ax1 = fig.add_subplot(num_rows, num_cols, importance + 1) ax1.imshow(minned/np.max(minned)) ax1.axis('off') ax1.set_xticklabels([]) ax1.set_yticklabels([]) ax1.set_title("{0:.2f}".format(allFilters - scores[f_num])) plt.subplots_adjust(wspace=1.0, hspace=0.1) plt.savefig("deep_dream_alexnet_l2.png") plt.close()
def train(opt): # set device to cpu/gpu if opt.use_gpu: device = torch.device("cuda", opt.gpu_id) else: device = torch.device("cpu") # Data transformations for data augmentation transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.RandomErasing(), ]) transform_val = transforms.Compose([ transforms.ToTensor(), ]) # get CIFAR10/CIFAR100 train/val set if opt.dataset == "CIFAR10": alp_lambda = 0.5 lambda_loss = [0.005, 0.001] train_set = CIFAR10(root="./data", train=True, download=True, transform=transform_train) val_set = CIFAR10(root="./data", train=True, download=True, transform=transform_val) else: alp_lambda = 0.5 lambda_loss = [0.005, 0.001] train_set = CIFAR100(root="./data", train=True, download=True, transform=transform_train) val_set = CIFAR100(root="./data", train=True, download=True, transform=transform_val) num_classes = np.unique(train_set.targets).shape[0] # set stratified train/val split idx = list(range(len(train_set.targets))) train_idx, val_idx, _, _ = train_test_split( idx, train_set.targets, test_size=opt.val_split, random_state=42) # get train/val samplers train_sampler = SubsetRandomSampler(train_idx) val_sampler = SubsetRandomSampler(val_idx) # get train/val dataloaders train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=opt.batch_size, num_workers=opt.num_workers) val_loader = DataLoader(val_set, sampler=val_sampler, batch_size=opt.batch_size, num_workers=opt.num_workers) data_loaders = {"train": train_loader, "val": val_loader} print("Dataset -- {}, Metric -- {}, Train Mode -- {}, Backbone -- {}".format(opt.dataset, opt.metric, opt.train_mode, opt.backbone)) print("Train iteration batch size: {}".format(opt.batch_size)) print("Train iterations per epoch: {}".format(len(train_loader))) # get backbone model if opt.backbone == "resnet18": model = resnet18(pretrained=False) else: model = resnet34(pretrained=False) # set metric loss function in_features = model.fc.in_features model.fc = Softmax(in_features, num_classes) model.to(device) if opt.use_gpu: model = DataParallel(model).to(device) criterion = CrossEntropyLoss() mse_criterion = MSELoss() cent_criterion = CenterLoss(num_classes, in_features, device) # set optimizer and LR scheduler if opt.optimizer == "sgd": optimizer = SGD([{"params": model.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9) cent_optimizer = SGD([{"params": cent_criterion.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay, momentum=0.9) else: optimizer = Adam([{"params": model.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay) cent_optimizer = Adam([{"params": cent_criterion.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay) if opt.scheduler == "decay": scheduler = lr_scheduler.StepLR( optimizer, step_size=opt.lr_step, gamma=opt.lr_decay) else: scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.1, patience=10) # train/val loop for epoch in range(opt.epoch): for phase in ["train", "val"]: total_examples, total_correct, total_loss = 0, 0, 0 if phase == "train": model.train() else: model.eval() start_time = time.time() for ii, data in enumerate(data_loaders[phase]): # load data batch to device images, labels = data images = images.to(device) labels = labels.to(device).long() # perform adversarial attack update to images if opt.train_mode == "at" or opt.train_mode == "alp": adv_images = pgd( model, images, labels, 8. / 255, 2. / 255, 7) else: pass # at train mode if opt.train_mode == "at": # get feature embedding and logits from resnet features, predictions = model(images, labels) adv_features, adv_predictions = model(adv_images, labels) # get center loss cent_loss = cent_criterion(features, labels) cent_loss = cent_loss + \ cent_criterion(adv_features, labels) # get feature norm loss norm = features.mm(features.t()).diag() adv_norm = adv_features.mm(adv_features.t()).diag() norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \ (features.size(0) + adv_features.size(0)) # get cross-entropy loss ce_loss = criterion(predictions, labels) ce_loss = ce_loss + criterion(adv_predictions, labels) # combine cross-entropy loss, center loss and feature norm loss using lambda weights loss = ce_loss + lambda_loss[0] * \ cent_loss + lambda_loss[1] * norm_loss optimizer.zero_grad() cent_optimizer.zero_grad() # for result accumulation predictions = adv_predictions # alp train mode elif opt.train_mode == "alp": # get feature embedding and logits from resnet features, predictions = model(images, labels) adv_features, adv_predictions = model(adv_images, labels) # get center loss cent_loss = cent_criterion(features, labels) cent_loss = cent_loss + \ cent_criterion(adv_features, labels) # get feature norm loss norm = features.mm(features.t()).diag() adv_norm = adv_features.mm(adv_features.t()).diag() norm_loss = (torch.sum(norm) + torch.sum(adv_norm)) / \ (features.size(0) + adv_features.size(0)) # get cross-entropy loss ce_loss = criterion(predictions, labels) ce_loss = ce_loss + criterion(adv_predictions, labels) # get alp loss alp_loss = mse_criterion(adv_predictions, predictions) # combine cross-entropy loss, center loss and feature norm loss using lambda weights loss = ce_loss + lambda_loss[0] * \ cent_loss + lambda_loss[1] * norm_loss # combine loss with alp loss loss = loss + alp_lambda * alp_loss optimizer.zero_grad() cent_optimizer.zero_grad() # for result accumulation predictions = adv_predictions # clean train mode else: # get feature embedding and logits from resnet features, predictions = model(images, labels) # get center loss cent_loss = cent_criterion(features, labels) # get feature norm loss norm = features.mm(features.t()).diag() norm_loss = torch.sum(norm) / features.size(0) # get cross-entropy loss ce_loss = criterion(predictions, labels) # combine cross-entropy loss, center loss and feature norm loss using lambda weights loss = ce_loss + lambda_loss[0] * \ cent_loss + lambda_loss[1] * norm_loss optimizer.zero_grad() cent_optimizer.zero_grad() # only take step if in train phase if phase == "train": loss.backward() optimizer.step() cent_optimizer.step() # accumulate train or val results predictions = torch.argmax(predictions, 1) total_examples += predictions.size(0) total_correct += predictions.eq(labels).sum().item() total_loss += loss.item() # print accumulated train/val results at end of epoch if ii == len(data_loaders[phase]) - 1: end_time = time.time() acc = total_correct / total_examples loss = total_loss / len(data_loaders[phase]) print("{}: Epoch -- {} Loss -- {:.6f} Acc -- {:.6f} Time -- {:.6f}sec".format( phase, epoch, loss, acc, end_time - start_time)) if phase == "train": loss = total_loss / len(data_loaders[phase]) scheduler.step(loss) else: print("") # save model after training for opt.epoch save_model(model, opt.dataset, opt.metric, opt.train_mode, opt.backbone)
class AR1(BaseStrategy): """ The AR1 strategy with Latent Replay. This implementations allows for the use of both Synaptic Intelligence and Latent Replay to protect the lower level of the model from forgetting. While the original papers show how to use those two techniques in a mutual exclusive way, this implementation allows for the use of both of them concurrently. This behaviour is controlled by passing proper constructor arguments). """ def __init__(self, criterion=None, lr: float = 0.001, momentum=0.9, l2=0.0005, train_epochs: int = 4, init_update_rate: float = 0.01, inc_update_rate=0.00005, max_r_max=1.25, max_d_max=0.5, inc_step=4.1e-05, rm_sz: int = 1500, freeze_below_layer: str = "lat_features.19.bn.beta", latent_layer_num: int = 19, ewc_lambda: float = 0, train_mb_size: int = 128, eval_mb_size: int = 128, device=None, plugins: Optional[Sequence[StrategyPlugin]] = None, evaluator: EvaluationPlugin = default_logger, eval_every=-1): """ Creates an instance of the AR1 strategy. :param criterion: The loss criterion to use. Defaults to None, in which case the cross entropy loss is used. :param lr: The learning rate (SGD optimizer). :param momentum: The momentum (SGD optimizer). :param l2: The L2 penalty used for weight decay. :param train_epochs: The number of training epochs. Defaults to 4. :param init_update_rate: The initial update rate of BatchReNorm layers. :param inc_update_rate: The incremental update rate of BatchReNorm layers. :param max_r_max: The maximum r value of BatchReNorm layers. :param max_d_max: The maximum d value of BatchReNorm layers. :param inc_step: The incremental step of r and d values of BatchReNorm layers. :param rm_sz: The size of the replay buffer. The replay buffer is shared across classes. Defaults to 1500. :param freeze_below_layer: A string describing the name of the layer to use while freezing the lower (nearest to the input) part of the model. The given layer is not frozen (exclusive). :param latent_layer_num: The number of the layer to use as the Latent Replay Layer. Usually this is the same of `freeze_below_layer`. :param ewc_lambda: The Synaptic Intelligence lambda term. Defaults to 0, which means that the Synaptic Intelligence regularization will not be applied. :param train_mb_size: The train minibatch size. Defaults to 128. :param eval_mb_size: The eval minibatch size. Defaults to 128. :param device: The device to use. Defaults to None (cpu). :param plugins: (optional) list of StrategyPlugins. :param evaluator: (optional) instance of EvaluationPlugin for logging and metric computations. :param eval_every: the frequency of the calls to `eval` inside the training loop. if -1: no evaluation during training. if 0: calls `eval` after the final epoch of each training experience. if >0: calls `eval` every `eval_every` epochs and at the end of all the epochs for a single experience. """ warnings.warn("The AR1 strategy implementation is in an alpha stage " "and is not perfectly aligned with the paper " "implementation. Please use at your own risk!") if plugins is None: plugins = [] # Model setup model = MobilenetV1(pretrained=True, latent_layer_num=latent_layer_num) replace_bn_with_brn(model, momentum=init_update_rate, r_d_max_inc_step=inc_step, max_r_max=max_r_max, max_d_max=max_d_max) fc_name, fc_layer = get_last_fc_layer(model) if ewc_lambda != 0: # Synaptic Intelligence is not applied to the last fully # connected layer (and implicitly to "freeze below" ones. plugins.append( SynapticIntelligencePlugin(ewc_lambda, excluded_parameters=[fc_name])) self.cwr_plugin = CWRStarPlugin(model, cwr_layer_name=fc_name, freeze_remaining_model=False) plugins.append(self.cwr_plugin) optimizer = SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=l2) if criterion is None: criterion = CrossEntropyLoss() self.ewc_lambda = ewc_lambda self.freeze_below_layer = freeze_below_layer self.rm_sz = rm_sz self.inc_update_rate = inc_update_rate self.max_r_max = max_r_max self.max_d_max = max_d_max self.lr = lr self.momentum = momentum self.l2 = l2 self.rm = None self.cur_acts: Optional[Tensor] = None self.replay_mb_size = 0 super().__init__(model, optimizer, criterion, train_mb_size=train_mb_size, train_epochs=train_epochs, eval_mb_size=eval_mb_size, device=device, plugins=plugins, evaluator=evaluator, eval_every=eval_every) def before_training_exp(self, **kwargs): self.model.eval() self.model.end_features.train() self.model.output.train() if self.training_exp_counter > 0: # In AR1 batch 0 is treated differently as the feature extractor is # left more free to learn. # This if is executed for batch > 0, in which we freeze layers # below "self.freeze_below_layer" (which usually is the latent # replay layer!) and we also change the parameters of BatchReNorm # layers to a more conservative configuration. # "freeze_up_to" will freeze layers below "freeze_below_layer" # Beware that Batch ReNorm layers are not frozen! freeze_up_to(self.model, freeze_until_layer=self.freeze_below_layer, layer_filter=AR1.filter_bn_and_brn) # Adapt the parameters of BatchReNorm layers change_brn_pars(self.model, momentum=self.inc_update_rate, r_d_max_inc_step=0, r_max=self.max_r_max, d_max=self.max_d_max) # Adapt the model and optimizer self.model = self.model.to(self.device) self.optimizer = SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum, weight_decay=self.l2) # super()... will run S.I. and CWR* plugin callbacks super().before_training_exp(**kwargs) # Update cur_j of CWR* to consider latent patterns if self.training_exp_counter > 0: for class_id, count in examples_per_class(self.rm[1]).items(): self.model.cur_j[class_id] += count self.cwr_plugin.cur_class = [ cls for cls in set(self.model.cur_j.keys()) if self.model.cur_j[cls] > 0 ] self.cwr_plugin.reset_weights(self.cwr_plugin.cur_class) def make_train_dataloader(self, num_workers=0, shuffle=True, **kwargs): """ Called after the dataset instantiation. Initialize the data loader. For AR1 a "custom" dataloader is used: instead of using `self.train_mb_size` as the batch size, the data loader batch size will be computed ad `self.train_mb_size - latent_mb_size`. `latent_mb_size` is in turn computed as: ` len(train_dataset) // ((len(train_dataset) + len(replay_buffer) // self.train_mb_size) ` so that the number of iterations required to run an epoch on the current batch is equal to the number of iterations required to run an epoch on the replay buffer. :param num_workers: number of thread workers for the data loading. :param shuffle: True if the data should be shuffled, False otherwise. """ current_batch_mb_size = self.train_mb_size if self.training_exp_counter > 0: train_patterns = len(self.adapted_dataset) current_batch_mb_size = train_patterns // ( (train_patterns + self.rm_sz) // self.train_mb_size) current_batch_mb_size = max(1, current_batch_mb_size) self.replay_mb_size = max(0, self.train_mb_size - current_batch_mb_size) # AR1 only supports SIT scenarios (no task labels). assert len(self.adapted_dataset.keys()) == 1 curr_data = list(self.adapted_dataset.values())[0] self.current_dataloader = DataLoader(curr_data, num_workers=num_workers, batch_size=current_batch_mb_size, shuffle=shuffle) def training_epoch(self, **kwargs): for self.mb_it, (self.mb_x, self.mb_y, _) in \ enumerate(self.current_dataloader): self.before_training_iteration(**kwargs) self.optimizer.zero_grad() self.mb_x = self.mb_x.to(self.device) self.mb_y = self.mb_y.to(self.device) if self.training_exp_counter > 0: lat_mb_x = self.rm[0][self.mb_it * self.replay_mb_size:(self.mb_it + 1) * self.replay_mb_size] lat_mb_x = lat_mb_x.to(self.device) lat_mb_y = self.rm[1][self.mb_it * self.replay_mb_size:(self.mb_it + 1) * self.replay_mb_size] lat_mb_y = lat_mb_y.to(self.device) self.mb_y = torch.cat((self.mb_y, lat_mb_y), 0) else: lat_mb_x = None # Forward pass. Here we are injecting latent patterns lat_mb_x. # lat_mb_x will be None for the very first batch (batch 0), which # means that lat_acts.shape[0] == self.mb_x[0]. self.before_forward(**kwargs) self.logits, lat_acts = self.model(self.mb_x, latent_input=lat_mb_x, return_lat_acts=True) if self.epoch == 0: # On the first epoch only: store latent activations. Those # activations will be used to update the replay buffer. lat_acts = lat_acts.detach().clone().cpu() if self.mb_it == 0: self.cur_acts = lat_acts else: self.cur_acts = torch.cat((self.cur_acts, lat_acts), 0) self.after_forward(**kwargs) # Loss & Backward # We don't need to handle latent replay, as self.mb_y already # contains both current and replay labels. self.loss = self.criterion(self.logits, self.mb_y) self.before_backward(**kwargs) self.loss.backward() self.after_backward(**kwargs) # Optimization step self.before_update(**kwargs) self.optimizer.step() self.after_update(**kwargs) self.after_training_iteration(**kwargs) def after_training_exp(self, **kwargs): h = min(self.rm_sz // (self.training_exp_counter + 1), self.cur_acts.size(0)) curr_data = self.experience.dataset idxs_cur = torch.randperm(self.cur_acts.size(0))[:h] rm_add_y = torch.tensor( [curr_data.targets[idx_cur] for idx_cur in idxs_cur]) rm_add = [self.cur_acts[idxs_cur], rm_add_y] # replace patterns in random memory if self.training_exp_counter == 0: self.rm = rm_add else: idxs_2_replace = torch.randperm(self.rm[0].size(0))[:h] for j, idx in enumerate(idxs_2_replace): idx = int(idx) self.rm[0][idx] = rm_add[0][j] self.rm[1][idx] = rm_add[1][j] self.cur_acts = None # Runs S.I. and CWR* plugin callbacks super().after_training_exp(**kwargs) @staticmethod def filter_bn_and_brn(param_def: LayerAndParameter): return not isinstance(param_def.layer, (_NormBase, BatchRenorm2D))
def train_model(model_name, model, lr=LEARNING_RATE, epochs=EPOCHS, momentum=MOMENTUM, weight_decay=0, train_loader=training_set_loader, test_loader=validation_set_loader): if not os.path.exists(RESULTS_PATH + "/" + model_name): os.makedirs(RESULTS_PATH + "/" + model_name) criterion = nn.CrossEntropyLoss() optimizer = SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) loaders = {'train': train_loader, 'test': test_loader} losses = {'train': [], 'test': []} accuracies = {'train': [], 'test': []} #testing variables y_testing = [] preds = [] if USE_CUDA and cuda_available: model = model.cuda() for e in range(epochs): for mode in ['train', 'test']: if mode == 'train': model.train() else: model.eval() epoch_loss = 0 epoch_acc = 0 samples = 0 try: for i, batch in enumerate(loaders[mode]): # convert tensor to variable x = Variable(batch['image'], requires_grad=(mode == 'train')) y = Variable(batch['label']) if USE_CUDA and cuda_available: x = x.cuda() y = y.cuda() output = model(x) l = criterion(output, y) # loss if mode == 'train': l.backward() optimizer.step() optimizer.zero_grad() else: y_testing.extend(y.data.tolist()) preds.extend(output.max(1)[1].tolist()) if USE_CUDA and cuda_available: acc = accuracy_score( y.data.cuda().cpu().numpy(), output.max(1)[1].cuda().cpu().numpy()) else: acc = accuracy_score(y.data, output.max(1)[1]) epoch_loss += l.data.item() * x.shape[0] # l.data[0] epoch_acc += acc * x.shape[0] samples += x.shape[0] print ("\r[%s] Epoch %d/%d. Iteration %d/%d. Loss: %0.2f. Accuracy: %0.2f" % \ (mode, e+1, epochs, i, len(loaders[mode]), epoch_loss/samples, epoch_acc/samples)) if DEBUG and i == 2: break except Exception as err: print("\n\n######### ERROR #######") print(str(err)) print("\n\n######### batch #######") print(batch['img_name']) print("\n\n") epoch_loss /= samples epoch_acc /= samples losses[mode].append(epoch_loss) accuracies[mode].append(epoch_acc) print ("\r[%s] Epoch %d/%d. Iteration %d/%d. Loss: %0.2f. Accuracy: %0.2f" % \ (mode, e+1, epochs, i, len(loaders[mode]), epoch_loss, epoch_acc)) torch.save( model.state_dict(), str(RESULTS_PATH) + "/" + str(model_name) + "/" + str(model_name) + ".pt") return model, (losses, accuracies), y_testing, preds
batch_loader.chars_vocab_size) neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size) if args.use_cuda: neg_loss = neg_loss.cuda() # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size] optimizer = SGD(neg_loss.parameters(), 0.1) for iteration in range(args.num_iterations): input_idx, target_idx = batch_loader.next_embedding_seq(args.batch_size) input = Variable(t.from_numpy(input_idx).long()) target = Variable(t.from_numpy(target_idx).long()) if args.use_cuda: input, target = input.cuda(), target.cuda() out = neg_loss(input, target, args.num_sample).mean() optimizer.zero_grad() out.backward() optimizer.step() if iteration % 500 == 0: out = out.cpu().data.numpy()[0] print('iteration = {}, loss = {}'.format(iteration, out)) word_embeddings = neg_loss.input_embeddings() np.save('data/word_embeddings.npy', word_embeddings)
def train(train_source_iter: ForeverDataIterator, train_target_iter: ForeverDataIterator, classifier: ImageClassifier, mdd: MarginDisparityDiscrepancy, optimizer: SGD, lr_scheduler: LambdaLR, epoch: int, args: argparse.Namespace): batch_time = AverageMeter('Time', ':3.1f') data_time = AverageMeter('Data', ':3.1f') losses = AverageMeter('Loss', ':3.2f') trans_losses = AverageMeter('Trans Loss', ':3.2f') cls_accs = AverageMeter('Cls Acc', ':3.1f') tgt_accs = AverageMeter('Tgt Acc', ':3.1f') progress = ProgressMeter( args.iters_per_epoch, [batch_time, data_time, losses, trans_losses, cls_accs, tgt_accs], prefix="Epoch: [{}]".format(epoch)) # switch to train mode classifier.train() mdd.train() criterion = nn.CrossEntropyLoss().to(device) end = time.time() for i in range(args.iters_per_epoch): optimizer.zero_grad() x_s, labels_s = next(train_source_iter) x_t, labels_t = next(train_target_iter) x_s = x_s.to(device) x_t = x_t.to(device) labels_s = labels_s.to(device) labels_t = labels_t.to(device) # measure data loading time data_time.update(time.time() - end) # compute output x = torch.cat((x_s, x_t), dim=0) outputs, outputs_adv = classifier(x) y_s, y_t = outputs.chunk(2, dim=0) y_s_adv, y_t_adv = outputs_adv.chunk(2, dim=0) # compute cross entropy loss on source domain cls_loss = criterion(y_s, labels_s) # compute margin disparity discrepancy between domains # for adversarial classifier, minimize negative mdd is equal to maximize mdd transfer_loss = -mdd(y_s, y_s_adv, y_t, y_t_adv) loss = cls_loss + transfer_loss * args.trade_off classifier.step() cls_acc = accuracy(y_s, labels_s)[0] tgt_acc = accuracy(y_t, labels_t)[0] losses.update(loss.item(), x_s.size(0)) cls_accs.update(cls_acc.item(), x_s.size(0)) tgt_accs.update(tgt_acc.item(), x_t.size(0)) trans_losses.update(transfer_loss.item(), x_s.size(0)) # compute gradient and do SGD step loss.backward() optimizer.step() lr_scheduler.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
def train(**kwargs): opt.parse(kwargs) images, tags, labels = load_data(opt.data_path) pretrain_model = load_pretrain_model(opt.pretrain_model_path) y_dim = tags.shape[1] X, Y, L = split_data(images, tags, labels) print('...loading and splitting data finish') img_model = ImgModule(opt.bit, pretrain_model) txt_model = TxtModule(y_dim, opt.bit) if opt.use_gpu: img_model = img_model.cuda() txt_model = txt_model.cuda() train_L = torch.from_numpy(L['train']) train_x = torch.from_numpy(X['train']) train_y = torch.from_numpy(Y['train']) query_L = torch.from_numpy(L['query']) query_x = torch.from_numpy(X['query']) query_y = torch.from_numpy(Y['query']) retrieval_L = torch.from_numpy(L['retrieval']) retrieval_x = torch.from_numpy(X['retrieval']) retrieval_y = torch.from_numpy(Y['retrieval']) num_train = train_x.shape[0] F_buffer = torch.randn(num_train, opt.bit) G_buffer = torch.randn(num_train, opt.bit) if opt.use_gpu: train_L = train_L.cuda() F_buffer = F_buffer.cuda() G_buffer = G_buffer.cuda() Sim = calc_neighbor(train_L, train_L) B = torch.sign(F_buffer + G_buffer) batch_size = opt.batch_size lr = opt.lr optimizer_img = SGD(img_model.parameters(), lr=lr) optimizer_txt = SGD(txt_model.parameters(), lr=lr) learning_rate = np.linspace(opt.lr, np.power(10, -6.), opt.max_epoch + 1) result = {'loss': []} ones = torch.ones(batch_size, 1) ones_ = torch.ones(num_train - batch_size, 1) unupdated_size = num_train - batch_size max_mapi2t = max_mapt2i = 0. for epoch in range(opt.max_epoch): # train image net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0:batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) image = Variable(train_x[ind].type(torch.float)) if opt.use_gpu: image = image.cuda() sample_L = sample_L.cuda() ones = ones.cuda() ones_ = ones_.cuda() # similar matrix size: (batch_size, num_train) S = calc_neighbor(sample_L, train_L) # S: (batch_size, num_train) cur_f = img_model(image) # cur_f: (batch_size, bit) F_buffer[ind, :] = cur_f.data F = Variable(F_buffer) G = Variable(G_buffer) theta_x = 1.0 / 2 * torch.matmul(cur_f, G.t()) logloss_x = -torch.sum(S * theta_x - torch.log(1.0 + torch.exp(theta_x))) quantization_x = torch.sum(torch.pow(B[ind, :] - cur_f, 2)) balance_x = torch.sum( torch.pow(cur_f.t().mm(ones) + F[unupdated_ind].t().mm(ones_), 2)) loss_x = logloss_x + opt.gamma * quantization_x + opt.eta * balance_x loss_x /= (batch_size * num_train) optimizer_img.zero_grad() loss_x.backward() optimizer_img.step() # train txt net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0:batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) text = train_y[ind, :].unsqueeze(1).unsqueeze(-1).type(torch.float) text = Variable(text) if opt.use_gpu: text = text.cuda() sample_L = sample_L.cuda() # similar matrix size: (batch_size, num_train) S = calc_neighbor(sample_L, train_L) # S: (batch_size, num_train) cur_g = txt_model(text) # cur_f: (batch_size, bit) G_buffer[ind, :] = cur_g.data F = Variable(F_buffer) G = Variable(G_buffer) # calculate loss # theta_y: (batch_size, num_train) theta_y = 1.0 / 2 * torch.matmul(cur_g, F.t()) logloss_y = -torch.sum(S * theta_y - torch.log(1.0 + torch.exp(theta_y))) quantization_y = torch.sum(torch.pow(B[ind, :] - cur_g, 2)) balance_y = torch.sum( torch.pow(cur_g.t().mm(ones) + G[unupdated_ind].t().mm(ones_), 2)) loss_y = logloss_y + opt.gamma * quantization_y + opt.eta * balance_y loss_y /= (num_train * batch_size) optimizer_txt.zero_grad() loss_y.backward() optimizer_txt.step() # update B B = torch.sign(F_buffer + G_buffer) # calculate total loss loss = calc_loss(B, F, G, Variable(Sim), opt.gamma, opt.eta) print('...epoch: %3d, loss: %3.3f, lr: %f' % (epoch + 1, loss.data, lr)) result['loss'].append(float(loss.data)) if opt.valid: mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x, query_y, retrieval_y, query_L, retrieval_L) print( '...epoch: %3d, valid MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (epoch + 1, mapi2t, mapt2i)) if mapt2i >= max_mapt2i and mapi2t >= max_mapi2t: max_mapi2t = mapi2t max_mapt2i = mapt2i img_model.save(img_model.module_name + '.pth') txt_model.save(txt_model.module_name + '.pth') lr = learning_rate[epoch + 1] # set learning rate for param in optimizer_img.param_groups: param['lr'] = lr for param in optimizer_txt.param_groups: param['lr'] = lr print('...training procedure finish') if opt.valid: print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (max_mapi2t, max_mapt2i)) result['mapi2t'] = max_mapi2t result['mapt2i'] = max_mapt2i else: mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x, query_y, retrieval_y, query_L, retrieval_L) print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i)) result['mapi2t'] = mapi2t result['mapt2i'] = mapt2i write_result(result)
def train(model, state, path, annotations, val_path, val_annotations, resize, max_size, jitter, batch_size, iterations, val_iterations, mixed_precision, lr, warmup, milestones, gamma, is_master=True, world=1, use_dali=True, verbose=True, metrics_url=None, logdir=None): 'Train the model on the given dataset' print("This is train.py, lr = ", lr) # Prepare model nn_model = model stride = model.stride model = convert_fixedbn_model(model) if torch.cuda.is_available(): model = model.cuda() # Setup optimizer and schedule optimizer = SGD(model.parameters(), lr=lr, weight_decay=0.0001, momentum=0.9) model, optimizer = amp.initialize(model, optimizer, opt_level = 'O2' if mixed_precision else 'O0', keep_batchnorm_fp32 = True, loss_scale = 128.0, verbosity = is_master) print("This is train.py/train, optimizer param_groups, before: ") print(optimizer.state_dict()['param_groups']) if world > 1: model = DistributedDataParallel(model) model.train() if 'optimizer' in state: #print("This is state['optimizer']") #print(state['optimizer']) optimizer.load_state_dict(state['optimizer']) for g in optimizer.param_groups: g['lr'] = lr g['initial_lr'] = lr print("This is train.py/train, optimizer param_groups, after: ") print(optimizer.state_dict()['param_groups']) #print(optimizer.param_groups) def schedule(train_iter): if warmup and train_iter <= warmup: return 0.9 * train_iter / warmup + 0.1 return gamma ** len([m for m in milestones if m <= train_iter]) scheduler = LambdaLR(optimizer, schedule) # Prepare dataset if verbose: print('Preparing dataset...') data_iterator = (DaliDataIterator if use_dali else DataIterator)( path, jitter, max_size, batch_size, stride, world, annotations, training=True) if verbose: print(data_iterator) if verbose: print(' device: {} {}'.format( world, 'cpu' if not torch.cuda.is_available() else 'gpu' if world == 1 else 'gpus')) print(' batch: {}, precision: {}'.format(batch_size, 'mixed' if mixed_precision else 'full')) print('Training model for {} iterations...'.format(iterations)) # Create TensorBoard writer if logdir is not None: from tensorboardX import SummaryWriter if is_master and verbose: print('Writing TensorBoard logs to: {}'.format(logdir)) writer = SummaryWriter(logdir=logdir) profiler = Profiler(['train', 'fw', 'bw']) iteration = state.get('iteration', 0) while iteration < iterations: cls_losses, box_losses = [], [] for i, (data, target) in enumerate(data_iterator): scheduler.step(iteration) # Forward pass profiler.start('fw') optimizer.zero_grad() cls_loss, box_loss = model([data, target]) del data profiler.stop('fw') # Backward pass profiler.start('bw') with amp.scale_loss(cls_loss + box_loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() # Reduce all losses cls_loss, box_loss = cls_loss.mean().clone(), box_loss.mean().clone() if world > 1: torch.distributed.all_reduce(cls_loss) torch.distributed.all_reduce(box_loss) cls_loss /= world box_loss /= world if is_master: cls_losses.append(cls_loss) box_losses.append(box_loss) if is_master and not isfinite(cls_loss + box_loss): raise RuntimeError('Loss is diverging!\n{}'.format( 'Try lowering the learning rate.')) del cls_loss, box_loss profiler.stop('bw') iteration += 1 profiler.bump('train') if is_master and (profiler.totals['train'] > 60 or iteration == iterations): focal_loss = torch.stack(list(cls_losses)).mean().item() box_loss = torch.stack(list(box_losses)).mean().item() learning_rate = optimizer.param_groups[0]['lr'] if verbose: msg = '[{:{len}}/{}]'.format(iteration, iterations, len=len(str(iterations))) msg += ' focal loss: {:.3f}'.format(focal_loss) msg += ', box loss: {:.3f}'.format(box_loss) msg += ', {:.3f}s/{}-batch'.format(profiler.means['train'], batch_size) msg += ' (fw: {:.3f}s, bw: {:.3f}s)'.format(profiler.means['fw'], profiler.means['bw']) msg += ', {:.1f} im/s'.format(batch_size / profiler.means['train']) msg += ', lr: {:.2g}'.format(learning_rate) print(msg, flush=True) if logdir is not None: writer.add_scalar('focal_loss', focal_loss, iteration) writer.add_scalar('box_loss', box_loss, iteration) writer.add_scalar('learning_rate', learning_rate, iteration) del box_loss, focal_loss if metrics_url: post_metrics(metrics_url, { 'focal loss': mean(cls_losses), 'box loss': mean(box_losses), 'im_s': batch_size / profiler.means['train'], 'lr': learning_rate }) # Save model weights state.update({ 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }) with ignore_sigint(): nn_model.save(state) profiler.reset() del cls_losses[:], box_losses[:] if val_annotations and (iteration == iterations or iteration % val_iterations == 0): infer(model, val_path, None, resize, max_size, batch_size, annotations=val_annotations, mixed_precision=mixed_precision, is_master=is_master, world=world, use_dali=use_dali, is_validation=True, verbose=False) model.train() if iteration == iterations: break if logdir is not None: writer.close()