def generate_inverted_image_specific_layer(self, input_image, img_size, target_layer=3): # Generate a random image which we will optimize opt_img = Variable(1e-1 * torch.randn(1, 3, img_size, img_size), requires_grad=True) # Define optimizer for previously created image optimizer = SGD([opt_img], lr=1e4, momentum=0.9) # Get the output from the model after a forward pass until target_layer # with the input image (real image, NOT the randomly generated one) input_image_layer_output = \ self.get_output_from_specific_layer(input_image, target_layer) # Alpha regularization parametrs # Parameter alpha, which is actually sixth norm alpha_reg_alpha = 6 # The multiplier, lambda alpha alpha_reg_lambda = 1e-7 # Total variation regularization parameters # Parameter beta, which is actually second norm tv_reg_beta = 2 # The multiplier, lambda beta tv_reg_lambda = 1e-8 for i in range(201): optimizer.zero_grad() # Get the output from the model after a forward pass until target_layer # with the generated image (randomly generated one, NOT the real image) output = self.get_output_from_specific_layer(opt_img, target_layer) # Calculate euclidian loss euc_loss = 1e-1 * self.euclidian_loss(input_image_layer_output.detach(), output) # Calculate alpha regularization reg_alpha = alpha_reg_lambda * self.alpha_norm(opt_img, alpha_reg_alpha) # Calculate total variation regularization reg_total_variation = tv_reg_lambda * self.total_variation_norm(opt_img, tv_reg_beta) # Sum all to optimize loss = euc_loss + reg_alpha + reg_total_variation # Step loss.backward() optimizer.step() # Generate image every 5 iterations if i % 5 == 0: print('Iteration:', str(i), 'Loss:', loss.data.numpy()[0]) x = recreate_image(opt_img) cv2.imwrite('../generated/Inv_Image_Layer_' + str(target_layer) + '_Iteration_' + str(i) + '.jpg', x) # Reduce learning rate every 40 iterations if i % 40 == 0: for param_group in optimizer.param_groups: param_group['lr'] *= 1/10
state = state.cuda() hx = hx.cuda() cx = cx.cuda() score = score.cuda() last_state = torch.squeeze(last_state, 0) last_hx = torch.squeeze(last_hx, 0) last_cx = torch.squeeze(last_cx, 0) cmd = torch.squeeze(cmd, 0) last_score = torch.squeeze(last_score, 0) state = torch.squeeze(state, 0) hx = torch.squeeze(hx, 0) cx = torch.squeeze(cx, 0) score = torch.squeeze(score, 0) optim.zero_grad() model_pred.load_state_dict(model.state_dict()) last_score_pred, cmd_pred, _ = model(last_state, last_hx, last_cx) score_pred, _, _ = model_pred(state, hx, cx) r = score - last_score r_pred = score_pred - last_score_pred loss = criterion((cmd_pred, r_pred), (cmd, r), score) loss.backward() if (i + 1) % 100 == 0: print('Iter:%d | loss:%.4f' % (i + 1, loss.item())) if (i + 1) % 1000 == 0:
def train(self, epochs): self.cm = Cluster_Model(self.encoder.module) self.cm = nn.DataParallel(self.cm) assert self.encoder.module == self.cm.module.encoder optimizer = SGD(self.cm.parameters(), lr=config.cluster_model_train_lr, momentum=0.9) # optimizer = Adam(params=self.dec.parameters()) data_iterator = tqdm( self.dataloader, leave='True', unit='batch', postfix={ 'epoch': -1, # 'acc': '%.4f' % 0.0, 'loss': '%.6f' % 0.0, 'dlb': '%.4f' % 0.0, }) km = KMeans(n_clusters=self.n_components, n_init=max(20, self.n_hidden_features), n_jobs=-1) self.cm.train() self.cm.to(config.device) features = [] actual = [] for index, batch in enumerate(data_iterator): if ((isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2): batch, value = batch actual.append(value) batch = batch.cuda(non_blocking=True) features.append(self.cm.module.encoder(batch).detach().cpu()) actual = torch.cat(actual).long() predicted = km.fit_predict(torch.cat(features).numpy()) predicted_previous = torch.tensor(np.copy(predicted), dtype=torch.long) cluster_centers = torch.tensor(km.cluster_centers_, dtype=torch.float, requires_grad=True) cluster_centers = cluster_centers.cuda(non_blocking=True) with torch.no_grad(): self.cm.module.state_dict()['assignment.cluster_centers'].copy_( cluster_centers) loss_function = nn.KLDivLoss(size_average=False) delta_label = None for epoch in range(epochs): features = [] data_iterator = tqdm(self.dataloader, leave='True', unit='batch', postfix={ 'epoch': epoch, 'loss': '%.8f' % 0.0, 'dlb': '%.4f' % (delta_label or 0.0) }) self.cm.train() for index, batch in enumerate(data_iterator): if ((isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2): batch, _ = batch batch = batch.cuda(non_blocking=True) output = self.cm(batch) target = target_distribution(output).detach() loss = loss_function(output.log(), target) / output.shape[0] data_iterator.set_postfix(epoch=epoch, loss='%.8f' % float(loss.item()), dlb='%.4f' % (delta_label or 0.0)) optimizer.zero_grad() loss.backward() optimizer.step(closure=None) features.append(self.cm.module.encoder(batch).detach().cpu()) if index % 10 == 0: # update_freq = 10 loss_value = float(loss.item()) data_iterator.set_postfix( epoch=epoch, loss='%.8f' % loss_value, dlb='%.4f' % (delta_label or 0.0), ) predicted, actual = self.predict() delta_label = float( (predicted != predicted_previous ).float().sum().item()) / predicted_previous.shape[0] if self.stopping_delta is not None and delta_label < self.stopping_delta: print( 'Early stopping as label delta "%1.5f" less tahn "%1.5f".' % (delta_label, self.stopping_delta)) break predicted_previous = predicted if (config.plot_clustering): self.plot_train(self.cm, self.n_components, epoch) self.encoder = self.cm.module.encoder print("training dec ended.")
def train_val_model(model: nn.Module, dataloaders: dict, criterion: nn.CrossEntropyLoss, optimizer: optim.SGD, num_epochs=num_epochs, is_inception=False): ''' :param model: :param dataloaders: dict ,包括了train和val两个dataloader :param criterion: :param optimizer: :param num_epochs: :param is_inception: :return: ''' print("************* train_and_valid begined!") since = time.time() val_acc_history = [] best_acc = 0.0 best_model_wts = copy.deepcopy(model.state_dict()) print('---Epoch train_and_valid begined') for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() #Set model to training mode else: model.eval() #Set model to evaluate mode running_loss = 0.0 running_corrects = 0 #Iterate over data for inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # zero the parameter optimizer.zero_grad() with torch.set_grad_enabled( phase == 'train' ): # torch.set_grad_enabled(True) if (phase=='train') if is_inception and phase == 'train': outputs, aux_outputs = model(inputs) loss1 = criterion(outputs, labels) loss2 = criterion(aux_outputs, labels) loss = loss1 + 0.4 * loss2 else: outputs = model(inputs) loss = criterion(outputs, labels) _, preds = torch.max(outputs, 1) if phase == 'train': loss.backward() optimizer.step() # todo *inputs.size(0)? 因为:The losses are averaged across observations for each minibatch # 详见 class CrossEntropyLoss(_WeightedLoss)的定义说明 running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_acc = running_corrects.double() / len( dataloaders[phase].dataset) print('{} Loss: {:.4f} Acc: {:4f}'.format(phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) if phase == 'val': val_acc_history.append(epoch_acc) print('---Epoch {}/{} finished!'.format(epoch, num_epochs - 1)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) print("************* train_and_valid finished!") return model, val_acc_history
def train_model(model: nn.Module, optimizer: optim.SGD, epochs: int, device: torch.device, train_dataloader: DataLoader, val_dataloader: DataLoader, logger: SummaryWriter, print_interval: int = 50): """ :param model: :param optimizer: :param epochs: :param device: :param train_dataloader: :param val_dataloader: TODO: make optional :param logger: TODO: make optional :param print_interval: TODO: add as argument to argparser :return: """ print("Training Model...") start = time.time() train_step, val_step = 0, 0 for epoch in tqdm(range(epochs)): model.train() for batch_idx, (data, target) in enumerate(train_dataloader): data, target = data.to(device), target.to(device) optimizer.zero_grad() # train step + backward step logits = model(data) loss = F.cross_entropy(logits, target) loss.backward() optimizer.step() # sgd noise step # _, sgd_noise = get_sgd_noise(model, device, optimizer, query_dataloader) # print('noise shape', sgd_noise.shape) # noise_norm = torch.norm(sgd_noise, dim=1) # alpha_hat = estimate_alpha(sgd_noise) # print('grad_norm', noise_norm.shape) # print('alpha_hat', alpha_hat) # # start train step logging here # logger.add_scalar('train/loss', loss.item(), train_step) # # TODO: how to log noise norm tensor? # logger.add_scalar('train/alpha', alpha_hat) train_step += 1 if not batch_idx % print_interval: print_train_step(epoch, epochs, batch_idx, len(train_dataloader), loss.item()) # end of train step logging model.eval() with torch.no_grad(): correct, samples = 0, 0 for batch_idx, (data, target) in enumerate(val_dataloader): data, target = data.to(device), target.to(device) # validation step logits = model(data) target_hat = torch.argmax(logits, dim=1) val_loss = F.cross_entropy(logits, target) correct += torch.sum(target == target_hat).item() samples += len(target) # start val step logging here logger.add_scalar('Loss/val', val_loss.item(), val_step) val_step += 1 # end of val step logging # scheduler.step(epoch) # start val epoch end logging here val_acc = correct / (samples * 1.0) logger.add_scalar('Acc/val', val_acc, epoch) print_validation_step(curr_epoch=epoch, epochs=epochs, val_acc=val_acc) # end val epoch logging here end = time.time() print('Total Training Time: %.2f min\n' % ((end - start) / 60))
shuffle=True) dev_loader = DataLoader(dev_data.to_numpy(), batch_size=32) num_epochs = 15 model = MatrixFactorizer(N, M) optimizer = SGD(model.parameters(), lr=0.01, weight_decay=1e-2) train_losses, dev_losses = [], [] for epoch in range(1, num_epochs + 1): epoch_train_loss = 0.0 num_train_batches = 0 num_dev_batches = 0 epoch_dev_loss = 0.0 model.train() for batch_idx, batch in enumerate(train_loader): optimizer.zero_grad() users = batch[:, 0].long() movies = batch[:, 1].long() ratings = (batch[:, 2] - rating_mean).float() ratings_guess = model(users, movies) batch_loss = torch.pow(ratings - ratings_guess, 2).mean() batch_loss.backward() optimizer.step() num_train_batches += 1 epoch_train_loss += batch_loss.item() model.eval() for batch_idx, batch in enumerate(dev_loader): users = batch[:, 0].long() movies = batch[:, 1].long() ratings = (batch[:, 2] - rating_mean).float() with torch.no_grad():
def main(): if not os.path.exists(args.outdir): os.mkdir(args.outdir) device = torch.device("cuda") torch.cuda.set_device(args.gpu) logfilename = os.path.join(args.outdir, args.logname) init_logfile(logfilename, "epoch\ttime\tlr\ttrain loss\ttrain acc\ttestloss\ttest acc") log(logfilename, "Hyperparameter List") log(logfilename, "Epochs: {:}".format(args.epochs)) log(logfilename, "Learning Rate: {:}".format(args.lr)) log(logfilename, "Alpha: {:}".format(args.alpha)) log(logfilename, "Keep ratio: {:}".format(args.keep_ratio)) log(logfilename, "Warmup Epochs: {:}".format(args.epochs_warmup)) test_acc_list = [] for _ in range(args.round): traindir = os.path.join(args.data_train, 'train') valdir = os.path.join(args.data_val, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) test_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch, shuffle=False, num_workers=args.workers, pin_memory=True) base_classifier = models.__dict__[args.arch](pretrained=False).cuda() print("Loaded the base_classifier") criterion = nn.CrossEntropyLoss().to(device) optimizer = SGD(base_classifier.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Warmup training for the rewinding. for epoch in range(args.epochs_warmup): print("Warmup Training Epochs: {:}".format(epoch)) log(logfilename, "Warmup current epochs: {}".format(epoch)) train_loss, train_top1, train_top5 = utils.train(train_loader, base_classifier, criterion, optimizer, epoch, device, print_freq=100, display=True) original_acc = model_inference(base_classifier, test_loader, device, display=True) log(logfilename, "Warmup Model Test Accuracy: {:.5}".format(original_acc)) print("Warmup Model Test Accuracy, ", original_acc) # Creating a fresh copy of network not affecting the original network. # Goal is to find the supermask. net = copy.deepcopy(base_classifier) net = net.to(device) # Generating the mask 'm' for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): layer.weight_mask = nn.Parameter(torch.ones_like(layer.weight)) layer.weight.requires_grad = True layer.weight_mask.requires_grad = True # This is the monkey-patch overriding layer.forward to custom function. # layer.forward will pass nn.Linear with weights: 'w' and 'm' elementwised if isinstance(layer, nn.Linear): layer.forward = types.MethodType(mask_forward_linear, layer) if isinstance(layer, nn.Conv2d): layer.forward = types.MethodType(mask_forward_conv2d, layer) criterion = nn.CrossEntropyLoss().to( device) # Criterion for training the mask. optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=0) # weight_decay = 0 for training the mask. # warm_scheduler = StepLR(optimizer, step_size=args.epochs_mask-10, gamma=0.2) sparsity, total = 0, 0 breakFlag = False net.train() # Training the mask with the training set. for epoch in range(100000): # if epoch % 5 == 0: print("Current epochs: ", epoch) print("Sparsity: {:}".format(sparsity)) log(logfilename, "Current epochs: {}".format(epoch)) log(logfilename, "Sparsity: {:}".format(sparsity)) for i, (inputs, targets) in enumerate(train_loader): inputs = inputs.cuda() targets = targets.cuda() reg_loss = 0 for layer in net.modules(): if isinstance(layer, nn.Conv2d) or isinstance( layer, nn.Linear): reg_loss += torch.norm(layer.weight_mask, p=1) outputs = net(inputs) loss = criterion(outputs, targets) + args.alpha * reg_loss # Computing gradient and do SGD optimizer.zero_grad() loss.backward() optimizer.step() sparsity, total = 0, 0 for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance( layer, nn.Conv2d): boolean_list = layer.weight_mask.data > 1e-3 sparsity += (boolean_list == 1).sum() total += layer.weight.numel() if i % 50 == 0: print( "Current Epochs: {}, Current i: {}, Current Sparsity: {}" .format(epoch, i, sparsity)) if sparsity <= total * args.keep_ratio: print("Current epochs breaking loop at {:}".format(epoch)) log(logfilename, "Current epochs breaking loop at {:}".format(epoch)) breakFlag = True break # if breakFlag == True: # break if breakFlag == True: break # print("W 1-norm: ", torch.norm(layer.weight_mask, p=1)) # Just checking the 1-norm of weights in each layer. # Approximates how sparse the mask is.. # This line allows to calculate the threshold to satisfy the keep_ratio. c_abs = [] for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): c_abs.append(torch.abs(layer.weight_mask)) all_scores = torch.cat([torch.flatten(x) for x in c_abs]) num_params_to_keep = int(len(all_scores) * args.keep_ratio) threshold, _ = torch.topk(all_scores, num_params_to_keep, sorted=True) threshold = threshold[-1] print("Threshold found: ", threshold) keep_masks = [] for c in c_abs: keep_masks.append((c >= threshold).float()) print( "Number of ones.", torch.sum(torch.cat([torch.flatten(x == 1) for x in keep_masks]))) torch.save(base_classifier.state_dict(), os.path.join(args.outdir, args.save_model)) base_classifier_acc = model_inference(base_classifier, test_loader, device, display=True) log(logfilename, "Weight Update Test Accuracy: {:.5}".format(base_classifier_acc)) print("Saved the rewind model.") for masks in keep_masks: masks = masks.data torch.save(keep_masks, os.path.join(args.outdir, args.keep_mask)) print("Saved the masking function.") log(logfilename, "Finished finding the mask. (REWIND)")
def train(**kwargs): opt.parse(kwargs) alpha = [0.2,0.5,0.8,1.0,1.3.1.5.1.8.2.0,2.5] images, tags, labels = load_data(opt.data_path) pretrain_model = load_pretrain_model(opt.pretrain_model_path) y_dim = tags.shape[1] label_num = labels.shape[1] X, Y, L = split_data(images, tags, labels) print('...loading and splitting data finish') img_model = ImgModule(opt.bit, pretrain_model) txt_model = TxtModule(y_dim, opt.bit) hash_model = HashModule(opt.bit) label_model = LabModule(label_num) if opt.use_gpu: img_model = img_model.cuda() txt_model = txt_model.cuda() hash_model = hash_model.cuda() label_model = label_model.cuda() train_L = torch.from_numpy(L['train']) train_x = torch.from_numpy(X['train']) train_y = torch.from_numpy(Y['train']) query_L = torch.from_numpy(L['query']) query_x = torch.from_numpy(X['query']) query_y = torch.from_numpy(Y['query']) retrieval_L = torch.from_numpy(L['retrieval']) retrieval_x = torch.from_numpy(X['retrieval']) retrieval_y = torch.from_numpy(Y['retrieval']) num_train = train_x.shape[0] F_buffer = torch.randn(num_train, opt.bit) G_buffer = torch.randn(num_train, opt.bit) X_fea_buffer = torch.randn(num_train, opt.X_fea_nums) Y_fea_buffer = torch.randn(num_train,opt.Y_fea_nums) X_label_buffer = torch.randn(num_train, label_num) Y_label_buffer = torch.randn(num_train, label_num) Label_buffer = torch.randn(num_train, label_num) Label_hash_buffer = torch.randn(num_train, opt.bit) Label_label_buffer = torch.randn(num_train, label_num) if opt.use_gpu: train_L = train_L.cuda() F_buffer = F_buffer.cuda() G_buffer = G_buffer.cuda() X_fea_buffer = X_fea_buffer.cuda() Y_fea_buffer = Y_fea_buffer.cuda() Label_buffer = Label_buffer.cuda() X_label_buffer = X_label_buffer.cuda() Y_label_buffer = Y_label_buffer.cuda() Label_hash_buffer = Label_hash_buffer.cuda() Label_label_buffer = Label_label_buffer.cuda() Sim = calc_neighbor(train_L, train_L) ###############ddddddd B = torch.sign(F_buffer + G_buffer) B_buffer = torch.sign(F_buffer + G_buffer) batch_size = opt.batch_size lr = opt.lr optimizer_img = SGD(img_model.parameters(), lr=lr) optimizer_txt = SGD(txt_model.parameters(), lr=lr) optimizer_hash = SGD(hash_model.parameters(), lr=lr) optimizer_label = SGD(label_model.parameters(), lr=lr) learning_rate = np.linspace(opt.lr, np.power(10, -6.), opt.max_epoch + 1) result = { 'loss': [], 'hash_loss' : [], 'total_loss' : [] } ones = torch.ones(batch_size, 1) ones_ = torch.ones(num_train - batch_size, 1) unupdated_size = num_train - batch_size max_mapi2t = max_mapt2i = 0. for epoch in range(opt.max_epoch): # train label net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0: batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) label = Variable(train_L[ind,:].unsqueeze(1).unsqueeze(-1).type(torch.float)) if opt.use_gpu: label = label.cuda() sample_L = sample_L.cuda() # similar matrix size: (batch_size, num_train) S = calc_neighbor(sample_L, train_L) label_hash, label_label = label_model(label) # Label_hash_buffer[ind, :] = label_hash.data Label_label_buffer[ind, :] = label_label.data Label = Variable(train_L) Label_B = torch.sign(label_hash) Label_H = Variable(Label_hash_buffer) theta_l = 1.0 / 2 * torch.matmul(label_hash, Label_H.t()) logloss_l = -torch.sum(S * theta_l - torch.log(1.0 + torch.exp(theta_l))) quantization_l = torch.sum(torch.pow(Label_hash_buffer[ind, :] - Label_B, 2)) labelloss_l = torch.sum(torch.pow(Label[ind, :].float() - label_label, 2)) loss_label = logloss_l + opt.beta * quantization_l + opt.alpha * labelloss_l # + logloss_x_fea loss_label /= (batch_size * num_train) optimizer_label.zero_grad() loss_label.backward() optimizer_label.step() # train image net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0: batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) image = Variable(train_x[ind].type(torch.float)) if opt.use_gpu: image = image.cuda() sample_L = sample_L.cuda() # similar matrix size: (batch_size, num_train) S = calc_neighbor(sample_L, train_L) # S: (batch_size, num_train) image_fea, cur_f, image_label = img_model(image) # cur_f: (batch_size, bit) X_fea_buffer[ind, :] = image_fea.data F_buffer[ind, :] = cur_f.data X_label_buffer[ind, :] = image_label.data G = Variable(G_buffer) H_l = Variable(Label_hash_buffer) B_x = torch.sign(F_buffer) theta_x = 1.0 / 2 * torch.matmul(cur_f, H_l.t()) logloss_x = -torch.sum(S * theta_x - torch.log(1.0 + torch.exp(theta_x))) quantization_xh = torch.sum(torch.pow(B_buffer[ind, :] - cur_f, 2)) quantization_xb = torch.sum(torch.pow(B_x[ind, :]- cur_f, 2)) labelloss_x = torch.sum(torch.pow(train_L[ind, :].float() - image_label,2)) loss_x = logloss_x + opt.beta * quantization_xh + opt.alpha * labelloss_x + opt.gamma * quantization_xb# + logloss_x_fea loss_x /= (batch_size * num_train) optimizer_img.zero_grad() loss_x.backward() optimizer_img.step() # train txt net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0: batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) text = train_y[ind, :].unsqueeze(1).unsqueeze(-1).type(torch.float) text = Variable(text) if opt.use_gpu: text = text.cuda() sample_L = sample_L.cuda() # similar matrix size: (batch_size, num_train) S = calc_neighbor(sample_L, train_L) # S: (batch_size, num_train) txt_fea, cur_g, txt_label = txt_model(text) # cur_f: (batch_size, bit) Y_fea_buffer[ind, :] = txt_fea.data G_buffer[ind, :] = cur_g.data Y_label_buffer[ind, :] = txt_label.data F = Variable(F_buffer) H_l = Variable(Label_hash_buffer) B_y = torch.sign(F) # calculate loss # theta_y: (batch_size, num_train) theta_y = 1.0 / 2 * torch.matmul(cur_g, H_l.t()) logloss_y = -torch.sum(S * theta_y - torch.log(1.0 + torch.exp(theta_y))) quantization_yh = torch.sum(torch.pow(B_buffer[ind, :] - cur_g, 2)) quantization_yb = torch.sum(torch.pow(B_y[ind, :] - cur_g, 2)) labelloss_y = torch.sum(torch.pow(train_L[ind, :].float() - txt_label, 2)) loss_y = logloss_y + opt.beta * quantization_yh + opt.alpha * labelloss_y + opt.gamma * quantization_yb# + logloss_y_fea loss_y /= (num_train * batch_size) optimizer_txt.zero_grad() loss_y.backward() optimizer_txt.step() #train hash net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0: batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) #W = norm(X_fea_buffer[ind, :], Y_fea_buffer[ind, :]) #fea = 1.0 / 2 * (torch.matmul(W, X_fea_buffer[ind, :]) + torch.matmul(W, Y_fea_buffer[ind, :])) fea = torch.cat([X_fea_buffer[ind, :], Y_fea_buffer[ind, :]], dim=1) fea = Variable(fea) if opt.use_gpu: fea = fea.cuda() sample_L = sample_L.cuda() S = calc_neighbor(sample_L, train_L) A = caculateAdj(sample_L, sample_L) cur_B, label_hash = hash_model(fea, A) B_buffer[ind, :] = cur_B.data #caculate loss B = Variable(torch.sign(B_buffer)) theta_hash = 1.0 / 2 * torch.matmul(cur_B, B_buffer.t()) logloss_hash = -torch.sum(S * theta_hash - torch.log(1.0 + torch.exp(theta_hash))) label_loss = torch.sum(torch.pow(train_L[ind, :].float() - label_hash, 2)) hashloss = torch.sum(torch.pow(B[ind, :] - cur_B, 2)) loss_hash = logloss_hash + opt.alpha * label_loss + opt.beta * hashloss optimizer_hash.zero_grad() loss_hash.backward() optimizer_hash.step() # train image net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0: batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) image = Variable(train_x[ind].type(torch.float)) if opt.use_gpu: image = image.cuda() sample_L = sample_L.cuda() # similar matrix size: (batch_size, num_train) S = calc_neighbor(sample_L, train_L) # S: (batch_size, num_train) image_fea, cur_f, image_label = img_model(image) # cur_f: (batch_size, bit) X_fea_buffer[ind, :] = image_fea.data F_buffer[ind, :] = cur_f.data X_label_buffer[ind, :] = image_label.data G = Variable(G_buffer) H_l = Variable(Label_hash_buffer) B_x = torch.sign(F_buffer) theta_x = 1.0 / 2 * torch.matmul(cur_f, H_l.t()) logloss_x = -torch.sum(S * theta_x - torch.log(1.0 + torch.exp(theta_x))) quantization_xh = torch.sum(torch.pow(B_buffer[ind, :] - cur_f, 2)) quantization_xb = torch.sum(torch.pow(B_x[ind, :] - cur_f, 2)) labelloss_x = torch.sum(torch.pow(train_L[ind, :].float() - image_label, 2)) loss_x = logloss_x + opt.gamma * quantization_xh + opt.alpha * labelloss_x + opt.beta * quantization_xb # + logloss_x_fea loss_x /= (batch_size * num_train) optimizer_img.zero_grad() loss_x.backward() optimizer_img.step() # train txt net for i in tqdm(range(num_train // batch_size)): index = np.random.permutation(num_train) ind = index[0: batch_size] unupdated_ind = np.setdiff1d(range(num_train), ind) sample_L = Variable(train_L[ind, :]) text = train_y[ind, :].unsqueeze(1).unsqueeze(-1).type(torch.float) text = Variable(text) if opt.use_gpu: text = text.cuda() sample_L = sample_L.cuda() # similar matrix size: (batch_size, num_train) S = calc_neighbor(sample_L, train_L) # S: (batch_size, num_train) txt_fea, cur_g, txt_label = txt_model(text) # cur_f: (batch_size, bit) Y_fea_buffer[ind, :] = txt_fea.data G_buffer[ind, :] = cur_g.data Y_label_buffer[ind, :] = txt_label.data F = Variable(F_buffer) H_l = Variable(Label_hash_buffer) B_y = torch.sign(F) # calculate loss # theta_y: (batch_size, num_train) theta_y = 1.0 / 2 * torch.matmul(cur_g, H_l.t()) logloss_y = -torch.sum(S * theta_y - torch.log(1.0 + torch.exp(theta_y))) quantization_yh = torch.sum(torch.pow(B_buffer[ind, :] - cur_g, 2)) quantization_yb = torch.sum(torch.pow(B_y[ind, :] - cur_g, 2)) labelloss_y = torch.sum(torch.pow(train_L[ind, :].float() - txt_label, 2)) loss_y = logloss_y + opt.gamma * quantization_yh + opt.alpha * labelloss_y + opt.beta * quantization_yb # + logloss_y_fea loss_y /= (num_train * batch_size) optimizer_txt.zero_grad() loss_y.backward() optimizer_txt.step() # calculate total loss loss, hash_loss, total_loss = calc_loss(B, F, G, Variable(Sim), opt.alpha, opt.beta,Label_buffer, train_L, X_label_buffer,Y_label_buffer) print('...epoch: %3d, loss: %3.3f, lr: %f' % (epoch + 1, loss.data, lr)) print('...epoch: %3d, hash_loss: %3.3f, lr: %f' % (epoch + 1, hash_loss.data, lr)) print('...epoch: %3d, total_loss: %3.3f, lr: %f' % (epoch + 1, total_loss.data, lr)) result['loss'].append(float(loss.data)) result['hash_loss'].append(float(hash_loss.data)) result['total_loss'].append(float(total_loss.data)) if opt.valid: mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x, query_y, retrieval_y, query_L, retrieval_L) print('...epoch: %3d, valid MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (epoch + 1, mapi2t, mapt2i)) if mapt2i >= max_mapt2i and mapi2t >= max_mapi2t: max_mapi2t = mapi2t max_mapt2i = mapt2i img_model.save(img_model.module_name + '.pth') txt_model.save(txt_model.module_name + '.pth') hash_model.save(hash_model.module_name+'.pth') lr = learning_rate[epoch + 1] # set learning rate for param in optimizer_img.param_groups: param['lr'] = lr for param in optimizer_txt.param_groups: param['lr'] = lr print('...training procedure finish') if opt.valid: print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (max_mapi2t, max_mapt2i)) result['mapi2t'] = max_mapi2t result['mapt2i'] = max_mapt2i else: mapi2t, mapt2i = valid(img_model, txt_model, query_x, retrieval_x, query_y, retrieval_y, query_L, retrieval_L) print(' max MAP: MAP(i->t): %3.4f, MAP(t->i): %3.4f' % (mapi2t, mapt2i)) result['mapi2t'] = mapi2t result['mapt2i'] = mapt2i write_result(result)
def train(args, model, tokenizer): """ Train the model """ if xm.is_master_ordinal(): tb_writer = SummaryWriterP(args.output_dir) def summary_write(*args, **kwargs): if xm.is_master_ordinal(): tb_writer.add_scalar(*args, **kwargs) args.train_batch_size = args.per_gpu_train_batch_size #* max(1, args.n_gpu) train_dataloader = build_dataloader(args, tokenizer) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if p.requires_grad and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # Scale learning rate to num cores #args.learning_rate = args.learning_rate * xm.xrt_world_size() if args.sgd: optimizer = SGD(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) warmup_steps = args.warmup_samples // (args.train_batch_size * xm.xrt_world_size()) if args.lr_decay: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) elif args.lr_cosine: scheduler = WarmupCosineWithHardRestartsSchedule( optimizer, warmup_steps=warmup_steps, t_total=t_total, cycles=args.num_train_epochs) else: scheduler = WarmupZeroSchedule(optimizer, warmup_steps=warmup_steps) # Train! tracker = xm.RateTracker() log_info("***** Running training *****") log_info(" Num Epochs = %d", args.num_train_epochs) log_info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) log_info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (xm.xrt_world_size() if args.local_rank != -1 else 1)) log_info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) log_info(" Total optimization steps = %d", t_total) try: with open(os.path.join(args.model_name_or_path, 'step.txt'), 'r') as c: global_step = int(c.readline()) except OSError as e: global_step = 0 moving_loss = MovingLoss(10000 // args.logging_steps) train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=not xm.is_master_ordinal()) try: for epoch in train_iterator: p_train_dataloader = pl.ParallelLoader(train_dataloader, [args.device]) epoch_iterator = tqdm(p_train_dataloader.per_device_loader( args.device), total=len(train_dataloader), desc="Iteration", disable=not xm.is_master_ordinal()) model.train() for step, batch in enumerate(epoch_iterator): optimizer.zero_grad() inputs, labels = mask_tokens( batch, tokenizer, args) if args.mlm else (batch, batch) outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) xm.optimizer_step(optimizer, barrier=True) scheduler.step() global_step += 1 tracker.add(args.train_batch_size) if args.logging_steps > 0 and global_step % args.logging_steps == 0: ls = loss.item( ) # weird. if you call loss.item() only in one process, the whole thing hangs. So call on every and log in one. moving_loss.add(ls) summary_write('lr', scheduler.get_last_lr()[0], global_step) epoch_iterator.set_postfix( MovingLoss=f'{moving_loss.loss:.2f}', Perplexity= f'{torch.exp(torch.tensor(moving_loss.loss)):.2f}') if args.save_steps > 0 and global_step % args.save_steps == 0: save_state(args, model, tokenizer, global_step) if step >= 2: # TPU seems to like consistent epoch lenght if xm.is_master_ordinal(): print(met.metrics_report()) exit(0) # epoch_iterator.close() # break if args.max_steps > 0 and step > args.max_steps: epoch_iterator.close() break # evaluate once in an epoch if args.evaluate_during_training: results = evaluate(args, model, tokenizer, f"checkpoint-{global_step}") log_info(f"Eval {results}") for key, value in results.items(): summary_write("eval_{}".format(key), value, global_step) # reload dataset every args.reload_data_file epochs if args.reload_data_file and (epoch + 1) % args.reload_data_file == 0: train_dataloader = build_dataloader(args, tokenizer) # that's very slow on TPU #print_sample(model, tokenizer, args.device, args) except (KeyboardInterrupt, SystemExit): save_state(args, model, tokenizer, global_step) raise save_state(args, model, tokenizer, global_step) return global_step, moving_loss.loss
def compute(self, config, budget, **kwargs): """Runs the training session. This training session will also save all the data on its runs (e.g. config, loss, accuracy) into the logging dir Args: config (dict): Dictionary containing the configuration by the optimizer budget (int): Amount of epochs the model can use to train. Returns: dict: dictionary with fields 'loss' (float) and 'info' (dict) """ # Start with printouts print("\n\n") print( "================================================================" "=======") print("\nStarting run {} with config:.".format(self.run_count)) print(" Optimizer: {}".format(config['optimizer'])) print(" Learning rate: {}".format(config['lr'])) print(" Batch size: {}".format(config['bs'])) print(" First layer: {}".format(config['first_layer'])) print(" Second layer: {}".format(config['second_layer'])) print(" Leaky config: {}, {}, {}".format(config['leaky1'], config['leaky2'], config['leaky3'])) # Set network, dataloader, optimizer, and loss criterion train_loader = DataLoader(self.train_data, config['bs'], shuffle=True) test_loader = DataLoader(self.test_data, config['bs'], shuffle=True) network = FCNetwork(784, 10, config['first_layer'], config['second_layer'], (config['leaky1'], config['leaky2'], config['leaky3'])).to(device=self.device) if config['optimizer'] == 'sgd': optimizer = SGD(network.parameters(), config['lr'], config['momentum']) else: optimizer = Adam(network.parameters(), config['lr'], eps=config['epsilon']) loss_crit = CrossEntropyLoss() # Increment run count number self.run_count += 1 # Start actual training loop for epoch in range(int(budget)): # Do training loop network.train() for i, (img, cls) in enumerate(train_loader): img = img.to(self.device) cls = cls.to(self.device) optimizer.zero_grad() h1, h2, out = network(img) out = out.softmax(1) loss = loss_crit(out, cls) # Do backprop if i % int(1000 / (config['bs'] / 4)) == 0: print("Iteration {}, \tepoch: {}, \tLoss: {:.4f}, " "\taccuracy: {:.2f}%".format( i + 1, epoch + 1, loss.item(), self.calc_batch_accuracy(out, cls) * 100)) loss.backward() optimizer.step() train_loss, train_acc = self.evaluate_network(network, loss_crit, train_loader) validation_loss, validation_accuracy = self.evaluate_network( network, loss_crit, test_loader) # Print out results print( "================================================================" "=======") print("Validation accuracy: {:.4f}%".format(validation_accuracy * 100.)) print("Validation loss: {:.4f}".format(validation_loss)) print("Training accuracy: {:.4f}%".format(train_acc * 100)) print("Training loss: {:.4f}".format(train_loss)) return { 'loss': 1 - validation_accuracy, 'info': { 'validation accuracy': validation_accuracy, 'validation loss': validation_loss, 'training loss': train_loss, 'training accuracy': train_acc } }
def train(model, state, path, annotations, val_path, val_annotations, resize, max_size, jitter, batch_size, iterations, val_iterations, mixed_precision, lr, warmup, milestones, gamma, is_master=True, world=1, use_dali=True, verbose=True, metrics_url=None, logdir=None): 'Train the model on the given dataset' # Prepare model nn_model = model stride = model.stride model = convert_fixedbn_model(model) if torch.cuda.is_available(): model = model.cuda() # Setup optimizer and schedule optimizer = SGD(model.parameters(), lr=lr, weight_decay=0.0001, momentum=0.9) model, optimizer = amp.initialize( model, optimizer, opt_level='O2' if mixed_precision else 'O0', keep_batchnorm_fp32=True, loss_scale=128.0, verbosity=is_master) if world > 1: model = DistributedDataParallel(model) model.train() if 'optimizer' in state: optimizer.load_state_dict(state['optimizer']) def schedule(train_iter): if warmup and train_iter <= warmup: return 0.9 * train_iter / warmup + 0.1 return gamma**len([m for m in milestones if m <= train_iter]) scheduler = LambdaLR(optimizer.optimizer if mixed_precision else optimizer, schedule) # Prepare dataset if verbose: print('Preparing dataset...') data_iterator = (DaliDataIterator if use_dali else DataIterator)( path, jitter, max_size, batch_size, stride, world, annotations, training=True) if verbose: print(data_iterator) if verbose: print(' device: {} {}'.format( world, 'cpu' if not torch.cuda.is_available() else 'gpu' if world == 1 else 'gpus')) print(' batch: {}, precision: {}'.format( batch_size, 'mixed' if mixed_precision else 'full')) print('Training model for {} iterations...'.format(iterations)) # Create TensorBoard writer if logdir is not None: from tensorboardX import SummaryWriter if is_master and verbose: print('Writing TensorBoard logs to: {}'.format(logdir)) writer = SummaryWriter(log_dir=logdir) profiler = Profiler(['train', 'fw', 'bw']) iteration = state.get('iteration', 0) while iteration < iterations: cls_losses, box_losses = [], [] for i, (data, target) in enumerate(data_iterator): scheduler.step(iteration) # Forward pass profiler.start('fw') optimizer.zero_grad() cls_loss, box_loss = model([data, target]) del data profiler.stop('fw') # Backward pass profiler.start('bw') with amp.scale_loss(cls_loss + box_loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() # Reduce all losses cls_loss, box_loss = cls_loss.mean().clone(), box_loss.mean( ).clone() if world > 1: torch.distributed.all_reduce(cls_loss) torch.distributed.all_reduce(box_loss) cls_loss /= world box_loss /= world if is_master: cls_losses.append(cls_loss) box_losses.append(box_loss) if is_master and not isfinite(cls_loss + box_loss): raise RuntimeError('Loss is diverging!\n{}'.format( 'Try lowering the learning rate.')) del cls_loss, box_loss profiler.stop('bw') iteration += 1 profiler.bump('train') if is_master and (profiler.totals['train'] > 60 or iteration == iterations): focal_loss = torch.stack(list(cls_losses)).mean().item() box_loss = torch.stack(list(box_losses)).mean().item() learning_rate = optimizer.param_groups[0]['lr'] if verbose: msg = '[{:{len}}/{}]'.format(iteration, iterations, len=len(str(iterations))) msg += ' focal loss: {:.3f}'.format(focal_loss) msg += ', box loss: {:.3f}'.format(box_loss) msg += ', {:.3f}s/{}-batch'.format(profiler.means['train'], batch_size) msg += ' (fw: {:.3f}s, bw: {:.3f}s)'.format( profiler.means['fw'], profiler.means['bw']) msg += ', {:.1f} im/s'.format(batch_size / profiler.means['train']) msg += ', lr: {:.2g}'.format(learning_rate) print(msg, flush=True) if logdir is not None: writer.add_scalar('focal_loss', focal_loss, iteration) writer.add_scalar('box_loss', box_loss, iteration) writer.add_scalar('learning_rate', learning_rate, iteration) del box_loss, focal_loss if metrics_url: post_metrics( metrics_url, { 'focal loss': mean(cls_losses), 'box loss': mean(box_losses), 'im_s': batch_size / profiler.means['train'], 'lr': learning_rate }) # Save model weights state.update({ 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }) with ignore_sigint(): nn_model.save(state) profiler.reset() del cls_losses[:], box_losses[:] if val_annotations and (iteration == iterations or iteration % val_iterations == 0): infer(model, val_path, None, resize, max_size, batch_size, annotations=val_annotations, mixed_precision=mixed_precision, is_master=is_master, world=world, use_dali=use_dali, verbose=False) model.train() if iteration == iterations: break if logdir is not None: writer.close()
class Trainer: def __init__(self, frozen_src2tgt: Seq2Seq, frozen_tgt2src: Seq2Seq, src_embedding: Embedding, tgt_embedding: Embedding, encoder_rnn, decoder_rnn, attention: Attention, src_hat: DecoderHat, tgt_hat: DecoderHat, discriminator: Discriminator, src_sos_index, tgt_sos_index, src_eos_index, tgt_eos_index, src_pad_index, tgt_pad_index, device, lr_core=1e-3, lr_disc=1e-3): assert discriminator.hidden_size == (encoder_rnn.bidirectional + 1) * encoder_rnn.hidden_size self.frozen_src2tgt = frozen_src2tgt self.frozen_tgt2src = frozen_tgt2src self.src_embedding = src_embedding self.tgt_embedding = tgt_embedding self.encoder_rnn = encoder_rnn self.decoder_rnn = decoder_rnn self.attention = attention self.src_hat = src_hat self.tgt_hat = tgt_hat self.core_model = nn.ModuleList([ self.src_embedding, self.tgt_embedding, self.encoder_rnn, self.decoder_rnn, self.attention, self.src_hat, self.tgt_hat ]) self.discriminator = discriminator self.src_sos_index = src_sos_index self.tgt_sos_index = tgt_sos_index self.src_eos_index = src_eos_index self.tgt_eos_index = tgt_eos_index self.src_pad_index = src_pad_index self.tgt_pad_index = tgt_pad_index self.device = device self.core_model.to(device) self.discriminator.to(device) use_cuda = device.type == 'cuda' self.src2src = Seq2Seq(src_embedding, encoder_rnn, src_embedding, attention, decoder_rnn, src_hat, use_cuda) self.src2tgt = Seq2Seq(src_embedding, encoder_rnn, tgt_embedding, attention, decoder_rnn, tgt_hat, use_cuda) self.tgt2tgt = Seq2Seq(tgt_embedding, encoder_rnn, tgt_embedding, attention, decoder_rnn, tgt_hat, use_cuda) self.tgt2src = Seq2Seq(tgt_embedding, encoder_rnn, src_embedding, attention, decoder_rnn, src_hat, use_cuda) self.core_optimizer = SGD(self.core_model.parameters(), lr=lr_core) self.discriminator_optimizer = SGD(self.discriminator.parameters(), lr=lr_disc) def train_step(self, batch, weights=(1, 1, 1), drop_probability=0.1, permutation_constraint=3): batch = {l: t.to(self.device) for l, t in batch.items()} src2src_dec, src2src_enc = self.src2src( noise(batch['src'], self.src_pad_index, drop_probability, permutation_constraint), self.src_sos_index, batch['src']) tgt2tgt_dec, tgt2tgt_enc = self.tgt2tgt( noise(batch['tgt'], self.tgt_pad_index, drop_probability, permutation_constraint), self.tgt_sos_index, batch['tgt']) tgt2src_dec, tgt2src_enc = self.tgt2src( noise(self.frozen_src2tgt(batch['src']), self.tgt_pad_index, drop_probability, permutation_constraint), self.src_sos_index, batch['src']) src2tgt_dec, src2tgt_enc = self.src2tgt( noise(self.frozen_tgt2src(batch['tgt']), self.src_pad_index, drop_probability, permutation_constraint), self.tgt_sos_index, batch['tgt']) # autoencoding core_loss = weights[0] * (translation_loss(src2src_dec, batch['src']) + translation_loss(tgt2tgt_dec, batch['tgt'])) # translating core_loss += weights[1] * ( translation_loss(tgt2src_dec, batch['src']) + translation_loss(src2tgt_dec, batch['tgt'])) # beating discriminator core_loss += weights[2] * ( classification_loss(self.discriminator(src2src_enc), 'tgt') + classification_loss(self.discriminator(tgt2tgt_enc), 'src') + classification_loss(self.discriminator(tgt2src_enc), 'src') + classification_loss(self.discriminator(src2tgt_enc), 'tgt')) # training discriminator discriminator_loss = classification_loss(self.discriminator(src2src_enc), 'src') + \ classification_loss(self.discriminator(tgt2tgt_enc), 'tgt') + \ classification_loss(self.discriminator(tgt2src_enc), 'tgt') + \ classification_loss(self.discriminator(src2tgt_enc), 'src') # update core model's parameters self.core_optimizer.zero_grad() core_loss.backward(retain_graph=True) self.core_optimizer.step() # update discriminator parameters self.discriminator_optimizer.zero_grad() discriminator_loss.backward() self.discriminator_optimizer.step()
def test_hybrid_batch_gradients(self, qnn_type: str): """Test gradient back-prop for batch input in a qnn.""" import torch from torch.nn import MSELoss from torch.optim import SGD qnn: Optional[Union[CircuitQNN, TwoLayerQNN]] = None if qnn_type == "opflow": qnn = self._create_opflow_qnn() output_size = 1 elif qnn_type == "circuit_qnn": qnn = self._create_circuit_qnn() output_size = 2 else: raise ValueError("Unsupported QNN type") model = self._create_network(qnn, output_size=output_size) model.to(self._device) # random data set x = torch.rand((5, 4), device=self._device) y = torch.rand((5, 2), device=self._device) # define optimizer and loss optimizer = SGD(model.parameters(), lr=0.1) f_loss = MSELoss(reduction="sum") # loss and gradients without batch optimizer.zero_grad(set_to_none=True) sum_of_individual_losses = 0.0 for x_i, y_i in zip(x, y): output = model(x_i) sum_of_individual_losses += f_loss(output, y_i) cast(torch.Tensor, sum_of_individual_losses).backward() sum_of_individual_gradients = 0.0 for n, param in model.named_parameters(): # make sure gradient is not None self.assertFalse(param.grad is None) if n.endswith(".weight"): sum_of_individual_gradients += np.sum(param.grad.detach().cpu().numpy()) # loss and gradients with batch optimizer.zero_grad(set_to_none=True) output = model(x) batch_loss = f_loss(output, y) batch_loss.backward() batch_gradients = 0.0 for n, param in model.named_parameters(): # make sure gradient is not None self.assertFalse(param.grad is None) if n.endswith(".weight"): batch_gradients += np.sum(param.grad.detach().cpu().numpy()) # making sure they are equivalent self.assertAlmostEqual( cast(float, np.linalg.norm(sum_of_individual_gradients - batch_gradients)), 0.0, places=4, ) self.assertAlmostEqual( cast(torch.Tensor, sum_of_individual_losses).detach().cpu().numpy(), batch_loss.detach().cpu().numpy(), places=4, )
def main( lsun_data_dir: ('Base directory for the LSUN data'), image_output_prefix: ('Prefix for image output', 'option', 'o') = 'glo', code_dim: ('Dimensionality of latent representation space', 'option', 'd', int) = 128, epochs: ('Number of epochs to train', 'option', 'e', int) = 25, use_cuda: ('Use GPU?', 'flag', 'gpu') = False, batch_size: ('Batch size', 'option', 'b', int) = 128, lr_g: ('Learning rate for generator', 'option', None, float) = 1., lr_z: ('Learning rate for representation_space', 'option', None, float) = 10., max_num_samples: ('Cap on the number of samples from the LSUN dataset', 'option', 'n', int) = -1, init: ('Initialization strategy for latent represetation vectors', 'option', 'i', str, ['pca', 'random']) = 'pca', n_pca: ('Number of samples to take for PCA', 'option', None, int) = (64 * 64 * 3 * 2), loss: ('Loss type (Laplacian loss as in the paper, or L2 loss)', 'option', 'l', str, ['lap_l1', 'l2']) = 'lap_l1', ): def maybe_cuda(tensor): return tensor.cuda() if use_cuda else tensor train_set = IndexedDataset( LSUN(lsun_data_dir, classes=['bedroom_train'], transform=transforms.Compose([ transforms.Resize(64), transforms.CenterCrop(64), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]))) train_loader = torch.utils.data.DataLoader( train_set, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=8, pin_memory=use_cuda, ) # we don't really have a validation set here, but for visualization let us # just take the first couple images from the dataset val_loader = torch.utils.data.DataLoader(train_set, shuffle=False, batch_size=8 * 8) if max_num_samples > 0: train_set.base.length = max_num_samples train_set.base.indices = [max_num_samples] # initialize representation space: if init == 'pca': from sklearn.decomposition import PCA # first, take a subset of train set to fit the PCA X_pca = np.vstack([ X.cpu().numpy().reshape(len(X), -1) for i, (X, _, _) in zip( tqdm(range(n_pca // train_loader.batch_size), 'collect data for PCA'), train_loader) ]) print("perform PCA...") pca = PCA(n_components=code_dim) pca.fit(X_pca) # then, initialize latent vectors to the pca projections of the complete dataset Z = np.empty((len(train_loader.dataset), code_dim)) for X, _, idx in tqdm(train_loader, 'pca projection'): Z[idx] = pca.transform(X.cpu().numpy().reshape(len(X), -1)) elif init == 'random': Z = np.random.randn(len(train_set), code_dim) Z = project_l2_ball(Z) g = maybe_cuda(Generator(code_dim)) # initial a Generator g loss_fn = LapLoss(max_levels=3) if loss == 'lap_l1' else nn.MSELoss() zi = maybe_cuda(torch.zeros((batch_size, code_dim))) zi = Variable(zi, requires_grad=True) optimizer = SGD([{ 'params': g.parameters(), 'lr': lr_g }, { 'params': zi, 'lr': lr_z }]) Xi_val, _, idx_val = next(iter(val_loader)) imsave( 'target.png', make_grid(Xi_val.cpu() / 2. + 0.5, nrow=8).numpy().transpose(1, 2, 0)) for epoch in range(epochs): losses = [] progress = tqdm(total=len(train_loader), desc='epoch % 3d' % epoch) for i, (Xi, yi, idx) in enumerate(train_loader): Xi = Variable(maybe_cuda(Xi)) zi.data = maybe_cuda(torch.FloatTensor(Z[idx.numpy()])) optimizer.zero_grad() rec = g(zi) loss = loss_fn(rec, Xi) loss.backward() optimizer.step() Z[idx.numpy()] = project_l2_ball(zi.data.cpu().numpy()) losses.append(loss.data[0]) progress.set_postfix({'loss': np.mean(losses[-100:])}) progress.update() progress.close() # visualize reconstructions rec = g(Variable(maybe_cuda(torch.FloatTensor(Z[idx_val.numpy()])))) imsave( '%s_rec_epoch_%03d.png' % (image_output_prefix, epoch), make_grid(rec.data.cpu() / 2. + 0.5, nrow=8).numpy().transpose(1, 2, 0))
class AlexNet(nn.Module): def __init__(self, num_classes, verbose=False): super(AlexNet, self).__init__() self.verbose = verbose self.num_classes = num_classes self.training = False self.convolution = None self.classifier = None self.criterion = None self.optimizer = None self.scheduler = None """ The first convolutional layer filters the 224x224x3 input image with 96 kernels of size 11x11x3 with a stride of 4 pixels (Note: the actual image size is 227x227x3) """ # Since the 2nd layer has an input of 55x55x48 # (n_h - k_h + 2*p_h)/s_h + 1 = (227 - 11 + 0)/4 + 1 = 55 self.C1 = nn.Conv2d(in_channels=3, out_channels=96, kernel_size=(11, 11), stride=4) """ The second convolutional layer takes as input the (response-normalized and pooled) output of the first layer and filters it with 256 kernels of size 5x5x48. """ # We used k = 2, n = 5, alpha = 10^-4, and beta = 0.75. # We applied this normalization after applying the ReLU non-linearity in certain layers self.RN2 = nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2) # (n_h - k_h + 2*p_h)/s_h + 1 = (55 - 3 + 0)/2 + 1 = 27 self.P2 = nn.MaxPool2d(kernel_size=(3, 3), stride=2) # (n_h - k_h + 2*p_h)/s_h + 1 = (27 - 5 + 2*2)/1 + 1 = 27 self.C2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=(5, 5), padding=(2, 2)) """ The third convolutional layer has 384 kernels of size 3x3x256 connected to the (normalized, pooled) outputs of the second convolutional layer. """ self.RN3 = nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2) # (n_h - k_h + 2*p_h)/s_h + 1 = (27 - 3 + 0)/2 + 1 = 13 self.P3 = nn.MaxPool2d(kernel_size=(3, 3), stride=2) # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 2*1)/1 + 1 = 13 self.C3 = nn.Conv2d(in_channels=256, out_channels=384, kernel_size=(3, 3), padding=(1, 1)) """ The fourth convolutional layer has 384 kernels of size 3x3x192 """ # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 2*1)/1 + 1 = 13 self.C4 = nn.Conv2d(in_channels=384, out_channels=384, kernel_size=(3, 3), padding=(1, 1)) """ The fifth convolutional layer has 256 kernels of size 3x3x192 """ # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 2*1)/1 + 1 = 13 self.C5 = nn.Conv2d(in_channels=384, out_channels=256, kernel_size=(3, 3), padding=(1, 1)) # (n_h - k_h + 2*p_h)/s_h + 1 = (13 - 3 + 0)/2 + 1 = 6 self.P5 = nn.MaxPool2d(kernel_size=(3, 3), stride=2) """ The fully-connected layers have 4096 neurons each """ self.F6 = nn.Linear(in_features=(256 * 6 * 6), out_features=4096) self.F7 = nn.Linear(in_features=4096, out_features=4096) self.F8 = nn.Linear(in_features=4096, out_features=self.num_classes) self.convolution = nn.Sequential(self.C1, self.C2, nn.ReLU(inplace=True), self.RN2, self.P2, self.C3, nn.ReLU(inplace=True), self.RN3, self.P3, self.C4, self.C5, self.P5) """ The ReLU non-linearity is applied to the output of every convolutional and fully-connected layer. Dropout is used in the first two fully-connected layers, consisting of setting to zero the output of each hidden neuron with probability 0.5. """ self.classifier = nn.Sequential(nn.Dropout(p=0.5), self.F6, nn.ReLU(inplace=True), nn.Dropout(p=0.5), self.F7, nn.ReLU(inplace=True), self.F8) def initialize(self, criterion=None, optimizer=None, scheduler=None, learning_rate=0.01) -> None: if criterion is None: self.criterion = nn.CrossEntropyLoss() else: self.criterion = criterion """ We trained our models using stochastic gradient descent with a batch size of 128 examples, momentum of 0.9, and weight decay of 0.0005. We used an equal learning rate for all layers, which we adjusted manually throughout training. The heuristic which we followed was to divide the learning rate by 10 when the validation error rate stopped improving with the current learning rate. The learning rate was initialized at 0.01 and reduced three times prior to termination. """ if optimizer is None: self.optimizer = SGD(self.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0005) else: self.optimizer = optimizer if scheduler is None: self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=self.optimizer, mode='min', factor=0.1, patience=5, threshold=0.002) else: self.scheduler = scheduler """ We initialized the weights in each layer from a zero-mean Gaussian distribution with standard deviation 0.01. We initialized the neuron biases in the second, fourth, and fifth convolutional layers, as well as in the fully-connected hidden layers, with the constant 1. We initialized the neuron biases in the remaining layers with the constant 0. """ for name, module in self.convolution.named_children(): if type(module) == nn.Conv2d: nn.init.normal_(tensor=module.weight, mean=0, std=0.01) if name in ['0', '2']: nn.init.constant_(tensor=module.bias, val=0) else: nn.init.constant_(tensor=module.bias, val=1) for name, module in self.classifier.named_children(): if type(module) == nn.Linear: nn.init.normal_(tensor=module.weight, mean=0, std=0.01) nn.init.constant_(tensor=module.bias, val=1) def forward(self, X: torch.Tensor) -> torch.Tensor: X = self.convolution(X) X = X.view(-1, 256 * 6 * 6) return self.classifier(X) def train(self, mode=True, data=None, epochs=10) -> 'AlexNet': device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.to(device) if data is None: raise FileNotFoundError( "\"data\" has to be a valid Dataloader object!") self.training = mode for module in self.convolution: module.train(mode) for module in self.classifier: module.train(mode) running_loss = 0.0 for epoch in range(0, epochs): for i, datum in enumerate(data, 0): features, labels = datum[0].to(device), datum[1].to(device) loss = self.criterion(self(features), labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() running_loss += loss.item() batch_split = int(len(data.dataset) / data.batch_size / 5) batch_split = 1 if batch_split < 1 else batch_split if i % batch_split == batch_split - 1: if self.verbose: print( f"[epoch {epoch + 1}, batch {i + 1}] loss: {running_loss / batch_split}" ) self.scheduler.step(running_loss / batch_split) running_loss = 0.0 if self.verbose: print('Finished Training') return self
def train(): args = parse_args() args.decay_lrs = cfg.TRAIN.DECAY_LRS cfg.USE_GPU_NMS = True if args.use_cuda else False assert args.batch_size == 1, 'Only support single batch' lr = cfg.TRAIN.LEARNING_RATE momentum = cfg.TRAIN.MOMENTUM weight_decay = cfg.TRAIN.WEIGHT_DECAY gamma = cfg.TRAIN.GAMMA # initial tensorboardX writer if args.use_tfboard: if args.exp_name == 'default': writer = SummaryWriter() else: writer = SummaryWriter('runs/' + args.exp_name) if args.dataset == 'voc07trainval': args.imdb_name = 'voc_2007_trainval' args.imdbval_name = 'voc_2007_test' elif args.dataset == 'voc0712trainval': args.imdb_name = 'voc_2007_trainval+voc_2012_trainval' args.imdbval_name = 'voc_2007_test' else: raise NotImplementedError if args.net == 'res50': fname = 'resnet50-caffe.pth' elif args.net == 'res101': fname = 'resnet101-caffe.pth' else: raise NotImplementedError args.pretrained_model = os.path.join('data', 'pretrained', fname) output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # dataset_cachefile = os.path.join(output_dir, 'dataset.pickle') # if not os.path.exists(dataset_cachefile): # imdb, roidb = combined_roidb(args.imdb_name) # cache = [imdb, roidb] # with open(dataset_cachefile, 'wb') as f: # pickle.dump(cache, f) # print('save dataset cache') # else: # with open(dataset_cachefile, 'rb') as f: # cache = pickle.load(f) # imdb, roidb = cache[0], cache[1] # print('loaded dataset from cache') imdb, roidb = combined_roidb(args.imdb_name) train_dataset = RoiDataset(roidb) train_dataloader = DataLoader(train_dataset, args.batch_size, shuffle=True) model = FasterRCNN(backbone=args.net, pretrained=args.pretrained_model) print('model loaded') # if cfg.PRETRAINED_RPN: # rpn_model_path = 'output/rpn.pth' # model.load_state_dict(torch.load(rpn_model_path)['model']) # print('loaded rpn!') # optimizer params = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'bias' in key: params += [{'params': [value], 'lr': lr * (cfg.TRAIN.DOUBLE_BIAS + 1), \ 'weight_decay': cfg.TRAIN.BIAS_DECAY and weight_decay or 0}] else: params += [{ 'params': [value], 'lr': lr, 'weight_decay': weight_decay }] optimizer = SGD(params, momentum=momentum) if args.use_cuda: model = model.cuda() model.train() iters_per_epoch = int(len(train_dataset) / args.batch_size) # start training for epoch in range(args.start_epoch, args.max_epochs + 1): loss_temp = 0 rpn_tp, rpn_tn, rpn_fg, rpn_bg = 0, 0, 0, 0 rcnn_tp, rcnn_tn, rcnn_fg, rcnn_bg = 0, 0, 0, 0 tic = time.time() train_data_iter = iter(train_dataloader) if epoch in args.decay_lrs: lr = lr * gamma adjust_learning_rate(optimizer, lr) print('adjust learning rate to {}'.format(lr)) for step in range(iters_per_epoch): im_data, gt_boxes, im_info = next(train_data_iter) if args.use_cuda: im_data = im_data.cuda() gt_boxes = gt_boxes.cuda() im_info = im_info.cuda() im_data_variable = Variable(im_data) output = model(im_data_variable, gt_boxes, im_info) rois, _, _, \ rcnn_cls_loss, rcnn_box_loss, \ rpn_cls_loss, rpn_box_loss, _train_info = output loss = rcnn_cls_loss.mean() + rcnn_box_loss.mean() +\ rpn_cls_loss.mean() + rpn_box_loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() loss_temp += loss.item() if cfg.VERBOSE: rpn_tp += _train_info['rpn_tp'] rpn_tn += _train_info['rpn_tn'] rpn_fg += _train_info['rpn_num_fg'] rpn_bg += _train_info['rpn_num_bg'] rcnn_tp += _train_info['rcnn_tp'] rcnn_tn += _train_info['rcnn_tn'] rcnn_fg += _train_info['rcnn_num_fg'] rcnn_bg += _train_info['rcnn_num_bg'] if (step + 1) % args.display_interval == 0: toc = time.time() loss_temp /= args.display_interval rpn_cls_loss_v = rpn_cls_loss.mean().item() rpn_box_loss_v = rpn_box_loss.mean().item() rcnn_cls_loss_v = rcnn_cls_loss.mean().item() rcnn_box_loss_v = rcnn_box_loss.mean().item() print("[epoch %2d][step %4d/%4d] loss: %.4f, lr: %.2e, time cost %.1fs" \ % (epoch, step+1, iters_per_epoch, loss_temp, lr, toc - tic)) print("\t\t\t rpn_cls_loss_v: %.4f, rpn_box_loss_v: %.4f\n\t\t\t " "rcnn_cls_loss_v: %.4f, rcnn_box_loss_v: %.4f" \ % (rpn_cls_loss_v, rpn_box_loss_v, rcnn_cls_loss_v, rcnn_box_loss_v)) if cfg.VERBOSE: print('\t\t\t RPN : [FG/BG] [%d/%d], FG: %.4f, BG: %.4f' % (rpn_fg, rpn_bg, float(rpn_tp) / rpn_fg, float(rpn_tn) / rpn_bg)) print('\t\t\t RCNN: [FG/BG] [%d/%d], FG: %.4f, BG: %.4f' % (rcnn_fg, rcnn_bg, float(rcnn_tp) / rcnn_fg, float(rcnn_tn) / rcnn_bg)) if args.use_tfboard: n_iter = (epoch - 1) * iters_per_epoch + step + 1 writer.add_scalar('losses/loss', loss_temp, n_iter) writer.add_scalar('losses/rpn_cls_loss_v', rpn_cls_loss_v, n_iter) writer.add_scalar('losses/rpn_box_loss_v', rpn_box_loss_v, n_iter) writer.add_scalar('losses/rcnn_cls_loss_v', rcnn_cls_loss_v, n_iter) writer.add_scalar('losses/rcnn_box_loss_v', rcnn_box_loss_v, n_iter) if cfg.VERBOSE: writer.add_scalar('rpn/fg_acc', float(rpn_tp) / rpn_fg, n_iter) writer.add_scalar('rpn/bg_acc', float(rpn_tn) / rpn_bg, n_iter) writer.add_scalar('rcnn/fg_acc', float(rcnn_tp) / rcnn_fg, n_iter) writer.add_scalar('rcnn/bg_acc', float(rcnn_tn) / rcnn_bg, n_iter) loss_temp = 0 rpn_tp, rpn_tn, rpn_fg, rpn_bg = 0, 0, 0, 0 rcnn_tp, rcnn_tn, rcnn_fg, rcnn_bg = 0, 0, 0, 0 tic = time.time() if epoch % args.save_interval == 0: save_name = os.path.join( output_dir, 'faster_{}_epoch_{}.pth'.format(args.net, epoch)) torch.save({ 'model': model.state_dict(), 'epoch': epoch, 'lr': lr }, save_name)
class TD3Agent(AgentType): """ Twin Delayed Deep Deterministic (TD3) Policy Gradient. Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise. """ name = "TD3" def __init__(self, state_size: int, action_size: int, hidden_layers: Sequence[int] = (128, 128), actor_lr: float = 1e-3, critic_lr: float = 1e-3, noise_scale: float = 0.2, noise_sigma: float = 0.1, clip: Tuple[int, int] = (-1, 1), config=None, device=None, **kwargs): config = config if config is not None else dict() self.device = device if device is not None else DEVICE # Reason sequence initiation. self.hidden_layers = config.get('hidden_layers', hidden_layers) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.critic = DoubleCritic(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) self.target_critic = DoubleCritic(state_size, action_size, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation self.noise = GaussianNoise(shape=(action_size, ), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_optimizer = SGD(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = SGD(self.critic.parameters(), lr=critic_lr) self.action_min = clip[0] self.action_max = clip[1] self.action_scale = config.get('action_scale', 1) self.gamma: float = float(config.get('gamma', 0.99)) self.tau: float = float(config.get('tau', 0.02)) self.batch_size: int = int(config.get('batch_size', 64)) self.buffer_size: int = int(config.get('buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up: int = int(config.get('warm_up', 0)) self.update_freq: int = int(config.get('update_freq', 1)) self.update_policy_freq: int = int(config.get('update_policy_freq', 1)) self.number_updates: int = int(config.get('number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0 def reset_agent(self) -> None: self.actor.reset_parameters() self.critic.reset_parameters() self.target_actor.reset_parameters() self.target_critic.reset_parameters() def act(self, obs, noise: float = 0.0): with torch.no_grad(): obs = torch.tensor(obs.astype(np.float32)).to(self.device) action = self.actor(obs) action += noise * self.noise.sample() return self.action_scale * torch.clamp( action, self.action_min, self.action_max).cpu().numpy().astype( np.float32) def target_act(self, obs, noise: float = 0.0): with torch.no_grad(): obs = torch.tensor(obs).to(self.device) action = self.target_actor(obs) + noise * self.noise.sample() return torch.clamp(action, self.action_min, self.action_max).cpu().numpy().astype( np.float32) def step(self, state, action, reward, next_state, done): self.iteration += 1 self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.iteration < self.warm_up: return if len(self.buffer) > self.batch_size and (self.iteration % self.update_freq) == 0: for _ in range(self.number_updates): # Note: Inside this there's a delayed policy update. # Every `update_policy_freq` it will learn `number_updates` times. self.learn(self.buffer.sample_sars()) def learn(self, samples): """update the critics and actors of all the agents """ states, actions, rewards, next_states, dones = samples rewards = rewards.to(self.device) dones = dones.type(torch.int).to(self.device) states = states.to(self.device) next_states = next_states.to(self.device) actions = actions.to(self.device) self._update_value_function(states, actions, rewards, next_states, dones) if (self.iteration % self.update_policy_freq) == 0: self._update_policy(states) soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def _update_value_function(self, states, actions, rewards, next_states, dones): # critic loss next_actions = self.target_actor.act(next_states) Q_target_next = torch.min( *self.target_critic.act(next_states, next_actions)) Q_target = rewards + (self.gamma * Q_target_next * (1 - dones)) Q1_expected, Q2_expected = self.critic(states, actions) critic_loss = mse_loss(Q1_expected, Q_target) + mse_loss( Q2_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip) self.critic_optimizer.step() self.critic_loss = critic_loss.item() def _update_policy(self, states): # Compute actor loss pred_actions = self.actor(states) actor_loss = -self.critic(states, pred_actions)[0].mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor_loss = actor_loss.item() def describe_agent(self) -> Tuple[Any, Any, Any, Any]: """ Returns network's weights in order: Actor, TargetActor, Critic, TargetCritic """ return (self.actor.state_dict(), self.target_actor.state_dict(), self.critic.state_dict(), self.target_critic()) def log_writer(self, writer, episode): writer.add_scalar("loss/actor", self.actor_loss, episode) writer.add_scalar("loss/critic", self.critic_loss, episode) def save_state(self, path: str): agent_state = dict( actor=self.actor.state_dict(), target_actor=self.target_actor.state_dict(), critic=self.critic.state_dict(), target_critic=self.target_critic.state_dict(), ) torch.save(agent_state, path) def load_state(self, path: str): agent_state = torch.load(path) self.actor.load_state_dict(agent_state['actor']) self.critic.load_state_dict(agent_state['critic']) self.target_actor.load_state_dict(agent_state['target_actor']) self.target_critic.load_state_dict(agent_state['target_critic'])
allcd = (neg_dist - pos_dist < margin).cpu().numpy().flatten() hard_triplets = np.where(allcd == 1) anc_hard_embedding = anc_embedding[hard_triplets].cuda() pos_hard_embedding = pos_embedding[hard_triplets].cuda() neg_hard_embedding = neg_embedding[hard_triplets].cuda() triplet_loss = TripletLoss(margin=margin).forward( anchor=anc_hard_embedding, positive=pos_hard_embedding, negative=neg_hard_embedding).cuda() triplet_loss_sum += triplet_loss.item() num_valid_training_triplets += len(anc_hard_embedding) optimizer_model.zero_grad() triplet_loss.backward() optimizer_model.step() avg_triplet_loss = 0 if ( num_valid_training_triplets == 0) else triplet_loss_sum / num_valid_training_triplets print( 'Epoch {}:\tAverage Triplet Loss: {:.4f}\tNumber of valid training triplets in epoch: {}' .format(epoch + 1, avg_triplet_loss, num_valid_training_triplets)) torch.save( { 'epoch': epoch, 'model_state_dict': net.state_dict(),
def train(cont=False): # for tensorboard tracking logger = get_logger() logger.info("(1) Initiating Training ... ") logger.info("Training on device: {}".format(device)) writer = SummaryWriter() # init model aux_layers = None if net == "SETR-PUP": aux_layers, model = get_SETR_PUP() elif net == "SETR-MLA": aux_layers, model = get_SETR_MLA() elif net == "TransUNet-Base": model = get_TransUNet_base() elif net == "TransUNet-Large": model = get_TransUNet_large() elif net == "UNet": model = UNet(CLASS_NUM) # prepare dataset cluster_model = get_clustering_model(logger) train_dataset = CityscapeDataset(img_dir=data_dir, img_dim=IMG_DIM, mode="train", cluster_model=cluster_model) valid_dataset = CityscapeDataset(img_dir=data_dir, img_dim=IMG_DIM, mode="val", cluster_model=cluster_model) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) logger.info("(2) Dataset Initiated. ") # optimizer epochs = epoch_num if epoch_num > 0 else iteration_num // len( train_loader) + 1 optim = SGD(model.parameters(), lr=lrate, momentum=momentum, weight_decay=wdecay) # optim = Adam(model.parameters(), lr=lrate) scheduler = lr_scheduler.MultiStepLR( optim, milestones=[int(epochs * fine_tune_ratio)], gamma=0.1) cur_epoch = 0 best_loss = float('inf') epochs_since_improvement = 0 # for continue training if cont: model, optim, cur_epoch, best_loss = load_ckpt_continue_training( best_ckpt_src, model, optim, logger) logger.info("Current best loss: {0}".format(best_loss)) with warnings.catch_warnings(): warnings.simplefilter("ignore") for i in range(cur_epoch): scheduler.step() else: model = nn.DataParallel(model) model = model.to(device) logger.info("(3) Model Initiated ... ") logger.info("Training model: {}".format(net) + ". Training Started.") # loss ce_loss = CrossEntropyLoss() if use_dice_loss: dice_loss = DiceLoss(CLASS_NUM) # loop over epochs iter_count = 0 epoch_bar = tqdm.tqdm(total=epochs, desc="Epoch", position=cur_epoch, leave=True) logger.info("Total epochs: {0}. Starting from epoch {1}.".format( epochs, cur_epoch + 1)) for e in range(epochs - cur_epoch): epoch = e + cur_epoch # Training. model.train() trainLossMeter = LossMeter() train_batch_bar = tqdm.tqdm(total=len(train_loader), desc="TrainBatch", position=0, leave=True) for batch_num, (orig_img, mask_img) in enumerate(train_loader): orig_img, mask_img = orig_img.float().to( device), mask_img.float().to(device) if net == "TransUNet-Base" or net == "TransUNet-Large": pred = model(orig_img) elif net == "SETR-PUP" or net == "SETR-MLA": if aux_layers is not None: pred, _ = model(orig_img) else: pred = model(orig_img) elif net == "UNet": pred = model(orig_img) loss_ce = ce_loss(pred, mask_img[:].long()) if use_dice_loss: loss_dice = dice_loss(pred, mask_img, softmax=True) loss = 0.5 * (loss_ce + loss_dice) else: loss = loss_ce # Backward Propagation, Update weight and metrics optim.zero_grad() loss.backward() optim.step() # update learning rate for param_group in optim.param_groups: orig_lr = param_group['lr'] param_group['lr'] = orig_lr * (1.0 - iter_count / iteration_num)**0.9 iter_count += 1 # Update loss trainLossMeter.update(loss.item()) # print status if (batch_num + 1) % print_freq == 0: status = 'Epoch: [{0}][{1}/{2}]\t' \ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch+1, batch_num+1, len(train_loader), loss=trainLossMeter) logger.info(status) # log loss to tensorboard if (batch_num + 1) % tensorboard_freq == 0: writer.add_scalar( 'Train_Loss_{0}'.format(tensorboard_freq), trainLossMeter.avg, epoch * (len(train_loader) / tensorboard_freq) + (batch_num + 1) / tensorboard_freq) train_batch_bar.update(1) writer.add_scalar('Train_Loss_epoch', trainLossMeter.avg, epoch) # Validation. model.eval() validLossMeter = LossMeter() valid_batch_bar = tqdm.tqdm(total=len(valid_loader), desc="ValidBatch", position=0, leave=True) with torch.no_grad(): for batch_num, (orig_img, mask_img) in enumerate(valid_loader): orig_img, mask_img = orig_img.float().to( device), mask_img.float().to(device) if net == "TransUNet-Base" or net == "TransUNet-Large": pred = model(orig_img) elif net == "SETR-PUP" or net == "SETR-MLA": if aux_layers is not None: pred, _ = model(orig_img) else: pred = model(orig_img) elif net == "UNet": pred = model(orig_img) loss_ce = ce_loss(pred, mask_img[:].long()) if use_dice_loss: loss_dice = dice_loss(pred, mask_img, softmax=True) loss = 0.5 * (loss_ce + loss_dice) else: loss = loss_ce # Update loss validLossMeter.update(loss.item()) # print status if (batch_num + 1) % print_freq == 0: status = 'Validation: [{0}][{1}/{2}]\t' \ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch+1, batch_num+1, len(valid_loader), loss=validLossMeter) logger.info(status) # log loss to tensorboard if (batch_num + 1) % tensorboard_freq == 0: writer.add_scalar( 'Valid_Loss_{0}'.format(tensorboard_freq), validLossMeter.avg, epoch * (len(valid_loader) / tensorboard_freq) + (batch_num + 1) / tensorboard_freq) valid_batch_bar.update(1) valid_loss = validLossMeter.avg writer.add_scalar('Valid_Loss_epoch', valid_loss, epoch) logger.info("Validation Loss of epoch [{0}/{1}]: {2}\n".format( epoch + 1, epochs, valid_loss)) # update optim scheduler scheduler.step() # save checkpoint is_best = valid_loss < best_loss best_loss_tmp = min(valid_loss, best_loss) if not is_best: epochs_since_improvement += 1 logger.info("Epochs since last improvement: %d\n" % (epochs_since_improvement, )) if epochs_since_improvement == early_stop_tolerance: break # early stopping. else: epochs_since_improvement = 0 state = { 'epoch': epoch, 'loss': best_loss_tmp, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optim.state_dict(), } torch.save(state, ckpt_src) logger.info("Checkpoint updated.") best_loss = best_loss_tmp epoch_bar.update(1) writer.close()
def main(): if not os.path.exists(args.outdir): os.mkdir(args.outdir) device = torch.device("cuda") torch.cuda.set_device(args.gpu) logfilename = os.path.join(args.outdir, args.logname) init_logfile(logfilename, "epoch\ttime\tlr\ttrain loss\ttrain acc\ttestloss\ttest acc") log(logfilename, "Hyperparameter List") log(logfilename, "Epochs: {:}".format(args.epochs)) log(logfilename, "Learning Rate: {:}".format(args.lr)) log(logfilename, "Alpha: {:}".format(args.alpha)) log(logfilename, "Keep ratio: {:}".format(args.keep_ratio)) test_acc_list = [] for _ in range(args.round): traindir = os.path.join(args.data_train, 'train') valdir = os.path.join(args.data_val, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) test_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch, shuffle=False, num_workers=args.workers, pin_memory=True) base_classifier = models.__dict__[args.arch](pretrained=True).cuda() print("Loaded the base_classifier") original_acc = model_inference(base_classifier, test_loader, device, display=True) log(logfilename, "Original Model Test Accuracy: {:.5}".format(original_acc)) print("Original Model Test Accuracy, ", original_acc) # Creating a fresh copy of network not affecting the original network. net = copy.deepcopy(base_classifier) net = net.to(device) # Generating the mask 'm' for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): layer.weight_mask = nn.Parameter(torch.ones_like(layer.weight)) layer.weight.requires_grad = True layer.weight_mask.requires_grad = True # This is the monkey-patch overriding layer.forward to custom function. # layer.forward will pass nn.Linear with weights: 'w' and 'm' elementwised if isinstance(layer, nn.Linear): layer.forward = types.MethodType(mask_forward_linear, layer) if isinstance(layer, nn.Conv2d): layer.forward = types.MethodType(mask_forward_conv2d, layer) criterion = nn.CrossEntropyLoss().to(device) # I added Log Softmax layer to all architecture. optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=0) # weight_decay = 0 for training the mask. sparsity, total = 0, 0 breakFlag = False net.train() # Training the mask with the training set. for epoch in range(100000): # if epoch % 5 == 0: print("Current epochs: ", epoch) print("Sparsity: {:}".format(sparsity)) log(logfilename, "Current epochs: {}".format(epoch)) log(logfilename, "Sparsity: {:}".format(sparsity)) for i, (inputs, targets) in enumerate(train_loader): inputs = inputs.cuda() targets = targets.cuda() reg_loss = 0 for layer in net.modules(): if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear): reg_loss += torch.norm(layer.weight_mask, p=1) outputs = net(inputs) loss = criterion(outputs, targets) + args.alpha * reg_loss # Computing gradient and do SGD optimizer.zero_grad() loss.backward() optimizer.step() # if i % 50000 == 0: # print("Entered 50000 loop") # log(logfilename, "Entered 50000 loop") sparsity, total = 0, 0 for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): boolean_list = layer.weight_mask.data > 1e-3 sparsity += (boolean_list == 1).sum() total += layer.weight.numel() if i % 50 == 0: print("Current Epochs: {}, Current i: {}, Current Sparsity: {}".format(epoch, i, sparsity)) if sparsity <= total*args.keep_ratio: print("Current epochs breaking loop at {:}".format(epoch)) log(logfilename, "Current epochs breaking loop at {:}".format(epoch)) breakFlag = True break # if breakFlag == True: # break if breakFlag == True: break # This line allows to calculate the threshold to satisfy the keep_ratio. c_abs = [] for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): c_abs.append(torch.abs(layer.weight_mask)) all_scores = torch.cat([torch.flatten(x) for x in c_abs]) num_params_to_keep = int(len(all_scores) * args.keep_ratio) threshold, _ = torch.topk(all_scores, num_params_to_keep, sorted=True) threshold = threshold[-1] print("Threshold found: ", threshold) keep_masks = [] for c in c_abs: keep_masks.append((c >= threshold).float()) print("Number of ones.", torch.sum(torch.cat([torch.flatten(x == 1) for x in keep_masks]))) # Updating the weight with elementwise product of update c. for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): # We update the weight by elementwise multiplication between # weight 'w' and mask 'm'. layer.weight.data = layer.weight.data * layer.weight_mask.data layer.zeros = nn.Parameter(torch.zeros_like(layer.weight)) # Dummy parameter. layer.ones = nn.Parameter(torch.ones_like(layer.weight)) # Dummy parameter. layer.weight_mask.data = torch.where(torch.abs(layer.weight_mask) <= threshold, layer.zeros, layer.ones) # Updated weight_mask becomes the mask with element # 0 and 1 again. # Temporarily disabling the backprop for both 'w' and 'm'. layer.weight.requires_grad = False layer.weight_mask.requires_grad = False if isinstance(layer, nn.Linear): layer.forward = types.MethodType(mask_forward_linear, layer) if isinstance(layer, nn.Conv2d): layer.forward = types.MethodType(mask_forward_conv2d, layer) # -------------------------------- # We need to transfer the weight we learned from "net" to "base_classifier". for (layer1, layer2) in zip(base_classifier.modules(), net.modules()): if isinstance(layer1, (nn.Linear, nn.Conv2d)) or isinstance(layer2, (nn.Linear, nn.Conv2d)): layer1.weight.data = layer2.weight.data if layer1.bias != None: layer1.bias.data = layer2.bias.data layer1.bias.requires_grad = True layer1.weight.requires_grad = True torch.save(base_classifier.state_dict(), os.path.join(args.outdir, args.save_model)) base_classifier_acc = model_inference(base_classifier, test_loader, device, display=True) log(logfilename, "Weight Update Test Accuracy: {:.5}".format(base_classifier_acc)) print("Saved the finetune model.") for masks in keep_masks: masks = masks.data torch.save(keep_masks, os.path.join(args.outdir, args.keep_mask)) print("Saved the masking function.") log(logfilename, "Finished finding the mask. (FINETUNE)")
def train(train_source_iter: ForeverDataIterator, train_target_iter: ForeverDataIterator, model: ImageClassifier, adaptive_feature_norm: AdaptiveFeatureNorm, optimizer: SGD, epoch: int, args: argparse.Namespace): batch_time = AverageMeter('Time', ':3.1f') data_time = AverageMeter('Data', ':3.1f') cls_losses = AverageMeter('Cls Loss', ':3.2f') norm_losses = AverageMeter('Norm Loss', ':3.2f') src_feature_norm = AverageMeter('Source Feature Norm', ':3.2f') tgt_feature_norm = AverageMeter('Target Feature Norm', ':3.2f') cls_accs = AverageMeter('Cls Acc', ':3.1f') tgt_accs = AverageMeter('Tgt Acc', ':3.1f') progress = ProgressMeter(args.iters_per_epoch, [ batch_time, data_time, cls_losses, norm_losses, src_feature_norm, tgt_feature_norm, cls_accs, tgt_accs ], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() end = time.time() for i in range(args.iters_per_epoch): x_s, labels_s = next(train_source_iter) x_t, labels_t = next(train_target_iter) x_s = x_s.to(device) x_t = x_t.to(device) labels_s = labels_s.to(device) labels_t = labels_t.to(device) # measure data loading time data_time.update(time.time() - end) # compute output y_s, f_s = model(x_s) y_t, f_t = model(x_t) # classification loss cls_loss = F.cross_entropy(y_s, labels_s) # norm loss norm_loss = adaptive_feature_norm(f_s) + adaptive_feature_norm(f_t) loss = cls_loss + norm_loss * args.trade_off_norm # using entropy minimization if args.trade_off_entropy: y_t = F.softmax(y_t, dim=1) entropy_loss = entropy(y_t, reduction='mean') loss += entropy_loss * args.trade_off_entropy # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # update statistics cls_acc = accuracy(y_s, labels_s)[0] tgt_acc = accuracy(y_t, labels_t)[0] cls_losses.update(cls_loss.item(), x_s.size(0)) norm_losses.update(norm_loss.item(), x_s.size(0)) src_feature_norm.update( f_s.norm(p=2, dim=1).mean().item(), x_s.size(0)) tgt_feature_norm.update( f_t.norm(p=2, dim=1).mean().item(), x_s.size(0)) cls_accs.update(cls_acc.item(), x_s.size(0)) tgt_accs.update(tgt_acc.item(), x_s.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
def train(): # Load data data_set_path = path.join(path.abspath(path.dirname(__file__)), '../resources/eclipse-data-set.csv') data = np.genfromtxt(data_set_path, delimiter=';', skip_header=1, usecols=[1, 2, 3, 4, 5, 6]) np.random.shuffle(data) n_rows = data.shape[0] train_rows = int(n_rows*0.8) test_rows = int((n_rows - train_rows)/2) # x_input = Variable(Tensor(data[:, 3].reshape((-1, 1)))) # third column is linear entropy # x_input = Variable(Tensor([[1.0], [2.0], [3.0]])) # y_truth = Variable(Tensor([[2.0], [4.0], [6.0]])) epochs = 2001 criterion = MSELoss() entropy_map = { 1: 'full_not_decayed', 2: 'weighted_not_decayed', 3: 'full_linear_decayed', 4: 'full_log_decayed', 5: 'full_exp_decayed' } learing_rate_map = { 1: (0.0000005, 0.000001), 2: (0.0001, 0.005), 3: (0.0001, 0.005), 4: (0.0001, 0.005), 5: (0.0001, 0.005) } for entropy_col, entropy_type in entropy_map.items(): y_train = Variable(Tensor(data[:train_rows, 0].reshape((-1, 1)))) # first column is number of bugs x_train = Variable(Tensor(data[:train_rows, entropy_col].reshape((-1, 1)))) # third column is weighted entropy y_val = Variable(Tensor(data[train_rows+1:train_rows+test_rows, 0].reshape((-1, 1)))) # first column is number of bugs x_val = Variable(Tensor(data[train_rows+1:train_rows+test_rows, entropy_col].reshape((-1, 1)))) # third column is weighted entropy y_test = Variable(Tensor(data[train_rows+test_rows+1:, 0].reshape((-1, 1)))) # first column is number of bugs x_test = Variable(Tensor(data[train_rows+test_rows+1:, entropy_col].reshape((-1, 1)))) # third column is weighted entropy model_file_name = entropy_type + '_hcm' model_dir = path.normpath(path.join(path.abspath(path.dirname(__file__)), '../resources/models')) model_file_path = path.join(model_dir, model_file_name + '.pt') learing_rates = np.linspace(*learing_rate_map[entropy_col]) # Try to load model old_model = LinearRegressionModel(1, 1) try: old_model.load_state_dict(load(model_file_path)) except FileNotFoundError: print('File="{}" was not found. Create new model.'.format(model_file_path)) y_test_pred_old = old_model(x_test) y_test_loss_old = criterion(y_test_pred_old, y_test) for learing_rate in learing_rates: train_loss_list = [] val_loss_list = [] test_loss_list = [] print('Train for entropy type="{0}" and learning rate="{1}"'.format(entropy_type, learing_rate)) model = LinearRegressionModel(1, 1) optimizer = SGD(model.parameters(), lr=learing_rate) for epoch in range(epochs): # Forward pass: Compute predicted y by passing # x to the model y_predicted = model(x_train) # Compute and print loss train_loss = criterion(y_predicted, y_train) train_loss_list.append(float(train_loss.data)) # Val loss y_val_pred = model(x_val) val_loss = criterion(y_val_pred, y_val) val_loss_list.append(float(val_loss.data)) # Zero optimizer.zero_grad() train_loss.backward() optimizer.step() y_test_pred = model(x_test) test_loss = criterion(y_test_pred, y_test) test_loss_list.append(float(test_loss.data)) if epoch % 50 == 0: print('epoch {0}, loss {1}'.format(epoch, train_loss.data)) if test_loss.data < y_test_loss_old.data: print('Save model with prediction error {}.'.format(test_loss)) save(model.state_dict(), model_file_path) meta_data = { 'history_complexity_metric': entropy_type, 'learning_rate': learing_rate, 'val_loss': val_loss_list, 'train_loss': train_loss_list, 'test_loss': test_loss_list } with open(path.join(model_dir, model_file_name + '_meta.json'), 'w') as write_file: json.dump(meta_data, write_file, indent=4) y_test_loss_old = test_loss # Compute and print loss test_loss = criterion(y_test_pred, y_test) print('Test loss {0}'.format(test_loss.data)) train_plt, = plt.plot(range(epochs), train_loss_list, label='Train Loss') val_plt, = plt.plot(range(epochs), val_loss_list, label='Validation Loss') plt.xlabel('Number of Epochs') plt.ylabel('Loss') plt.legend(handles=[train_plt, val_plt]) plt.show()
def train(start_path, beta): # prepare hyper-parameters seed = 42 cuda_enabled = True cuda_deterministic = False batch_size = 2048 num_workers = 2 shared = False stochastic = False kkt_momentum = 0.0 create_graph = False grad_correction = False shift = 0.0 tol = 1e-5 damping = 0.1 maxiter = 50 lr = 0.1 momentum = 0.0 weight_decay = 0.0 num_steps = 10 verbose = False # prepare path ckpt_name = start_path.name.split('.')[0] root_path = Path(__file__).resolve().parent dataset_path = root_path / 'MultiMNIST' ckpt_path = root_path / 'cpmtl' / ckpt_name if not start_path.is_file(): raise RuntimeError('Pareto solutions not found.') root_path.mkdir(parents=True, exist_ok=True) dataset_path.mkdir(parents=True, exist_ok=True) ckpt_path.mkdir(parents=True, exist_ok=True) # fix random seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if cuda_enabled and torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) # prepare device if cuda_enabled and torch.cuda.is_available(): import torch.backends.cudnn as cudnn device = torch.device('cuda') if cuda_deterministic: cudnn.benchmark = False cudnn.deterministic = True else: cudnn.benchmark = True else: device = torch.device('cpu') # prepare dataset transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) trainset = MultiMNIST(dataset_path, train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) testset = MultiMNIST(dataset_path, train=False, download=True, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers) # prepare network network = MultiLeNet() network.to(device) # initialize network start_ckpt = torch.load(start_path, map_location='cpu') network.load_state_dict(start_ckpt['state_dict']) # prepare losses criterion = F.cross_entropy closures = [ lambda n, l, t: criterion(l[0], t[:, 0]), lambda n, l, t: criterion(l[1], t[:, 1]) ] # prepare HVP solver hvp_solver = VisionHVPSolver(network, device, trainloader, closures, shared=shared) hvp_solver.set_grad(batch=False) hvp_solver.set_hess(batch=True) # prepare KKT solver kkt_solver = MINRESKKTSolver(network, hvp_solver, device, stochastic=stochastic, kkt_momentum=kkt_momentum, create_graph=create_graph, grad_correction=grad_correction, shift=shift, tol=tol, damping=damping, maxiter=maxiter) # prepare optimizer optimizer = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) # first evaluation losses, tops = evaluate(network, testloader, device, closures, f'{ckpt_name}') # prepare utilities top_trace = TopTrace(len(closures)) top_trace.print(tops, show=False) beta = beta.to(device) # training for step in range(1, num_steps + 1): network.train(True) optimizer.zero_grad() kkt_solver.backward(beta, verbose=verbose) optimizer.step() losses, tops = evaluate(network, testloader, device, closures, f'{ckpt_name}: {step}/{num_steps}') top_trace.print(tops) ckpt = { 'state_dict': network.state_dict(), 'optimizer': optimizer.state_dict(), 'beta': beta, } record = {'losses': losses, 'tops': tops} ckpt['record'] = record torch.save(ckpt, ckpt_path / f'{step:d}.pth') hvp_solver.close()
def train(train_source_iter: ForeverDataIterator, train_target_iter: ForeverDataIterator, model: ImageClassifier, domain_adv_D: DomainAdversarialLoss, domain_adv_D_0: DomainAdversarialLoss, importance_weight_module, optimizer: SGD, lr_scheduler: LambdaLR, epoch: int, args: argparse.Namespace): batch_time = AverageMeter('Time', ':5.2f') data_time = AverageMeter('Data', ':5.2f') losses = AverageMeter('Loss', ':6.2f') cls_accs = AverageMeter('Cls Acc', ':3.1f') tgt_accs = AverageMeter('Tgt Acc', ':3.1f') domain_accs_D = AverageMeter('Domain Acc for D', ':3.1f') domain_accs_D_0 = AverageMeter('Domain Acc for D_0', ':3.1f') partial_classes_weights = AverageMeter('Partial Weight', ':3.2f') non_partial_classes_weights = AverageMeter('Non-Partial Weight', ':3.2f') progress = ProgressMeter(args.iters_per_epoch, [ batch_time, data_time, losses, cls_accs, tgt_accs, domain_accs_D, domain_accs_D_0, partial_classes_weights, non_partial_classes_weights ], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() domain_adv_D.train() domain_adv_D_0.train() end = time.time() for i in range(args.iters_per_epoch): x_s, labels_s = next(train_source_iter) x_t, labels_t = next(train_target_iter) x_s = x_s.to(device) x_t = x_t.to(device) labels_s = labels_s.to(device) labels_t = labels_t.to(device) # measure data loading time data_time.update(time.time() - end) # compute output x = torch.cat((x_s, x_t), dim=0) y, f = model(x) y_s, y_t = y.chunk(2, dim=0) f_s, f_t = f.chunk(2, dim=0) # classification loss cls_loss = F.cross_entropy(y_s, labels_s) # domain adversarial loss for D adv_loss_D = domain_adv_D(f_s.detach(), f_t.detach()) # get importance weights w_s = importance_weight_module.get_importance_weight(f_s) # domain adversarial loss for D_0 adv_loss_D_0 = domain_adv_D_0(f_s, f_t, w_s=w_s) # entropy loss y_t = F.softmax(y_t, dim=1) entropy_loss = entropy(y_t, reduction='mean') loss = cls_loss + 1.5 * args.trade_off * adv_loss_D + \ args.trade_off * adv_loss_D_0 + args.gamma * entropy_loss # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() cls_acc = accuracy(y_s, labels_s)[0] tgt_acc = accuracy(y_t, labels_t)[0] losses.update(loss.item(), x_s.size(0)) cls_accs.update(cls_acc.item(), x_s.size(0)) tgt_accs.update(tgt_acc.item(), x_s.size(0)) domain_accs_D.update(domain_adv_D.domain_discriminator_accuracy, x_s.size(0)) domain_accs_D_0.update(domain_adv_D_0.domain_discriminator_accuracy, x_s.size(0)) # debug: output class weight averaged on the partial classes and non-partial classes respectively partial_class_weight, non_partial_classes_weight = \ importance_weight_module.get_partial_classes_weight(w_s, labels_s) partial_classes_weights.update(partial_class_weight.item(), x_s.size(0)) non_partial_classes_weights.update(non_partial_classes_weight.item(), x_s.size(0)) batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
class ELECTRATrainer: """ ELECTRATrainer make the pretrained ELECTRA model """ def __init__(self, electra: AttentionModel, vocab_size: int, train_dataloader: DataLoader, train_orig_dataloader: DataLoader, test_dataloader: DataLoader = None, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 100, log_file=None, freeze_embed=0, class_weights=None): """ :param electra: ELECTRA model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ self.softmax = torch.nn.Softmax() # Setup cuda device for ELECTRA training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") self.hardware = "cuda" if cuda_condition else "cpu" self.freeze_embed = freeze_embed self.electra = electra self.electra = self.electra.to(self.device) self.electra = self.electra.float() #pdb.set_trace() self.loss = nn.MSELoss() #self.loss = nn.CrossEntropyLoss() self.loss.to(self.device) #pdb.set_trace() # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for ELECTRA" % torch.cuda.device_count()) self.electra = nn.DataParallel(self.electra, device_ids=cuda_devices) self.hardware = "parallel" # Setting the train and test data loader self.train_data = train_dataloader self.train_orig_data = train_orig_dataloader self.test_data = test_dataloader # Setting the Adam optimizer with hyper-param #self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) #self.optim_schedule = ScheduledOptim(self.optim, self.electra.hidden, n_warmup_steps=warmup_steps) if freeze_embed == 1: if self.hardware == "parallel": self.electra.module.embed_layer.weight.requires_grad = False else: self.electra.embed_layer.weight.requires_grad = False elif freeze_embed == 2: self.freeze_embed_idx = torch.arange(26726, dtype=torch.long).to( self.device) self.optim = SGD([ param for param in self.electra.parameters() if param.requires_grad == True ], lr=lr, momentum=0.9) self.log_freq = log_freq # clear log file if log_file: self.log_file = log_file with open(self.log_file, "w+") as f: f.write( "EPOCH,MODE, AVG LOSS, TOTAL CORRECT, TOTAL ELEMENTS, ACCURACY, AUC, AUPR, TOTAL POSITIVE CORRECT, TOTAL POSITIVE, ACCURACY\n" ) print("Total Parameters:", sum([p.nelement() for p in self.electra.parameters()])) @staticmethod def calc_auc(y_true, y_probas, show_plot=False): fpr, tpr, thresholds = metrics.roc_curve(y_true, y_probas, pos_label=1) auc_score = metrics.auc(fpr, tpr) if show_plot: plt.figure() plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.show() return auc_score @staticmethod def calc_aupr(y_true, y_probas, show_plot=False): precision, recall, thresholds = metrics.precision_recall_curve( y_true, y_probas, pos_label=1) aupr_score = metrics.auc(recall, precision) if show_plot: plt.figure() plt.plot(recall, precision) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Receiver operating characteristic') plt.show() return aupr_score def train(self, epoch): self.electra.train() self.iteration(epoch, self.train_data, True, "train") def train_orig_dist(self, epoch): self.electra.eval() self.iteration(epoch, self.train_orig_data, False, "train_orig") def test(self, epoch): self.electra.eval() self.iteration(epoch, self.test_data, False, "test") def iteration(self, epoch, data_loader, train, str_code): """ loop over the data_loader for training or testing if on train status, backward operation is activated and also auto save the model every peoch :param epoch: current epoch index :param data_loader: torch.utils.data.DataLoader for iteration :param train: boolean value of is train or test :return: None """ # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") cumulative_loss = 0.0 total_correct = 0 total_samples = 0 total_positive_correct = 0 total_positive = 0 all_scores = [] all_labels = [] for i, data in data_iter: #pdb.set_trace() #print(i) all_labels.append(data["electra_label"]) # 0. batch_data will be sent into the device(GPU or cpu) data = {key: value.to(self.device) for key, value in data.items()} #create attention mask #pdb.set_trace() zero_boolean = torch.eq(data["species_frequencies"], 0) mask = torch.ones(zero_boolean.shape, dtype=torch.float).to(self.device) mask = mask.masked_fill(zero_boolean, 0) # 1. forward the next_sentence_prediction and masked_lm model scores = self.electra.forward(data["electra_input"], mask) # 3. backward and optimization only in train #for mse loss = self.loss(scores, data["electra_label"].float()) #for cross entropy #loss = self.loss(scores,data["electra_label"].squeeze()) if train: #self.optim_schedule.zero_grad() #pdb.set_trace() self.optim.zero_grad() loss.backward() #self.optim_schedule.step_and_update_lr() if self.freeze_embed == 2: if self.hardware == "parallel": self.electra.module.embed_layer.weight.grad[ self.freeze_embed_idx] = 0 else: self.electra.embed_layer.weight.grad[ self.freeze_embed_idx] = 0 self.optim.step() all_scores.append(scores.detach().cpu()) #for MSE predictions = scores >= 0.5 #for Cross Entropy #predictions = scores.max(1).indices #predictions = predictions.unsqueeze(0).reshape(data["electra_label"].shape) #get accuracy for all tokens total_correct += torch.sum( predictions == data["electra_label"]).item() total_samples += data["electra_input"].shape[0] positive_inds = data["electra_label"].nonzero(as_tuple=True) total_positive_correct += torch.sum( predictions[positive_inds] == data["electra_label"] [positive_inds]).item() total_positive += data["electra_label"].nonzero().shape[0] log_loss = 0 if self.hardware == "parallel": cumulative_loss += loss.sum().item() log_loss = loss.sum().item() else: cumulative_loss += loss.item() log_loss = loss.item() if i % self.log_freq == 0: if total_positive > 0: data_iter.write( "epoch: {}, iter: {}, avg loss: {},accuracy: {}/{}={:.2f}%,pos accuracy: {}/{}={:.2f}%, loss: {}" .format(epoch, i, cumulative_loss / (i + 1), total_correct, total_samples, total_correct / total_samples * 100, total_positive_correct, total_positive, total_positive_correct / total_positive * 100, log_loss)) else: data_iter.write( "epoch: {}, iter: {}, avg loss: {},accuracy: {}/{}={:.2f}%,pos accuracy: 0/0, loss: {}" .format(epoch, i, cumulative_loss / (i + 1), total_correct, total_samples, total_correct / total_samples * 100, log_loss)) del data del mask del loss del scores del predictions del positive_inds #for MSE auc_score = ELECTRATrainer.calc_auc( torch.cat(all_labels).flatten().numpy(), torch.cat(all_scores).flatten().numpy()) aupr_score = ELECTRATrainer.calc_aupr( torch.cat(all_labels).flatten().numpy(), torch.cat(all_scores).flatten().numpy()) #for Cross entropy #auc_score = ELECTRATrainer.calc_auc(torch.cat(all_labels).flatten().numpy(),torch.cat(all_scores)[:,1].numpy()) #aupr_score = ELECTRATrainer.calc_aupr(torch.cat(all_labels).flatten().numpy(),torch.cat(all_scores)[:,1].numpy()) print("EP{}_{}, avg_loss={}, accuracy={:.2f}%".format( epoch, str_code, cumulative_loss / len(data_iter), total_correct / total_samples * 100)) if self.log_file: with open(self.log_file, "a") as f: f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format( epoch, str_code, cumulative_loss / len(data_iter), total_correct, total_samples, total_correct / total_samples * 100, auc_score, aupr_score, total_positive_correct, total_positive, total_positive_correct / total_positive * 100)) def save(self, epoch, file_path): """ Saving the current ELECTRA model on file_path :param epoch: current epoch number :param file_path: model output directory """ output_file_path = file_path + "_epoch{}".format(epoch) if self.hardware == "parallel": #pdb.set_trace() self.electra.module.discriminator.save_pretrained( output_file_path + "_disc") torch.save(self.electra.module.embed_layer.state_dict(), output_file_path + "_embed") else: self.electra.discriminator.save_pretrained(output_file_path + "_disc") torch.save(self.electra.embed_layer.state_dict(), output_file_path + "_embed")
def train(class_num, epoch_num, config, x_train, y_train, x_val, y_val, seed=32): epoch_num = int(epoch_num) print(epoch_num, config) train_batch_size = config['train_batch_size'] init_lr = config['init_lr'] lr_decay_factor = config['lr_decay_factor'] weight_decay = config['weight_decay'] momentum = config['momentum'] nesterov = True if config['nesterov'] == 'True' else False from torchvision.models.resnet import resnet18 model = resnet18(num_classes=class_num).to(gpu_device) x_train = np.transpose(x_train, (0, 3, 1, 2)) x_val = np.transpose(x_val, (0, 3, 1, 2)) x_train_data = torch.from_numpy(x_train) y_train_data = torch.from_numpy(y_train) train_dataset = TensorDataset(x_train_data, y_train_data) x_val_data = torch.from_numpy(x_val) y_val_data = torch.from_numpy(y_val) val_dataset = TensorDataset(x_val_data, y_val_data) trainloader = DataLoader(train_dataset, batch_size=train_batch_size, num_workers=5, shuffle=True) validloader = DataLoader(val_dataset, batch_size=100, num_workers=5, shuffle=False) optimizer = SGD(params=model.parameters(), lr=init_lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov) scheduler = MultiStepLR( optimizer, milestones=[int(epoch_num / 2), int(epoch_num * 3 / 4)], gamma=lr_decay_factor) loss_func = nn.CrossEntropyLoss() for epoch_id in range(epoch_num): model.train() # print('Current learning rate: %.5f' % optimizer.state_dict()['param_groups'][0]['lr']) epoch_avg_loss = 0 epoch_avg_acc = 0 val_avg_loss = 0 val_avg_acc = 0 num_train_samples = 0 num_val_samples = 0 for i, data in enumerate(trainloader): batch_x, batch_y = data[0].float(), data[1].long() num_train_samples += len(batch_x) logits = model(batch_x.float().to(gpu_device)) loss = loss_func(logits, batch_y.to(gpu_device)) optimizer.zero_grad() loss.backward() optimizer.step() epoch_avg_loss += loss.to('cpu').detach() * len(batch_x) prediction = np.argmax(logits.to('cpu').detach().numpy(), axis=-1) epoch_avg_acc += accuracy_score( prediction, batch_y.to('cpu').detach().numpy()) * len(batch_x) epoch_avg_loss /= num_train_samples epoch_avg_acc /= num_train_samples print('Epoch %d: Train loss %.4f, train acc %.4f' % (epoch_id, epoch_avg_loss, epoch_avg_acc)) if validloader is not None: model.eval() with torch.no_grad(): for i, data in enumerate(validloader): batch_x, batch_y = data[0].float(), data[1].long() logits = model(batch_x.float().to(gpu_device)) val_loss = loss_func(logits, batch_y.to(gpu_device)) num_val_samples += len(batch_x) val_avg_loss += val_loss.to('cpu').detach() * len(batch_x) prediction = np.argmax(logits.to('cpu').detach().numpy(), axis=-1) val_avg_acc += accuracy_score( prediction, batch_y.to('cpu').detach().numpy()) * len(batch_x) val_avg_loss /= num_val_samples val_avg_acc /= num_val_samples print('Epoch %d: Val loss %.4f, val acc %.4f' % (epoch_id, val_avg_loss, val_avg_acc)) scheduler.step() return 1 - val_avg_acc
def train(model, state, path, annotations, val_path, val_annotations, resize, max_size, jitter, batch_size, iterations, val_iterations, mixed_precision, lr, warmup, milestones, gamma, rank=0, world=1, no_apex=False, use_dali=True, verbose=True, metrics_url=None, logdir=None, rotate_augment=False, augment_brightness=0.0, augment_contrast=0.0, augment_hue=0.0, augment_saturation=0.0, regularization_l2=0.0001, rotated_bbox=False, absolute_angle=False): 'Train the model on the given dataset' # Prepare model nn_model = model stride = model.stride model = convert_fixedbn_model(model) if torch.cuda.is_available(): model = model.to(memory_format=torch.channels_last).cuda() # Setup optimizer and schedule optimizer = SGD(model.parameters(), lr=lr, weight_decay=regularization_l2, momentum=0.9) is_master = rank == 0 if not no_apex: loss_scale = "dynamic" if use_dali else "128.0" model, optimizer = amp.initialize( model, optimizer, opt_level='O2' if mixed_precision else 'O0', keep_batchnorm_fp32=True, loss_scale=loss_scale, verbosity=is_master) if world > 1: model = DDP(model, device_ids=[rank]) if no_apex else ADDP(model) model.train() if 'optimizer' in state: optimizer.load_state_dict(state['optimizer']) def schedule(train_iter): if warmup and train_iter <= warmup: return 0.9 * train_iter / warmup + 0.1 return gamma**len([m for m in milestones if m <= train_iter]) scheduler = LambdaLR(optimizer, schedule) if 'scheduler' in state: scheduler.load_state_dict(state['scheduler']) # Prepare dataset if verbose: print('Preparing dataset...') if rotated_bbox: if use_dali: raise NotImplementedError( "This repo does not currently support DALI for rotated bbox detections." ) data_iterator = RotatedDataIterator( path, jitter, max_size, batch_size, stride, world, annotations, training=True, rotate_augment=rotate_augment, augment_brightness=augment_brightness, augment_contrast=augment_contrast, augment_hue=augment_hue, augment_saturation=augment_saturation, absolute_angle=absolute_angle) else: data_iterator = (DaliDataIterator if use_dali else DataIterator)( path, jitter, max_size, batch_size, stride, world, annotations, training=True, rotate_augment=rotate_augment, augment_brightness=augment_brightness, augment_contrast=augment_contrast, augment_hue=augment_hue, augment_saturation=augment_saturation) if verbose: print(data_iterator) if verbose: print(' device: {} {}'.format( world, 'cpu' if not torch.cuda.is_available() else 'GPU' if world == 1 else 'GPUs')) print(' batch: {}, precision: {}'.format( batch_size, 'mixed' if mixed_precision else 'full')) print(' BBOX type:', 'rotated' if rotated_bbox else 'axis aligned') print('Training model for {} iterations...'.format(iterations)) # Create TensorBoard writer if is_master and logdir is not None: from torch.utils.tensorboard import SummaryWriter if verbose: print('Writing TensorBoard logs to: {}'.format(logdir)) writer = SummaryWriter(log_dir=logdir) scaler = GradScaler() profiler = Profiler(['train', 'fw', 'bw']) iteration = state.get('iteration', 0) while iteration < iterations: cls_losses, box_losses = [], [] for i, (data, target) in enumerate(data_iterator): if iteration >= iterations: break # Forward pass profiler.start('fw') optimizer.zero_grad() if not no_apex: cls_loss, box_loss = model([ data.contiguous(memory_format=torch.channels_last), target ]) else: with autocast(): cls_loss, box_loss = model([ data.contiguous(memory_format=torch.channels_last), target ]) del data profiler.stop('fw') # Backward pass profiler.start('bw') if not no_apex: with amp.scale_loss(cls_loss + box_loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() else: scaler.scale(cls_loss + box_loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() # Reduce all losses cls_loss, box_loss = cls_loss.mean().clone(), box_loss.mean( ).clone() if world > 1: torch.distributed.all_reduce(cls_loss) torch.distributed.all_reduce(box_loss) cls_loss /= world box_loss /= world if is_master: cls_losses.append(cls_loss) box_losses.append(box_loss) if is_master and not isfinite(cls_loss + box_loss): raise RuntimeError('Loss is diverging!\n{}'.format( 'Try lowering the learning rate.')) del cls_loss, box_loss profiler.stop('bw') iteration += 1 profiler.bump('train') if is_master and (profiler.totals['train'] > 60 or iteration == iterations): focal_loss = torch.stack(list(cls_losses)).mean().item() box_loss = torch.stack(list(box_losses)).mean().item() learning_rate = optimizer.param_groups[0]['lr'] if verbose: msg = '[{:{len}}/{}]'.format(iteration, iterations, len=len(str(iterations))) msg += ' focal loss: {:.3f}'.format(focal_loss) msg += ', box loss: {:.3f}'.format(box_loss) msg += ', {:.3f}s/{}-batch'.format(profiler.means['train'], batch_size) msg += ' (fw: {:.3f}s, bw: {:.3f}s)'.format( profiler.means['fw'], profiler.means['bw']) msg += ', {:.1f} im/s'.format(batch_size / profiler.means['train']) msg += ', lr: {:.2g}'.format(learning_rate) print(msg, flush=True) if is_master and logdir is not None: writer.add_scalar('focal_loss', focal_loss, iteration) writer.add_scalar('box_loss', box_loss, iteration) writer.add_scalar('learning_rate', learning_rate, iteration) del box_loss, focal_loss if metrics_url: post_metrics( metrics_url, { 'focal loss': mean(cls_losses), 'box loss': mean(box_losses), 'im_s': batch_size / profiler.means['train'], 'lr': learning_rate }) # Save model weights state.update({ 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }) with ignore_sigint(): nn_model.save(state) profiler.reset() del cls_losses[:], box_losses[:] if val_annotations and (iteration == iterations or iteration % val_iterations == 0): stats = infer(model, val_path, None, resize, max_size, batch_size, annotations=val_annotations, mixed_precision=mixed_precision, is_master=is_master, world=world, use_dali=use_dali, no_apex=no_apex, is_validation=True, verbose=False, rotated_bbox=rotated_bbox) model.train() if is_master and logdir is not None and stats is not None: writer.add_scalar('Validation_Precision/mAP', stats[0], iteration) writer.add_scalar('Validation_Precision/[email protected]', stats[1], iteration) writer.add_scalar('Validation_Precision/[email protected]', stats[2], iteration) writer.add_scalar('Validation_Precision/mAP (small)', stats[3], iteration) writer.add_scalar('Validation_Precision/mAP (medium)', stats[4], iteration) writer.add_scalar('Validation_Precision/mAP (large)', stats[5], iteration) writer.add_scalar('Validation_Recall/mAR (max 1 Dets)', stats[6], iteration) writer.add_scalar('Validation_Recall/mAR (max 10 Dets)', stats[7], iteration) writer.add_scalar('Validation_Recall/mAR (max 100 Dets)', stats[8], iteration) writer.add_scalar('Validation_Recall/mAR (small)', stats[9], iteration) writer.add_scalar('Validation_Recall/mAR (medium)', stats[10], iteration) writer.add_scalar('Validation_Recall/mAR (large)', stats[11], iteration) if (iteration == iterations and not rotated_bbox) or (iteration > iterations and rotated_bbox): break if is_master and logdir is not None: writer.close()
class Trainer(object): """ Trainer encapsulates all the logic necessary for training the Recurrent Attention Model. All hyperparameters are provided by the user in the config file. """ def __init__(self, config, data_loader): """ Construct a new Trainer instance. Args ---- - config: object containing command line arguments. - data_loader: data iterator """ self.config = config # glimpse network params self.patch_size = config.patch_size self.glimpse_scale = config.glimpse_scale self.num_patches = config.num_patches self.loc_hidden = config.loc_hidden self.glimpse_hidden = config.glimpse_hidden # core network params self.num_glimpses = config.num_glimpses self.hidden_size = config.hidden_size # reinforce params self.std = config.std self.M = config.M # data params if config.is_train: self.train_loader = data_loader[0] self.valid_loader = data_loader[1] self.num_train = len(self.train_loader.sampler.indices) self.num_valid = len(self.valid_loader.sampler.indices) else: self.test_loader = data_loader self.num_test = len(self.test_loader.dataset) self.num_classes = 10 self.num_channels = 1 # training params self.epochs = config.epochs self.start_epoch = 0 self.momentum = config.momentum self.lr = config.init_lr # misc params self.use_gpu = config.use_gpu self.best = config.best self.ckpt_dir = config.ckpt_dir self.logs_dir = config.logs_dir self.best_valid_acc = 0. self.counter = 0 self.patience = config.patience self.use_tensorboard = config.use_tensorboard self.resume = config.resume self.print_freq = config.print_freq self.plot_freq = config.plot_freq self.model_name = 'ram_{}_{}x{}_{}'.format(config.num_glimpses, config.patch_size, config.patch_size, config.glimpse_scale) self.plot_dir = './plots/' + self.model_name + '/' if not os.path.exists(self.plot_dir): os.makedirs(self.plot_dir) # configure tensorboard logging if self.use_tensorboard: tensorboard_dir = self.logs_dir + self.model_name print('[*] Saving tensorboard logs to {}'.format(tensorboard_dir)) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) configure(tensorboard_dir) # build RAM model self.model = RecurrentAttention( self.patch_size, self.num_patches, self.glimpse_scale, self.num_channels, self.loc_hidden, self.glimpse_hidden, self.std, self.hidden_size, self.num_classes, ) if self.use_gpu: self.model.cuda() print('[*] Number of model parameters: {:,}'.format( sum([p.data.nelement() for p in self.model.parameters()]))) # initialize optimizer and scheduler self.optimizer = SGD( self.model.parameters(), lr=self.lr, momentum=self.momentum, ) self.scheduler = ReduceLROnPlateau(self.optimizer, 'min', patience=self.patience) def reset(self): """ Initialize the hidden state of the core network and the location vector. This is called once every time a new minibatch `x` is introduced. """ dtype = torch.cuda.FloatTensor if self.use_gpu else torch.FloatTensor h_t = torch.zeros(self.batch_size, self.hidden_size) h_t = Variable(h_t).type(dtype) l_t = torch.Tensor(self.batch_size, 2).uniform_(-1, 1) l_t = Variable(l_t).type(dtype) return h_t, l_t def train(self): """ Train the model on the training set. A checkpoint of the model is saved after each epoch and if the validation accuracy is improved upon, a separate ckpt is created for use on the test set. """ # load the most recent checkpoint if self.resume: self.load_checkpoint(best=False) print("\n[*] Train on {} samples, validate on {} samples".format( self.num_train, self.num_valid)) for epoch in range(self.start_epoch, self.epochs): print('\nEpoch: {}/{} - LR: {:.6f}'.format(epoch + 1, self.epochs, self.lr)) # train for 1 epoch train_loss, train_acc = self.train_one_epoch(epoch) # evaluate on validation set valid_loss, valid_acc = self.validate(epoch) # reduce lr if validation loss plateaus self.scheduler.step(valid_loss) is_best = valid_acc > self.best_valid_acc msg1 = "train loss: {:.3f} - train acc: {:.3f} " msg2 = "- val loss: {:.3f} - val acc: {:.3f}" if is_best: msg2 += " [*]" msg = msg1 + msg2 print(msg.format(train_loss, train_acc, valid_loss, valid_acc)) # check for improvement if not is_best: self.counter += 1 if self.counter > self.patience: print("[!] No improvement in a while, stopping training.") return self.best_valid_acc = max(valid_acc, self.best_valid_acc) self.save_checkpoint( { 'epoch': epoch + 1, 'model_state': self.model.state_dict(), 'optim_state': self.optimizer.state_dict(), 'best_valid_acc': self.best_valid_acc, }, is_best) def train_one_epoch(self, epoch): """ Train the model for 1 epoch of the training set. An epoch corresponds to one full pass through the entire training set in successive mini-batches. This is used by train() and should not be called manually. """ batch_time = AverageMeter() losses = AverageMeter() accs = AverageMeter() tic = time.time() with tqdm(total=self.num_train) as pbar: for i, (x, y) in enumerate(self.train_loader): if self.use_gpu: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) plot = False if (epoch % self.plot_freq == 0) and (i == 0): plot = True # initialize location vector and hidden state self.batch_size = x.shape[0] h_t, l_t = self.reset() # save images imgs = [] imgs.append(x[0:9]) # extract the glimpses locs = [] log_pi = [] baselines = [] for t in range(self.num_glimpses - 1): # forward pass through model h_t, l_t, b_t, p = self.model(x, l_t, h_t) # store locs.append(l_t[0:9]) baselines.append(b_t) log_pi.append(p) # last iteration h_t, l_t, b_t, log_probas, p = self.model(x, l_t, h_t, last=True) log_pi.append(p) baselines.append(b_t) locs.append(l_t[0:9]) # convert list to tensors and reshape baselines = torch.stack(baselines).transpose(1, 0) log_pi = torch.stack(log_pi).transpose(1, 0) # calculate reward predicted = torch.max(log_probas, 1)[1] R = (predicted.detach() == y).float() R = R.unsqueeze(1).repeat(1, self.num_glimpses) # compute losses for differentiable modules loss_action = F.nll_loss(log_probas, y) loss_baseline = F.mse_loss(baselines, R) # compute reinforce loss adjusted_reward = R - baselines.detach() loss_reinforce = torch.mean(-log_pi * adjusted_reward) # sum up into a hybrid loss loss = loss_action + loss_baseline + loss_reinforce # compute accuracy correct = (predicted == y).float() acc = 100 * (correct.sum() / len(y)) # store losses.update(loss.data[0], x.size()[0]) accs.update(acc.data[0], x.size()[0]) # compute gradients and update SGD self.optimizer.zero_grad() loss.backward() self.optimizer.step() # measure elapsed time toc = time.time() batch_time.update(toc - tic) pbar.set_description( ("{:.1f}s - loss: {:.3f} - acc: {:.3f}".format( (toc - tic), loss.data[0], acc.data[0]))) pbar.update(self.batch_size) # dump the glimpses and locs if plot: if self.use_gpu: imgs = [g.cpu().data.numpy().squeeze() for g in imgs] locs = [l.cpu().data.numpy() for l in locs] else: imgs = [g.data.numpy().squeeze() for g in imgs] locs = [l.data.numpy() for l in locs] pickle.dump( imgs, open(self.plot_dir + "g_{}.p".format(epoch + 1), "wb")) pickle.dump( locs, open(self.plot_dir + "l_{}.p".format(epoch + 1), "wb")) # log to tensorboard if self.use_tensorboard: iteration = epoch * len(self.train_loader) + i log_value('train_loss', losses.avg, iteration) log_value('train_acc', accs.avg, iteration) return losses.avg, accs.avg def validate(self, epoch): """ Evaluate the model on the validation set. """ losses = AverageMeter() accs = AverageMeter() for i, (x, y) in enumerate(self.valid_loader): if self.use_gpu: x, y = x.cuda(), y.cuda() x, y = Variable(x), Variable(y) # duplicate 10 times x = x.repeat(self.M, 1, 1, 1) # initialize location vector and hidden state self.batch_size = x.shape[0] h_t, l_t = self.reset() # extract the glimpses log_pi = [] baselines = [] for t in range(self.num_glimpses - 1): # forward pass through model h_t, l_t, b_t, p = self.model(x, l_t, h_t) # store baselines.append(b_t) log_pi.append(p) # last iteration h_t, l_t, b_t, log_probas, p = self.model(x, l_t, h_t, last=True) log_pi.append(p) baselines.append(b_t) # convert list to tensors and reshape baselines = torch.stack(baselines).transpose(1, 0) log_pi = torch.stack(log_pi).transpose(1, 0) # average log_probas = log_probas.view(self.M, -1, log_probas.shape[-1]) log_probas = torch.mean(log_probas, dim=0) baselines = baselines.contiguous().view(self.M, -1, baselines.shape[-1]) baselines = torch.mean(baselines, dim=0) log_pi = log_pi.contiguous().view(self.M, -1, log_pi.shape[-1]) log_pi = torch.mean(log_pi, dim=0) # calculate reward predicted = torch.max(log_probas, 1)[1] R = (predicted.detach() == y).float() R = R.unsqueeze(1).repeat(1, self.num_glimpses) # compute losses for differentiable modules loss_action = F.nll_loss(log_probas, y) loss_baseline = F.mse_loss(baselines, R) # compute reinforce loss adjusted_reward = R - baselines.detach() loss_reinforce = torch.mean(-log_pi * adjusted_reward) # sum up into a hybrid loss loss = loss_action + loss_baseline + loss_reinforce # compute accuracy correct = (predicted == y).float() acc = 100 * (correct.sum() / len(y)) # store losses.update(loss.data[0], x.size()[0]) accs.update(acc.data[0], x.size()[0]) # log to tensorboard if self.use_tensorboard: iteration = epoch * len(self.valid_loader) + i log_value('valid_loss', losses.avg, iteration) log_value('valid_acc', accs.avg, iteration) return losses.avg, accs.avg def test(self): """ Test the model on the held-out test data. This function should only be called at the very end once the model has finished training. """ correct = 0 # load the best checkpoint self.load_checkpoint(best=self.best) for i, (x, y) in enumerate(self.test_loader): if self.use_gpu: x, y = x.cuda(), y.cuda() x, y = Variable(x, volatile=True), Variable(y) # duplicate 10 times x = x.repeat(self.M, 1, 1, 1) # initialize location vector and hidden state self.batch_size = x.shape[0] h_t, l_t = self.reset() # extract the glimpses for t in range(self.num_glimpses - 1): # forward pass through model h_t, l_t, b_t, p = self.model(x, l_t, h_t) # last iteration h_t, l_t, b_t, log_probas, p = self.model(x, l_t, h_t, last=True) log_probas = log_probas.view(self.M, -1, log_probas.shape[-1]) log_probas = torch.mean(log_probas, dim=0) pred = log_probas.data.max(1, keepdim=True)[1] correct += pred.eq(y.data.view_as(pred)).cpu().sum() perc = (100. * correct) / (self.num_test) print('[*] Test Acc: {}/{} ({:.2f}%)'.format(correct, self.num_test, perc)) def save_checkpoint(self, state, is_best): """ Save a copy of the model so that it can be loaded at a future date. This function is used when the model is being evaluated on the test data. If this model has reached the best validation accuracy thus far, a seperate file with the suffix `best` is created. """ # print("[*] Saving model to {}".format(self.ckpt_dir)) filename = self.model_name + '_ckpt.pth.tar' ckpt_path = os.path.join(self.ckpt_dir, filename) torch.save(state, ckpt_path) if is_best: filename = self.model_name + '_model_best.pth.tar' shutil.copyfile(ckpt_path, os.path.join(self.ckpt_dir, filename)) def load_checkpoint(self, best=False): """ Load the best copy of a model. This is useful for 2 cases: - Resuming training with the most recent model checkpoint. - Loading the best validation model to evaluate on the test data. Params ------ - best: if set to True, loads the best model. Use this if you want to evaluate your model on the test data. Else, set to False in which case the most recent version of the checkpoint is used. """ print("[*] Loading model from {}".format(self.ckpt_dir)) filename = self.model_name + '_ckpt.pth.tar' if best: filename = self.model_name + '_model_best.pth.tar' ckpt_path = os.path.join(self.ckpt_dir, filename) ckpt = torch.load(ckpt_path) # load variables from checkpoint self.start_epoch = ckpt['epoch'] self.best_valid_acc = ckpt['best_valid_acc'] self.model.load_state_dict(ckpt['model_state']) self.optimizer.load_state_dict(ckpt['optim_state']) if best: print("[*] Loaded {} checkpoint @ epoch {} " "with best valid acc of {:.3f}".format( filename, ckpt['epoch'] + 1, ckpt['best_valid_acc'])) else: print("[*] Loaded {} checkpoint @ epoch {}".format( filename, ckpt['epoch'] + 1))
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional RNN-CNN') parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', required=True) parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') parser.add_argument('--num_filters', type=int, default=30, help='Number of filters in CNN') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--dropout', choices=['std', 'variational'], help='type of dropout', required=True) parser.add_argument('--p_rnn', nargs=2, type=float, required=True, help='dropout rate for RNN') parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input embeddings') parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer') parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--embedding', choices=['glove', 'senna', 'sskip', 'polyglot'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() logger = get_logger("NER") mode = args.mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma schedule = args.schedule p_rnn = tuple(args.p_rnn) p_in = args.p_in p_out = args.p_out unk_replace = args.unk_replace embedding = args.embedding embedding_path = args.embedding_dict embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) logger.info("Creating Alphabets") word_alphabet, char_alphabet, pos_alphabet, \ chunk_alphabet, ner_alphabet = conll03_data.create_alphabets("data/alphabets/ner/", train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info("Reading Data") use_gpu = torch.cuda.is_available() data_train = conll03_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu) num_data = sum(data_train[1]) num_labels = ner_alphabet.size() data_dev = conll03_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True) data_test = conll03_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True) writer = CoNLL03Writer(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conll03_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in list(word_alphabet.items()): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = args.char_dim window = 3 num_layers = args.num_layers tag_space = args.tag_space initializer = nn.init.xavier_uniform if args.dropout == 'std': network = BiRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, initializer=initializer) else: network = BiVarRecurrentConv(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p_in, p_out=p_out, p_rnn=p_rnn, initializer=initializer) if use_gpu: network.cuda() lr = learning_rate # optim = Adam(network.parameters(), lr=lr, betas=(0.9, 0.9), weight_decay=gamma) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d" % (mode, num_layers, hidden_size, num_filters, tag_space)) logger.info( "training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (gamma, num_data, batch_size, unk_replace)) logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_out, p_rnn)) num_batches = num_data / batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, args.dropout, lr, decay_rate, schedule)) train_err = 0. train_corr = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch in range(1, num_batches + 1): word, char, _, _, labels, masks, lengths = conll03_data.get_batch_variable( data_train, batch_size, unk_replace=unk_replace) optim.zero_grad() loss, corr, _ = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) loss.backward() optim.step() num_tokens = masks.data.sum() train_err += loss.data[0] * num_tokens train_corr += corr.data[0] train_total += num_tokens time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' % (num_batches, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)) # evaluate performance on dev data network.eval() tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_variable(data_dev, batch_size): word, char, pos, chunk, labels, masks, lengths = batch _, _, preds = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1 < f1: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # evaluate on test data when better performance detected tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in conll03_data.iterate_batch_variable( data_test, batch_size): word, char, pos, chunk, labels, masks, lengths = batch _, _, preds = network.loss( word, char, labels, mask=masks, length=lengths, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.data.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc, test_precision, test_recall, test_f1 = evaluate( tmp_filename) print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc, test_precision, test_recall, test_f1, best_epoch)) if epoch % schedule == 0: lr = learning_rate / (1.0 + epoch * decay_rate) optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True)
def train(args): # srun -p gpu --gres=gpu:1 python main_dsh.py sketch_folder, imsk_folder, im_folder, path_semantic, train_class, test_class = _parse_args_paths( args) logger = make_logger(join(mkdir(args.save_dir), curr_time_str() + '.log')) if DEBUG: train_class = train_class[:2] test_class = test_class[:2] args.print_every = 2 args.save_every = 8 args.steps = 20 args.batch_size = 2 args.npy_dir = NPY_FOLDER_SKETCHY # logger.info("try loading data_train") data_train = DSH_dataloader(folder_sk=sketch_folder, folder_im=im_folder, clss=train_class, folder_nps=args.npy_dir, folder_imsk=imsk_folder, normalize01=False, doaug=False, m=args.m, path_semantic=path_semantic, folder_saving=join(mkdir(args.save_dir), 'train_saving'), logger=logger) dataloader_train = DataLoader(dataset=data_train, batch_size=args.batch_size, shuffle=False) # logger.info("try loading data_test") data_test = DSH_dataloader(folder_sk=sketch_folder, clss=test_class, folder_nps=args.npy_dir, path_semantic=path_semantic, folder_imsk=imsk_folder, normalize01=False, doaug=False, m=args.m, folder_saving=join(mkdir(args.save_dir), 'test_saving'), logger=logger) model = DSH(m=args.m, config=args.config) model.cuda() optimizer = SGD(params=model.parameters(), lr=args.lr, momentum=0.9) # logger.info("optimizer inited") steps = _try_load(args, logger, model, optimizer) logger.info(str(args)) args.steps += steps dsh_loss = _DSH_loss(gamma=args.gamma) model.train() l2_regularization = _Regularization(model, args.l2_reg, p=2, logger=None) loss_sum = [] # logger.info("iterations") # iterations while True: # logger.info("update D") # 1. update D data_train.D = update_D(bi=data_train.BI, bs=data_train.BS, vec_bi=data_train.vec_bi, vec_bs=data_train.vec_bs) # logger.info("update BI/BS") # 2. update BI/BS feats_labels_sk, feats_labels_im = _extract_feats_sk_im( data=data_train, model=model, batch_size=args.batch_size) data_train.BI, data_train.BS = update_B(bi=data_train.BI, bs=data_train.BS, vec_bi=data_train.vec_bi, vec_bs=data_train.vec_bs, W=data_train.W, D=data_train.D, Fi=feats_labels_im[0], Fs=feats_labels_sk[0], lamb=args.lamb, gamma=args.gamma) # logger.info("update network parameters") # 3. update network parameters for _, (sketch, code_of_sketch, image, sketch_token, code_of_image) in enumerate(dataloader_train): sketch_feats, im_feats = model(sketch.cuda(), sketch_token.cuda(), image.cuda()) loss = dsh_loss(sketch_feats, im_feats, code_of_sketch.cuda(), code_of_image.cuda()) \ + l2_regularization() loss = loss / args.update_every loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if (steps + 1) % args.update_every == 0: optimizer.step() optimizer.zero_grad() loss_sum.append(float(loss.item() * args.update_every)) if (steps + 1) % args.save_every == 0: _test_and_save(steps=steps, optimizer=optimizer, data_test=data_test, model=model, logger=logger, args=args, loss_sum=loss_sum) data_train.save_params() if (steps + 1) % args.print_every == 0: loss_sum = [np.mean(loss_sum)] logger.info('step: {}, loss: {}'.format(steps, loss_sum[0])) steps += 1 if steps >= args.steps: break dr_dec(optimizer=optimizer, args=args) if steps >= args.steps: break
nesterov=True) psi_optimizer = PsiSGD(psis, lr=0.1, momentum=0.9, weight_decay=2e-4, nesterov=True, num_data=50000) for epoch in range(args.epochs): bayesian_net.train() for i, (input, target) in enumerate(train_loader): input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) output = bayesian_net(input) loss = torch.nn.functional.cross_entropy(output, target) mu_optimizer.zero_grad() psi_optimizer.zero_grad() loss.backward() mu_optimizer.step() psi_optimizer.step() if i % 100 == 0: print("Epoch {}, ite {}/{}, loss {}".format( epoch, i, len(train_loader), loss.item())) eval_loss, eval_acc = Bayes_ensemble(test_loader, bayesian_net) print("Epoch {}, eval loss {}, eval acc {}".format( epoch, eval_loss, eval_acc))
batch_loader.chars_vocab_size) neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size) if args.use_cuda: neg_loss = neg_loss.cuda() # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size] optimizer = SGD(neg_loss.parameters(), 0.1) for iteration in range(args.num_iterations): input_idx, target_idx = batch_loader.next_embedding_seq(args.batch_size) input = Variable(t.from_numpy(input_idx).long()) target = Variable(t.from_numpy(target_idx).long()) if args.use_cuda: input, target = input.cuda(), target.cuda() out = neg_loss(input, target, args.num_sample).mean() optimizer.zero_grad() out.backward() optimizer.step() if iteration % 500 == 0: out = out.cpu().data.numpy()[0] print('iteration = {}, loss = {}'.format(iteration, out)) word_embeddings = neg_loss.input_embeddings() np.save('data/word_embeddings.npy', word_embeddings)