def __init__(self): super(bor, self).__init__() self.lr = 0.9 self.loss_fn = nn.MSELoss() self.h = nn.Linear(2, 1) self.h.bias.data.fill_(0.2) # setting bias
def forward(self, p, img_size, targets=None, var=None): if ONNX_EXPORT: bs, nG = 1, self.nG # batch size, grid size else: bs, nG = p.shape[0], p.shape[-1] if self.img_size != img_size: self.create_grids(img_size, nG) if p.is_cuda: self.grid_xy = self.grid_xy.cuda() self.anchor_vec = self.anchor_vec.cuda() self.anchor_wh = self.anchor_wh.cuda() # p.view(bs, 255, 13, 13) -- > (bs, 3, 13, 13, 80) # (bs, anchors, grid, grid, classes + xywh) p = p.view(bs, self.nA, self.nC + 5, nG, nG).permute(0, 1, 3, 4, 2).contiguous() # prediction # xy, width and height xy = torch.sigmoid(p[..., 0:2]) wh = p[..., 2:4] # wh (yolo method) # wh = torch.sigmoid(p[..., 2:4]) # wh (power method) # Training if targets is not None: MSELoss = nn.MSELoss() BCEWithLogitsLoss = nn.BCEWithLogitsLoss() CrossEntropyLoss = nn.CrossEntropyLoss() # Get outputs p_conf = p[..., 4] # Conf p_cls = p[..., 5:] # Class txy, twh, mask, tcls = build_targets(targets, self.anchor_vec, self.nA, self.nC, nG) tcls = tcls[mask] if p.is_cuda: txy, twh, mask, tcls = txy.cuda(), twh.cuda(), mask.cuda( ), tcls.cuda() # Compute losses nT = sum([len(x) for x in targets]) # number of targets nM = mask.sum().float() # number of anchors (assigned to targets) k = 1 # nM / bs if nM > 0: lxy = k * MSELoss(xy[mask], txy[mask]) lwh = k * MSELoss(wh[mask], twh[mask]) lcls = (k / 4) * CrossEntropyLoss(p_cls[mask], torch.argmax(tcls, 1)) # lcls = (k * 10) * BCEWithLogitsLoss(p_cls[mask], tcls.float()) else: FT = torch.cuda.FloatTensor if p.is_cuda else torch.FloatTensor lxy, lwh, lcls, lconf = FT([0]), FT([0]), FT([0]), FT([0]) lconf = (k * 64) * BCEWithLogitsLoss(p_conf, mask.float()) # Sum loss components loss = lxy + lwh + lconf + lcls return loss, loss.item(), lxy.item(), lwh.item(), lconf.item( ), lcls.item(), nT else: if ONNX_EXPORT: grid_xy = self.grid_xy.repeat((1, self.nA, 1, 1, 1)).view( (1, -1, 2)) anchor_wh = self.anchor_wh.repeat((1, 1, nG, nG, 1)).view( (1, -1, 2)) / nG # p = p.view(-1, 85) # xy = xy + self.grid_xy[0] # x, y # wh = torch.exp(wh) * self.anchor_wh[0] # width, height # p_conf = torch.sigmoid(p[:, 4:5]) # Conf # p_cls = F.softmax(p[:, 5:85], 1) * p_conf # SSD-like conf # return torch.cat((xy / nG, wh, p_conf, p_cls), 1).t() p = p.view(1, -1, 85) xy = xy + grid_xy # x, y wh = torch.exp(p[..., 2:4]) * anchor_wh # width, height p_conf = torch.sigmoid(p[..., 4:5]) # Conf p_cls = p[..., 5:85] # Broadcasting only supported on first dimension in CoreML. See onnx-coreml/_operators.py # p_cls = F.softmax(p_cls, 2) * p_conf # SSD-like conf p_cls = torch.exp(p_cls).permute((2, 1, 0)) p_cls = p_cls / p_cls.sum(0).unsqueeze(0) * p_conf.permute( (2, 1, 0)) # F.softmax() equivalent p_cls = p_cls.permute(2, 1, 0) return torch.cat((xy / nG, wh, p_conf, p_cls), 2).squeeze().t() p[..., 0:2] = xy + self.grid_xy # xy p[..., 2:4] = torch.exp(wh) * self.anchor_wh # wh yolo method # p[..., 2:4] = ((wh * 2) ** 2) * self.anchor_wh # wh power method p[..., 4] = torch.sigmoid(p[..., 4]) # p_conf p[..., :4] *= self.stride # reshape from [1, 3, 13, 13, 85] to [1, 507, 85] return p.view(bs, -1, 5 + self.nC)
def test(self): # load model self.load_model() # self.load_spec_model() # Test print('Test is started.') # load dataset test_data_loader = self.data_test self.model.eval() if self.config.gpu_mode: self.model.cuda() self.MSE_loss = nn.MSELoss().cuda() else: self.MSE_loss = nn.MSELoss() loss_test = 0 dtw_test = 0 snr = 0 flag = 0 data = pd.DataFrame() for input_test, target_test, groundtruth in test_data_loader: flag += 1 print('{} batch'.format(flag)) if self.config.gpu_mode: x_test = Variable(input_test.cuda()) y_test = Variable(groundtruth.cuda()) y_log_test = Variable(target_test.cuda()) else: x_test = Variable(input_test) y_test = Variable(groundtruth) y_log_test = Variable(target_test) # prediction model_out_test = self.model(x_test) data = pd.concat((data, pd.DataFrame(model_out_test[-1][-1].cpu().data.numpy()).T),axis=0) #if flag != 1: # with open('../LQ_SRP_SmartMeter/predict_scale10.csv','a+') as file1: # np.savetxt(file1, model_out_test.cpu().data.numpy(), delimiter=',') # file1.write(',\n') #else: # with open('../LQ_SRP_SmartMeter/predict_scale10.csv','w') as file1: # np.savetxt(file1, model_out_test.cpu().data.numpy(), delimiter=',') # file1.write(',\n') with open('../LQ_SRP_SmartMeter/predict_scale10.csv','w') as file1: np.savetxt(file1, data, delimiter=',') print('Test is finished')
def get_loss_function(name: str) -> nn.Module: if name == 'MSE': return nn.MSELoss() elif name == 'L1': return nn.L1Loss()
def __init__(self): super(Criterion, self).__init__() self.bce_loss = nn.BCELoss() self.mse_loss = nn.MSELoss()
rb = ReplayBufferNStepLevy(args.buffer_size, args.gamma) q_network = QNetworkN(env) #q_network = nn.DataParallel(q_network) q_network = q_network.to(device) target_network = QNetworkN(env) #target_network = nn.DataParallel(target_network) target_network = target_network.to(device) target_network.load_state_dict(q_network.state_dict()) sampler = Sampler(env) optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate) loss_fn = nn.MSELoss() print(device.__repr__()) print(q_network) print(f"Using {torch.cuda.device_count()} GPUS") n_params = sum([p.numel() for p in q_network.parameters()]) writer.add_scalar("n_params", n_params) print("Number of parameters:", n_params) # TRY NOT TO MODIFY: start the game obs = env.reset() if args.gym_id == "MiniGrid-Empty-100x100-v0": env.max_steps = 5000 episode_reward = 0
n_train = train_data.shape[0] train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float) test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float) train_labels = torch.tensor(train_data.SalePrice.values, dtype=torch.float).view(-1, 1) # print("tensor shape", train_features.shape, test_features.shape, train_labels.shape) def get_net(feature_num): net = nn.Linear(feature_num, 1) for param in net.parameters(): nn.init.normal_(param, mean=0, std=0.01) return net loss = nn.MSELoss() def log_rmse(net, features, labels): with torch.no_grad(): # 将小于1的值设成1,使得取对数时数值更稳定 clipped_preds = torch.max(net(features), torch.tensor(1.0)) rmse = torch.sqrt(2 * loss(clipped_preds.log(), labels.log()).mean()) return rmse.item() def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size): train_ls, test_ls = [], [] dataset = torch.utils.data.TensorDataset(train_features, train_labels) train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
def init_fn(self): # create training dataset self.train_ds = create_dataset(self.options.dataset, self.options, use_IUV=True) self.dp_res = int(self.options.img_res // (2**self.options.warp_level)) self.CNet = DPNet(warp_lv=self.options.warp_level, norm_type=self.options.norm_type).to(self.device) self.LNet = get_LNet(self.options).to(self.device) self.smpl = SMPL().to(self.device) self.female_smpl = SMPL(cfg.FEMALE_SMPL_FILE).to(self.device) self.male_smpl = SMPL(cfg.MALE_SMPL_FILE).to(self.device) uv_res = self.options.uv_res self.uv_type = self.options.uv_type self.sampler = Index_UV_Generator(UV_height=uv_res, UV_width=-1, uv_type=self.uv_type).to(self.device) weight_file = 'data/weight_p24_h{:04d}_w{:04d}_{}.npy'.format( uv_res, uv_res, self.uv_type) if not os.path.exists(weight_file): cal_uv_weight(self.sampler, weight_file) uv_weight = torch.from_numpy(np.load(weight_file)).to( self.device).float() uv_weight = uv_weight * self.sampler.mask.to(uv_weight.device).float() uv_weight = uv_weight / uv_weight.mean() self.uv_weight = uv_weight[None, :, :, None] self.tv_factor = (uv_res - 1) * (uv_res - 1) # Setup an optimizer if self.options.stage == 'dp': self.optimizer = torch.optim.Adam( params=list(self.CNet.parameters()), lr=self.options.lr, betas=(self.options.adam_beta1, 0.999), weight_decay=self.options.wd) self.models_dict = {'CNet': self.CNet} self.optimizers_dict = {'optimizer': self.optimizer} else: self.optimizer = torch.optim.Adam( params=list(self.LNet.parameters()) + list(self.CNet.parameters()), lr=self.options.lr, betas=(self.options.adam_beta1, 0.999), weight_decay=self.options.wd) self.models_dict = {'CNet': self.CNet, 'LNet': self.LNet} self.optimizers_dict = {'optimizer': self.optimizer} # Create loss functions self.criterion_shape = nn.L1Loss().to(self.device) self.criterion_uv = nn.L1Loss().to(self.device) self.criterion_keypoints = nn.MSELoss(reduction='none').to(self.device) self.criterion_keypoints_3d = nn.L1Loss(reduction='none').to( self.device) self.criterion_regr = nn.MSELoss().to(self.device) # LSP indices from full list of keypoints self.to_lsp = list(range(14)) self.renderer = Renderer(faces=self.smpl.faces.cpu().numpy()) # Optionally start training from a pretrained checkpoint # Note that this is different from resuming training # For the latter use --resume if self.options.pretrained_checkpoint is not None: self.load_pretrained( checkpoint_file=self.options.pretrained_checkpoint)
def train(self): test_res, tmp_res, best_epoch = 0, 0, 0 self.loadModel() #set train mode self.net.train() if (self.cfg.cuda): criterion = nn.MSELoss().cuda() else: criterion = nn.MSELoss() if self.cfg.optimizer == 'adam': optimizer = optim.Adam(self.net.parameters(), lr=0.0001) elif self.cfg.optimizer == 'adadelta': optimizer = optim.Adadelta(self.net.parameters(), lr=1.0, rho=0.9, eps=1e-06, weight_decay=0) else: optimizer = optim.SGD(self.net.parameters(), lr=0.0001, momentum=0.9) # optimizer = optim.SGD(self.net.parameters(), lr=0.0001, momentum=0.9, weight_decay=0.01, dampening=0.0) for epoch in range( self.cfg.train_epochs): # loop over the dataset multiple times train_loss, running_loss = 0, 0 for i, data in enumerate(self.trainloader, 0): # get the inputs inputs, labels = data # wrap them in Variable if (self.cfg.cuda): inputs, labels = Variable( inputs.cuda(non_blocking=True)), Variable( labels.cuda(non_blocking=True)) else: inputs, labels = Variable(inputs), Variable(labels) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize if (self.cfg.cuda): outputs = self.net(inputs).cuda(non_blocking=True) else: outputs = self.net(inputs) # Remove one dimension # print(outputs) # outputs = outputs.squeeze() loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() del loss # print statistics if i % 5 == 4: # print every 5 mini-batches print('[%d, %5d] loss: %.6f' % (epoch + 1, i + 1, running_loss / (i + 1))) train_loss = running_loss / len(self.trainloader) print('MSE of the network on the traintset: %.6f' % (train_loss)) if ((epoch + 1) % self.cfg.test_rate == 0): self.log.logLoss((epoch + 1, train_loss)) tmp_res = self.test() self.log.logTest((epoch + 1, tmp_res)) # Check test result over all splits to save best model if (tmp_res < test_res or test_res == 0 or True): self.saveModel() test_res = tmp_res best_epoch = epoch + 1 print('Finished Training') print('Lowest model MSE: %.6f - in epoch: %d' % (test_res, best_epoch))
print('==> 加载checkpoint ') if not os.path.exists(cfg.ckpt): raise AssertionError['找不到路径'] checkpoint = torch.load(cfg.ckpt) net.load_state_dict(checkpoint['net']) best_test_acc = checkpoint['best_test_acc'] print('best_test_acc is %.4f%%'%best_test_acc) best_test_acc_epoch = checkpoint['best_test_acc_epoch'] print('best_test_acc_epoch is %d'%best_test_acc_epoch) start_epoch = checkpoint['best_test_acc_epoch'] + 1 else: print('------------------------------') print('==> 构建新的模型') optimizer = optim.Adam(net.parameters(), lr=cfg.lr) # 实例化梯度下降算法 MSELoss = nn.MSELoss() # 实例化loss def train(epoch): global train_acc trainloss = 0.0 total = 0 correct = 0 for i in range(0, trainData_sum, cfg.bs): if trainData_sum - i >= cfg.bs: inputs = X_train[i:(i+cfg.bs), :, :, :] target = y_train[i:(i+cfg.bs), :] mask = mask_train[i:(i+cfg.bs), :, :] else: inputs = X_train[i:trainData_sum, :, :, :] target = y_train[i:trainData_sum, :]
lengths = torch.randint(seq_len, (batch_size,)) src_len_mask = fast_transformers.masking.LengthMask(lengths, max_len=seq_len) lengths = torch.randint(tgt_seq_len, (batch_size,)) tgt_len_mask = fast_transformers.masking.LengthMask(lengths, max_len=tgt_seq_len) # query_dimensions = hidden_dim # value_dimensions = hidden_dim # attention_type = 'linear' # # d_model = value_dimensions * n_heads # model = TransformerEncoderDecoder(input_feats, output_feats, n_layers, n_heads, hidden_dim, ff_dim) model = TransformerEncoderDecoder(**params.transformer_ar_settings) optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) criterion = nn.MSELoss(reduction='mean') num_epochs = 10 n_params = sum(p.numel() for p in model.parameters()) print(f'created model with n_params={n_params}') model.train() epoch_loss = 0 for i in range(num_epochs): start_time = time.time() optimizer.zero_grad() output = model(X, tgt, src_len_mask=src_len_mask, tgt_len_mask=tgt_len_mask)
def forward(self, feed_dict, *, epoch_weight=1, weight_type='eta', segSize=None): sup_feed_dict, seq_feed_dict, sup_pred, seq_pred = None, None, None, None if isinstance(feed_dict, tuple): sup_feed_dict = feed_dict[0] seq_feed_dict = feed_dict[ 1] # get the data for the second iterator that has the seq info in it if type(sup_feed_dict) is list and type(seq_feed_dict) is list: if torch.cuda.is_available(): sup_feed_dict = sup_feed_dict[0] seq_feed_dict = seq_feed_dict[0] # Single valued list sup_feed_dict['img_data'] = sup_feed_dict['img_data'].cuda( ) sup_feed_dict['seg_label'] = sup_feed_dict[ 'seg_label'].cuda() seq_feed_dict['img_data'] = seq_feed_dict['img_data'].cuda( ) seq_feed_dict['seg_label'] = seq_feed_dict[ 'seg_label'].cuda() else: raise RunTimeError( 'Cannot convert torch.Floattensor into torch.cuda.FloatTensor' ) else: sup_feed_dict = feed_dict # added the following for training on 1 gpu # if the dataset is loaded as a list, this will # raise a TypeError while trying to access it as a dictionary. if type(sup_feed_dict) is list: sup_feed_dict = sup_feed_dict[0] # also, convert to torch.cuda.FloatTensor if torch.cuda.is_available(): sup_feed_dict['img_data'] = sup_feed_dict['img_data'].cuda( ) sup_feed_dict['seg_label'] = sup_feed_dict[ 'seg_label'].cuda() else: raise RunTimeError( 'Cannot convert torch.Floattensor into torch.cuda.FloatTensor' ) # training #### start for hidden layers activation = {} ''' # def get_activation(name): def get_activation(name, activation): def hook(model,input, output): activation[name] = output.detach() return hook ''' # self.decoder.cbr.register_forward_hook(get_activation('cbr')) # self.decoder.conv_last.register_forward_hook(get_activation('conv_last')) hidden_wt = weight_type.split(',') if len(list(set(hidden_wt))) != len(hidden_wt): print("duplicated weight_type!!!") register_hooks(self.encoder, hidden_wt, activation, False) register_hooks(self.decoder, hidden_wt, activation, False) ###### end if segSize is None: if self.deep_sup_scale is not None: # use deep supervision technique (sup_pred, sup_pred_deepsup) = self.decoder(self.encoder(sup_feed_dict['img_data']\ , return_feature_maps=True)) if seq_feed_dict != None: (seq_pred, seq_pred_deepsup) = self.decoder( self.encoder(seq_feed_dict['img_data'], return_feature_maps=True)) else: sup_pred = self.decoder(self.encoder(sup_feed_dict['img_data'], \ return_feature_maps=True)) if seq_feed_dict != None: seq_pred = self.decoder(self.encoder(seq_feed_dict['img_data'],\ return_feature_maps=True)) loss = self.crit( sup_pred, sup_feed_dict['seg_label']) #this would be our sup loss MSELoss = nn.MSELoss() cos = nn.CosineSimilarity(dim=0, eps=1e-6) def cal_weight(tensor, l): weights = [] b, c, w, h = tensor.shape ind = 0 # the index of the image in the sequence with gt for i in range(l): if i % seq_len == 0: ind = i #weights.append(torch.sum(cos(tensor[i], tensor[ind]))/(w * h)) weights.append( torch.sum( cos(torch.sum(tensor[i], dim=0), torch.sum(tensor[ind], dim=0))) / (w * h)) return weights #weight_types = ['eta', 'cbr', 'conv_last'] #weight_type = weight_types[2] if "share" in self.training_type: seq_losses = self.crit(seq_pred, seq_feed_dict['seg_label']) loss += seq_losses * epoch_weight elif "seq" in self.training_type: ### all of this is for eta l = len(seq_feed_dict['seg_label']) seq_len = l / self.batch_size # loss for each individual image losses = [ self.crit(seq_pred[i, :, :, :].unsqueeze(0), seq_feed_dict['seg_label'][i, :, :].unsqueeze(0)) for i in range(l) ] ''' for i in range(l): cv2.imwrite(str(i) + '.jpg', seq_feed_dict['img_data'][i].detach().cpu().numpy().transpose(1,2,0)*40) cv2.imwrite(str(i) + '.png',seq_feed_dict['seg_label'][i].detach().cpu().numpy()*20) cv2.imwrite(str(i) +'cbr.png', torch.sum(activation['cbr'][i], dim=0).detach().cpu().numpy()) cv2.imwrite(str(i) +'conv.png', torch.sum(activation['conv_last'][i], dim=0).detach().cpu().numpy()*-10) cv2.imwrite(str(i) +'output.png', torch.sum(seq_pred[i], dim=0).detach().cpu().numpy()*-10) import bpython bpython.embed(locals()) exit() ''' mse_losses = [] mse_losses2 = [] cbr_losses = [] conv_last_losses = [] """ used the (number of equal pixels in both seq_pred and gt label)/(total number pixel in the image) as the weight = similarity level """ # to change the weigh to one for images with actual gt labels l = len(losses) weights = [] tensor = seq_pred # lets have the eta as the default (eta is when the weights are calculated based on the network's predictions) #if weight_type == 'eta': # tensor = seq_pred ''' # when the weights are only calculated based on the cbr layers in the decoder if weight_type == 'cbr': tensor = activation['cbr'] # when the weights are only calculated based on the conv_last layers in the decoder if weight_type == 'conv_last': tensor = activation['conv_last'] ''' if len(activation.keys()) == 1: tensor = activation[list(activation.keys())[0]] # need to fix # stack layer's weights if "-stack" in hidden_wt: tmp = 1 for k, v in activation.items(): if tmp == 1: tensor = v if not isinstance(tensor, list): tensor = [tensor] #print("tensor size: {}".format(len(tensor))) tmp += 1 else: if isinstance(v, list): tensor.extend(v) else: tensor.extend([v]) #print("tensor size: {}; appended {}".format(len(tensor), k)) ''' if isinstance(v, list): for i in range(len(v)): print(" {} shape: {}".format(i, v[i].shape)) else: print(" shape: {}".format(v.shape)) ''' # weights = cal_weight(tensor, l) eta_weights = cal_weight(seq_pred, l) ''' for encoder hidden layers, hrnet's output is a list of tensors. calculate similarity weights for each of them, then "mean stack" ''' hidden_weights = [] if isinstance(tensor, list): tmp = [] for i in range(len(tensor)): tmp.append(cal_weight(tensor[i], l)) zipped_weights = zip(*tmp) for w in zipped_weights: hidden_weights.append(torch.mean(torch.stack(w))) else: hidden_weights = cal_weight(tensor, l) # when the weights are only calculated based on the predictions of the network and conv/cbr layers in the decoder ''' -eta means combine seq_pred based weights and layer based weights ''' #if weight_type == 'eta-conv' or weight_type == 'eta-cbr': if "-eta" in hidden_wt: #eta_weights = weights #weights = [] ''' if weight_type == 'eta-conv': tensor = activation['conv_last'] if weight_type == 'eta-cbr': tensor = activation['cbr'] decoder_weights = cal_weight(tensor, l) ''' zipped_weights = zip(eta_weights, hidden_weights) for w in zipped_weights: weights.append(torch.mean(torch.stack(w))) else: weights = hidden_weights #import bpython #bpython.embed(locals()) #exit() weighted_losses = [a * b for a, b in zip(losses, weights)] #instead of averaging the loss for all sup and unsup togather, I separated them unsup_weighted_losses = [] sup_weighted_losses = [] for i in range(len(weighted_losses)): if i % seq_len != 0: unsup_weighted_losses.append(weighted_losses[i]) else: sup_weighted_losses.append(weighted_losses[i]) if len(unsup_weighted_losses) >= 1: unsup_loss = torch.mean(torch.stack(unsup_weighted_losses)) if len(sup_weighted_losses) >= 1: sup_loss = torch.mean(torch.stack(sup_weighted_losses)) ##elif len(unsup_weighted_losses) == 1: ## unsup_loss = unsup_weighted_losses[0] ''' ## for mse if len(mse_losses) > 0: mse_loss = torch.mean(torch.stack(mse_losses)) loss += sup_loss + mse_loss * epoch_weight ''' unsup_loss = unsup_loss * epoch_weight loss += sup_loss + unsup_loss if self.deep_sup_scale is not None: loss_deepsup = self.crit(sup_pred_deepsup, sup_feed_dict['seg_label']) loss = loss + loss_deepsup * self.deep_sup_scale acc = self.pixel_acc(sup_pred, sup_feed_dict['seg_label']) return loss, acc #, weights, unsup_weighted_losses, unsup_loss # inference else: pred = self.decoder(self.encoder(sup_feed_dict['img_data'], return_feature_maps=True), segSize=segSize) return pred
def main(): #cmd and arg parser parser = argparse.ArgumentParser() arg = parser.add_argument arg('--mode', choices=['train', 'validate', 'predict_valid', 'predict_test'], default='train') arg('--run_root', default='result/furniture_bbox') arg('--fold', type=int, default=0) arg('--model', default='inception_v4') arg('--ckpt', type=str, default='model_loss_best.pt') arg('--pretrained', type=str, default='imagenet') #resnet 1, resnext imagenet arg('--batch-size', type=int, default=8) arg('--step', type=str, default=8) arg('--workers', type=int, default=16) arg('--lr', type=float, default=3e-4) arg('--patience', type=int, default=4) arg('--clean', action='store_true') arg('--n-epochs', type=int, default=120) arg('--epoch-size', type=int) arg('--tta', type=int, default=1) arg('--use-sample', action='store_true', help='use a sample of the dataset') arg('--debug', action='store_true') arg('--limit', type=int) arg('--imgsize', type=int, default=256) arg('--finetuning', action='store_true') #cuda version T/F use_cuda = cuda.is_available() args = parser.parse_args() #run_root: model/weights root run_root = Path(args.run_root) #csv for train/test/validate [id,attribute_id,fold,data] # folds = pd.read_csv('train_val_test_furniture.csv') folds = pd.read_csv('train_val_test_render.csv') #Not used @this version... train_root = TR_DATA_ROOT valid_root = TT_DATA_ROOT #split train/valid fold train_fold = folds[folds['fold'] == 0] valid_fold = folds[folds['fold'] == 2] #limit the size of train/valid data #W::Do not use it because the limited size of training data may not contain whole class if args.limit: train_fold = train_fold[:args.limit] valid_fold = valid_fold[:args.limit] ##::DataLoader def make_loader(df: pd.DataFrame, root, image_transform, name='train') -> DataLoader: if name == 'train': return DataLoader( TrainDatasetBatchAug_BG_4_BBox(root, df, debug=args.debug, name=name, imgsize=args.imgsize, class_num=N_CLASSES), shuffle=True, batch_size=args.batch_size, num_workers=args.workers, collate_fn=collate_TrainDatasetBatchAug_BG_4_BBox) else: return DataLoader( TrainDatasetBatchAug_BG_4_BBox(root, df, debug=args.debug, name=name, imgsize=args.imgsize, class_num=N_CLASSES), shuffle=True, batch_size=args.batch_size, num_workers=args.workers, collate_fn=collate_TrainDatasetBatchAug_BG_4_BBox) #Not used in this version # criterion = nn.BCEWithLogitsLoss(reduction='none') criterion = nn.MSELoss(reduce='none') # criterion = nn.CrossEnropyLoss(reduction='none) # se- ception dpn can only use finetuned model from imagenet if args.finetuning: base_model_class = OLD_N_CLASSES else: base_model_class = N_CLASSES if 'se' not in args.model and 'ception' not in args.model and 'dpn' not in args.model: # model=> models.py model = getattr(models, args.model)(num_classes=base_model_class, pretrained=args.pretrained) else: model = getattr(models, args.model)(num_classes=base_model_class, pretrained='imagenet') #finetune::load model with old settings first and then change the last layer for new task! if args.finetuning: print('Doing finetune initial...') # load_par_gpu_model_gpu(model, Path(str(run_root) + '/' + 'model_base.initial') ) # load_model(model, Path(str(run_root) + '/' + 'model_base.initial')) # model.finetuning(N_CLASSES) # load_model_ex_inceptionv4(model, Path(str(run_root) + '/' + 'max_valid_model.pth')) model.finetune(str(run_root) + '/' + 'max_valid_model_7146.pth') # model.freeze_clsnet() model.freeze_net() # model.freeze_attnet_radius() else: model.initial() md_path = Path(str(run_root) + '/' + args.ckpt) if md_path.exists(): print('load weights from md_path') load_model(model, md_path) # model.freeze_net() ##params::Add here #params list[models.parameters()] # all_params = list(model.parameters()) all_params = filter(lambda p: p.requires_grad, model.parameters()) #apply parallel gpu if available # model = torch.nn.DataParallel(model) #gpu first if use_cuda: model = model.cuda() #print(model) if args.mode == 'train': if run_root.exists() and args.clean: shutil.rmtree(run_root) run_root.mkdir(exist_ok=True, parents=True) Path(str(run_root) + '/params.json').write_text( json.dumps(vars(args), indent=4, sort_keys=True)) train_loader = make_loader(train_fold, train_root, train_transform, name='train') valid_loader = make_loader(valid_fold, valid_root, test_transform, name='valid') print(f'{len(train_loader.dataset):,} items in train, ' f'{len(valid_loader.dataset):,} in valid') train_kwargs = dict( args=args, model=model, criterion=criterion, train_loader=train_loader, valid_loader=valid_loader, patience=args.patience, init_optimizer=lambda params, lr: Adam( params, lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=2e-4), use_cuda=use_cuda, ) train(params=all_params, **train_kwargs) elif args.mode == 'validate': valid_loader = make_loader(valid_fold, valid_root, image_transform=test_transform, name='valid') # if args.finetuning: # pass # else: # load_model(model, Path(str(run_root) + '/' + args.ckpt)) # model.set_infer_mode() validation(model, criterion, tqdm.tqdm(valid_loader, desc='Validation'), use_cuda=use_cuda)
def __init__(self): """ LSGAN loss. """ super().__init__() self.loss_fn = nn.MSELoss()
conv3_out = self.conv3(conv2_out) conv4_out = self.conv4(conv3_out) conv5_out = self.conv5(conv4_out) # Flatten the result res = conv5_out.view(conv5_out.size(0), -1) out = self.dense(res) return out device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net = AlexNet().to(device) # Create an optimizer optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) # Set the loss function loss_function = nn.MSELoss().to(device) # Seperate X and y. PS: this step is very slow # X: data # y: labels print("Start to extract data and labels...") X = torch.Tensor([i[0] for i in training_data]).to(device) X = X / 255.0 y = torch.Tensor([i[1] for i in training_data]).to(device) print("Data and labels are extracted.") train_size = int(len(X) * TRAIN_PCT) train_X = X[:train_size] train_y = y[:train_size]
def train(args_job, output_dir_job, output_dir, return_dict): ################################################# ## preamble args = parse_args() args = pygrid.overwrite_opt(args, args_job) args = to_named_dict(args) # set_gpu(args.device) set_cuda(deterministic=args.gpu_deterministic) set_seed(args.seed) makedirs_exp(output_dir) job_id = int(args['job_id']) logger = setup_logging('job{}'.format(job_id), output_dir, console=True) logger.info(args) device = torch.device('cuda:{}'.format(args.device) if torch.cuda.is_available() else 'cpu') ################################################# ## data ds_train, ds_val = get_dataset(args) logger.info('len(ds_train)={}'.format(len(ds_train))) logger.info('len(ds_val)={}'.format(len(ds_val))) dataloader_train = torch.utils.data.DataLoader(ds_train, batch_size=args.batch_size, shuffle=True, num_workers=0) dataloader_val = torch.utils.data.DataLoader(ds_val, batch_size=args.batch_size, shuffle=True, num_workers=0) assert len(ds_train) >= args.n_fid_samples to_range_0_1 = lambda x: (x + 1.) / 2. # ds_fid = np.array(torch.stack([to_range_0_1(torch.tensor(ds_train[i][0])) for i in range(args.n_fid_samples)]).cpu().numpy()) # logger.info('ds_fid.shape={}'.format(ds_fid.shape)) ds_fid = [] def plot(p, x): return torchvision.utils.save_image(torch.clamp(x, -1., 1.), p, normalize=True, nrow=int(np.sqrt(args.batch_size))) ################################################# ## model if args.gpu_multi: net = torch.nn.DataParallel(_netWrapper(args).to(device), device_ids=[0,1,2,3]) else: net = _netWrapper(args).to(device) def eval_flag(): net.eval() def train_flag(): net.train() def energy(score): if args.e_energy_form == 'tanh': energy = F.tanh(-score.squeeze()) elif args.e_energy_form == 'sigmoid': energy = F.sigmoid(score.squeeze()) elif args.e_energy_form == 'identity': energy = score.squeeze() elif args.e_energy_form == 'softplus': energy = F.softplus(score.squeeze()) return energy mse = nn.MSELoss(reduction='sum') ################################################# ## optimizer if args.gpu_multi: net_resolve = net.module else: net_resolve = net optE = torch.optim.Adam(net_resolve.netE.parameters(), lr=args.e_lr, weight_decay=args.e_decay, betas=(args.e_beta1, args.e_beta2)) optG = torch.optim.Adam(net_resolve.netG.parameters(), lr=args.g_lr, weight_decay=args.g_decay, betas=(args.g_beta1, args.g_beta2)) lr_scheduleE = torch.optim.lr_scheduler.ExponentialLR(optE, args.e_gamma) lr_scheduleG = torch.optim.lr_scheduler.ExponentialLR(optG, args.g_gamma) ################################################# ## ckpt epoch_ckpt = 0 if args.load_ckpt: ckpt = torch.load(args.load_ckpt, map_location='cuda:{}'.format(args.device)) net_resolve.netE.load_state_dict(ckpt['netE']) optE.load_state_dict(ckpt['optE']) net_resolve.netG.load_state_dict(ckpt['netG']) optG.load_state_dict(ckpt['optG']) epoch_ckpt = 76 ################################################# ## sampling def sample_p_0(n=args.batch_size, sig=args.e_init_sig): return sig * torch.randn(*[n, args.nz, 1, 1]).to(device) ################################################# ## fid def get_fid(n): assert n <= ds_fid.shape[0] logger.info('computing fid with {} samples'.format(n)) try: eval_flag() def sample_x(): z_0 = sample_p_0().to(device) z_k = net(Variable(z_0), prior=True) x_samples = to_range_0_1(net_resolve.netG(z_k)).clamp(min=0., max=1.).detach().cpu() return x_samples x_samples = torch.cat([sample_x() for _ in range(int(n / args.batch_size))]).numpy() fid = compute_fid_nchw(args, ds_fid[:n], x_samples) return fid except Exception as e: print(e) logger.critical(e, exc_info=True) logger.info('FID failed') finally: train_flag() # get_fid(n=args.batch_size) ################################################# ## train train_flag() fid = 0.0 fid_best = math.inf def normalize(x): return ((x.float() / 255.) - .5) * 2. z_fixed = sample_p_0() x_fixed = normalize(next(iter(dataloader_train))[0]).to(device) stats = { 'loss_g':[], 'loss_e':[], 'en_neg':[], 'en_pos':[], 'grad_norm_g':[], 'grad_norm_e':[], 'z_e_grad_norm':[], 'z_g_grad_norm':[], 'z_e_k_grad_norm':[], 'fid':[], } interval = [] for epoch in range(epoch_ckpt, args.n_epochs): for i, (x, y) in enumerate(dataloader_train, 0): train_flag() x = normalize(x).to(device) batch_size = x.shape[0] # Initialize chains z_g_0 = sample_p_0(n=batch_size) z_e_0 = sample_p_0(n=batch_size) # Langevin posterior and prior z_g_k = net(Variable(z_g_0), x, prior=False) z_e_k = net(Variable(z_e_0), prior=True) # Learn generator optG.zero_grad() x_hat = net_resolve.netG(z_g_k.detach()) loss_g = mse(x_hat, x) / batch_size loss_g.backward() # grad_norm_g = get_grad_norm(net.netG.parameters()) # if args.g_is_grad_clamp: # torch.nn.utils.clip_grad_norm(net.netG.parameters(), opt.g_max_norm) optG.step() # Learn prior EBM optE.zero_grad() en_neg = energy(net_resolve.netE(z_e_k.detach())).mean() # TODO(nijkamp): why mean() here and in Langevin sum() over energy? constant is absorbed into Adam adaptive lr en_pos = energy(net_resolve.netE(z_g_k.detach())).mean() loss_e = en_pos - en_neg loss_e.backward() # grad_norm_e = get_grad_norm(net.netE.parameters()) # if args.e_is_grad_clamp: # torch.nn.utils.clip_grad_norm_(net.netE.parameters(), args.e_max_norm) optE.step() # Printout if i % args.n_printout == 0: with torch.no_grad(): x_0 = net_resolve.netG(z_e_0) x_k = net_resolve.netG(z_e_k) en_neg_2 = energy(net_resolve.netE(z_e_k)).mean() en_pos_2 = energy(net_resolve.netE(z_g_k)).mean() prior_moments = '[{:8.2f}, {:8.2f}, {:8.2f}]'.format(z_e_k.mean(), z_e_k.std(), z_e_k.abs().max()) posterior_moments = '[{:8.2f}, {:8.2f}, {:8.2f}]'.format(z_g_k.mean(), z_g_k.std(), z_g_k.abs().max()) logger.info('{} {:5d}/{:5d} {:5d}/{:5d} '.format(job_id, epoch, args.n_epochs, i, len(dataloader_train)) + 'loss_g={:8.3f}, '.format(loss_g) + 'loss_e={:8.3f}, '.format(loss_e) + 'en_pos=[{:9.4f}, {:9.4f}, {:9.4f}], '.format(en_pos, en_pos_2, en_pos_2-en_pos) + 'en_neg=[{:9.4f}, {:9.4f}, {:9.4f}], '.format(en_neg, en_neg_2, en_neg_2-en_neg) + '|z_g_0|={:6.2f}, '.format(z_g_0.view(batch_size, -1).norm(dim=1).mean()) + '|z_g_k|={:6.2f}, '.format(z_g_k.view(batch_size, -1).norm(dim=1).mean()) + '|z_e_0|={:6.2f}, '.format(z_e_0.view(batch_size, -1).norm(dim=1).mean()) + '|z_e_k|={:6.2f}, '.format(z_e_k.view(batch_size, -1).norm(dim=1).mean()) + 'z_e_disp={:6.2f}, '.format((z_e_k-z_e_0).view(batch_size, -1).norm(dim=1).mean()) + 'z_g_disp={:6.2f}, '.format((z_g_k-z_g_0).view(batch_size, -1).norm(dim=1).mean()) + 'x_e_disp={:6.2f}, '.format((x_k-x_0).view(batch_size, -1).norm(dim=1).mean()) + 'prior_moments={}, '.format(prior_moments) + 'posterior_moments={}, '.format(posterior_moments) + 'fid={:8.2f}, '.format(fid) + 'fid_best={:8.2f}'.format(fid_best)) # Schedule lr_scheduleE.step(epoch=epoch) lr_scheduleG.step(epoch=epoch) # Stats if epoch % args.n_stats == 0: stats['loss_g'].append(loss_g.item()) stats['loss_e'].append(loss_e.item()) stats['en_neg'].append(en_neg.data.item()) stats['en_pos'].append(en_pos.data.item()) stats['grad_norm_g'].append(0) stats['grad_norm_e'].append(0) stats['z_g_grad_norm'].append(0) stats['z_e_grad_norm'].append(0) stats['z_e_k_grad_norm'].append(0) stats['fid'].append(fid) interval.append(epoch + 1) plot_stats(output_dir, stats, interval) # Metrics if False and epoch % args.n_metrics == 0: fid = get_fid(n=len(ds_fid)) if fid < fid_best: fid_best = fid logger.info('fid={}'.format(fid)) # Plot if epoch % args.n_plot == 0: batch_size_fixed = x_fixed.shape[0] z_g_0 = sample_p_0(n=batch_size_fixed) z_e_0 = sample_p_0(n=batch_size_fixed) z_g_k = net(Variable(z_g_0), x_fixed) z_e_k = net(Variable(z_e_0), prior=True) with torch.no_grad(): plot('{}/samples/{:>06d}_{:>06d}_x_fixed.png'.format(output_dir, epoch, i), x_fixed) plot('{}/samples/{:>06d}_{:>06d}_x_fixed_hat.png'.format(output_dir, epoch, i), net_resolve.netG(z_g_k)) plot('{}/samples/{:>06d}_{:>06d}_x_z_neg_0.png'.format(output_dir, epoch, i), net_resolve.netG(z_e_0)) plot('{}/samples/{:>06d}_{:>06d}_x_z_neg_k.png'.format(output_dir, epoch, i), net_resolve.netG(z_e_k)) plot('{}/samples/{:>06d}_{:>06d}_x_z_fixed.png'.format(output_dir, epoch, i), net_resolve.netG(z_fixed)) # Ckpt if epoch > 0 and epoch % args.n_ckpt == 0: save_dict = { 'epoch': epoch, 'net': net.state_dict(), 'optE': optE.state_dict(), 'optG': optG.state_dict(), } torch.save(save_dict, '{}/ckpt/ckpt_{:>06d}.pth'.format(output_dir, epoch)) # Early exit if False and epoch > 10 and loss_g > 500: logger.info('early exit condition 1: epoch > 10 and loss_g > 500') return_dict['stats'] = {'fid_best': fid_best, 'fid': fid, 'mse': loss_g.data.item()} return if False and epoch > 20 and fid > 100: logger.info('early exit condition 2: epoch > 20 and fid > 100') return_dict['stats'] = {'fid_best': fid_best, 'fid': fid, 'mse': loss_g.data.item()} return return_dict['stats'] = {'fid_best': fid_best, 'fid': fid, 'mse': loss_g.data.item()} logger.info('done')
def train(args, trial, is_train=True, study=None): hparams = HPARAMS[args.hparams] if hparams.model_type in {"bjorn"}: Dataset = src.dataset.xTSDatasetSpeakerIdEmbedding def prepare_batch(batch, device, non_blocking): for i in range(len(batch)): batch[i] = batch[i].to(device) batch_x, batch_y, _, emb = batch return (batch_x, batch_y, emb), batch_y else: Dataset = src.dataset.xTSDatasetSpeakerId prepare_batch = prepare_batch_3 train_path_loader = PATH_LOADERS[args.dataset](ROOT, args.filelist + "-train") valid_path_loader = PATH_LOADERS[args.dataset](ROOT, args.filelist + "-valid") train_dataset = Dataset(hparams, train_path_loader, transforms=TRAIN_TRANSFORMS) valid_dataset = Dataset(hparams, valid_path_loader, transforms=VALID_TRANSFORMS) kwargs = dict(batch_size=args.batch_size, collate_fn=collate_fn) train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, **kwargs) valid_loader = torch.utils.data.DataLoader(valid_dataset, shuffle=False, **kwargs) num_speakers = len(train_path_loader.speaker_to_id) dataset_parameters = DATASET_PARAMETERS[args.dataset] dataset_parameters["num_speakers"] = num_speakers hparams = update_namespace(hparams, trial.parameters) model = MODELS[hparams.model_type](dataset_parameters, hparams) model_speaker = MODELS_SPEAKER[hparams.model_speaker_type]( hparams.encoder_embedding_dim, num_speakers) if model.speaker_info is SpeakerInfo.EMBEDDING and hparams.embedding_normalize: model.embedding_stats = train_dataset.embedding_stats if hparams.drop_frame_rate: path_mel_mean = os.path.join("output", "mel-mean", f"{args.dataset}-{args.filelist}.npz") mel_mean = cache(compute_mel_mean, path_mel_mean)(train_dataset)["mel_mean"] mel_mean = torch.tensor(mel_mean).float().to(DEVICE) mel_mean = mel_mean.unsqueeze(0).unsqueeze(0) model.decoder.mel_mean = mel_mean model_name = f"{args.dataset}_{args.filelist}_{args.hparams}_revgrad" model_path = f"output/models/{model_name}.pth" # Initialize model from existing one. if args.model_path is not None: model.load_state_dict(torch.load(args.model_path, map_location=DEVICE)) # if hasattr(hparams, "model_speaker_path"): # model_speaker.load_state_dict(torch.load(hparams.model_speaker_path)) optimizer = torch.optim.Adam( list(model.parameters()) + list(model_speaker.parameters()), lr=trial.parameters["lr"], ) # 0.001 # optimizer_speaker = torch.optim.Adam(model_speaker.parameters(), lr=0.001) mse_loss = nn.MSELoss() def loss_reconstruction(pred, true): pred1, pred2 = pred return mse_loss(pred1, true) + mse_loss(pred2, true) λ = 0.0002 model.to(DEVICE) model_speaker.to(DEVICE) def step(engine, batch): model.train() model_speaker.train() x, y = prepare_batch(batch, device=DEVICE, non_blocking=True) i = batch[2].to(DEVICE) y_pred, z = model.forward_emb(x) i_pred = model_speaker.forward(z) loss_r = loss_reconstruction(y_pred, y) # reconstruction loss_s = F.nll_loss(i_pred, i) # speaker loss = loss_r + λ * loss_s optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() return { "loss": loss.item(), "loss-reconstruction": loss_r.item(), "loss-speaker": loss_s.item(), } trainer = engine.Engine(step) # trainer = engine.create_supervised_trainer( # model, optimizer, loss, device=device, prepare_batch=prepare_batch # ) evaluator = engine.create_supervised_evaluator( model, metrics={"loss": ignite.metrics.Loss(loss_reconstruction)}, device=DEVICE, prepare_batch=prepare_batch, ) @trainer.on(engine.Events.ITERATION_COMPLETED) def log_training_loss(trainer): print( "Epoch {:3d} | Loss gen.: {:+8.6f} = {:8.6f} + λ * {:8.6f}".format( trainer.state.epoch, trainer.state.output["loss"], trainer.state.output["loss-reconstruction"], trainer.state.output["loss-speaker"], )) @trainer.on(engine.Events.ITERATION_COMPLETED(every=EVERY_K_ITERS)) def log_validation_loss(trainer): evaluator.run(valid_loader) metrics = evaluator.state.metrics print("Epoch {:3d} Valid loss: {:8.6f} ←".format( trainer.state.epoch, metrics["loss"])) lr_reduce = lr_scheduler.ReduceLROnPlateau(optimizer, verbose=args.verbose, **LR_REDUCE_PARAMS) @evaluator.on(engine.Events.COMPLETED) def update_lr_reduce(engine): loss = engine.state.metrics["loss"] lr_reduce.step(loss) @evaluator.on(engine.Events.COMPLETED) def terminate_study(engine): """Stops underperforming trials.""" if study and study.should_trial_stop(trial=trial): trainer.terminate() def score_function(engine): return -engine.state.metrics["loss"] early_stopping_handler = ignite.handlers.EarlyStopping( patience=PATIENCE, score_function=score_function, trainer=trainer) evaluator.add_event_handler(engine.Events.COMPLETED, early_stopping_handler) if is_train: def global_step_transform(*args): return trainer.state.iteration // EVERY_K_ITERS checkpoint_handler = ignite.handlers.ModelCheckpoint( "output/models/checkpoints", model_name, score_name="objective", score_function=score_function, n_saved=5, require_empty=False, create_dir=True, global_step_transform=global_step_transform, ) evaluator.add_event_handler(engine.Events.COMPLETED, checkpoint_handler, {"model": model}) trainer.run(train_loader, max_epochs=args.max_epochs) if is_train: torch.save(model.state_dict(), model_path) print("Last model @", model_path) model_best_path = link_best_model(model_name) print("Best model @", model_best_path) return evaluator.state.metrics["loss"]
netE.load_state_dict(torch.load(opt.netE)) netE.apply(model_New.weights_init) print(netE) netR = model_New.MLP_RNN_ThreeLSTM(opt) if opt.netR != '': netR.load_state_dict(torch.load(opt.netR)) #netR.apply(model_New.weights_init) print(netR) # classification loss, Equation (4) of the paper cls_criterion = nn.NLLLoss() lstm_criterion = nn.CrossEntropyLoss() ## loss for image-text matching sim_criterion = nn.CosineEmbeddingLoss() reconst_criterion = nn.MSELoss() binary_criterion = nn.BCEWithLogitsLoss() ## input_wordID = torch.LongTensor( opt.batch_size, opt.max_seq_length) ## assume max of sentence length is 30 target_wordID = torch.LongTensor( opt.batch_size, opt.max_seq_length) ## assume max of sentence length is 30 input_img_index = torch.LongTensor(opt.batch_size) input_cap_len = torch.LongTensor(opt.batch_size) input_res = torch.FloatTensor(opt.batch_size, opt.resSize) input_att = torch.FloatTensor(opt.batch_size, opt.attSize) input_att_confuse = torch.FloatTensor(opt.batch_size, opt.nclass_all) input_att_binary = torch.FloatTensor(opt.batch_size, opt.attSize) input_label = torch.LongTensor(opt.batch_size) one = torch.FloatTensor([1]) mone = one * -1
def trainValidateNet(train_dataloader, valid_dataloader, neural_net, learning_rate, num_epochs, neural_net_folderPath, iter_num, device): """ Forward MLP training and validation. Parameters: ---------- train_dataloader: Tensor dataloader. Training dataset. valid_dataloader: Tensor dataloader. Validation dataset. neural_net: MLP model. learning_rate: Float. Specify a value typically less than 1. num_epochs: Int. Total number of training epochs. neural_net_folderPath: String. The directory to save the eventual trained ANN. iter_num: Int. The number of the current iteration. device: CPU/GPU. Returns: ---------- neural_net: Trained MLP. lossList_train: List. The loss result of each training epoch. lossList_valid: List. The loss result of each validation epoch. """ # Define criterion and optimizer criterion = nn.MSELoss() optimizer = torch.optim.Adam(neural_net.parameters(), learning_rate) # Iterative training and validation lossList_train, lossList_valid = [], [ ] # List of loss summation during training and validation process. for epoch in range(num_epochs): loss_sum_train, loss_sum_valid = 0, 0 # Training iteration_num_train = 0 for iteration, (displacements, weights) in enumerate(train_dataloader): # Forward fitting x_train_batch = torch.autograd.Variable(displacements) y_train_batch = torch.autograd.Variable(weights) x_train_batch = x_train_batch.to(device) y_train_batch = y_train_batch.to(device) output = neural_net(x_train_batch) loss_train_temp = criterion(output, y_train_batch) # Back propagation optimizer.zero_grad() loss_train_temp.backward() optimizer.step() loss_sum_train += loss_train_temp.cpu().data.numpy() iteration_num_train += 1 lossList_train.append(loss_sum_train / iteration_num_train) # Validation iteration_num_valid = 0 for iteration, (displacements, weights) in enumerate(valid_dataloader): x_valid_batch = torch.autograd.Variable(displacements) y_valid_batch = torch.autograd.Variable(weights) x_valid_batch = x_valid_batch.to(device) y_valid_batch = y_valid_batch.to(device) output = neural_net(x_valid_batch) loss_valid_temp = criterion(output, y_valid_batch) loss_sum_valid += loss_valid_temp.cpu().data.numpy() iteration_num_valid += 1 lossList_valid.append(loss_sum_valid / iteration_num_valid) print( "Iter: ", iter_num, "| Epoch: ", epoch, "| train loss: %.8f | valid loss: %.8f " % (loss_sum_train / iteration_num_train, loss_sum_valid / iteration_num_valid)) if (epoch + 1) % 100 == 0: ANN_savePath_temp = os.path.join( neural_net_folderPath, "ANN_" + str(int( (epoch + 1) / 100)) + ".pkl") torch.save(neural_net.state_dict(), ANN_savePath_temp) # Save the model every 100 epochs. torch.save( neural_net.state_dict(), os.path.join(neural_net_folderPath, "ANN_trained.pkl")) # Save the final trained ANN model. return neural_net, lossList_train, lossList_valid
def main(): device = torch.device("cuda:0") model = SMNet(in_channels=1) model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]).to(device) optimier = optim.Adam(model.parameters(), lr=5e-4) scheduler = optim.lr_scheduler.MultiStepLR(optimier, milestones=[10, 40], gamma=0.1) criterion_sr = nn.MSELoss(reduction='mean') dataset_train = Dataset(train=True, data_root='train_gray_data.h5') loader_train = DataLoader(dataset_train, batch_size=opt.batchsize, num_workers=8, shuffle=True, drop_last=True) dataset_step = len(loader_train.dataset)/opt.batchsize dataset_val = Dataset(train=False) viz = visdom.Visdom() weight_path = "weights" if opt.checkpoint != 0: model.load_state_dict(torch.load(os.path.join(weight_path, "model_gray_L%d_%d.pth" %(opt.noiseL,opt.checkpoint)))) viz.line([0.001], [dataset_step * opt.checkpoint], win='gray_loss', opts=dict(title='gray_loss')) viz.line([26], [dataset_step * opt.checkpoint], win='gray_val_psnr', opts=dict(title='gray_val_psnr')) viz.line([26], [opt.checkpoint], win='gray_train_psnr', opts=dict(title='gray_train_psnr')) print('check point:%d' %(opt.checkpoint)) if opt.checkpoint == 0: viz.line([0.001], [dataset_step*opt.checkpoint], win='gray_loss', opts=dict(title='gray_loss')) viz.line([26], [dataset_step*opt.checkpoint], win='gray_val_psnr', opts=dict(title='gray_val_psnr')) viz.line([26], [opt.checkpoint], win='gray_train_psnr', opts=dict(title='gray_train_psnr')) global_step = dataset_step*opt.checkpoint model.train() for epoch in range(opt.checkpoint, opt.epochs): train_psnr_all = 0 for step, x in enumerate(loader_train): x = x.to(device) noise = torch.FloatTensor(x.size()).normal_(mean=0, std=opt.noiseL / 255.).to(device) x_noise = x + noise x_hat = model(x_noise) loss_sr = criterion_sr(x, x_hat) loss = 1.0 * loss_sr optimier.zero_grad() loss.backward() optimier.step() out_train = torch.clamp(x_hat, 0., 1.) psnr_train = batch_PSNR(out_train, x, 1.) train_psnr_all += psnr_train if step % 100 == 0 and step != 0: out_train = torch.clamp(x_hat, 0., 1.) psnr_train = batch_PSNR(out_train, x, 1.) print("[epoch %d][%d/%d] PSNR_train: %.6f \n all_loss: %.6f \n" % (epoch + 1, step, dataset_step, psnr_train, loss.item())) viz.line([loss.item()], [global_step], win='gray_loss', update='append') if step % 400 == 0 and step != 0: val_psnr = evaluate(model, dataset_val) print("****************************\nepoch:{} val_psnr:{}\n***************************************\n".format( epoch + 1, val_psnr)) viz.line([val_psnr], [global_step], win='gray_val_psnr', update='append') with open('log_gray_L%d.txt'%(opt.noiseL), 'a+') as f: f.write("[epoch %d][%d/%d] PSNR_train: %.6f \n all_loss: %.6f \n" % (epoch + 1, step, dataset_step, psnr_train, loss.item())) f.write("****************************\nepoch:{} val_psnr:{}\n***************************************\n".format( epoch + 1, val_psnr)) model.train() global_step +=1 if epoch % 1 == 0: print("****************************\nepoch:{} train_psnr:{}\n***************************************\n".format( epoch + 1, train_psnr_all/step)) val_psnr = evaluate(model, dataset_val) print("****************************\nepoch:{} val_psnr:{}\n***************************************\n".format( epoch + 1, val_psnr)) viz.line([val_psnr], [global_step], win='gray_val_psnr', update='append') viz.line([train_psnr_all/step], [epoch+1], win='gray_train_psnr', update='append') torch.save(model.state_dict(), os.path.join(weight_path, "model_gray_L%d_%d.pth" %(opt.noiseL,epoch+1))) scheduler.step()
def demo_checkpoint(rank, world_size, use_ort_module): torch.manual_seed(rank) print(f"Running DDP checkpoint example on rank {rank}.") setup(rank, world_size) if use_ort_module: print(f" Rank {rank} uses ORTModule."); model = ToyModel().to(rank) model = ORTModule(model) else: print(f" Rank {rank} uses Pytorch's nn.Module."); model = ToyModel().to(rank) ddp_model = DDP(model, device_ids=[rank]) loss_fn = nn.MSELoss() optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) CHECKPOINT_PATH = os.path.join(tempfile.gettempdir(), "model.checkpoint") if rank == 0: # All processes should see same parameters as they all start from same # random parameters and gradients are synchronized in backward passes. # Therefore, saving it in one process is sufficient. torch.save(ddp_model.state_dict(), CHECKPOINT_PATH) # Use a barrier() to make sure that process 1 loads the model after process # 0 saves it. dist.barrier() # configure map_location properly map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} ddp_model.load_state_dict( torch.load(CHECKPOINT_PATH, map_location=map_location)) optimizer.zero_grad() outputs = ddp_model(torch.randn(20, 10)) labels = torch.randn(20, 5).to(rank) loss_fn = nn.MSELoss() loss = loss_fn(outputs, labels) loss.backward() optimizer.step() print(f"Rank {rank} sees loss {loss}") if rank == 0: assert torch.allclose(loss.cpu(), torch.FloatTensor([1.4909229278564453])) elif rank == 1: assert torch.allclose(loss.cpu(), torch.FloatTensor([1.0177688598632812])) elif rank == 2: assert torch.allclose(loss.cpu(), torch.FloatTensor([1.290669322013855])) elif rank == 3: assert torch.allclose(loss.cpu(), torch.FloatTensor([0.825118362903595])) else: assert False # Not necessary to use a dist.barrier() to guard the file deletion below # as the AllReduce ops in the backward pass of DDP already served as # a synchronization. if rank == 0: os.remove(CHECKPOINT_PATH) cleanup()
def do(seed): torch.random.manual_seed(seed) np.random.seed(seed) dataset0 = torch.load("./data/wedges0.pkl") dataset1 = torch.load("./data/wedges1.pkl") data_dicts = [dataset0, dataset1] encoded_state_dim = 2 identity = nn.Identity() softsign = nn.Softsign() parameters = [] encoded_velocity_predictor = NNet(encoded_state_dim, encoded_state_dim, softsign, hidden_dims=[256, 256]) state_dict = encoded_velocity_predictor.state_dict() for param_key in list(state_dict.keys())[:-2]: state_dict[param_key] *= 0 encoded_velocity_predictor.load_state_dict(state_dict) parameters.extend(list(encoded_velocity_predictor.parameters())) model_dicts = [] for data_dict in data_dicts: _, state_dim = data_dict.get(DataKey.states).shape state_encoder = NNet(state_dim, encoded_state_dim, identity, hidden_dims=[256, 256]) state_decoder = NNet(encoded_state_dim, state_dim, identity, hidden_dims=[256, 256]) state_scaler_ = StandardScaler() state_scaler_.fit(data_dict.get(DataKey.states)) state_scaler = ScalerWrapper(state_scaler_) model_dict = ModelDict() model_dict.set(ModelKey.state_encoder, state_encoder) model_dict.set(ModelKey.state_decoder, state_decoder) model_dict.set(ModelKey.encoded_velocity_predictor, encoded_velocity_predictor) model_dict.set(ModelKey.state_scaler, state_scaler) # model_dicts.append(model_container) model_dicts.append(model_dict) parameters.extend(list(state_encoder.parameters())) parameters.extend(list(state_decoder.parameters())) tensor_collector = TensorCollector(TensorInserterSeq([ TensorInserterTensorizeScaled(DataKey.states, ModelKey.state_scaler, TensorKey.states_tensor, torch.float), TensorInserterTensorizeScaled(DataKey.next_states, ModelKey.state_scaler, TensorKey.next_states_tensor, torch.float), TensorInserterForward(TensorKey.states_tensor, ModelKey.state_encoder, TensorKey.encoded_states_tensor), TensorInserterForward(TensorKey.next_states_tensor, ModelKey.state_encoder, TensorKey.encoded_next_states_tensor), TensorInserterForward(TensorKey.encoded_states_tensor, ModelKey.encoded_velocity_predictor, TensorKey.encoded_velocity_predictions_tensor), TensorInserterSum([TensorKey.encoded_states_tensor, TensorKey.encoded_velocity_predictions_tensor], TensorKey.encoded_next_state_predictions_tensor), TensorInserterForward(TensorKey.encoded_states_tensor, ModelKey.state_decoder, TensorKey.decoded_states_tensor) ]), TensorListGetterOneToOne()) mse_loss = nn.MSELoss() loss_calculator = LossCalculatorSum([ LossCalculatorInputTarget(TensorKey.decoded_states_tensor, TensorKey.states_tensor, mse_loss, 1.), LossCalculatorInputTarget(TensorKey.encoded_next_state_predictions_tensor, TensorKey.encoded_next_states_tensor, mse_loss, 1e4), LossCalculatorNearestNeighborL2(TensorKey.encoded_states_tensor, TensorKey.origins_tensor, 1.) ]) optimizer = RAdam(params=parameters, lr=3e-4) optim = HeterogeneousLearner(loss_calculator, tensor_collector, optimizer, n_epochs=100) for episode in range(10): loss = optim.train_one_episode(data_dicts, model_dicts, batch_size=5000) print("Episode {:d}\tLoss: {:.10f}".format(episode, loss)) model_dict = dict() model_dict['wedges0'] = model_dicts[0] model_dict['wedges1'] = model_dicts[1] model_path = "./torch_model/wedges_model_containers_{:07d}_{:02d}.pkl".format(episode, seed) torch.save(model_dict, model_path) print("Saved models to {:s}".format(model_path))
def update_policy(self): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor( list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = th.stack(batch.states).type(FloatTensor) action_batch = th.stack(batch.actions).type(FloatTensor) reward_batch = th.stack(batch.rewards).type(FloatTensor) # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = th.stack([ s for s in batch.next_states if s is not None ]).type(FloatTensor) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) self.critic_optimizer[agent].zero_grad() #print("whole_action",whole_action) current_Q = self.critics[agent](whole_state, whole_action) non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents) ] non_final_next_actions = th.stack(non_final_next_actions) non_final_next_actions = (non_final_next_actions.transpose( 0, 1).contiguous()) target_Q = th.zeros(self.batch_size).type(FloatTensor) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)).squeeze() # scale_reward: to scale reward in Q functions target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + ( reward_batch[:, agent].unsqueeze(1) * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) if self.steps_done % 30 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.critics[i], self.tau) soft_update(self.actors_target[i], self.actors[i], self.tau) if self.steps_done % 300 == 0: th.save(self.critics[i].state_dict(), "parameter/critic_v3" + str(i) + ".pth") th.save(self.actors[i].state_dict(), "parameter/actor_v3" + str(i) + ".pth") return c_loss, a_loss
def main(args, adj=None, train_dates=("1871-01", "1972-12"), val_dates=("1973-01", "1983-12"), test_dates=("1984-01", "2020-08")): device = torch.device(args.device) args.device = device torch.set_num_threads(3) Data = DataLoaderS(args, train_dates=train_dates, val_dates=val_dates, test_dates=test_dates) args.num_nodes = Data.n_nodes args.save += f"_{Data.n_nodes}nodes.pt" print(Data, '\n') model = gtnet(args.gcn_true, args.adaptive_edges, args.gcn_depth, args.num_nodes, device, args, predefined_A=adj, dropout=args.dropout, subgraph_size=args.subgraph_size, node_dim=args.node_dim, dilation_exponential=args.dilation_exponential, conv_channels=args.conv_channels, residual_channels=args.residual_channels, skip_channels=args.skip_channels, end_channels=args.end_channels, seq_length=args.window, in_dim=args.in_dim, out_dim=args.seq_out_len, layers=args.layers, propalpha=args.propalpha, tanhalpha=args.tanhalpha, layer_norm_affline=False) model = model.to(device) # print(args) print('The receptive field size is', model.receptive_field) nParams = sum([p.nelement() for p in model.parameters()]) print('Number of model parameters is', nParams, flush=True) if args.L1Loss: criterion = nn.L1Loss().to(device) else: criterion = nn.MSELoss().to(device) evaluateL2 = nn.MSELoss().to(device) evaluateL1 = nn.L1Loss().to(device) best_val = 10000000 optim = Optim(model.parameters(), args.optim, args.lr, args.clip, lr_decay=args.weight_decay) # At any point you can hit Ctrl + C to break out of training early. DataG = IndexLoader(args, test_set="GODAS", start_date="1984-01", end_date="2020-08", data_dir=args.data_dir) try: print('begin training') for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train_loss = train(Data, Data.train[0], Data.train[1], model, criterion, optim, args) val_loss, val_rae, val_corr, oni_stats = \ evaluate(Data, Data.valid[0], Data.valid[1], model, evaluateL2, evaluateL1, args) print( '--> Epoch {:3d} | time: {:5.2f}s | train_loss {:5.4f} | Val. loss {:5.4f}, corr {:5.4f} |' ' ONI corr {:5.4f}, RMSE {:5.4f}'.format( epoch, (time.time() - epoch_start_time), train_loss, val_loss, val_corr, oni_stats["Corrcoef"], oni_stats["RMSE"]), flush=True) # Save the model if the validation loss is the best we've seen so far. if oni_stats["RMSE"] < best_val: print("Model will be saved...") with open(args.save, 'wb') as f: torch.save(model, f) best_val = oni_stats["RMSE"] if epoch % 5 == 0: test_acc, test_rae, test_corr, oni_stats = evaluate( Data, Data.test[0], Data.test[1], model, evaluateL2, evaluateL1, args) print( "-------> Test stats: rse {:5.4f} | rae {:5.4f} | corr {:5.4f} |" " ONI corr {:5.4f} | ONI RMSE {:5.4f}".format( test_acc, test_rae, test_corr, oni_stats["Corrcoef"], oni_stats["RMSE"]), flush=True) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. with open(args.save, 'rb') as f: model = torch.load(f).to(device) val_acc, val_rae, val_corr, val_oni = evaluate(Data, Data.valid[0], Data.valid[1], model, evaluateL2, evaluateL1, args) test_acc, test_rae, test_corr, oni_stats = evaluate( Data, Data.test[0], Data.test[1], model, evaluateL2, evaluateL1, args) print( "+++++++++++++++++++++ BEST MODEL STATS (best w.r.t to validations RMSE): +++++++++++++++++++++++++++++++" ) print("-------> Valid stats: rse {:5.4f} | rae {:5.4f} | corr {:5.4f} |" " ONI corr {:5.4f} | ONI RMSE {:5.4f}".format( val_acc, val_rae, val_corr, val_oni["Corrcoef"], val_oni["RMSE"]), flush=True) print("-------> Test stats: rse {:5.4f} | rae {:5.4f} | corr {:5.4f} |" " ONI corr {:5.4f} | ONI RMSE {:5.4f}".format( test_acc, test_rae, test_corr, oni_stats["Corrcoef"], oni_stats["RMSE"]), flush=True) print("Saved in", args.save) return val_acc, val_rae, val_corr, test_acc, test_rae, test_corr
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--batch_size", default=10, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument("--num_workers", type=int, default=16, help="Number of workers in the dataloader.") parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument("--use_chunk", default=0, type=float, help="whether use chunck for parallel training.") parser.add_argument("--in_memory", default=False, type=bool, help="whether use chunck for parallel training.") parser.add_argument("--optimizer", default='BertAdam', type=str, help="whether use chunck for parallel training.") parser.add_argument("--tasks", default='', type=str, help="1-2-3... training task separate by -") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.") parser.add_argument("--vision_scratch", action="store_true", help="whether pre-trained the image or not.") parser.add_argument("--evaluation_interval", default=1, type=int, help="evaluate very n epoch.") parser.add_argument("--lr_scheduler", default='mannul', type=str, help="whether use learning rate scheduler.") parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--compact", action="store_true", help="whether use compact vilbert model.") parser.add_argument("--captions_path", default='', type=str, help="Captions file for coco") parser.add_argument("--cider_path", default='', type=str, help="Cider scores file for coco") parser.add_argument( "--tsv_path", default='', type=str, help= "tsv path file (We don't use this, just that acc is in same place) for coco" ) parser.add_argument("--captions_path_2", default='', type=str, help="Captions file for nocaps") parser.add_argument("--cider_path_2", default='', type=str, help="Cider scores file for nocaps") parser.add_argument( "--tsv_path_2", default='', type=str, help= "tsv path file (We don't use this, just that acc is in same place) for nocaps" ) parser.add_argument("--ratio", default=3, type=int, help="Number of epochs of coco vs nocaps") parser.add_argument("--val_captions_path", default='', type=str, help="Val captions") parser.add_argument("--val_cider_path", default='', type=str, help="Val cider") parser.add_argument("--val_captions_path_2", default='', type=str, help="Val captions") parser.add_argument("--val_cider_path_2", default='', type=str, help="Val cider") args = parser.parse_args() assert len(args.output_dir) > 0 with open('vlbert_tasks.yml', 'r') as f: task_cfg = edict(yaml.load(f)) if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BaseBertForVLTasks elif args.compact: from vilbert.vilbert_compact import BertConfig from vilbert.vilbert_compact import VILBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks if args.save_name: prefix = '-' + args.save_name else: prefix = '' timeStamp = '_' + args.config_file.split('/')[1].split('.')[0] + prefix savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, 'command.txt'), 'w') as f: print(args, file=f) # Python 3.x print('\n', file=f) print(config, file=f) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) coco_dataset = CiderDataset(args.captions_path, args.tsv_path, args.cider_path, tokenizer) coco_val_dataset = CiderDataset(args.val_captions_path, args.tsv_path, args.val_cider_path, tokenizer) nocaps_dataset = CiderDataset(args.captions_path_2, args.tsv_path_2, args.cider_path_2, tokenizer) nocaps_val_dataset = CiderDataset(args.val_captions_path_2, args.tsv_path_2, args.val_cider_path_2, tokenizer) coco_train_dataloader = DataLoader(coco_dataset, batch_size=args.batch_size, shuffle=True) nocaps_train_dataloader = DataLoader(nocaps_dataset, batch_size=args.batch_size, shuffle=True) coco_val_dataloader = DataLoader(coco_val_dataset, batch_size=args.batch_size, shuffle=False) nocaps_val_dataloader = DataLoader(nocaps_val_dataset, batch_size=args.batch_size, shuffle=False) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model = VILBertForVLTasks.from_pretrained(args.from_pretrained, config, num_labels=1, default_gpu=default_gpu) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] lr = args.learning_rate for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'vil_prediction' in key: # if args.learning_rate <= 2e-5: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = args.learning_rate else: lr = 1e-4 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) num_train_optimization_steps = ((len(coco_dataset) // args.batch_size) * args.num_train_epochs) if args.optimizer == 'BertAdam': optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) elif args.optimizer == 'Adam': optimizer = Adam( optimizer_grouped_parameters, lr=base_lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) elif args.optimizer == 'Adamax': optimizer = Adamax( optimizer_grouped_parameters, lr=base_lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) if args.lr_scheduler == 'automatic': lr_scheduler = ReduceLROnPlateau(optimizer, \ mode='max', factor=0.2, patience=1, cooldown=1, threshold=0.001) elif args.lr_scheduler == 'mannul': lr_reduce_list = np.array([12, 16]) # lr_reduce_list = np.array([6, 8, 10]) def lr_lambda_fun(epoch): return pow(0.1, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) criterion = nn.MSELoss() i = 0 j = 0 coco_correlation_values = [] nocaps_correlation_values = [] # initialize the data iteration. for epochId in tqdm(range(args.num_train_epochs), desc="Epoch"): model.train() for batch in coco_train_dataloader: # TODO: Make this a function i += 1 if not args.no_cuda: batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, captions, _, input_mask, segment_ids, co_attention_mask, image_id, y = batch _, vil_logit, _, _, _, _, _ = \ model(captions, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask) loss = torch.sqrt(criterion(vil_logit.squeeze(-1), y.to(device))) writer.add_scalar('Train_loss', loss, i) loss.backward() optimizer.step() model.zero_grad() optimizer.zero_grad() for _ in range(args.ratio): for batch in nocaps_train_dataloader: i += 1 if not args.no_cuda: batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, captions, _, input_mask, segment_ids, co_attention_mask, image_id, y = batch _, vil_logit, _, _, _, _, _ = \ model(captions, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask) loss = torch.sqrt( criterion(vil_logit.squeeze(-1), y.to(device))) writer.add_scalar('Train_loss', loss, i) loss.backward() optimizer.step() model.zero_grad() optimizer.zero_grad() model.eval() coco_actual_values = [] coco_predicted_values = [] for batch in coco_val_dataloader: j += 1 batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, captions, _, input_mask, segment_ids, co_attention_mask, image_id, y = batch _, vil_logit, _, _, _, _, _ = \ model(captions, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask) coco_actual_values += y.tolist() coco_predicted_values += vil_logit.squeeze(-1).tolist() loss = torch.sqrt(criterion(vil_logit.squeeze(-1), y.to(device))) writer.add_scalar('Val_loss', loss, j) correlation_here = np.corrcoef(np.array(coco_actual_values), np.array(coco_predicted_values))[0, 1] coco_correlation_values.append(correlation_here) print("coco correlation: ", correlation_here) nocaps_actual_values = [] nocaps_predicted_values = [] for batch in nocaps_val_dataloader: j += 1 batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, captions, _, input_mask, segment_ids, co_attention_mask, image_id, y = batch _, vil_logit, _, _, _, _, _ = \ model(captions, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask) nocaps_actual_values += y.tolist() nocaps_predicted_values += vil_logit.squeeze(-1).tolist() loss = torch.sqrt(criterion(vil_logit.squeeze(-1), y.to(device))) writer.add_scalar('Val_loss', loss, j) correlation_here = np.corrcoef(np.array(nocaps_actual_values), np.array(nocaps_predicted_values))[0, 1] nocaps_correlation_values.append(correlation_here) print("nocaps correlation: ", correlation_here) # Save a trained model model_to_save = (model.module if hasattr(model, "module") else model ) # Only save the model it-self if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) output_model_file = os.path.join( args.output_dir, "pytorch_model_" + str(epochId) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) lr_scheduler.step() print(args.ratio) print("coco correlations ", coco_correlation_values) print("nocaps correlations ", nocaps_correlation_values)
def loss_creator(config): return nn.MSELoss()
def run(settings): settings.description = 'Default train settings for DiMP with ResNet50 as backbone.' settings.batch_size = 4 settings.num_workers = 8 settings.multi_gpu = False settings.print_interval = 5 settings.normalize_mean = [0.485, 0.456, 0.406, 0] settings.normalize_std = [0.229, 0.224, 0.225, 1.0] settings.search_area_factor = 5.0 settings.output_sigma_factor = 1/4 settings.target_filter_sz = 4 settings.feature_sz = 18 settings.output_sz = settings.feature_sz * 16 settings.center_jitter_factor = {'train': 3, 'test': 4.5} settings.scale_jitter_factor = {'train': 0.25, 'test': 0.5} settings.hinge_threshold = 0.05 # settings.print_stats = ['Loss/total', 'Loss/iou', 'ClfTrain/init_loss', 'ClfTrain/test_loss'] # # Train datasets # lasot_train = Lasot(settings.env.lasot_dir, split='train') # got10k_train = Got10k(settings.env.got10k_dir, split='vottrain') # trackingnet_train = TrackingNet(settings.env.trackingnet_dir, set_ids=list(range(4))) # coco_train = MSCOCOSeq(settings.env.coco_dir) # # # Validation datasets # got10k_val = Got10k(settings.env.got10k_dir, split='votval') # Train datasets #lasot_train = Lasot(split='train') ptb_train = PrincetonRGBD(split='validation') # stc_train = StcRGBD(split='train') # kevinlai_train=kevinlaiRGBD(split='train') #trackingnet_train = TrackingNet(set_ids=list(range(11))) #coco_train = MSCOCOSeq() # Validation datasets #lasot_val = Lasot(split='train')#TrackingNet(set_ids=list(range(11,12))) ptb_val = PrincetonRGBD(split='validation') # Data transform transform_joint = dltransforms.ToGrayscale(probability=0.05) transform_train = torchvision.transforms.Compose([dltransforms.ToTensorAndJitter(0.2), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) transform_val = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize(mean=settings.normalize_mean, std=settings.normalize_std)]) # The tracking pairs processing module output_sigma = settings.output_sigma_factor / settings.search_area_factor proposal_params = {'min_iou': 0.1, 'boxes_per_frame': 8, 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]} label_params = {'feature_sz': settings.feature_sz, 'sigma_factor': output_sigma, 'kernel_sz': settings.target_filter_sz} data_processing_train = processing.DiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_train, joint_transform=transform_joint) data_processing_val = processing.DiMPProcessing(search_area_factor=settings.search_area_factor, output_sz=settings.output_sz, center_jitter_factor=settings.center_jitter_factor, scale_jitter_factor=settings.scale_jitter_factor, mode='sequence', proposal_params=proposal_params, label_function_params=label_params, transform=transform_val, joint_transform=transform_joint) # Train sampler and loader dataset_train = sampler.DiMPSampler([ptb_train], [1], samples_per_epoch=26000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_train) loader_train = LTRLoader('train', dataset_train, training=True, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=True, drop_last=True, stack_dim=1) # Validation samplers and loaders # dataset_val = sampler.DiMPSampler([got10k_val], [1], samples_per_epoch=5000, max_gap=30, # num_test_frames=3, num_train_frames=3, # processing=data_processing_val) dataset_val = sampler.DiMPSampler([ptb_val], [1], samples_per_epoch=5000, max_gap=30, num_test_frames=3, num_train_frames=3, processing=data_processing_val) loader_val = LTRLoader('val', dataset_val, training=False, batch_size=settings.batch_size, num_workers=settings.num_workers, shuffle=False, drop_last=True, epoch_interval=5, stack_dim=1) # Create network and actor net = dimpnet_rgbd_blend0.dimpnet50(filter_size=settings.target_filter_sz, backbone_pretrained=True, optim_iter=5, clf_feat_norm=True, clf_feat_blocks=0, final_conv=True, out_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, init_gauss_sigma=output_sigma * settings.feature_sz, num_dist_bins=100, bin_displacement=0.1, mask_init_factor=3.0, target_mask_act='sigmoid', score_act='relu') # Wrap the network for multi GPU training if settings.multi_gpu: net = MultiGPU(net, dim=1) objective = {'iou': nn.MSELoss(), 'occ': nn.SmoothL1Loss(), 'test_clf': ltr_losses.LBHinge(threshold=settings.hinge_threshold)} loss_weight = {'iou': 1, 'occ':1, 'test_clf': 100, 'test_init_clf': 100, 'test_iter_clf': 400} actor = actors.DiMPActor(net=net, objective=objective, loss_weight=loss_weight) # Optimizer optimizer = optim.Adam([{'params': actor.net.classifier.filter_initializer.parameters(), 'lr': 5e-5}, {'params': actor.net.classifier.filter_optimizer.parameters(), 'lr': 5e-4}, {'params': actor.net.classifier.feature_extractor.parameters(), 'lr': 5e-5}, {'params': actor.net.bb_regressor.parameters(), 'lr': 2e-4}, {'params': actor.net.feature_extractor.parameters(), 'lr': 2e-5}], lr=0.1*2e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.2) trainer = LTRTrainer(actor, [loader_train, loader_val], optimizer, settings, lr_scheduler) #trainer.train(10, load_latest=True, fail_safe=True, path_pretrained=None)#'./checkpoints/dimp50.pth') trainer.train(50, load_latest=True, fail_safe=True, path_pretrained=None)
def __init__(self, config): super(GRANMixtureBernoulli, self).__init__() self.config = config self.device = config.device self.max_num_nodes = config.model.max_num_nodes self.hidden_dim = config.model.hidden_dim # 256 self.is_sym = config.model.is_sym self.block_size = config.model.block_size # 1 self.sample_stride = config.model.sample_stride # 1 self.num_GNN_prop = config.model.num_GNN_prop # 1 self.num_GNN_layers = config.model.num_GNN_layers # 7 self.agg_GNN_method = config.model.agg_GNN_method self.edge_weight = config.model.edge_weight if hasattr( config.model, 'edge_weight') else 1.0 self.dimension_reduce = config.model.dimension_reduce # true self.has_attention = config.model.has_attention # true self.num_canonical_order = config.model.num_canonical_order # 1 self.output_dim = 1 self.num_mix_component = config.model.num_mix_component # 20 self.has_rand_feat = False # use random feature instead of 1-of-K encoding self.att_edge_dim = 64 self.use_mask_prob = config.test.use_mask_prob self.relative_training = config.model.relative_training self.relative_num = config.model.relative_num self.output_theta = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU(inplace=True), nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU(inplace=True), nn.Linear(self.hidden_dim, self.output_dim * self.num_mix_component)) self.output_alpha = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU(inplace=True), nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU(inplace=True), nn.Linear(self.hidden_dim, self.num_mix_component)) if self.dimension_reduce: self.embedding_dim = config.model.embedding_dim self.decoder_input = nn.Sequential( nn.Linear(self.max_num_nodes, self.embedding_dim)) self.decoder_input_new = nn.Sequential( nn.Linear(3, self.embedding_dim)) self.decoder_input_new2 = nn.Sequential( nn.Linear(4, self.embedding_dim)) else: self.embedding_dim = self.max_num_nodes self.output_label = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.LeakyReLU(inplace=True), nn.Dropout(0.5), nn.Linear(self.hidden_dim, 2), nn.Sigmoid()) self.output_position = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.LeakyReLU(inplace=True), nn.Dropout(0.5), nn.Linear(self.hidden_dim, 2)) self.output_feature = nn.Sequential( nn.Linear(self.hidden_dim, self.hidden_dim), nn.LeakyReLU(inplace=True), nn.Dropout(0.5), nn.Linear(self.hidden_dim, 1)) self.decoder = GNN(msg_dim=self.embedding_dim, node_state_dim=self.hidden_dim, edge_feat_dim=2 * self.att_edge_dim, num_prop=self.num_GNN_prop, num_layer=self.num_GNN_layers, has_attention=self.has_attention) self.decoder2 = GNN(msg_dim=self.embedding_dim, node_state_dim=self.hidden_dim, edge_feat_dim=2 * self.att_edge_dim, num_prop=self.num_GNN_prop, num_layer=self.num_GNN_layers, has_attention=self.has_attention) ### Loss functions pos_weight = torch.ones([1]) * self.edge_weight self.label_loss_func = nn.CrossEntropyLoss(reduction='mean') self.feature_loss_func = nn.MSELoss(reduction='mean') self.adj_loss_func = nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction='none')
net = Net() print(net) params = list(net.parameters()) print(len(params)) print(params[0].size()) # conv1的weight # 输入一个Variable input = Variable(torch.randn(1, 1, 32, 32)) out = net(input) # 输出也是Varible print(out) # loss target = Variable(torch.arange(1, 11)) # 一个虚拟的目标 criterion = nn.MSELoss() loss = criterion(out, target.float()) print(loss) net.zero_grad() out.backward(torch.randn(1, 10)) print(out) print(loss.grad_fn) # MSELoss print(loss.grad_fn.next_functions[0][0]) # Linear print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # ReLU # net.zero_grad() # 把之前的梯度清零 # print('conv1.bias.grad before backward') # print(net.conv1.bias.grad) # None
def startLearning(): #Init Tensorboard writer = SummaryWriter() #Define batch size the number of epoch batch_size = 16 max_epochs = 50 #Make model model = AutoEncoder().cuda() #Define loss type criterion_expressions = nn.CrossEntropyLoss().cuda() criterion_landmarks = nn.MSELoss().cuda() #Define the optimizer optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999)) #Define the scheduler scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=5) best_acc = None print("load dataset") #Load Dataset loader_filenames = np.asarray([ "training_dataset_pack0.h5", "training_dataset_pack1.h5", "training_dataset_pack2.h5", "training_dataset_pack3.h5" ]) validation_loader = torch.utils.data.DataLoader(dataset=Dataset( 'validation_dataset_pack0.h5', "std_training.png", "mean_training.png"), batch_size=batch_size, shuffle=True, num_workers=1) print("Done") #Main loop (epoch) for epoch in range(1, max_epochs + 1): np.random.shuffle(loader_filenames) is_best = False print("Training...") #Init metrics loss_exp = 0. acc_exp = 0. loss_lm = 0. acc_lm = 0. expressions_loss = 0. landmarks_loss = 0. expressions_acc = 0. landmarks_acc = 0. count = 0 for loader_filename in loader_filenames: training_loader = torch.utils.data.DataLoader( dataset=Dataset(loader_filename, "std_training.png", "mean_training.png"), batch_size=batch_size, shuffle=True, num_workers=4) #Init progress bar for training pbart = tqdm.tqdm(total=int( len(training_loader.dataset) / batch_size), postfix={ "loss_e": None, "acc_e": None, "loss_l": None, "acc_l": None }, desc="Epoch: {}/{}".format(epoch, max_epochs)) #Training loop for i, data in enumerate(training_loader, 0): count += 1 #Zero the parameter gradients optimizer.zero_grad() #Get the inputs images, landmarks, expressions = data images = images.to(device) landmarks = landmarks.to(device).float() expressions = expressions.to(device).long() #Get the outputs outputs = model(images) #Calculate metrics #Loss expressions_loss = criterion_expressions( outputs[0], expressions) landmarks_loss = criterion_landmarks(outputs[1], landmarks) loss_lm += landmarks_loss.item() loss_exp += expressions_loss.item() #Accuracy _, predicted_expressions = torch.max(outputs[0], 1) expressions_acc += (predicted_expressions == expressions).sum().float() / batch_size acc_exp += (predicted_expressions == expressions).sum().float().item() / batch_size predicted_landmarks = outputs[1] #if epoch==2 and i ==5: # print(predicted_landmarks.detach().cpu().numpy().shape,landmarks.cpu().numpy().shape) # print(predicted_landmarks.detach().cpu().numpy()[0,:],landmarks.cpu().numpy()[0,:]) # print(np.abs(predicted_landmarks.detach().cpu().numpy()[0,:]-landmarks.cpu().numpy()[0,:])) landmarks_acc += 1 - (np.mean( np.abs(predicted_landmarks.detach().cpu().numpy() - landmarks.cpu().numpy())) / 128) acc_lm += 1 - (np.mean( np.abs(predicted_landmarks.detach().cpu().numpy() - landmarks.cpu().numpy())) / 128) #Backpropagation landmarks_loss.backward() expressions_loss.backward() #Reduce learning rate if we are on a plateau optimizer.step() #Update pbart.update(1) pbart.set_postfix({ "loss_e": loss_exp / count, "acc_e": expressions_acc.item() / count, "loss_l": loss_lm / count, "acc_l": landmarks_acc.item() / count }) pbart.close() #Calculate metrics on one epoch loss_exp /= (count) / batch_size acc_exp /= (count) / batch_size loss_lm /= (count) / batch_size acc_lm /= (count) / batch_size #Save metrics in a log file with open("log/training_DAN.log", "a") as f: f.write( "epoch: {} / {} loss_e: {} acc_e: {} loss_l: {} acc_l: {}\n". format(epoch, max_epochs, loss_exp, acc_exp, loss_lm, acc_lm)) f.close() #Construct tensorboard graph writer.add_scalar('data/Loss_expressions_training', loss_exp, epoch) writer.add_scalar('data/Accuracy_expressions_training', acc_exp, epoch) writer.add_scalar('data/Loss_landmarks_training', loss_lm, epoch) writer.add_scalar('data/Accuracy_landmarks_training', acc_lm, epoch) #Init metrics loss_exp = 0. acc_exp = 0. loss_lm = 0. acc_lm = 0. expressions_loss = 0. landmarks_loss = 0. expressions_acc = 0. landmarks_acc = 0. count = 0 print("Validation...") #Init progress bar for validation pbarv = tqdm.tqdm(total=int( len(validation_loader.dataset) / batch_size), postfix={ "loss_e": None, "acc_e": None, "loss_l": None, "acc_l": None }, desc="Epoch: {}/{}".format(epoch, max_epochs)) #Validation loop with torch.no_grad(): for i, data in enumerate(validation_loader, 0): count += 1 #Get the inputs images, landmarks, expressions = data images = images.to(device) landmarks = landmarks.to(device).float() expressions = expressions.to(device).long() #Get the outputs outputs = model(images) #Calculate metrics #Loss expressions_loss = criterion_expressions( outputs[0], expressions) landmarks_loss = criterion_landmarks(outputs[1], landmarks) loss_lm += landmarks_loss.item() loss_exp += expressions_loss.item() #Accuracy _, predicted_expressions = torch.max(outputs[0], 1) expressions_acc += (predicted_expressions == expressions).sum().float() / batch_size acc_exp += (predicted_expressions == expressions).sum().float().item() / batch_size predicted_landmarks = outputs[1] landmarks_acc += (1 - (np.mean( np.abs(predicted_landmarks.cpu().numpy() - landmarks.cpu().numpy())) / 128)) acc_lm += (1 - (np.mean( np.abs(predicted_landmarks.cpu().numpy() - landmarks.cpu().numpy())) / 128)) #Uptate validation progress bar pbarv.update(1) pbarv.set_postfix({ "loss_e": loss_exp / count, "acc_e": expressions_acc.item() / count, "loss_l": loss_lm / count, "acc_l": landmarks_acc.item() / count }) pbarv.close() #Calculate metrics on one epoch loss_exp /= (count) / batch_size acc_exp /= (count) / batch_size loss_lm /= (count) / batch_size acc_lm /= (count) / batch_size #Save the weights of the model if best_acc == None or acc_exp < best_acc: best_acc = acc_exp is_best = True save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'loss expressions': loss_exp, 'accuracy expressions': acc_exp, 'loss landmarks': loss_lm, 'accuracy landmarks': acc_lm, 'optimizer': optimizer.state_dict() }, is_best, "save_model/checkpoint_DAN.pth", "save_model/best_model_validation.pth") is_best = False scheduler.step(loss_lm) #Save metrics in a log file with open("log/validation_DAN.log", "a") as f: f.write( "epoch: {} / {} loss_e: {} acc_e: {} loss_l: {} acc_l: {}\n". format(epoch, max_epochs, loss_exp, acc_exp, loss_lm, acc_lm)) f.close() #Construct tensorboard graph writer.add_scalar('data/Loss_expressions_validation', loss_exp, epoch) writer.add_scalar('data/Accuracy_expressions_validation', acc_exp, epoch) writer.add_scalar('data/Loss_landmarks_validation', loss_lm, epoch) writer.add_scalar('data/Accuracy_landmarks_validation', acc_lm, epoch)