def train_model(model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu , optimizer_center , criterion_center_loss, criterion_osm_caa, beta_ratio): model.train() losses = AverageMeter() cetner_loss_weight = 0.0005 for batch_idx, (imgs, pids, _) in enumerate(trainloader): if use_gpu: imgs, pids = imgs.cuda(), pids.cuda() imgs, pids = Variable(imgs), Variable(pids) outputs, features = model(imgs) ide_loss = criterion_xent(outputs , pids) triplet_loss = criterion_htri(features, features, features, pids, pids, pids) center_loss = criterion_center_loss(features, pids) # hosm_loss = criterion_osm_caa(features, pids , model.module.classifier.classifier.weight.t()) hosm_loss = criterion_osm_caa(features, pids , criterion_center_loss.centers.t() ) loss = ide_loss + (1-beta_ratio )* triplet_loss + center_loss * cetner_loss_weight + beta_ratio * hosm_loss optimizer.zero_grad() optimizer_center.zero_grad() loss.backward() optimizer.step() for param in criterion_center_loss.parameters(): param.grad.data *= (1./cetner_loss_weight) optimizer_center.step() losses.update(loss.data.item(), pids.size(0)) return (losses.avg , ide_loss.item() , triplet_loss.item() , hosm_loss.item())
def predict_set(nets, dataloader, runtime_params): run_type = runtime_params['run_type'] #net = net.eval() progbar = Progbar(len(dataloader.dataset), stateful_metrics=['run-type']) batch_time = AverageMeter() names = [] pred_landmarks = np.array([]) gt_landmarks = np.array([]) with torch.no_grad(): for i, (landmarks, imgs, img_paths) in enumerate(dataloader): s_time = time.time() imgs = imgs.cuda() names.extend(img_paths) net = nets[0] if 'half' in runtime_params.values(): output = net(imgs.half()) else: output = net(imgs) output = output.cpu().numpy() pred_landmarks = np.concatenate((pred_landmarks, output), axis=0) gt_landmarks = np.concatenate( (gt_landmarks, landmarks.data.numpy()), axis=0) progbar.add(imgs.size(0), values=[ ('run-type', run_type) ]) # ,('batch_time', batch_time.val)]) batch_time.update(time.time() - s_time) if runtime_params['debug'] and i: break pred_landmarks = pred_landmarks.reshape((-1, 28, 2)) gt_landmarks = gt_landmarks.reshape((-1, 28, 2)) assert gt_landmarks.shape == pred_landmarks.shape return gt_landmarks, gt_landmarks, names
def valid_trainer(model, valid_loader, criterion): model.eval() loss_meter = AverageMeter() preds_probs = [] gt_list = [] with torch.no_grad(): for step, (imgs, gt_label, gt_depth, imgname) in enumerate(tqdm(valid_loader)): imgs = imgs.cuda() gt_label = gt_label.cuda() gt_depth = gt_depth.cuda() gt_list.append(gt_label.cpu().numpy()) gt_label[gt_label == -1] = 0 valid_logits, _ = model(imgs) valid_loss = criterion(valid_logits, gt_label) + loss_autoencoder( _, gt_depth) valid_probs = torch.sigmoid(valid_logits) preds_probs.append(valid_probs.cpu().numpy()) loss_meter.update(to_scalar(valid_loss)) valid_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) return valid_loss, gt_label, preds_probs
def valid_trainer(model, valid_loader, criterion): model.eval() loss_meter = AverageMeter() preds_probs = [] gt_list = [] with torch.no_grad(): for step, (imgs, gt_label, imgname) in enumerate(tqdm(valid_loader)): imgs = imgs.cuda() gt_label = gt_label.cuda() gt_list.append(gt_label.cpu().numpy()) gt_label[gt_label == -1] = 0 valid_logits = model(imgs) valid_loss = criterion(valid_logits, gt_label) #去除sigmoid for mcc valid_probs = torch.sigmoid(valid_logits) # valid_probs = valid_logits preds_probs.append(valid_probs.cpu().numpy()) loss_meter.update(to_scalar(valid_loss)) valid_loss = loss_meter.avg print(f'valid losss: {valid_loss}') gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) return valid_loss, gt_label, preds_probs
def valid_trainer(model, valid_loader, criterion): model.eval() loss_meter = AverageMeter() preds_probs = [] gt_list = [] with torch.no_grad(): for step, (imgs, depth, gt_label, imgname) in enumerate(tqdm(valid_loader)): imgs = imgs.cuda() gt_label = gt_label.cuda() gt_list.append(gt_label.cpu().numpy()) gt_label[gt_label == -1] = 0 valid_logits = model(imgs, depth) #valid_logits = model(imgs, gt_label) #valid_logits, output_depth_0,output_depth_1,output_depth_2 = model(imgs, depth) #valid_logits, depth_logits = model(imgs, depth) valid_loss = criterion(valid_logits, gt_label) valid_probs = torch.sigmoid(valid_logits) preds_probs.append(valid_probs.cpu().numpy()) loss_meter.update(to_scalar(valid_loss)) #show_on_image(imgname, output_0) #vif(imgname,output_depth_0,output_depth_1,output_depth_2,output_depth_3, output_depth_4, output_depth_5) #return 0 valid_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) return valid_loss, gt_label, preds_probs
def batch_trainer(epoch, model, train_loader, criterion, optimizer): model.train() epoch_time = time.time() loss_meter = AverageMeter() batch_num = len(train_loader) gt_list = [] preds_probs = [] lr = optimizer.param_groups[1]['lr'] for step, (imgs, gt_label, imgname) in enumerate(train_loader): batch_time = time.time() imgs, gt_label = imgs.cuda(), gt_label.cuda() feat_map, output = model(imgs) loss_list = [] for k in range(len(output)): out = output[k] loss_list.append(criterion(out, gt_label)) loss = sum(loss_list) #maximum voting output = torch.max( torch.max(torch.max(torch.max(output[0], output[1]), output[2]), output[3]), output[4]) train_loss = loss optimizer.zero_grad() train_loss.backward() clip_grad_norm_(model.parameters(), max_norm=10.0) # make larger learning rate works optimizer.step() loss_meter.update(to_scalar(train_loss)) gt_list.append(gt_label.cpu().numpy()) train_probs = torch.sigmoid(output) preds_probs.append(train_probs.detach().cpu().numpy()) log_interval = 20 if (step + 1) % log_interval == 0 or (step + 1) % len(train_loader) == 0: print( f'{time_str()}, Step {step}/{batch_num} in Ep {epoch}, {time.time() - batch_time:.2f}s ', f'train_loss:{loss_meter.val:.4f}') train_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) print( f'Epoch {epoch}, LR {lr}, Train_Time {time.time() - epoch_time:.2f}s, Loss: {loss_meter.avg:.4f}' ) return train_loss, gt_label, preds_probs
def batch_trainer(epoch, model, train_loader, criterion, optimizer, loss): model.train() epoch_time = time.time() loss_meter = AverageMeter() batch_num = len(train_loader) gt_list = [] preds_probs = [] lr = optimizer.param_groups[0]['lr'] for step, (imgs, gt_label, imgname) in enumerate(train_loader): batch_time = time.time() imgs, gt_label = imgs.cuda(), gt_label.cuda() train_logit_1, train_logit_2, train_logit_3, train_logit_4 = model( imgs) if loss == 'Multi_Level_Loss': train_loss = 0.1 * criterion( train_logit_1, gt_label) + 0.3 * criterion( train_logit_2, gt_label) + 0.7 * criterion( train_logit_3, gt_label) + criterion( train_logit_4, gt_label) train_loss.backward() clip_grad_norm_(model.parameters(), max_norm=10.0) # make larger learning rate works optimizer.step() optimizer.zero_grad() loss_meter.update(to_scalar(train_loss)) gt_list.append(gt_label.cpu().numpy()) train_probs = torch.sigmoid(train_logit_4) #train_probs_2 = torch.sigmoid(train_logit_2) #train_probs_3 = torch.sigmoid(train_logit_3) #train_probs_4 = torch.sigmoid(train_logit_4) #train_max = (train_probs + train_probs_2)/2 #preds_probs.append(train_max.detach().cpu().numpy()) preds_probs.append(train_probs.detach().cpu().numpy()) log_interval = 20 if (step + 1) % log_interval == 0 or (step + 1) % len(train_loader) == 0: print( f'{time_str()}, Step {step}/{batch_num} in Ep {epoch}, {time.time() - batch_time:.2f}s ', f'train_loss:{loss_meter.val:.4f}') train_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) print( f'Epoch {epoch}, LR {lr}, Train_Time {time.time() - epoch_time:.2f}s, Loss: {loss_meter.avg:.4f}' ) return train_loss, gt_label, preds_probs
def train_val(model, optimizer, train_loader, test_loader, epoch, margin=1.0, use_ohem=False, log_interval=100, test_interval=2000, is_cuda=True): loss = AverageMeter() batch_num = len(train_loader) for batch_idx, (data_a, data_p, data_n, target) in enumerate(train_loader): model.train() if is_cuda: data_a = data_a.cuda() data_p = data_p.cuda() data_n = data_n.cuda() #target = target.cuda() #print('data_size = ',data_a.size()) #print(data_a) #print('-----------------------------------------') data_a = Variable(data_a) data_p = Variable(data_p) data_n = Variable(data_n) target = Variable(target) optimizer.zero_grad() out_a = model(data_a) out_p = model(data_p) out_n = model(data_n) triploss_layer = TripletMarginLoss(margin, use_ohem=use_ohem) trip_loss = triploss_layer(out_a, out_p, out_n) trip_loss.backward() optimizer.step() loss.update(trip_loss.data[0]) if (batch_idx + 1) % log_interval == 0: logging('Train-Epoch:{:04d}\tbatch:{:06d}/{:06d}\tloss:{:.04f}'\ .format(epoch, batch_idx+1, batch_num, trip_loss.data[0])) if (batch_idx + 1) % test_interval == 0: threshlod, accuracy, mean_d_a_p, mean_d_a_n = best_test( model, test_loader) logging( 'Test-T-A Epoch {:04d}-{:06d} accuracy: {:.04f} threshold: {:.05} ap_mean: {:.04f} an_mean: {:.04f}' .format(epoch, batch_idx + 1, accuracy, threshlod, mean_d_a_p, mean_d_a_n)) cutoff = len(model.module.feat_model._modules) model_name = 'models/epoch_{:04d}-{:06d}_feat.weights'.format( epoch, batch_idx + 1) save_weights(model.module.feat_model, model_name, cutoff) logging('save model: {:s}'.format(model_name))
def valid_trainer(model, valid_loader, criterion): model.eval() loss_meter = AverageMeter() preds_probs = [] gt_list = [] with torch.no_grad(): for step, (imgs, gt_label, imgname) in enumerate(tqdm(valid_loader)): imgs = imgs.cuda() gt_label = gt_label.cuda() valid_logit_1, valid_logit_2, valid_logit_3, valid_logit_4 = model( imgs) #pdb.set_trace() gt_list.append(gt_label.cpu().numpy()) gt_label[gt_label == -1] = 0 #valid_logits, cha_att, spa_att = model(imgs) #pdb.set_trace() #valid_loss = 0 valid_loss = criterion(valid_logit_4, gt_label) valid_probs = torch.sigmoid(valid_logit_4) #valid_probs_2 = torch.sigmoid(valid_logit_2) #valid_probs_3 = torch.sigmoid(valid_logit_3) #valid_probs_4 = torch.sigmoid(valid_logit_4) #pdb.set_trace() #pred_max = (valid_probs + valid_probs_2)/2 #preds_probs.append(pred_max.cpu().numpy()) preds_probs.append(valid_probs.detach().cpu().numpy()) loss_meter.update(to_scalar(valid_loss)) #show_filter(imgname, gt_label, valid_logit_2) #pdb.set_trace() #nmf_show(imgname, feature_map) #show_att(imgname, mask,mask ) #affine(imgname, theta) #vif(imgname, valid_logit_4, valid_logit_4) #return 0 #get_mask_block(imgname, gt_label, valid_logit_l, valid_logit_3, valid_logit_2) #get_att(imgname, gt_label, valid_logit_2, valid_logit_l) #pdb.set_trace() #get_detector(gt_label, valid_logit_4,valid_probs_2, valid_logit_3) #np.save('part_detector.py', part_detector) valid_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) #save_part() return valid_loss, gt_label, preds_probs
def train_epoch(current_epoch, loss_functions, model, optimizer, scheduler, train_data_loader, summary_writer, conf, local_rank): losses = AverageMeter() mious = AverageMeter() iterator = tqdm(train_data_loader) model.train() if conf["optimizer"]["schedule"]["mode"] == "epoch": scheduler.step(current_epoch) for i, sample in enumerate(iterator): imgs = sample["image"].cuda() masks = sample["mask"].cuda().float() masks_orig = sample["mask_orig"].cuda().float() out_mask = model(imgs) with torch.no_grad(): pred = torch.softmax(out_mask, dim=1) argmax = torch.argmax(pred, dim=1) ious = miou_round(argmax, masks_orig).item() mious.update(ious, imgs.size(0)) mask_loss = loss_functions["mask_loss"](out_mask, masks.contiguous()) loss = mask_loss losses.update(loss.item(), imgs.size(0)) iterator.set_description( "epoch: {}; lr {:.7f}; Loss ({loss.avg:.4f}); miou ({miou.avg:.4f}); " .format(current_epoch, scheduler.get_lr()[-1], loss=losses, miou=mious)) optimizer.zero_grad() if conf['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1) optimizer.step() torch.cuda.synchronize() if conf["optimizer"]["schedule"]["mode"] in ("step", "poly"): scheduler.step(i + current_epoch * len(train_data_loader)) if local_rank == 0: for idx, param_group in enumerate(optimizer.param_groups): lr = param_group['lr'] summary_writer.add_scalar('group{}/lr'.format(idx), float(lr), global_step=current_epoch) summary_writer.add_scalar('train/loss', float(losses.avg), global_step=current_epoch)
def main(): args.n_resgroups = 5 args.n_resblocks = 3 args.n_feats = 64 args.n_reduction = 16 data_path = './data/valid/lr3' gt_path = './data/valid/hr' result_path = './track1_valid_data/' var_name = 'data' if not os.path.exists(result_path): os.makedirs(result_path) model_path = './model/track1_model.pkl' save_point = torch.load(model_path) model_param = save_point['state_dict'] model = make_model(args) model.load_state_dict(model_param) model = model.cuda() model.eval() mrae = AverageMeter() for mat_name in sorted(os.listdir(data_path)): mat_path_name = os.path.join(data_path, mat_name) f = h5py.File(mat_path_name,'r') input_data = f.get(var_name) input_data = np.array(input_data) mat_name = mat_name[:-8] + '_tr1.mat' mat_path_name = os.path.join(gt_path, mat_name) f = h5py.File(mat_path_name,'r') target = f.get(var_name) target = np.array(target) target = np.transpose(target,[2,1,0]) input_data = input_data/65535 img_res = self_ensemble(model,input_data,target) MRAEs = cal_mrae(target,img_res) mat_name = mat_name[:-8] + '_tr1.mat' mat_dir= os.path.join(result_path, mat_name) save_matv73(mat_dir, var_name, img_res) mrae.update(MRAEs) print(mat_name) print(img_res.shape) print(MRAEs) print(mrae.avg)
def batch_trainer(epoch, model, train_loader, criterion, optimizer, loss): model.train() epoch_time = time.time() loss_meter = AverageMeter() batch_num = len(train_loader) gt_list = [] preds_probs = [] lr = optimizer.param_groups[0]['lr'] for step, (imgs, gt_label, gt_depth, imgname) in enumerate(train_loader): #print(step) batch_time = time.time() imgs, gt_label, gt_depth = imgs.cuda(), gt_label.cuda(), gt_depth.cuda( ) train_logits, _ = model(imgs) #print("sssssssssssssssssssssssssssss") #pdb.set_trace() if loss == 'BCE_LOSS': train_loss = criterion(train_logits, gt_label) + loss_autoencoder( _, gt_depth) train_loss.backward() clip_grad_norm_(model.parameters(), max_norm=10.0) # make larger learning rate works optimizer.step() optimizer.zero_grad() loss_meter.update(to_scalar(train_loss)) gt_list.append(gt_label.cpu().numpy()) train_probs = torch.sigmoid(train_logits) preds_probs.append(train_probs.detach().cpu().numpy()) log_interval = 20 if (step + 1) % log_interval == 0 or (step + 1) % len(train_loader) == 0: print( f'{time_str()}, Step {step}/{batch_num} in Ep {epoch}, {time.time() - batch_time:.2f}s ', f'train_loss:{loss_meter.val:.4f}') train_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) print( f'Epoch {epoch}, LR {lr}, Train_Time {time.time() - epoch_time:.2f}s, Loss: {loss_meter.avg:.4f}' ) return train_loss, gt_label, preds_probs
def train_epoch(current_epoch, loss_function, model, optimizer, scheduler, train_data_loader, summary_writer, conf, local_rank, debug): #存储平均值 progbar = Progbar(len(train_data_loader.dataset), stateful_metrics=['epoch', 'config', 'lr']) batch_time = AverageMeter() end = time.time() losses = AverageMeter() max_iters = conf['optimizer']['schedule']['params']['max_iter'] print("training epoch {}".format(current_epoch)) model.train() for i, (landmarks, imgs, img_path) in enumerate(train_data_loader): numm = imgs.shape[0] optimizer.zero_grad() imgs = imgs.reshape((-1, imgs.size(-3), imgs.size(-2), imgs.size(-1))) imgs = Variable(imgs, requires_grad=True).cuda() landmarks = landmarks.cuda().float() output = model(imgs) loss = loss_function(output, landmarks) losses.update(loss.item(), imgs.size(0)) summary_writer.add_scalar('train/loss', loss.item(), global_step=i + current_epoch * max_iters) summary_writer.add_scalar('train/lr', float(scheduler.get_lr()[-1]), global_step=i + current_epoch * max_iters) if conf['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1) optimizer.step() torch.cuda.synchronize() batch_time.update(time.time() - end) end = time.time() if conf["optimizer"]["schedule"]["mode"] in ("step", "poly"): scheduler.step(i + current_epoch * max_iters) if (i == max_iters - 1) or debug: break progbar.add(numm, values=[('epoch', current_epoch), ('loss', losses.avg), ("lr", float(scheduler.get_lr()[-1]))]) if conf["optimizer"]["schedule"]["mode"] == "epoch": scheduler.step(current_epoch) if local_rank == 0: for idx, param_group in enumerate(optimizer.param_groups): lr = param_group['lr'] summary_writer.add_scalar('group{}/lr'.format(idx), float(lr), global_step=current_epoch)
def extract(test_loader, model): batch_time = AverageMeter(10) model.eval() features = [] with torch.no_grad(): end = time.time() for i, input in enumerate(test_loader): # compute output output = model(input) features.append(output.data.cpu().numpy()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() return np.vstack(features)
def run_epoch(phase, epoch, data_loader, model_loss, opt, optimizer, losses_stat): avg_loss_stats = {k: AverageMeter() for k in losses_stat} if phase == 'train': model_loss.train() else: if len(opt.gups) > 1: model_loss = model_loss.module model_loss.eval() torch.cuda.empty_cache() for iter_id, batch in enumerate(data_loader): for k in batch: if k != 'meta': batch[k] = batch[k].to(device=opt.device, non_blocking=True) output, loss, loss_stats = model_loss(batch) loss = loss.mean() if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() message = phase + ' | ' + ' epoch : ' + str( epoch) + ' | iter : ' + iter_id + " | " for kw in loss_stats.items(): message += kw[0] + ' : ' + str(kw[1]) + ' | ' print(message) for k in avg_loss_stats: avg_loss_stats[k].update(loss_stats[k].mean().item(), batch['iinput'].size(0)) ret = {k: v.avg for k, v in avg_loss_stats.items()} return ret
def valid_trainer(model, valid_loader, criterion): model.eval() loss_meter = AverageMeter() preds_probs = [] gt_list = [] with torch.no_grad(): for step, (imgs, gt_label, imgname) in enumerate(tqdm(valid_loader)): imgs = imgs.cuda() gt_label = gt_label.cuda() gt_list.append(gt_label.cpu().numpy()) gt_label[gt_label == -1] = 0 #valid_logits = model(imgs) valid_logits, valid_logits_2 = model(imgs) #pdb.set_trace() #valid_logits = model(imgs) #valid_loss = criterion(valid_logits, gt_label) #+ criterion(valid_logits_2, gt_label) #valid_loss = criterion(valid_logits, gt_label) #+ F.kl_div(torch.mean(valid_logits_2.squeeze(), 0)[0], torch.from_numpy(label_att).float().cuda(), reduction='sum') valid_loss = criterion( valid_logits_2, gt_label ) #+criterion(valid_logits_2, gt_label) #+ 0.3*mse_loss_fn(torch.mean(valid_logits_2.squeeze()[0], 0), torch.from_numpy(label_att).float().cuda()) valid_probs = torch.sigmoid(valid_logits_2) ''' valid_probs_2 = torch.sigmoid(valid_logits_2) #pdb.set_trace() # accessory valid_prob = valid_probs > 0.5 for i in range(valid_logits.size()[0]): if (valid_prob[i,15] and valid_prob[i,16]): print(str(valid_probs[i,15]) + ' '+str(valid_probs[i,16])) ''' preds_probs.append(valid_probs.cpu().numpy()) loss_meter.update(to_scalar(valid_loss)) #show_on_image(imgname, output) #vif(imgname,output_depth_0,output_depth_1,output_depth_2,output_depth_3, output_depth_4, output_depth_5) #return 0 valid_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) return valid_loss, gt_label, preds_probs
def valid_trainer(model, valid_loader, criterion): model.eval() grad_cam = GradCam(model=model, target_layer_names=["layer4"], use_cuda=True) loss_meter = AverageMeter() preds_probs = [] gt_list = [] if True: #with torch.no_grad(): for step, (imgs, gt_label, imgname) in enumerate(tqdm(valid_loader)): #pdb.set_trace() imgs = imgs.cuda() gt_label = gt_label.cuda() gt_list.append(gt_label.cpu().numpy()) #gt_list.append(gt_label[:, 6:].cpu().numpy()) gt_label[gt_label == -1] = 0 valid_logits = model(imgs) #mask_cam = grad_cam(imgs, 22)#mask: bs, 256, 192 valid_loss = criterion( valid_logits, gt_label ) #+ 0.2*(torch.sum(torch.abs(cha_att))+torch.sum(torch.abs(spa_att))) #valid_loss = criterion(valid_logits * (mask.expand_as(valid_logits)), gt_label * (mask.expand_as(gt_label)))#+ 0.2*(torch.sum(torch.abs(cha_att))+torch.sum(torch.abs(spa_att))) valid_probs = torch.sigmoid(valid_logits) #valid_probs = torch.sigmoid(valid_logits[:, 6:]) preds_probs.append(valid_probs.detach().cpu().numpy()) loss_meter.update(to_scalar(valid_loss)) #show_att(imgname, spa_att,spa_att ) #affine(imgname, theta) #vif(imgname, spa_att, spa_att) #show_on_image(imgname, mask_cam, 22, gt_label) #return 0 valid_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) return valid_loss, gt_label, preds_probs
def train(self, epoch): self.scheduler.step() self.model.train() landmark_loss_ = AverageMeter() for batch_idx, sample in enumerate(self.train_loader): image = sample['image'] gt_landmarks = sample['landmarks'] image, gt_landmarks = image.to(self.device), gt_landmarks.to( self.device) pred_landmarks = self.model(image) loss = self.lossfn(pred_landmarks, gt_landmarks) self.optimizer.zero_grad() loss.backward() self.optimizer.step() landmark_loss_.update(loss, image.size(0)) if batch_idx % 20 == 0: print( "Train Epoch: {:03} [{:05}/{:05} ({:03.0f}%)]\tLoss:{:.6f} LR: {:.7f}" .format(epoch, batch_idx * len(sample['image']), len(self.train_loader.dataset), 100. * batch_idx / len(self.train_loader), loss.item(), self.optimizer.param_groups[0]['lr'])) self.scalar_info['loss'] = landmark_loss_.avg self.scalar_info['lr'] = self.scheduler.get_lr()[0] if self.logger is not None: for tag, value in list(self.scalar_info.items()): self.logger.scalar_summary(tag, value, self.run_count) self.scalar_info = {} self.run_count += 1 print("|===>Loss: {:.4f}".format(landmark_loss_.avg)) self.evaluate(epoch, image, gt_landmarks, pred_landmarks)
def valid_trainer(epoch, model, valid_loader, criterion): model.eval() loss_meter = AverageMeter() preds_probs = [] gt_list = [] with torch.no_grad(): for step, (imgs, gt_label, imgname) in enumerate(tqdm(valid_loader)): imgs = imgs.cuda() gt_label = gt_label.cuda() gt_list.append(gt_label.cpu().numpy()) gt_label[gt_label == -1] = 0 output = model(imgs) loss_list = [] for k in range(len(output)): out = output[k] loss_list.append(criterion(out, gt_label)) loss = sum(loss_list) # maximum voting output = torch.max( torch.max( torch.max(torch.max(output[0], output[1]), output[2]), output[3]), output[4]) valid_loss = loss valid_probs = torch.sigmoid(output) preds_probs.append(valid_probs.detach().cpu().numpy()) loss_meter.update(to_scalar(valid_loss)) valid_loss = loss_meter.avg gt_label = np.concatenate(gt_list, axis=0) preds_probs = np.concatenate(preds_probs, axis=0) return valid_loss, gt_label, preds_probs
def validate(val_queue, model): top1 = AverageMeter() top5 = AverageMeter() model.eval() for data in tqdm.tqdm(val_queue): x = data[0].cuda(non_blocking=True) target = data[1].cuda(non_blocking=True) with torch.no_grad(): logits = model(x) prec1, prec5 = accuracy(logits, target, topk=(1, 5)) n = x.size(0) top1.update(prec1.data.item(), n) top5.update(prec5.data.item(), n) return top1.avg, top5.avg
def train(self, epoch): cls_loss_ = AverageMeter() accuracy_ = AverageMeter() self.model.train() for batch_idx, (data, gt_label) in enumerate(self.train_loader): data, gt_label = data.to(self.device), gt_label.to(self.device) cls_pred = self.model(data) # compute the loss cls_loss = self.lossfn.cls_loss(gt_label, cls_pred) accuracy = self.compute_accuracy(cls_pred, gt_label) self.optimizer.zero_grad() cls_loss.backward() self.optimizer.step() cls_loss_.update(cls_loss, data.size(0)) accuracy_.update(accuracy, data.size(0)) if batch_idx % 50 == 0: print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tTrain Loss: {:.6f}\tTrain Accuracy: {:.6f}' .format(epoch, batch_idx * len(data), len(self.train_loader.dataset), 100. * batch_idx / len(self.train_loader), cls_loss.item(), accuracy)) self.scalar_info['cls_loss'] = cls_loss_.avg self.scalar_info['accuracy'] = accuracy_.avg self.scalar_info['lr'] = self.lr # if self.logger is not None: # for tag, value in list(self.scalar_info.items()): # self.logger.scalar_summary(tag, value, self.run_count) # self.scalar_info = {} # self.run_count += 1 print("|===>Loss: {:.4f} Train Accuracy: {:.6f} ".format( cls_loss_.avg, accuracy_.avg)) return cls_loss_.avg, accuracy_.avg
def main(): args.n_resgroups = 5 args.n_resblocks = 3 args.n_feats = 64 args.n_reduction = 16 data_path = './data/final_test/lr3' # data_path = './data/valid/lr3' result_path = './track1_test_data/' var_name = 'data' if not os.path.exists(result_path): os.makedirs(result_path) model_path = './model/track1_model.pkl' save_point = torch.load(model_path) model_param = save_point['state_dict'] model = make_model(args) model.load_state_dict(model_param) model = model.cuda() model.eval() mrae = AverageMeter() for mat_name in sorted(os.listdir(data_path)): mat_path_name = os.path.join(data_path, mat_name) f = h5py.File(mat_path_name, 'r') input_data = f.get(var_name) input_data = np.array(input_data) input_data = input_data / 65535 s_time = time.time() img_res = self_ensemble(model, input_data) e_time = time.time() print(s_time - e_time) mat_name = mat_name[:-8] + '_tr1.mat' mat_dir = os.path.join(result_path, mat_name) save_matv73(mat_dir, var_name, img_res) print(mat_name) print(img_res.shape)
def validate(val_loader, model, criterion): objs = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() model.eval() for step, data in enumerate(val_loader): x = data[0].cuda(non_blocking=True) target = data[1].cuda(non_blocking=True) with torch.no_grad(): logits = model(x) loss = criterion(logits, target) prec1, prec5 = accuracy(logits, target, topk=(1, 5)) if args.distributed: reduced_loss = reduce_tensor(loss.data) prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) else: reduced_loss = loss.data objs.update(reduced_loss.item(), x.size(0)) top1.update(prec1.item(), x.size(0)) top5.update(prec5.item(), x.size(0)) if args.local_rank == 0 and step % args.print_freq == 0: duration = 0 if step == 0 else time.time() - duration_start duration_start = time.time() logging.info( 'VALIDATE Step: %03d Objs: %e R1: %f R5: %f Duration: %ds', step, objs.avg, top1.avg, top5.avg, duration) return top1.avg, top5.avg, objs.avg
def train(train_loader, model, criterion, optimizer): objs = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() model.train() end = time.time() for step, data in enumerate(train_loader): data_time.update(time.time() - end) x = data[0].cuda(non_blocking=True) target = data[1].cuda(non_blocking=True) # forward batch_start = time.time() logits = model(x) loss = criterion(logits, target) # backward optimizer.zero_grad() if args.opt_level is not None: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.grad_clip > 0: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip) optimizer.step() batch_time.update(time.time() - batch_start) if step % args.print_freq == 0: # For better performance, don't accumulate these metrics every iteration, # since they may incur an allreduce and some host<->device syncs. prec1, prec5 = accuracy(logits, target, topk=(1, 5)) if args.distributed: reduced_loss = reduce_tensor(loss.data) prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) else: reduced_loss = loss.data objs.update(reduced_loss.item(), x.size(0)) top1.update(prec1.item(), x.size(0)) top5.update(prec5.item(), x.size(0)) torch.cuda.synchronize() duration = 0 if step == 0 else time.time() - duration_start duration_start = time.time() if args.local_rank == 0: logging.info( 'TRAIN Step: %03d Objs: %e R1: %f R5: %f Duration: %ds BTime: %.3fs DTime: %.4fs', step, objs.avg, top1.avg, top5.avg, duration, batch_time.avg, data_time.avg) end = time.time() return top1.avg, objs.avg
def train(self, epoch): cls_loss_ = AverageMeter() box_offset_loss_ = AverageMeter() total_loss_ = AverageMeter() accuracy_ = AverageMeter() self.scheduler.step() self.model.train() for batch_idx, (data, target) in enumerate(self.train_loader): gt_label = target['label'] gt_bbox = target['bbox_target'] data, gt_label, gt_bbox = data.to(self.device), gt_label.to( self.device), gt_bbox.to(self.device).float() cls_pred, box_offset_pred = self.model(data) # compute the loss cls_loss = self.lossfn.cls_loss(gt_label, cls_pred) box_offset_loss = self.lossfn.box_loss(gt_label, gt_bbox, box_offset_pred) total_loss = cls_loss + box_offset_loss accuracy = self.compute_accuracy(cls_pred, gt_label) self.optimizer.zero_grad() total_loss.backward() self.optimizer.step() cls_loss_.update(cls_loss, data.size(0)) box_offset_loss_.update(box_offset_loss, data.size(0)) total_loss_.update(total_loss, data.size(0)) accuracy_.update(accuracy, data.size(0)) print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {:.6f}' .format(epoch, batch_idx * len(data), len(self.train_loader.dataset), 100. * batch_idx / len(self.train_loader), total_loss.item(), accuracy.item())) self.scalar_info['cls_loss'] = cls_loss_.avg self.scalar_info['box_offset_loss'] = box_offset_loss_.avg self.scalar_info['total_loss'] = total_loss_.avg self.scalar_info['accuracy'] = accuracy_.avg self.scalar_info['lr'] = self.scheduler.get_lr()[0] if self.logger is not None: for tag, value in list(self.scalar_info.items()): self.logger.scalar_summary(tag, value, self.run_count) self.scalar_info = {} self.run_count += 1 print("|===>Loss: {:.4f}".format(total_loss_.avg)) return cls_loss_.avg, box_offset_loss_.avg, total_loss_.avg, accuracy_.avg
def train(self, epoch): cls_loss_ = AverageMeter() accuracy_ = AverageMeter() accuracy_valid_ = AverageMeter() # 训练集作为模型输入 self.scheduler_1.step() self.scheduler_2.step() self.model_1.train() self.model_2.train() for batch_idx, (data, gt_label) in enumerate(self.train_loader): data, gt_label = data.to(self.device), gt_label.to(self.device) x, mask = self.model_1(data) # test # print(self.model_1.alexnet_1.conv1[0].weight.data) # print(self.model_2.channelgroup_2.group[0].weight.data[5][5:10]) # print(self.model_3.Classify_1.conv1[0].weight.data) # test with torch.no_grad(): parts = part_box(mask) img_parts, parts = get_part(data.cpu(), parts) # (1, 64, 48, 48) img_parts = torch.from_numpy(img_parts).view( img_parts.shape[0], 1, 48, 48).to(self.device) # view(64, 1, 48, 48) if (epoch == 1 or epoch == 5 or epoch == 10 or epoch == 15) and batch_idx == 1: self.show_image_grid(data, img_parts, parts, epoch) self.show_mask(mask, epoch) print('save image and parts in result: ' + self.config.save_path) print('epoch: ' + str(epoch)) print('batch_idx: ' + str(batch_idx)) cls_pred = self.model_2(img_parts, x) # compute the loss cls_loss = self.lossfn.cls_loss(gt_label, cls_pred) accuracy = self.compute_accuracy(cls_pred, gt_label) if epoch >= 0: self.optimizer_1.zero_grad() self.optimizer_2.zero_grad() cls_loss.backward() self.optimizer_1.step() self.optimizer_2.step() cls_loss_.update(cls_loss.item(), data.size(0)) accuracy_.update(accuracy, data.size(0)) if batch_idx % 2000 == 1: print('batch_idx: ', batch_idx) print('Cls loss: ', cls_loss.item()) # 验证集作为模型输入 with torch.no_grad(): self.model_1.eval() self.model_2.eval() for batch_idx, (data, gt_label) in enumerate(self.valid_loader): data, gt_label = data.to(self.device), gt_label.to(self.device) x, mask = self.model_1(data) parts = part_box(mask) img_parts, parts = get_part(data.cpu(), parts) # (4, 64, 48, 48) img_parts = torch.from_numpy(img_parts).view( img_parts.shape[0], 1, 48, 48).to(self.device) cls_pred = self.model_2(img_parts, x) accuracy_valid = self.compute_accuracy(cls_pred, gt_label) accuracy_valid_.update(accuracy_valid, data.size(0)) # 记录数据 self.scalar_info['cls_loss'] = cls_loss_.avg self.scalar_info['accuracy'] = accuracy_.avg self.scalar_info['lr'] = self.scheduler_1.get_lr()[0] # if self.logger is not None: # for tag, value in list(self.scalar_info.items()): # self.logger.scalar_summary(tag, value, self.run_count) # self.scalar_info = {} # self.run_count += 1 print( "\r\nEpoch: {}|===>Train Loss: {:.8f} Train Accuracy: {:.6f} valid Accuracy: {:.6f}\r\n" .format(epoch, cls_loss_.avg, accuracy_.avg, accuracy_valid_.avg)) return cls_loss_.avg, accuracy_.avg, accuracy_valid_.avg
def train(epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() # switch to train mode model.train() end = time.time() for batch_idx, (data, img_data, labels) in enumerate(train_loader): data_time.update(time.time() - end) # measure data loading time B = data.shape[0] # Batch size N = data.shape[1] # Num of points in PointCloud data, labels, img_data = data.float(), labels.float(), img_data.float() if use_cuda: labels, data, img_data = labels.cuda(), data.cuda(), img_data.cuda( ) img_data = img_data.unsqueeze(1) hidden = torch.zeros( 1, B, 512).cuda() # initialising the hidden variable for GRU optimizer.zero_grad() output = model(data, img_data, hidden, seq_len) # (B,4) loss = criterion(output, labels, seq_len) loss.backward() optimizer.step() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) hidden = hidden.detach() losses.update(loss.item(), B) pred = output[0] prec1 = binary_accuracy(pred[0], labels[:, 0, 8]) top1.update(prec1, B) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( epoch, batch_idx, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1)) return losses.avg
def validate(): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() TP = torch.zeros(0) # True Positives CS = torch.zeros(0) # Cosine Similarity # switch to evaluate mode model.eval() # if args.evaluate: # model.train() with torch.no_grad(): end = time.time() for batch_idx, (data, img_data, labels) in enumerate(valid_loader): B = data.shape[0] # Batch size N = data.shape[1] # Num of points in PointCloud data, labels, img_data = data.float(), labels.float( ), img_data.float() # labels = labels.permute(1,0,2) #(seq,B,5) if use_cuda: labels, data, img_data = labels.cuda(), data.cuda( ), img_data.cuda() img_data = img_data.unsqueeze(1) hidden = torch.zeros( 1, B, 512).cuda() # initialising the hidden variable for GRU optimizer.zero_grad() output = model(data, img_data, hidden, seq_len) # (B,4) loss = criterion(output, labels, seq_len) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) hidden = hidden.detach() losses.update(loss.item(), B) pred = output[0] prec1 = binary_accuracy(pred[1], labels[:, 1, 8]) top1.update(prec1, B) # measure elapsed time batch_time.update(time.time() - end) end = time.time() ###################################### Final Evaluation #################################################### score_seq, loc_seq, box_seq = output trans_mat_1 = torch.eye(3).view(1, -1).repeat(seq_len, B, 1).cuda() trans_mat_1[:, :, 0] = loc_seq[:, :, 0] # c trans_mat_1[:, :, 1] = -loc_seq[:, :, 1] # -s trans_mat_1[:, :, 3] = loc_seq[:, :, 1] # s trans_mat_1[:, :, 4] = loc_seq[:, :, 0] # c trans_mat_1[:, :, 2] = loc_seq[:, :, 2] #tx trans_mat_1[:, :, 5] = loc_seq[:, :, 3] #ty trans_mat_1 = trans_mat_1.view(seq_len * B, 3, 3) trans_mat_2 = torch.eye(3).view(1, -1).repeat(seq_len, B, 1).cuda() trans_mat_2[:, :, 0] = box_seq[:, :, 0] # c trans_mat_2[:, :, 1] = -box_seq[:, :, 1] # -s trans_mat_2[:, :, 3] = box_seq[:, :, 1] # s trans_mat_2[:, :, 4] = box_seq[:, :, 0] # c trans_mat_2[:, :, 2] = box_seq[:, :, 2] #tx trans_mat_2[:, :, 5] = box_seq[:, :, 3] #ty trans_mat_2 = trans_mat_2.view(seq_len * B, 3, 3) resultant_trans = torch.bmm(trans_mat_1, trans_mat_2) resultant_trans = resultant_trans.view(seq_len, B, 9) final_trans_params = resultant_trans[:, :, [0, 3, 2, 5]] z = (loc_seq[:, :, 4] + box_seq[:, :, 4]).view(seq_len, B, -1) final_trans_params = torch.cat((final_trans_params, z), 2) loc = final_trans_params[:, :, 2:5] theta = torch.atan2(final_trans_params[:, :, 1], final_trans_params[:, :, 0]) size = box_seq[:, :, 5:] for a in range(B): car_list = check_for_car(labels[a]) detections = [] for i in range(seq_len): trans_params = torch.cat( (loc[i, a], theta[i, a].view(1), size[i, a]), 0) if ((score_seq[i, a] > 0.7)): detections.append(trans_params.cpu().numpy()) TP_region, CS_region = eval_detect_in_region( car_list, detections) TP = torch.cat((TP, TP_region), 0) CS = torch.cat((CS, CS_region), 0) if batch_idx % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format( batch_idx, len(valid_loader), batch_time=batch_time, loss=losses, top1=top1)) print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) if (TP.nelement() == 0): recall = 0 else: recall = TP.mean() if (CS.nelement() == 0): AOS = 0 else: AOS = CS.mean() print("recall: ", recall) print("AOS: ", AOS) return losses.avg, recall, AOS
def train(model, params): # helper function to print and save logs def print_log(string, print_time = True): if print_time: curr_time = time.asctime(time.localtime(time.time())) string = "[ " + curr_time + " ] " + string print(string) log_file = os.path.join(params.work_dir, "train_log.txt") with open(log_file, "a+") as log: log.write(string + "\n") # helper function to save checkpoints def save_checkpoint(best = False): if isinstance(model, nn.DataParallel): model_state_dict = model.module.model.state_dict() else: model_state_dict = model.model.state_dict() ckpt_dict = { "epoch": e, "step": step, "model_state_dict": model_state_dict, "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "best_accuracy": best_accuracy } ckpt_dir = os.path.join(params.work_dir, "checkpoints") os.makedirs(ckpt_dir, exist_ok = True) if best: torch.save(ckpt_dict, os.path.join(ckpt_dir, "best.pth")) else: torch.save(ckpt_dict, os.path.join(ckpt_dir, f"epoch_{e}_step_{step}.pth")) # set up our project working directory os.makedirs(params.work_dir, exist_ok = True) # and save our training configuration with open(os.path.join(params.work_dir, "train_args.yaml"), "w+") as f: yaml.dump(params.params, f) # print out the settings for training print_log("Below are the training settings", print_time = False) for k, v in params.params.items(): print_log(f"{k} : {v}", print_time = False) # tensorboard summary writer writer = SummaryWriter(os.path.join(params.work_dir, "events")) # initiating dataset and loader train_dir = os.path.join(params.data_root, "train") train_set = TrainDataset( imdir = train_dir, input_size = params.input_size, color_jitter = params.color_jitter, resize_scale = params.resize_scale, ratio = params.ratio, interpolation = params.interpolation, horizontal_flip = params.horizontal_flip, mean = params.mean, std = params.std, fname = True ) train_loader = DataLoader( train_set, batch_size = params.train_bs, num_workers = params.num_workers, shuffle = True ) # we will use center crop to evaluate the model's accuracy every epoch val_dir = os.path.join(params.data_root, "val") val_set = EvalDataset( imdir = val_dir, input_size = params.input_size, mean = params.mean, std = params.std, rescale_sizes = params.test_rescales, center_square = False, crop = "center", horizontal_flip = False ) val_loader = DataLoader( val_set, batch_size = params.test_bs, shuffle = False, num_workers = params.num_workers ) # GPU(s) or CPU usage if params.gpus: assert len(params.gpus) >= 1, "Please provide at least one gpu id for gpu training" if len(params.gpus) == 1: device = torch.device(f"cuda:{params.gpus[0]}") model = model.to(device) print_log(f"Training model on cuda: {params.gpus[0]}") else: device = torch.device(f"cuda:{params.gpus[0]}") # for parallelism, the model on the default gpu is still the one being updated # however, we replicate it to the other gpu every forward and backward pass # for gradient computation on the data that we allocated to those gpus model = model.to(torch.device(f"cuda:{params.gpus[0]}")) # it seems params.gpus must be like [0, 1] instead of [1, 0] model = nn.DataParallel(model, params.gpus) print_log(f"Data Parallelism is used across cuda: {params.gpus}") else: # the model stays on cpu print_log("Using cpu for training") # define optimizer # add in separate bn parameters if params.weight_decay is not None: # add_weight_decay separate bias and weight and bias in batchnorm from other parameters # because bias terms and and weight and bias in bn should not be decayed towards zero-norm # check here https://discuss.pytorch.org/t/weight-decay-in-the-optimizers-is-a-bad-idea-especially-with-batchnorm/16994/2 param_groups = add_weight_decay(model, params.weight_decay) else: param_groups = model.parameters() # it is recommended to construct an optimizer after you have done the model.cuda(), # as some optimizer might create buffers of the type same as the model parameters. # since we will put our model to gpu, it is better that the model parameters have the type cuda # instead of cpu before optimizer construction. optimizer = torch.optim.SGD(param_groups, lr = params.lr, weight_decay = params.weight_decay, momentum = params.momentum, nesterov=params.nesterov) # Let's define a learning rate scheduler that helps us reduce the learning rate by 10 times if our model's performance # on the validation set ceases to increase for 6 epochs # The mode should be min, so that it stores the min previous validation loss and compares that to the new validation loss # that we will provide when calling scheduler.step(<new_value>). # You may use "max" and validation accuracy too. scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.1, patience = 6) # resume from previous training if params.resume_path: ckpt_dict = torch.load(params.resume_path) model_state_dict = ckpt_dict["model_state_dict"] if isinstance(model, nn.DataParallel): model.module.model.load_state_dict(model_state_dict) else: model.model.load_state_dict(model_state_dict) optimizer.load_state_dict(ckpt_dict["optimizer_state_dict"]) scheduler.load_state_dict(ckpt_dict["scheduler_state_dict"]) start_epoch = ckpt_dict["epoch"] step = ckpt_dict["step"] if step != 0 and step % len(train_loader) == 0: # if we have finished the whole epoch last time before we saved the checkpoint # we move on to the next epoch start_epoch += 1 best_accuracy = ckpt_dict["best_accuracy"] print_log(f"Loaded checkpoint {params.resume_path}") print_log(f"Resuming from epoch {start_epoch} step {step}") else: start_epoch = 0 step = 0 best_accuracy = 0 batch_loss = AverageMeter() batch_accu = AverageMeter() try: # the try clause is for the except below where if we use ctrl-c/cmd-c to stop the program # it will save a checkpoint before exiting for e in range(start_epoch, params.num_epochs): model.train() progress_bar = tqdm(range(len(train_loader))) step_in_last_epoch = step % len(train_loader) loader = iter(train_loader) for i in progress_bar: # If we saved weights and stopped training halfway in an epoch, let's finish the remaining data in # that epoch before moving on. # As the train_loader will be shuffled, we cannot really train the model on the data # we left behind last time. However, it is easier for us to track training, as with these few lines # of code, we can align the stored epoch number correctly with the number of images trained (suppose the batch size is # the same, so the number of images trained per step is the same) if i < step_in_last_epoch: progress_bar.update() continue if i == len(train_loader) - step_in_last_epoch: # if we have finished the equivalent amount of what we left behind last time # stop this epoch and move on to the next break data = next(loader) images = data['image'] labels = data['label'] # sending images and labels to gpus if params.gpus and len(params.gpus) == 1: # if we are using one gpu images = images.to(device) labels = labels.to(device) # else: # if the model is on cpu, then nothing needs to be done # if multiple gpus are used, the data will be scattered to the corresponding gpus # inside the nn.DataParallel class directly from CPU. Nothing needs to be done here. # forward pass the images to get prediction and loss # remember now our model is an instance of the wrapper ModelWithLoss. # It computes the loss inside its own forward() method. # Do not write as model(images = images, labels = labels) with DataParallel, as # they will then be counted as kwargs instead of tensor inputs preds, loss = model(images, labels) # compute accuracy for the training batch # Note: for the last batch or batch of odd number, the dataparallel may skip the remainder # when dividing the batch evenly among the gpus, resulting in different dimension between # preds and labels. Therefore, we need to take labels[:len(preds)] batch_accu.update(compute_accuracy(preds, labels[:len(preds)].view((-1, 1)))[0]) # if we use data parallelism, the loss will be a vector with elements corresponding # to the loss on each gpu # it does not hurt if we are not using data parallelism loss = loss.mean() # compute dloss/dx for every parameter x that has requires_grad = True # and add this dloss/dx to the parameter's gradient # Initially, the parameters' gradients are all zero, loss.backward() adds the newly computed gradient # to the existing gradient. It will accumulate unless we call optimizer.zero_grad() to clear them. loss /= params.grad_accu_steps # see comments right below loss.backward() batch_loss.update(loss.item()) step += 1 # params.grad_accu_steps specifies for how many mini-batches we wish to accumulate gradients. # It is a work-around if we cannot fit a desirable size of mini-batch in GPU, we can simply accumulate # 2 or 3 batches' gradient before we call optimizer.step() (backpropagation). # However, this work-around has a difference when you have batch normalization layers, # as the running averages/variances of these are computed as exponential moving average. So # the running averages/variances statistics may deviate from using a larger batch. if step % params.grad_accu_steps == 0: # Backpropagation to update parameters optimizer.step() # Set the gradients to zero, so that we can accumulate gradients from fresh optimizer.zero_grad() if step % params.logging_interval == 0: print_log(f"Epoch {e} Step {step}: Average loss is {batch_loss.avg:.4f} Training accuracy is {batch_accu.avg:.4f}") for j, param_group in enumerate(optimizer.param_groups): print_log(f"lr_{j} is {param_group['lr']}") batch_loss.reset() batch_accu.reset() writer.add_scalars("accuracy", {"train": batch_accu.val}, step) writer.add_scalars("loss", {"train": batch_loss.val}, step) for j, param_group in enumerate(optimizer.param_groups): writer.add_scalar(f"lr/lr_{j}", param_group["lr"], step) # update the information of the progress_bar.set_description(f"Epoch {e}/{params.num_epochs} Step {step} Loss: {batch_loss.avg:.4f} Accuracy: {batch_accu.avg:.4f}") if (e + 1) % params.saving_interval == 0: save_checkpoint() # evaluate our model on the validation set # remember, our evaluate function can take care of ModelWithLoss wrapper if params.gpus and len(params.gpus) == 1: accu_meters, loss_meter = evaluate(model, val_loader, topk = (1, ), device = device) else: # dataparallel or cpu, let the data stay on cpu, see relevants comments above during training accu_meters, loss_meter = evaluate(model, val_loader, topk = (1, )) accuracy = accu_meters[0].avg print_log(f"Accuracy is {accuracy:.4f}, loss is {loss_meter.avg:.4f} for Epoch {e} Step {step} ") writer.add_scalars("accuracy", {"val": accuracy}, step) writer.add_scalars("loss", {"val": loss_meter.avg}, step) # update learning rate scheduler scheduler.step(loss_meter.avg) # scheduler.step(accuracy) if accuracy > best_accuracy: best_accuracy = accuracy save_checkpoint(best = True) except KeyboardInterrupt: print_log("KeyboardInterrupt: Saving a checkpoint") save_checkpoint()
def train_one_epoch(model, data_queue, opt, gm, epoch, args): def train_func(image, im_info, gt_boxes): with gm: loss_dict = model(image=image, im_info=im_info, gt_boxes=gt_boxes) gm.backward(loss_dict["total_loss"]) loss_list = list(loss_dict.values()) opt.step().clear_grad() return loss_list meter = AverageMeter(record_len=model.cfg.num_losses) time_meter = AverageMeter(record_len=2) log_interval = model.cfg.log_interval tot_step = model.cfg.nr_images_epoch // (args.batch_size * dist.get_world_size()) for step in range(tot_step): adjust_learning_rate(opt, epoch, step, model.cfg, args) data_tik = time.time() mini_batch = next(data_queue) data_tok = time.time() tik = time.time() loss_list = train_func(image=mge.tensor(mini_batch["data"]), im_info=mge.tensor(mini_batch["im_info"]), gt_boxes=mge.tensor(mini_batch["gt_boxes"])) tok = time.time() time_meter.update([tok - tik, data_tok - data_tik]) if dist.get_rank() == 0: info_str = "e%d, %d/%d, lr:%f, " loss_str = ", ".join( ["{}:%f".format(loss) for loss in model.cfg.losses_keys]) time_str = ", train_time:%.3fs, data_time:%.3fs" log_info_str = info_str + loss_str + time_str meter.update([loss.numpy() for loss in loss_list]) if step % log_interval == 0: logger.info(log_info_str, epoch, step, tot_step, opt.param_groups[0]["lr"], *meter.average(), *time_meter.average()) meter.reset() time_meter.reset()
def train(self, epoch): cls_loss_ = AverageMeter() box_offset_loss_ = AverageMeter() landmark_loss_ = AverageMeter() total_loss_ = AverageMeter() accuracy_ = AverageMeter() self.scheduler.step() self.model.train() for batch_idx, (data, target) in enumerate(self.train_loader): gt_label = target['label'] gt_bbox = target['bbox_target'] gt_landmark = target['landmark_target'] data, gt_label, gt_bbox, gt_landmark = data.to(self.device), gt_label.to( self.device), gt_bbox.to(self.device).float(), gt_landmark.to(self.device).float() cls_pred, box_offset_pred, landmark_offset_pred = self.model(data) # compute the loss cls_loss = self.lossfn.cls_loss(gt_label, cls_pred) box_offset_loss = self.lossfn.box_loss( gt_label, gt_bbox, box_offset_pred) landmark_loss = self.lossfn.landmark_loss(gt_label, gt_landmark, landmark_offset_pred) total_loss = cls_loss + box_offset_loss * 0.5 + landmark_loss accuracy = self.compute_accuracy(cls_pred, gt_label) self.optimizer.zero_grad() total_loss.backward() self.optimizer.step() cls_loss_.update(cls_loss, data.size(0)) box_offset_loss_.update(box_offset_loss, data.size(0)) landmark_loss_.update(landmark_loss, data.size(0)) total_loss_.update(total_loss, data.size(0)) accuracy_.update(accuracy, data.size(0)) print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {:.6f}'.format( epoch, batch_idx * len(data), len(self.train_loader.dataset), 100. * batch_idx / len(self.train_loader), total_loss.item(), accuracy.item())) self.scalar_info['cls_loss'] = cls_loss_.avg self.scalar_info['box_offset_loss'] = box_offset_loss_.avg self.scalar_info['landmark_loss'] = landmark_loss_.avg self.scalar_info['total_loss'] = total_loss_.avg self.scalar_info['accuracy'] = accuracy_.avg self.scalar_info['lr'] = self.scheduler.get_lr()[0] if self.logger is not None: for tag, value in list(self.scalar_info.items()): self.logger.scalar_summary(tag, value, self.run_count) self.scalar_info = {} self.run_count += 1 print("|===>Loss: {:.4f}".format(total_loss_.avg)) return cls_loss_.avg, box_offset_loss_.avg, landmark_loss_.avg, total_loss_.avg, accuracy_.avg