def validate(val_dataloader, model, configs): losses = AverageMeter('Loss', ':.4e') criterion = Compute_Loss(device=configs.device) # switch to train mode model.eval() with torch.no_grad(): for batch_idx, batch_data in enumerate(tqdm(val_dataloader)): metadatas, imgs, targets = batch_data batch_size = imgs.size(0) for k in targets.keys(): targets[k] = targets[k].to(configs.device, non_blocking=True) imgs = imgs.to(configs.device, non_blocking=True).float() outputs = model(imgs) total_loss, loss_stats = criterion(outputs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) return losses.avg
def train(train_loader, net, optim, curr_epoch, writer): """ Runs the training loop per epoch train_loader: Data loader for train net: thet network optimizer: optimizer curr_epoch: current epoch writer: tensorboard writer return: """ net.train() train_main_loss = AverageMeter() curr_iter = curr_epoch * len(train_loader) for i, data in enumerate(train_loader): inputs, gts, _img_name = data batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3) inputs, gts = inputs.cuda(), gts.cuda() optim.zero_grad() main_loss = net(inputs, gts=gts) if args.apex: log_main_loss = main_loss.clone().detach_() torch.distributed.all_reduce(log_main_loss, torch.distributed.ReduceOp.SUM) log_main_loss = log_main_loss / args.world_size else: main_loss = main_loss.mean() log_main_loss = main_loss.clone().detach_() train_main_loss.update(log_main_loss.item(), batch_pixel_size) if args.fp16: with amp.scale_loss(main_loss, optim) as scaled_loss: scaled_loss.backward() else: main_loss.backward() optim.step() curr_iter += 1 if args.local_rank == 0: msg = '[epoch {}], [iter {} / {}], [train main loss {:0.6f}], [lr {:0.6f}]'.format( curr_epoch, i + 1, len(train_loader), train_main_loss.avg, optim.param_groups[-1]['lr']) logging.info(msg) # Log tensorboard metrics for each iteration of the training phase writer.add_scalar('training/loss', (train_main_loss.val), curr_iter) writer.add_scalar('training/lr', optim.param_groups[-1]['lr'], curr_iter) if i > 5 and args.test_mode: return
def train(epoch): net.train() # 计算平均损失,每个epoch更新为0 train_loss = AverageMeter() #每次迭代调用 _getitem_ 方法,进行transform变换。 curr_iter = (epoch - 1) * len(trainloader) for i, (inputs, labels) in enumerate(trainloader): if args.cuda: inputs, labels = inputs.cuda(), labels.cuda() N = inputs.size(0) # 清空梯度 optimizer.zero_grad() outputs = net(inputs) # 计算单个样本的loss loss = criterion(outputs, labels) / N # 反向传导,更新参数 loss.backward() optimizer.step() train_loss.update(loss.item(), N) curr_iter += 1 #writer.add_scalar('train_loss', train_loss.avg, curr_iter) #if (i + 1) % args.trainInterval == 0: print('[epoch %d], [iter %d / %d], [train loss %.5f]' % (epoch, i + 1, len(trainloader), train_loss.avg))
def train_single_epoch(model, criterion, optimizer, train_loader, epoch, is_cuda): model.train() # switch to train mode avg_loss = AverageMeter() end = time.time() running_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader, 0): # wrap them in Variable if is_cuda: labels = labels.cuda(async=True) inputs = inputs.cuda(async=True) input_var = torch.autograd.Variable( inputs ) label_var = torch.autograd.Variable( labels ) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize output = model(input_var) loss = criterion(output, label_var) loss.backward() optimizer.step() # print statistics avg_loss.update(loss.data[0], labels.size(0)) return avg_loss.avg
def validate(val_loader, net, criterion, optim, curr_epoch, writer): """ Runs the validation loop after each training epoch val_loader: Data loader for validation net: thet network criterion: loss fn optimizer: optimizer curr_epoch: current epoch writer: tensorboard writer return: val_avg for step function if required """ net.eval() val_loss = AverageMeter() iou_acc = 0 dump_images = [] for val_idx, data in enumerate(val_loader): inputs, gt_image, img_names = data assert len(inputs.size()) == 4 and len(gt_image.size()) == 3 assert inputs.size()[2:] == gt_image.size()[1:] batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3) inputs, gt_cuda = inputs.cuda(), gt_image.cuda() with torch.no_grad(): output = net(inputs) # output = (1, 19, 713, 713) assert output.size()[2:] == gt_image.size()[1:] assert output.size()[1] == args.dataset_cls.num_classes val_loss.update(criterion(output, gt_cuda).item(), batch_pixel_size) predictions = output.data.max(1)[1].cpu() # Logging if val_idx % 20 == 0: if args.local_rank == 0: logging.info("validating: %d / %d", val_idx + 1, len(val_loader)) if val_idx > 10 and args.test_mode: break # Image Dumps if val_idx < 10: dump_images.append([gt_image, predictions, img_names]) iou_acc += fast_hist(predictions.numpy().flatten(), gt_image.numpy().flatten(), args.dataset_cls.num_classes) del output, val_idx, data if args.apex: iou_acc_tensor = torch.cuda.FloatTensor(iou_acc) torch.distributed.all_reduce(iou_acc_tensor, op=torch.distributed.ReduceOp.SUM) iou_acc = iou_acc_tensor.cpu().numpy() if args.local_rank == 0: evaluate_eval(args, net, optim, val_loss, iou_acc, dump_images, writer, curr_epoch, args.dataset_cls) return val_loss.avg
def validate(val_dataloader, model, configs): losses = AverageMeter('Loss', ':.4e') criterion = Compute_Loss(device=configs.device) # switch to train mode model.eval() with torch.no_grad(): for batch_idx, batch_data in enumerate(tqdm(val_dataloader)): metadatas, targets = batch_data batch_size = len(metadatas['img_path']) voxelinput = metadatas['voxels'] coorinput = metadatas['coors'] numinput = metadatas['num_points'] for k in targets.keys(): targets[k] = targets[k].to(configs.device, non_blocking=True) #imgs = imgs.to(configs.device, non_blocking=True).float() dtype = torch.float32 voxelinputr = torch.tensor(voxelinput, dtype=torch.float32, device=configs.device).to(dtype) coorinputr = torch.tensor(coorinput, dtype=torch.int32, device=configs.device) numinputr = torch.tensor(numinput, dtype=torch.int32, device=configs.device) try: outputs = model(voxelinputr, coorinputr, numinputr) except RuntimeError as exception: if "out of memory" in str(exception): print("WARNING: out of memory") print('###############################3') if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: print('###############################3') raise exception #outputs = model(voxelinputr, coorinputr, numinputr) total_loss, loss_stats = criterion(outputs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) return losses.avg
def train(model, train_dataset, criterion, optimizer, epoch, device, args): BATCH_SIZE = args.batch_size ITER_SIZE = args.iter_size TOTAL_TRAIN_DATA = train_dataset.len NUM_PTS = args.num_pts NUM_BATCH = int(np.ceil((TOTAL_TRAIN_DATA / (BATCH_SIZE * ITER_SIZE)))) data_idx = 0 model = model.train() losses = AverageMeter() tot_loss = [] fastprint("Training... ") for batch_idx in range(NUM_BATCH): loss_sum = 0 optimizer.zero_grad() for _iter in range(ITER_SIZE): data = train_dataset.getitem(data_idx) points, label, indptr, indices = data['data'], \ data['label'], \ data['indptr'], \ data['indices'] points, label, indptr, indices = torch.from_numpy(points), \ torch.from_numpy(label.reshape(-1)), \ torch.from_numpy(indptr), \ torch.from_numpy(indices) points, label, indptr, indices = points.view(NUM_PTS, -1), \ label.view(-1), \ indptr.view(-1), \ indices.view(-1) points, label, indptr, indices = Variable(points).float(), \ Variable(label).type(torch.LongTensor), \ indptr, indices points, label, indptr, indices = points.to(device), \ label.to(device), \ indptr.to(device), \ indices.to(device) pred = model(points, indptr, indices) loss = criterion(pred, label) / ITER_SIZE loss.backward() loss_sum += loss.item() data_idx += 1 losses.update(loss.item(), label.size(0)) optimizer.step() tot_loss.append(loss_sum) fastprint('[%d: %d/%d] train loss: %f' % (epoch, batch_idx, NUM_BATCH, loss_sum)) torch.save(model.state_dict(), '%s/cls_model_%d.pth' % (args.outf, epoch)) np.savez(os.path.join(args.outf, 'TrainLoss_epoch_{}.npz'.format(epoch)), loss=tot_loss)
def evaluate_one_epoch(val_loader, model, epoch, configs, logger): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') conf_thresh = 0.5 nms_thresh = 0.5 iou_threshold = 0.5 progress = ProgressMeter(len(val_loader), [batch_time, data_time], prefix="Evaluate - Epoch: [{}/{}]".format( epoch, configs.num_epochs)) labels = [] sample_metrics = [] # List of tuples (TP, confs, pred) # switch to evaluate mode model.eval() with torch.no_grad(): start_time = time.time() for batch_idx, batch_data in enumerate(tqdm(val_loader)): data_time.update(time.time() - start_time) _, imgs, targets = batch_data # Extract labels labels += targets[:, 1].tolist() # Rescale target targets[:, 2:] *= configs.img_size imgs = imgs.to(configs.device, non_blocking=True) outputs = model(imgs) outputs = post_processing(outputs, conf_thresh=conf_thresh, nms_thresh=nms_thresh) sample_metrics += get_batch_statistics_rotated_bbox( outputs, targets, iou_threshold=iou_threshold) # measure elapsed time # torch.cuda.synchronize() batch_time.update(time.time() - start_time) # Log message if logger is not None: if ((batch_idx + 1) % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time() # Concatenate sample statistics true_positives, pred_scores, pred_labels = [ np.concatenate(x, 0) for x in list(zip(*sample_metrics)) ] precision, recall, AP, f1, ap_class = ap_per_class( true_positives, pred_scores, pred_labels, labels) return precision, recall, AP, f1, ap_class
def train(train_dataloader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() for i, data in enumerate(train_dataloader): # measure data loading time data_time.update(time.time() - end) # get the inputs; data is a list of [inputs, labels] inputs, targets = data inputs = inputs.to(device) targets = targets.to(device) # compute output output = model(inputs) loss = criterion(output, targets) # measure accuracy and record loss prec1, prec5 = accuracy(output, targets, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1, inputs.size(0)) top5.update(prec5, inputs.size(0)) # compute gradients in a backward pass optimizer.zero_grad() loss.backward() # Call step of optimizer to update model params optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 5 == 0: print( f"Epoch [{epoch + 1}] [{i}/{len(train_dataloader)}]\t" f"Time {data_time.val:.3f} ({data_time.avg:.3f})\t" f"Loss {loss.item():.4f}\t" f"Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t" f"Prec@5 {top5.val:.3f} ({top5.avg:.3f})", end="\r") torch.save(model.state_dict(), f"./checkpoints/{opt.datasets}_epoch_{epoch + 1}.pth")
def training(self, epoch, prefix='Train', evaluation=False): self.model.train() if evaluation: self.evaluator.reset() train_losses = AverageMeter() tbar = tqdm(self.train_dataloader, desc='\r', total=self.iters_per_epoch) # 设置最多迭代次数, 从0开始.. if self.writer: self.writer.add_scalar(f'{prefix}/learning_rate', get_learning_rate(self.optimizer), epoch) for i, sample in enumerate(tbar): image, target = sample['img'], sample['target'] image, target = image.to(self.device), target.to(self.device) if self.args.optimizer == 'SGD': self.lr_scheduler(self.optimizer, i, epoch) # each iteration output = self.model(image) loss = self.criterion(output, target) # multiple output loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_losses.update(loss.item()) tbar.set_description('Epoch {}, Train loss: {:.3f}'.format(epoch, train_losses.avg)) if evaluation: output = F.interpolate(output[-1], size=(target.size(1), target.size(2)), mode='bilinear', align_corners=True) pred = torch.argmax(output, dim=1) self.evaluator.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # B,H,W # 即便 tqdm 有 total,仍然要这样跳出 if i == self.iters_per_epoch - 1: break if self.writer: self.writer.add_scalar(f'{prefix}/loss', train_losses.val, epoch) if evaluation: Acc = self.evaluator.Pixel_Accuracy() mIoU = self.evaluator.Mean_Intersection_over_Union() print('Epoch: {}, Acc_pixel:{:.3f}, mIoU:{:.3f}'.format(epoch, Acc, mIoU)) self.writer.add_scalars(f'{prefix}/IoU', { 'mIoU': mIoU, # 'mDice': mDice, }, epoch) self.writer.add_scalars(f'{prefix}/Acc', { 'acc_pixel': Acc, # 'acc_class': Acc_class }, epoch)
def train(train_loader, net, criterion, optim, curr_epoch, scheduler, max_iter): """ Runs the training loop per epoch train_loader: Data loader for train net: thet network optimizer: optimizer curr_epoch: current epoch writer: tensorboard writer return: """ net.train() train_total_loss = AverageMeter() time_meter = AverageMeter() curr_iter = curr_epoch * len(train_loader) for i, data in enumerate(train_loader): if curr_iter >= max_iter: break start_ts = time.time() inputs, gts = data batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3) inputs, gts = inputs.cuda(), gts.cuda() optim.zero_grad() outputs = net(inputs) total_loss = criterion(outputs, gts) log_total_loss = total_loss.clone().detach_() train_total_loss.update(log_total_loss.item(), batch_pixel_size) total_loss.backward() optim.step() scheduler.step() time_meter.update(time.time() - start_ts) del total_loss curr_iter += 1 if i % 50 == 49: msg = '[epoch {}], [iter {} / {} : {}], [loss {:0.6f}], [lr {:0.6f}], [time {:0.4f}]'.format( curr_epoch, i + 1, len(train_loader), curr_iter, train_total_loss.avg, optim.param_groups[-1]['lr'], time_meter.avg / args.batch_size) logging.info(msg) train_total_loss.reset() time_meter.reset() return curr_iter
def test(model, test_dataset, criterion, epoch, device, args): fastprint('Evaluation ... ') TOTAL_TEST_DATA = test_dataset.len NUM_PTS = args.num_pts test_loss = 0.0 correct = 0.0 losses = AverageMeter() model = model.eval() with torch.no_grad(): for idx in range(TOTAL_TEST_DATA): data = test_dataset.getitem(idx) points, label, indptr, indices = data['data'], \ data['label'], \ data['indptr'], \ data['indices'] points, label, indptr, indices = torch.from_numpy(points), \ torch.from_numpy(label.reshape(-1)), \ torch.from_numpy(indptr), \ torch.from_numpy(indices) points, label, indptr, indices = points.view( NUM_PTS, -1), label.view(-1), indptr.view(-1), indices.view(-1) points, label, indptr, indices = Variable(points).float(), Variable( label).type(torch.LongTensor), indptr, indices points, label, indptr, indices = points.to(device), label.to( device), indptr.to(device), indices.to(device) pred = model(points, indptr, indices) loss = criterion(pred, label) # get the index of the max log-probability pred = pred.argmax(dim=1, keepdim=True) test_loss += loss.item() losses.update(loss.item(), label.size(0)) correct += pred.eq(label.view_as(pred)).sum().item() test_loss /= float(TOTAL_TEST_DATA) acc = 100. * correct / float(TOTAL_TEST_DATA) fastprint('Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, TOTAL_TEST_DATA, acc)) return acc
def validate(val_loader, net, criterion, optim, scheduler, curr_epoch, curr_iter): """ Runs the validation loop after each training epoch val_loader: Data loader for validation net: thet network criterion: loss fn optimizer: optimizer curr_epoch: current epoch return: val_avg for step function if required """ net.eval() val_loss = AverageMeter() iou_acc = 0 error_acc = 0 for val_idx, data in enumerate(val_loader): inputs, gts = data = data assert len(inputs.size()) == 4 and len(gts.size()) == 3 assert inputs.size()[2:] == gts.size()[1:] batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3) inputs, gts = inputs.cuda(), gts.cuda() with torch.no_grad(): output = net(inputs) del inputs assert output.size()[2:] == gts.size()[1:] assert output.size()[1] == args.num_classes val_loss.update(criterion(output, gts).item(), batch_pixel_size) predictions = output.data.max(1)[1].cpu() # Logging if val_idx % 20 == 0: logging.info("validating: %d / %d", val_idx + 1, len(val_loader)) iou_acc += fast_hist(predictions.numpy().flatten(), gts.cpu().numpy().flatten(), args.num_classes) del gts, output, val_idx, data per_cls_iou = evaluate_eval(args, net, optim, scheduler, val_loss, iou_acc, curr_epoch, args.dataset, curr_iter) return val_loss.avg, per_cls_iou
def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') progress = ProgressMeter(len(train_dataloader), [batch_time, data_time, losses], prefix="Train - Epoch: [{}/{}]".format(epoch, configs.num_epochs)) criterion = Compute_Loss(device=configs.device) num_iters_per_epoch = len(train_dataloader) # switch to train mode model.train() start_time = time.time() for batch_idx, batch_data in enumerate(tqdm(train_dataloader)): data_time.update(time.time() - start_time) metadatas, imgs, targets = batch_data batch_size = imgs.size(0) global_step = num_iters_per_epoch * (epoch - 1) + batch_idx + 1 for k in targets.keys(): targets[k] = targets[k].to(configs.device, non_blocking=True) imgs = imgs.to(configs.device, non_blocking=True).float() outputs = model(imgs) total_loss, loss_stats = criterion(outputs, targets) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) # compute gradient and perform backpropagation total_loss.backward() if global_step % configs.subdivisions == 0: optimizer.step() # zero the parameter gradients optimizer.zero_grad() # ######################### Sersy ######################################### # Adjust learning rate # if configs.step_lr_in_epoch: # lr_scheduler.step() # if tb_writer is not None: # tb_writer.add_scalar('LR', lr_scheduler.get_lr()[0], global_step) if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) # measure elapsed time # torch.cuda.synchronize() batch_time.update(time.time() - start_time) if tb_writer is not None: if (global_step % configs.tensorboard_freq) == 0: loss_stats['avg_loss'] = losses.avg tb_writer.add_scalars('Train', loss_stats, global_step) # Log message if logger is not None: if (global_step % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time()
def train(train_loader, model, criterion, optimizer, args): model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() running_metric_text = runningScore(2) running_metric_kernel = runningScore(2) end = time.time() for batch_idx, (imgs, gt_texts, gt_kernels, training_masks) in enumerate(train_loader): data_time.update(time.time() - end) imgs = Variable(imgs.cuda()) gt_texts = Variable(gt_texts.cuda()) gt_kernels = Variable(gt_kernels.cuda()) training_masks = Variable(training_masks.cuda()) outputs = model(imgs) texts = outputs[:, 0, :, :] kernels = outputs[:, 1:, :, :] loss = criterion(texts, gt_texts, kernels, gt_kernels, training_masks) losses.update(loss.item(), imgs.size(0)) optimizer.zero_grad() loss.backward() if (args.sr_lr is not None): updateBN(model, args) optimizer.step() score_text = cal_text_score(texts, gt_texts, training_masks, running_metric_text) score_kernel = cal_kernel_score(kernels, gt_kernels, gt_texts, training_masks, running_metric_kernel) batch_time.update(time.time() - end) end = time.time() if batch_idx % 20 == 0: output_log = '({batch}/{size}) Batch: {bt:.3f}s | TOTAL: {total:.0f}min | ETA: {eta:.0f}min | Loss: {loss:.4f} | Acc_t: {acc: .4f} | IOU_t: {iou_t: .4f} | IOU_k: {iou_k: .4f}'.format( batch=batch_idx + 1, size=len(train_loader), bt=batch_time.avg, total=batch_time.avg * batch_idx / 60.0, eta=batch_time.avg * (len(train_loader) - batch_idx) / 60.0, loss=losses.avg, acc=score_text['Mean Acc'], iou_t=score_text['Mean IoU'], iou_k=score_kernel['Mean IoU']) print(output_log) sys.stdout.flush() return (losses.avg, score_text['Mean Acc'], score_kernel['Mean Acc'], score_text['Mean IoU'], score_kernel['Mean IoU'])
def test(self, epoch): batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(self.test_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to test mode self.model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(self.test_loader): images = images.cuda() target = target.cuda() # compute output output, _ = self.model(images) loss = self.criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.args.print_freq == 0 and self.args.local_rank == 0: progress.display(i) if self.args.local_rank == 0: print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) self.writer.add_scalar('Test/Avg_Loss', losses.avg, epoch + 1) self.writer.add_scalar('Test/Avg_Top1', top1.avg, epoch + 1) self.writer.add_scalar('Test/Avg_Top5', top5.avg, epoch + 1) self.summary_graph_adj(self.writer, epoch + 1) self.summary_graph_histogram(self.writer, epoch + 1) return top1.avg
def train_one_epoch(train_loader, model, optimizer, epoch, configs, logger): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') progress = ProgressMeter(len(train_loader), [batch_time, data_time, losses], prefix="Train - Epoch: [{}]".format(epoch)) # switch to train mode model.train() start_time = time.time() for batch_idx, (origin_imgs, resized_imgs, org_ball_pos_xy, global_ball_pos_xy, event_class, target_seg) in enumerate(tqdm(train_loader)): data_time.update(time.time() - start_time) batch_size = resized_imgs.size(0) target_seg = target_seg.to(configs.device, non_blocking=True) resized_imgs = resized_imgs.to(configs.device, non_blocking=True).float() # Only move origin_imgs to cuda if the model has local stage for ball detection if not configs.no_local: origin_imgs = origin_imgs.to(configs.device, non_blocking=True).float() # compute output pred_ball_global, pred_ball_local, pred_events, pred_seg, local_ball_pos_xy, total_loss, _ = model( origin_imgs, resized_imgs, org_ball_pos_xy, global_ball_pos_xy, event_class, target_seg) else: pred_ball_global, pred_ball_local, pred_events, pred_seg, local_ball_pos_xy, total_loss, _ = model( None, resized_imgs, org_ball_pos_xy, global_ball_pos_xy, event_class, target_seg) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) # zero the parameter gradients optimizer.zero_grad() # compute gradient and perform backpropagation total_loss.backward() optimizer.step() losses.update(total_loss.item(), batch_size) # measure elapsed time batch_time.update(time.time() - start_time) # Log message if logger is not None: if ((batch_idx + 1) % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time() return losses.avg
def train_one_epoch(train_loader, model, optimizer, epoch, configs, logger): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') progress = ProgressMeter(len(train_loader), [batch_time, data_time, losses], prefix="Train - Epoch: [{}/{}]".format( epoch, configs.num_epochs)) # switch to train mode model.train() start_time = time.time() for batch_idx, (resized_imgs, org_ball_pos_xy, global_ball_pos_xy, target_events, target_seg) in enumerate(tqdm(train_loader)): data_time.update(time.time() - start_time) batch_size = resized_imgs.size(0) target_seg = target_seg.to(configs.device, non_blocking=True) resized_imgs = resized_imgs.to(configs.device, non_blocking=True).float() pred_ball_global, pred_ball_local, pred_events, pred_seg, local_ball_pos_xy, total_loss, _ = model( resized_imgs, org_ball_pos_xy, global_ball_pos_xy, target_events, target_seg) # For torch.nn.DataParallel case if (not configs.distributed) and (configs.gpu_idx is None): total_loss = torch.mean(total_loss) # zero the parameter gradients optimizer.zero_grad() # compute gradient and perform backpropagation total_loss.backward() optimizer.step() if configs.distributed: reduced_loss = reduce_tensor(total_loss.data, configs.world_size) else: reduced_loss = total_loss.data losses.update(to_python_float(reduced_loss), batch_size) # measure elapsed time torch.cuda.synchronize() batch_time.update(time.time() - start_time) # Log message if logger is not None: if ((batch_idx + 1) % configs.print_freq) == 0: logger.info(progress.get_message(batch_idx)) start_time = time.time() return losses.avg
net.train() for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data[0].to(device), data[1].to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) # measure accuracy and record loss prec1, prec5 = accuracy(outputs, labels, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1, inputs.size(0)) top5.update(prec5, inputs.size(0)) loss.backward() optimizer.step() # print statistics if i % 5 == 0: print(f"Epoch [{epoch + 1}] [{i}/{len(trainloader)}]\t" f"Loss {loss.item():.4f}\t" f"Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t" f"Prec@5 {top5.val:.3f} ({top5.avg:.3f})") net.eval() correct = 0
def train_epoch(self, epoch_num): batch_time = AverageMeter() losses = AverageMeter() acces = AverageMeter() self.model.train() end = time.time() for iter_i, batch_data in enumerate(self.train_loader): image_inputs = batch_data['image'] mean_normal = batch_data['mean_normal'] room_mask = batch_data['room_mask'] if self.configs.mode == 'room_corner': corner_map = batch_data['corner_map'] else: corner_map = torch.stack( [batch_data['corners_map'], batch_data['edge_map']], 1) label = batch_data['label'] if self.configs.use_cuda: image_inputs = image_inputs.cuda() mean_normal = mean_normal.cuda() room_mask = room_mask.cuda() corner_map = corner_map.cuda() label = label.cuda() if self.configs.mode == 'room_corner': corner_map = corner_map.unsqueeze(1) inputs = torch.cat([ image_inputs.unsqueeze(1), mean_normal, room_mask.unsqueeze(1), corner_map ], dim=1) logits, preds = self.model(inputs) loss = self.criterion(logits, label) losses.update(loss.data, image_inputs.size(0)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.configs.mode == 'corner_corner': acc = binary_pred_accuracy(preds.detach().cpu().numpy(), label.cpu().numpy()) else: acc = binary_pred_accuracy(preds.detach().cpu().numpy()[:, 0], label.cpu().numpy()[:, 0]) acces.update(acc, image_inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Corner pred Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( epoch_num, iter_i, len(self.train_loader), batch_time=batch_time, loss=losses, acc=acces)) if iter_i > self.configs.max_iter_per_epoch: break
def train(): try: os.makedirs(opt.checkpoints_dir) except OSError: pass if torch.cuda.device_count() > 1: model = torch.nn.parallel.DataParallel( AlexNet(num_classes=opt.num_classes)) else: model = AlexNet(num_classes=opt.num_classes) if os.path.exists(MODEL_PATH): model.load_state_dict( torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)) model.to(device) ################################################ # Set loss function and Adam optimizer ################################################ criterion = torch.nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=opt.lr) for epoch in range(opt.epochs): # train for one epoch print(f"\nBegin Training Epoch {epoch + 1}") # Calculate and return the top-k accuracy of the model # so that we can track the learning process. losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() for i, data in enumerate(train_dataloader): # get the inputs; data is a list of [inputs, labels] inputs, targets = data inputs = inputs.to(device) targets = targets.to(device) # compute output output = model(inputs) loss = criterion(output, targets) # measure accuracy and record loss prec1, prec5 = accuracy(output, targets, topk=(1, 2)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1, inputs.size(0)) top5.update(prec5, inputs.size(0)) # compute gradients in a backward pass optimizer.zero_grad() loss.backward() # Call step of optimizer to update model params optimizer.step() print( f"Epoch [{epoch + 1}] [{i + 1}/{len(train_dataloader)}]\t" f"Loss {loss.item():.4f}\t" f"Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t" f"Prec@5 {top5.val:.3f} ({top5.avg:.3f})", end="\r") # save model file torch.save(model.state_dict(), MODEL_PATH)
def validate(epoch): net.eval() val_loss = AverageMeter() inputs_all, labels_all, predictions_all = [], [], [] for i, (inputs, labels) in enumerate(valloader): if args.cuda: inputs, labels = inputs.cuda(), labels.cuda() N = inputs.size(0) outputs = net(inputs) # predictions 为输入图片尺寸大小的对应每一像素点的分类值 predictions = outputs.data.max(1)[1].squeeze_(1).squeeze_( 0).cpu().numpy() loss = criterion(outputs, labels) / N val_loss.update(loss.item(), N) if random.random() > args.valImgSampleRate: inputs_all.append(None) else: inputs_all.append(inputs.data.squeeze_(0).cpu()) labels_all.append(labels.data.squeeze_(0).cpu().numpy()) predictions_all.append(predictions) # 计算本次epoch之后对验证集的正确率等评价指标 acc, acc_cls, mean_iu, fwavacc = evaluate(predictions_all, labels_all, num_classes) if mean_iu > best_record['mean_iu']: best_record['val_loss'] = val_loss.avg best_record['epoch'] = epoch best_record['acc'] = acc best_record['acc_cls'] = acc_cls best_record['mean_iu'] = mean_iu best_record['fwavacc'] = fwavacc snapshot_name = 'epoch_%d_loss_%.5f_acc_%.5f_acc-cls_%.5f_mean-iu_%.5f_fwavacc_%.5f_lr_%.10f' % ( epoch, val_loss.avg, acc, acc_cls, mean_iu, fwavacc, args.lr) torch.save(net.state_dict(), os.path.join(ckpt_path, exp_name, snapshot_name + '.pth')) #torch.save(optimizer.state_dict(), os.path.join(ckpt_path, exp_name, 'opt_' + snapshot_name + '.pth')) if args.val_save_to_img_file: to_save_dir = os.path.join(ckpt_path, exp_name, str(epoch)) check_mkdir(to_save_dir) #val_visual = [] for idx, data in enumerate(zip(inputs_all, labels_all, predictions_all)): if data[0] is None: continue input_pil = restore_transform(data[0]) labels_pil = colorize_mask(data[1]) predictions_pil = colorize_mask(data[2]) if args.val_save_to_img_file: input_pil.save(os.path.join(to_save_dir, '%d_input.png' % idx)) predictions_pil.save( os.path.join(to_save_dir, '%d_prediction.png' % idx)) labels_pil.save(os.path.join(to_save_dir, '%d_label.png' % idx)) # val_visual.extend([visualize(input_pil.convert('RGB')), visualize(labels_pil.convert('RGB')), # visualize(predictions_pil.convert('RGB'))]) # val_visual = torch.stack(val_visual, 0) # val_visual = vutils.make_grid(val_visual, nrow=3, padding=5) # writer.add_image(snapshot_name, val_visual) print( '--------------------------------------------------------------------') print( '[epoch %d], [val loss %.5f], [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f]' % (epoch, val_loss.avg, acc, acc_cls, mean_iu, fwavacc)) print( 'best record: [val loss %.5f], [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f], [epoch %d]' % (best_record['val_loss'], best_record['acc'], best_record['acc_cls'], best_record['mean_iu'], best_record['fwavacc'], best_record['epoch'])) print( '--------------------------------------------------------------------') # writer.add_scalar('val_loss', val_loss.avg, epoch) # writer.add_scalar('acc', acc, epoch) # writer.add_scalar('acc_cls', acc_cls, epoch) # writer.add_scalar('mean_iu', mean_iu, epoch) # writer.add_scalar('fwavacc', fwavacc, epoch) # writer.add_scalar('lr', optimizer.param_groups[1]['lr'], epoch) return val_loss.avg
def train_epoch(self, epoch): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(self.train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) # switch to train mode self.model.train() end = time.time() for i, (images, target) in enumerate(self.train_loader): # measure data loading time data_time.update(time.time() - end) images = images.cuda() target = target.cuda() # compute output self.optimizer.zero_grad() logits, logits_aux = self.model(images) loss = self.criterion(logits, target) if self.args.graph_wd > 0: graph_params = [ v for k, v in self.model.named_parameters() if 'graph_weights' in k and v.requires_grad ] graph_l2 = 0 for v in graph_params: graph_l2 += (self.model.edge_act(v)**2).sum() loss += 0.5 * graph_l2 * self.args.graph_wd if self.args.auxiliary: loss_aux = self.criterion(logits_aux, target) loss += self.args.auxiliary_weight * loss_aux loss.backward() if self.args.grad_clip > 0: nn.utils.clip_grad_norm_(self.model.parameters(), self.args.grad_clip) self.optimizer.step() # measure accuracy and record loss acc1, acc5 = accuracy(logits, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) self.moving_loss = loss.item() if epoch == self.args.start_epoch and i == 0 else \ (1. - self.mu) * self.moving_loss + self.mu * loss.item() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.args.print_freq == 0 and self.args.local_rank == 0: progress.display(i) niter = epoch * len(self.train_loader) + i self.writer.add_scalar('Train/Sec_per_batch', batch_time.avg, niter) self.writer.add_scalar('Train/Avg_Loss', losses.avg, niter) self.writer.add_scalar('Train/Avg_Top1', top1.avg, niter) self.writer.add_scalar('Train/Avg_Top5', top5.avg, niter) self.writer.add_scalar('Train/Moving_Loss', self.moving_loss, niter)
def train(_run, _log): cfg = edict(_run.config) torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) random.seed(cfg.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not (_run._id is None): checkpoint_dir = os.path.join(_run.observers[0].basedir, str(_run._id), 'checkpoints') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # build network network = UNet(cfg.model) if not (cfg.resume_dir == 'None'): model_dict = torch.load(cfg.resume_dir, map_location=lambda storage, loc: storage) network.load_state_dict(model_dict) # load nets into gpu if cfg.num_gpus > 1 and torch.cuda.is_available(): network = torch.nn.DataParallel(network) network.to(device) # set up optimizers optimizer = get_optimizer(network.parameters(), cfg.solver) # data loader data_loader = load_dataset('train', cfg.dataset) # save losses per epoch history = { 'losses': [], 'losses_pull': [], 'losses_push': [], 'losses_binary': [], 'losses_depth': [], 'ioues': [], 'rmses': [] } network.train(not cfg.model.fix_bn) bin_mean_shift = Bin_Mean_Shift(device=device) k_inv_dot_xy1 = get_coordinate_map(device) instance_parameter_loss = InstanceParameterLoss(k_inv_dot_xy1) # main loop for epoch in range(cfg.num_epochs): batch_time = AverageMeter() losses = AverageMeter() losses_pull = AverageMeter() losses_push = AverageMeter() losses_binary = AverageMeter() losses_depth = AverageMeter() losses_normal = AverageMeter() losses_instance = AverageMeter() ioues = AverageMeter() rmses = AverageMeter() instance_rmses = AverageMeter() mean_angles = AverageMeter() tic = time.time() for iter, sample in enumerate(data_loader): image = sample['image'].to(device) instance = sample['instance'].to(device) semantic = sample['semantic'].to(device) gt_depth = sample['depth'].to(device) gt_seg = sample['gt_seg'].to(device) gt_plane_parameters = sample['plane_parameters'].to(device) valid_region = sample['valid_region'].to(device) gt_plane_instance_parameter = sample[ 'plane_instance_parameter'].to(device) # forward pass logit, embedding, _, _, param = network(image) segmentations, sample_segmentations, sample_params, centers, sample_probs, sample_gt_segs = \ bin_mean_shift(logit, embedding, param, gt_seg) # calculate loss loss, loss_pull, loss_push, loss_binary, loss_depth, loss_normal, loss_parameters, loss_pw, loss_instance \ = 0., 0., 0., 0., 0., 0., 0., 0., 0. batch_size = image.size(0) for i in range(batch_size): _loss, _loss_pull, _loss_push = hinge_embedding_loss( embedding[i:i + 1], sample['num_planes'][i:i + 1], instance[i:i + 1], device) _loss_binary = class_balanced_cross_entropy_loss( logit[i], semantic[i]) _loss_normal, mean_angle = surface_normal_loss( param[i:i + 1], gt_plane_parameters[i:i + 1], valid_region[i:i + 1]) _loss_L1 = parameter_loss(param[i:i + 1], gt_plane_parameters[i:i + 1], valid_region[i:i + 1]) _loss_depth, rmse, infered_depth = Q_loss( param[i:i + 1], k_inv_dot_xy1, gt_depth[i:i + 1]) if segmentations[i] is None: continue _instance_loss, instance_depth, instance_abs_disntace, _ = \ instance_parameter_loss(segmentations[i], sample_segmentations[i], sample_params[i], valid_region[i:i+1], gt_depth[i:i+1]) _loss += _loss_binary + _loss_depth + _loss_normal + _instance_loss + _loss_L1 # planar segmentation iou prob = torch.sigmoid(logit[i]) mask = (prob > 0.5).float().cpu().numpy() iou = eval_iou(mask, semantic[i].cpu().numpy()) ioues.update(iou * 100) instance_rmses.update(instance_abs_disntace.item()) rmses.update(rmse.item()) mean_angles.update(mean_angle.item()) loss += _loss loss_pull += _loss_pull loss_push += _loss_push loss_binary += _loss_binary loss_depth += _loss_depth loss_normal += _loss_normal loss_instance += _instance_loss loss /= batch_size loss_pull /= batch_size loss_push /= batch_size loss_binary /= batch_size loss_depth /= batch_size loss_normal /= batch_size loss_instance /= batch_size # Backward optimizer.zero_grad() loss.backward() optimizer.step() # update loss losses.update(loss.item()) losses_pull.update(loss_pull.item()) losses_push.update(loss_push.item()) losses_binary.update(loss_binary.item()) losses_depth.update(loss_depth.item()) losses_normal.update(loss_normal.item()) losses_instance.update(loss_instance.item()) # update time batch_time.update(time.time() - tic) tic = time.time() if iter % cfg.print_interval == 0: _log.info( f"[{epoch:2d}][{iter:5d}/{len(data_loader):5d}] " f"Time: {batch_time.val:.2f} ({batch_time.avg:.2f}) " f"Loss: {losses.val:.4f} ({losses.avg:.4f}) " f"Pull: {losses_pull.val:.4f} ({losses_pull.avg:.4f}) " f"Push: {losses_push.val:.4f} ({losses_push.avg:.4f}) " f"INS: {losses_instance.val:.4f} ({losses_instance.avg:.4f}) " f"Binary: {losses_binary.val:.4f} ({losses_binary.avg:.4f}) " f"IoU: {ioues.val:.2f} ({ioues.avg:.2f}) " f"LN: {losses_normal.val:.4f} ({losses_normal.avg:.4f}) " f"AN: {mean_angles.val:.4f} ({mean_angles.avg:.4f}) " f"Depth: {losses_depth.val:.4f} ({losses_depth.avg:.4f}) " f"INSDEPTH: {instance_rmses.val:.4f} ({instance_rmses.avg:.4f}) " f"RMSE: {rmses.val:.4f} ({rmses.avg:.4f}) ") _log.info(f"* epoch: {epoch:2d}\t" f"Loss: {losses.avg:.6f}\t" f"Pull: {losses_pull.avg:.6f}\t" f"Push: {losses_push.avg:.6f}\t" f"Binary: {losses_binary.avg:.6f}\t" f"Depth: {losses_depth.avg:.6f}\t" f"IoU: {ioues.avg:.2f}\t" f"RMSE: {rmses.avg:.4f}\t") # save history history['losses'].append(losses.avg) history['losses_pull'].append(losses_pull.avg) history['losses_push'].append(losses_push.avg) history['losses_binary'].append(losses_binary.avg) history['losses_depth'].append(losses_depth.avg) history['ioues'].append(ioues.avg) history['rmses'].append(rmses.avg) # save checkpoint if not (_run._id is None): torch.save( network.state_dict(), os.path.join(checkpoint_dir, f"network_epoch_{epoch}.pt")) pickle.dump( history, open(os.path.join(checkpoint_dir, 'history.pkl'), 'wb'))
def test(args, model, video_val=None): reward_avg = AverageMeter() loss_avg = AverageMeter() value_loss_avg = AverageMeter() policy_loss_avg = AverageMeter() root_dir = '/home/piaozx/VOT16' data_type = 'VOT' model.train() env = Env(seqs_path=root_dir, data_set_type=data_type, save_path='dataset/Result/VOT') for video_name in video_val: # env = Env(seqs_path=root_dir, data_set_type=data_type, save_path='dataset/Result/VOT') actions = [] rewards = [] values = [] entropies = [] logprobs = [] # reset for new video observation1, observation2 = env.reset(video_name) img1 = ReadSingleImage(observation2) img1 = Variable(img1).cuda() hidden_prev = model.init_hidden_state( batch_size=1) # variable cuda tensor _, _, _, _, hidden_pres = model(imgs=img1, hidden_prev=hidden_prev) # for loop init parameter hidden_prev = hidden_pres observation = observation2 FLAG = 1 i = 2 while FLAG: img = ReadSingleImage(observation) img = Variable(img).cuda() action_prob, action_logprob, action_sample, value, hidden_pres = model( imgs=img, hidden_prev=hidden_prev) entropy = -(action_logprob * action_prob).sum(1, keepdim=True) entropies.append(entropy) actions.append(action_sample.long()) # list, Variable cuda inner action_np = action_sample.data.cpu().numpy() sample = Variable(torch.LongTensor(action_np).cuda()).unsqueeze(0) hidden_prev = hidden_pres logprob = action_logprob.gather(1, sample) logprobs.append(logprob) reward, new_observation, done = env.step(action=action_np) env.show_all() # env.show_tracking_result() print('test:', action_np[0], 'rewards:', reward, 'probability:', action_prob.data.cpu().numpy()[0, 0], action_prob.data.cpu().numpy()[0, 1]) rewards.append(reward) # just list values.append(value) # list, Variable cuda inner observation = new_observation if done: FLAG = 0 num_seqs = len(rewards) running_add = Variable(torch.FloatTensor([0])).cuda() value_loss = 0 policy_loss = 0 gae = torch.FloatTensor([0]).cuda() values.append(running_add) for i in reversed(range(len(rewards))): running_add = args.gamma * running_add + rewards[i] advantage = running_add - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - logprobs[i] * Variable( gae) - args.entropy_coef * entropies[i] # value_loss = value_loss / num_seqs # policy_loss = policy_loss / num_seqs # # values.append(running_add) # for i in reversed(range(len(rewards))): # running_add = args.gamma * running_add + rewards[i] # advantage = running_add - values[i] # value_loss = value_loss + 0.5 * advantage.pow(2) # policy_loss = policy_loss - logprobs[i] * advantage - args.entropy_coef * entropies[i] # value_loss = value_loss / num_seqs policy_loss = policy_loss / num_seqs loss = args.value_loss_coef * value_loss + policy_loss print(video_name, 'frame{%d}' % (i), 'rewards:{%.6f}' % np.mean(rewards), 'loss:{%.6f}' % loss.data[0], 'value_loss:{%6f}' % value_loss.data[0], 'policy_loss{%.6f}' % policy_loss.data[0]) i += 1 # update the loss loss_avg.update(loss.data.cpu().numpy()) value_loss_avg.update(value_loss.data.cpu().numpy()) policy_loss_avg.update(policy_loss.data.cpu().numpy()) reward_avg.update(np.mean(rewards)) return reward_avg.avg, loss_avg.avg, value_loss_avg.avg, policy_loss_avg.avg
def validate(args, val_loader, model, criterion, criterion2): logger = logging.getLogger('val') log_dir = os.path.join('log', args.env) if not os.path.isdir(log_dir): logger.info('log dir does not exist, create log dir.') os.makedirs(log_dir) fh = logging.FileHandler(os.path.join(log_dir, 'val.log'), mode='a+') fh.setLevel(logging.INFO) logger.addHandler(fh) batch_time = AverageMeter() losses = AverageMeter() accu = AverageMeter() # switch to evaluate mode model = model.eval() end = time.time() for i, (image, target) in enumerate(val_loader): # measure data loading time image = image.cuda(async=True) target = target.cuda(async=True) image_var, target_var = Variable(image, volatile=True), Variable( target, volatile=True) # compute output output = model(image_var) loss = criterion(output, target_var) acc = 10 losses.update(loss.data[0]) accu.update(acc) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if ((i + 1) % args.print_freq) == 0: logger.info('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format( i + 1, len(val_loader), batch_time=batch_time, loss=losses, accu=accu)) sys.stdout.flush() print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format( i + 1, len(val_loader), batch_time=batch_time, loss=losses, accu=accu)) logger.info(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format( losses=losses, accu=accu)) print(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format( losses=losses, accu=accu)) return losses.avg, accu.avg
def train(args, train_loader, model, optimizer, criterion, epoch): logger = logging.getLogger('train') log_dir = os.path.join('log', args.env) if not os.path.isdir(log_dir): logger.info('log dir does not exist, create log dir.') os.makedirs(log_dir) fh = logging.FileHandler(os.path.join(log_dir, 'train.log'), mode='a+') fh.setLevel(logging.INFO) logger.addHandler(fh) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accu = AverageMeter() model = model.train() end = time.time() optimizer = optimizer for i, (image, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) image = image.cuda(async=True) target = target.cuda(async=True) image_var, target_var = Variable(image), Variable(target) # compute output output = model(image_var) loss = criterion(output, target_var) # update the cls optimizer.zero_grad() loss.backward() optimizer.step() acc = 10 losses.update(loss.data[0]) accu.update(acc) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if ((i + 1) % args.print_freq) == 0: logger.info('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format( epoch + 1, i + 1, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, accu=accu)) sys.stdout.flush() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format( epoch + 1, i + 1, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, accu=accu)) logger.info(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format( losses=losses, accu=accu)) print(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format( losses=losses, accu=accu)) return losses.avg, accu.avg
def train(args, model, optimizer=None, video_train=None): reward_avg = AverageMeter() loss_avg = AverageMeter() value_loss_avg = AverageMeter() policy_loss_avg = AverageMeter() root_dir = '/home/piaozx/VOT16' data_type = 'VOT' model.train() # save_path='dataset/Result/VOT' env = Env(seqs_path=root_dir, data_set_type=data_type, save_path='save') for video_name in video_train: actions = [] rewards = [] values = [] entropies = [] logprobs = [] # reset for new video observation1, observation2 = env.reset(video_name) img1 = ReadSingleImage(observation2) img1 = Variable(img1).cuda() hidden_prev = model.init_hidden_state( batch_size=1) # variable cuda tensor _, _, _, _, hidden_pres = model(imgs=img1, hidden_prev=hidden_prev) # for loop init parameter hidden_prev = hidden_pres observation = observation2 FLAG = 1 loss_dd = 0 i = 2 while FLAG: img = ReadSingleImage(observation) img = Variable(img).cuda() action_prob, action_logprob, action_sample, value, hidden_pres = model( imgs=img, hidden_prev=hidden_prev) entropy = -(action_logprob * action_prob).sum(1, keepdim=True) entropies.append(entropy) actions.append(action_sample.long()) # list, Variable cuda inner action_np = action_sample.data.cpu().numpy() # print('train:', action_np) # import pdb; pdb.set_trace() # print(action_prob[0, 1]) loss_dd += torch.abs(0.5 - action_prob[0, 1]).pow(2) hidden_prev = hidden_pres sample = Variable(torch.LongTensor(action_np).cuda()).unsqueeze(0) logprob = action_logprob.gather(1, sample) logprobs.append(logprob) reward, new_observation, done = env.step(action=action_np) env.show_all() print( 'train:', 'frame{%d}' % (i), 'Action:{%1d}' % action_np[0], 'rewards:{%.6f}' % reward, 'probability:{%.6f}, {%.6f}' % (action_prob.data.cpu().numpy()[0, 0], action_prob.data.cpu().numpy()[0, 1])) i += 1 rewards.append(reward) # just list values.append(value) # list, Variable cuda inner observation = new_observation if done: FLAG = 0 num_seqs = len(rewards) running_add = Variable(torch.FloatTensor([0])).cuda() value_loss = 0 policy_loss = 0 gae = torch.FloatTensor([0]).cuda() values.append(running_add) for i in reversed(range(len(rewards))): running_add = args.gamma * running_add + rewards[i] advantage = running_add - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t # gae = delta_t policy_loss = policy_loss - logprobs[i] * Variable( gae) - args.entropy_coef * entropies[i] value_loss = value_loss / num_seqs policy_loss = policy_loss / num_seqs # values.append(running_add) # for i in reversed(range(len(rewards))): # running_add = args.gamma * running_add + rewards[i] # advantage = running_add - values[i] # value_loss = value_loss + 0.5 * advantage.pow(2) # policy_loss = policy_loss - logprobs[i] * advantage - args.entropy_coef * entropies[i] # # value_loss = value_loss / num_seqs # policy_loss = policy_loss/num_seqs optimizer.zero_grad() loss = args.value_loss_coef * value_loss + policy_loss loss += 0.005 * loss_dd[0] # print model.actor.fc1.weight loss.backward() # viz_ = viz.get_viz('main') # # viz_.update_plot() torch.nn.utils.clip_grad_norm(model.critic.parameters(), args.max_grad_norm) torch.nn.utils.clip_grad_norm(model.actor.parameters(), args.max_grad_norm) optimizer.step() print(video_name, 'rewards:{%.6f}' % np.mean(rewards), 'loss:{%.6f}' % loss.data[0], 'value_loss:{%6f}' % value_loss.data[0], 'policy_loss:{%.6f}' % policy_loss.data[0]) # update the loss loss_avg.update(loss.data.cpu().numpy()) value_loss_avg.update(value_loss.data.cpu().numpy()) policy_loss_avg.update(policy_loss.data.cpu().numpy()) reward_avg.update(np.mean(rewards)) return reward_avg.avg, loss_avg.avg, value_loss_avg.avg, policy_loss_avg.avg
def train(cfg, train_loader, GENERATORS, DISCRIMINATORS, LOSS_MODEL, OPTIMIZERS, SCHEDULERS, CRITERIONS, epoch, train_kernel_dict, device): batch_time = AverageMeter() loss_dict = OrderedDict() # loss_dict['netD_total_loss'] = AverageMeter() if cfg.LOSS_ADV_WEIGHT: for name, models in DISCRIMINATORS.items(): if models is not None: loss_dict['dis_loss_{}'.format(name)] = AverageMeter() loss_dict['adv_loss_{}'.format(name)] = AverageMeter() models.train() # switch train mode # generators # loss_dict['netG_total_loss'] = AverageMeter() for name, models in GENERATORS.items(): if models is not None: models.train() loss_dict['gen_loss_{}'.format(name)] = AverageMeter() if cfg.LOSS_L1_WEIGHT: loss_dict['l1_loss_{}'.format(name)] = AverageMeter() if cfg.LOSS_VGG_WEIGHT: loss_dict['vgg_loss'] = AverageMeter() cri_l1 = CRITERIONS['cri_l1'] cri_gan = CRITERIONS['cri_gan'] end = time.time() for cur_iter, data in enumerate(train_loader): gt_img, gt_onehot_segmap, _ = data batch_size = gt_img.size(0) blur_range, kernels = random.choice(list(train_kernel_dict.items())) blur_img = get_blurtensor(gt_img, blur_range, kernels) blur_img, gt_img, gt_onehot_segmap = prepare( [blur_img, gt_img, gt_onehot_segmap], device) out0, out1, out2, out3 = GENERATORS['netG'](blur_img) gt0 = gt_img * gt_onehot_segmap[:, 1:2, :, :] out0 = out0 * gt_onehot_segmap[:, 1:2, :, :] gt1 = gt_img * gt_onehot_segmap[:, 2:3, :, :] out1 = out1 * gt_onehot_segmap[:, 2:3, :, :] gt2 = gt_img * gt_onehot_segmap[:, 3:4, :, :] out2 = out2 * gt_onehot_segmap[:, 3:4, :, :] gt_list = [gt0, gt1, gt2, gt_img] out_list = [out0, out1, out2, out3] #====================================== # Training netD #====================================== if cfg.LOSS_ADV_WEIGHT: d_loss = 0 dis_input_real = gt_list dis_input_fake = [o.detach() for o in out_list] dis_real = DISCRIMINATORS['netD'](dis_input_real) dis_fake = DISCRIMINATORS['netD'](dis_input_fake) dis_real_loss = cri_gan(dis_real, True) dis_fake_loss = cri_gan(dis_fake, False) d_loss += (dis_real_loss + dis_fake_loss) / 2 loss_dict['dis_loss_netD'].update(d_loss.item(), batch_size) OPTIMIZERS['netD'].zero_grad() d_loss.backward() OPTIMIZERS['netD'].step() #====================================== # Training netG #====================================== g_loss = 0 if cfg.LOSS_ADV_WEIGHT: gen_input_fake = [o.detach() for o in out_list] gen_fake = DISCRIMINATORS['netD'](gen_input_fake) adv_loss = cri_gan(gen_fake, True) * cfg.LOSS_ADV_WEIGHT loss_dict['adv_loss_netD'].update(adv_loss.item(), batch_size) g_loss += adv_loss if cfg.LOSS_VGG_WEIGHT: # vgg face feature loss gen_input_real = gt_img gen_input_fake = out3.detach() fake_feat = LOSS_MODEL['vggface'](gen_input_fake) real_feat = LOSS_MODEL['vggface'](gen_input_real) vgg_loss = cri_l1(fake_feat, real_feat) * cfg.LOSS_VGG_WEIGHT loss_dict['vgg_loss'].update(vgg_loss.item(), batch_size) g_loss += vgg_loss if cfg.LOSS_L1_WEIGHT: l1_loss = 0 for g, o in zip(gt_list, out_list): l1_loss += cri_l1(g, o) * cfg.LOSS_L1_WEIGHT loss_dict['l1_loss_netG'].update(l1_loss.item(), batch_size) g_loss += l1_loss loss_dict['gen_loss_netG'].update(g_loss.item(), batch_size) OPTIMIZERS['netG'].zero_grad() g_loss.backward() OPTIMIZERS['netG'].step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # plot progress print("=" * 30) log = '[Epoch: {}|{}] [Iter: {}|{}({:.3f}s)]\ \n[GPU:{}] [KEY: {}]\ \n[Blur:{}]'.format( epoch, cfg.TOTAL_EPOCH, cur_iter + 1, len(train_loader), batch_time.avg, cfg.GPU_IDS, cfg.KEYPOINT, blur_range, ) print(log) for loss_name, value in loss_dict.items(): log = '\t[{} : {:.6f}]'.format(loss_name, value.avg) print(log) return loss_dict
def train(train_loader, net, optim, curr_epoch, scaler): """ Runs the training loop per epoch train_loader: Data loader for train net: thet network optimizer: optimizer curr_epoch: current epoch return: """ full_bt = time.perf_counter() net.train() train_main_loss = AverageMeter() start_time = None warmup_iter = 10 optim.last_batch = len(train_loader) - 1 btimes = [] batch_time = time.perf_counter() for i, data in enumerate(train_loader): lr_warmup(optim, curr_epoch, i, len(train_loader), max_lr=0.4) if i <= warmup_iter: start_time = time.time() # inputs = (bs,3,713,713) # gts = (bs,713,713) images, gts, _img_name, scale_float = data batch_pixel_size = images.size(0) * images.size(2) * images.size(3) images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda( ) inputs = {'images': images, 'gts': gts} optim.zero_grad() if args.amp: with amp.autocast(): main_loss = net(inputs) log_main_loss = main_loss.clone().detach_() # torch.distributed.all_reduce(log_main_loss, # torch.distributed.ReduceOp.SUM) log_wait = optim.comm.Iallreduce(MPI.IN_PLACE, log_main_loss, MPI.SUM) # log_main_loss = log_main_loss / args.world_size # train_main_loss.update(log_main_loss.item(), batch_pixel_size) scaler.scale(main_loss).backward() else: main_loss = net(inputs) main_loss = main_loss.mean() log_main_loss = main_loss.clone().detach_() log_wait = None #train_main_loss.update(log_main_loss.item(), batch_pixel_size) main_loss.backward() # the scaler update is within the optim step optim.step() if i >= warmup_iter: curr_time = time.time() batches = i - warmup_iter + 1 batchtime = (curr_time - start_time) / batches else: batchtime = 0 if log_wait is not None: log_wait.Wait() log_main_loss = log_main_loss / args.world_size train_main_loss.update(log_main_loss.item(), batch_pixel_size) msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],' ' [lr {:0.6f}] [batchtime {:0.3g}]') msg = msg.format(curr_epoch, i + 1, len(train_loader), train_main_loss.avg, optim.local_optimizer.param_groups[-1]['lr'], batchtime) logx.msg(msg) metrics = { 'loss': train_main_loss.avg, 'lr': optim.local_optimizer.param_groups[-1]['lr'] } curr_iter = curr_epoch * len(train_loader) + i logx.metric('train', metrics, curr_iter) if i >= 10 and args.test_mode: del data, inputs, gts return btimes.append(time.perf_counter() - batch_time) batch_time = time.perf_counter() if args.benchmarking: train_loss_tens = torch.tensor(train_main_loss.avg) optim.comm.Allreduce(MPI.IN_PLACE, train_loss_tens, MPI.SUM) train_loss_tens = train_loss_tens.to(torch.float) train_loss_tens /= float(optim.comm.size) train_main_loss.avg = train_loss_tens.item() return train_main_loss.avg, torch.mean( torch.tensor(btimes)), time.perf_counter() - full_bt