def train(batch_processor, train_loader, model, criterion, optimizer, epoch, args, logger): batch_time = tools.AverageMeter('Time', ':6.3f') data_time = tools.AverageMeter('Data', ':6.3f') losses = tools.AverageMeter('Loss', ':.4e') progress = tools.ProgressMeter(len(train_loader), [batch_time, data_time, losses], prefix="Epoch: [{}]".format(epoch), logger=logger) # fix BN if args.fixed_BN: model.eval() else: model.train() criterion.train() end = time.time() for i, batch_data in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) batch_processor(args.gpu, batch_data, model, criterion, optimizer, losses) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
def infer(valid_queue, model, epoch, Latency,criterion, writer): batch_time = utils.AverageMeters('Time', ':6.3f') losses = utils.AverageMeters('Loss', ':.4e') top1 = utils.AverageMeters('Acc@1', ':6.2f') top5 = utils.AverageMeters('Acc@5', ':6.2f') # set chosen op active model.module.set_chosen_op_active() model.module.unused_modules_off() model.eval() progress = utils.ProgressMeter(len(valid_queue), batch_time, losses, top1, top5, prefix='Test: ') cur_step = epoch*len(valid_queue) end = time.time() with torch.no_grad(): for step, (input, target) in enumerate(valid_queue): # input = input.cuda() # target = target.cuda(non_blocking=True) input = Variable(input, volatile=True).cuda() # target = Variable(target, volatile=True).cuda(async=True) target = Variable(target, volatile=True).cuda() logits = model(input) loss = criterion(logits, target) acc1, acc5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) reduced_loss = reduce_tensor( loss.data, world_size=config.world_size) acc1 = reduce_tensor(acc1, world_size=config.world_size) acc5 = reduce_tensor(acc5, world_size=config.world_size) losses.update(to_python_float(reduced_loss), n) top1.update(to_python_float(acc1), n) top5.update(to_python_float(acc5), n) # measure elapsed time batch_time.update(time.time() - end) end = time.time() shape = [1, 3, 224, 224] input_var = torch.zeros(shape, device=device) flops = model.module.get_flops(input_var) if config.target_hardware in [None, 'flops']: latency = 0 else: latency = Latency.predict_latency(model) model.module.unused_modules_back() if step % config.print_freq == 0: progress.print(step) logger.info('valid %03d\t loss: %e\t top1: %f\t top5: %f\t flops: %f\t latency: %f', step, losses.avg, top1.avg, top5.avg, flops/1e6, latency) writer.add_scalar('val/loss', losses.avg, cur_step) writer.add_scalar('val/top1', top1.avg, cur_step) writer.add_scalar('val/top5', top5.avg, cur_step) return top1.avg, losses.avg
def train(train_loader, model, criterion, optimizer, epoch, args): AverageMeter = utils.AverageMeter batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':f') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = utils.ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch+1)) # switch to train mode model.train() end = time.time() pth_file_name = os.path.join(args.train_local, 'train_epoch_%s.npy' % (str(epoch + 1))) for i, (images, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # target2 = target[1].cuda(args.gpu, non_blocking=True) # lam = target[2] #进行mixup操作 # images, targets_a, targets_b, lam = mixup_data(images, target, 1.) # compute output output = model(images) # loss = lam * criterion(output, target1) + (1 - lam) * criterion(output, target2) loss = criterion(output, target) #之前用交叉熵损失函数 # loss = mixip_criterion(criterion, output, targets_a, targets_b, lam) # print(loss.item()) # measure accuracy and record loss acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
def train(args, model, train_loader, optimizer, epoch, training_step, writer): losses = utils.AverageMeter("Loss", ":.6f") progress = utils.ProgressMeter(len(train_loader), [losses], prefix="Epoch: [{}]".format(epoch)) model.train() for batch_idx, batch in enumerate(train_loader): batch = [tensor.cuda() for tensor in batch] ( obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end, ) = batch optimizer.zero_grad() loss = torch.zeros(1).to(pred_traj_gt) l2_loss_rel = [] loss_mask = loss_mask[:, args.obs_len:] if training_step == 1 or training_step == 2: model_input = obs_traj_rel pred_traj_fake_rel = model(model_input, obs_traj, seq_start_end, 1, training_step) l2_loss_rel.append( l2_loss(pred_traj_fake_rel, model_input, loss_mask, mode="raw")) else: model_input = torch.cat((obs_traj_rel, pred_traj_gt_rel), dim=0) for _ in range(args.best_k): pred_traj_fake_rel = model(model_input, obs_traj, seq_start_end, 0) l2_loss_rel.append( l2_loss( pred_traj_fake_rel, model_input[-args.pred_len:], loss_mask, mode="raw", )) l2_loss_sum_rel = torch.zeros(1).to(pred_traj_gt) l2_loss_rel = torch.stack(l2_loss_rel, dim=1) for start, end in seq_start_end.data: _l2_loss_rel = torch.narrow(l2_loss_rel, 0, start, end - start) _l2_loss_rel = torch.sum(_l2_loss_rel, dim=0) # [20] _l2_loss_rel = torch.min(_l2_loss_rel) / ( (pred_traj_fake_rel.shape[0]) * (end - start)) l2_loss_sum_rel += _l2_loss_rel loss += l2_loss_sum_rel losses.update(loss.item(), obs_traj.shape[1]) loss.backward() optimizer.step() if batch_idx % args.print_every == 0: progress.display(batch_idx) writer.add_scalar("train_loss", losses.avg, epoch)
def train(train_loader, model, criterion, optimizer, epoch, args): AverageMeter = utils.AverageMeter batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':f') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = utils.ProgressMeter(len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch + 1)) # switch to train mode model.train() end = time.time() for i, (images, target) in enumerate(train_loader): # print(images.shape) # print(target.size()) # exit() # measure data loading time data_time.update(time.time() - end) images = images.cuda() target = target.cuda(async=True) images, target = torch.autograd.Variable( images), torch.autograd.Variable(target) # forward output = model(images) loss = criterion(output, target) # print(loss.item()) # exit() # measure accuracy and record loss acc1, acc5 = utils.accuracy(output.data, target.data, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) return (losses.avg, top1.avg, top5.avg)
def validate_warmup(valid_queue, model, epoch, criterion, writer): batch_time = utils.AverageMeters('Time', ':6.3f') losses = utils.AverageMeters('Loss', ':.4e') top1 = utils.AverageMeters('Acc@1', ':6.2f') top5 = utils.AverageMeters('Acc@5', ':6.2f') model.train() progress = utils.ProgressMeter(len(valid_queue), batch_time, losses, top1, top5, prefix='Warmup-Test: ') cur_step = epoch * len(valid_queue) end = time.time() with torch.no_grad(): for step, (input, target) in enumerate(valid_queue): # input = input.cuda() # target = target.cuda(non_blocking=True) input = Variable(input, volatile=True).cuda() # target = Variable(target, volatile=True).cuda(async=True) target = Variable(target, volatile=True).cuda() logits = model(input) loss = criterion(logits, target) acc1, acc5 = utils.accuracy(logits, target, topk=(1, 5)) n = input.size(0) losses.update(loss, n) top1.update(acc1, n) top5.update(acc5, n) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % config.print_freq == 0: progress.print(step) logger.info('warmup-valid %03d %e %f %f', step, losses.avg, top1.avg, top5.avg) writer.add_scalar('warmup-val/loss', losses.avg, cur_step) writer.add_scalar('warmup-val/top1', top1.avg, cur_step) writer.add_scalar('warmup-val/top5', top5.avg, cur_step) return top1.avg, top5.avg, losses.avg
def validate(val_loader, model, criterion, args): global best_acc1 AverageMeter = utils.AverageMeter batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':f') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = utils.ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): images = images.cuda() target = target.cuda() images, target = torch.autograd.Variable( images), torch.autograd.Variable(target) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = utils.accuracy(output.data, target.data, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) # TODO: this should also be done with the ProgressMeter print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) return losses.avg, top1.avg, top5.avg
def validate(args, model, val_loader, epoch, writer): ade = utils.AverageMeter("ADE", ":.6f") fde = utils.AverageMeter("FDE", ":.6f") progress = utils.ProgressMeter(len(val_loader), [ade, fde], prefix="Test: ") model.eval() with torch.no_grad(): for i, batch in enumerate(val_loader): batch = [tensor.cuda() for tensor in batch] ( obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end, ) = batch loss_mask = loss_mask[:, args.obs_len:] pred_traj_fake_rel = model(obs_traj_rel, obs_traj, seq_start_end) pred_traj_fake_rel_predpart = pred_traj_fake_rel[-args.pred_len:] pred_traj_fake = relative_to_abs(pred_traj_fake_rel_predpart, obs_traj[-1]) ade_, fde_ = cal_ade_fde(pred_traj_gt, pred_traj_fake) ade_ = ade_ / (obs_traj.shape[1] * args.pred_len) fde_ = fde_ / (obs_traj.shape[1]) ade.update(ade_, obs_traj.shape[1]) fde.update(fde_, obs_traj.shape[1]) if i % args.print_every == 0: progress.display(i) logging.info(" * ADE {ade.avg:.3f} FDE {fde.avg:.3f}".format( ade=ade, fde=fde)) writer.add_scalar("val_ade", ade.avg, epoch) return ade.avg
def train(train_loader, model, optimizer, epoch, args): batch_time = utils.AverageMeter('Time', '6.3f') data_time = utils.AverageMeter('Data', '6.3f') # save images to investigate inv_normalize = transforms.Normalize( mean=[-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225], std=[1 / 0.229, 1 / 0.224, 1 / 0.225]) inv_transform = transforms.Compose( [inv_normalize, transforms.ToPILImage()]) os.makedirs("{}/train_images".format(args.save_folder), exist_ok=True) img_ctr = 0 loss_meters = [] loss_updates = [] meter = utils.AverageMeter('Total Loss', '.4e') loss_updates.append( (lambda m: lambda _, l_total, bs: m.update(l_total, bs) )(meter)) # lam for closure loss_meters.extend([meter, utils.ProgressMeter.BR]) if args.moco_contr_w != 0: meter = utils.AverageMeter('Contr-Loss', '.4e') acc1 = utils.AverageMeter('Contr-Acc1', '6.2f') acc5 = utils.AverageMeter('Contr-Acc5', '6.2f') def f(meter, macc1, macc5): # closure def accuracy(output, target=0, topk=(1, )): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = output.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = (pred == 0) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum() res.append(correct_k.mul_(100.0 / batch_size)) return res def update(losses, _, bs): meter.update(losses.loss_contr, bs) acc1, acc5 = accuracy(losses.logits_contr, topk=(1, 5)) macc1.update(acc1, bs) macc5.update(acc5, bs) return update loss_updates.append(f(meter, acc1, acc5)) loss_meters.extend([meter, acc1, acc5, utils.ProgressMeter.BR]) if args.moco_align_w != 0: meter = utils.AverageMeter('Align-Loss', '.4e') loss_updates.append( (lambda m: lambda losses, _, bs: m.update(losses.loss_align, bs) )(meter)) # lam for closure loss_meters.append(meter) if args.moco_unif_w != 0: meter = utils.AverageMeter('Unif-Loss', '.4e') loss_updates.append( (lambda m: lambda losses, _, bs: m.update(losses.loss_unif) )(meter)) # lam for closure loss_meters.append(meter) if len(loss_meters) and loss_meters[-1] == utils.ProgressMeter.BR: loss_meters = loss_meters[:-1] progress = utils.ProgressMeter(len(train_loader), [batch_time, data_time] + loss_meters, prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() end = time.time() # for i, (images, _) in enumerate(train_loader): for i, (_, images, target, _) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) images[0] = images[0].cuda(args.gpu, non_blocking=True) images[1] = images[1].cuda(args.gpu, non_blocking=True) # save images to investigate if epoch == 0 and i < 10: for batch_index in range(images[0].size(0)): if int(target[batch_index].item()) == 26: img_ctr = img_ctr + 1 inv_image1 = inv_transform(images[0][batch_index].cpu()) inv_image1.save( "{}/train_images/".format(args.save_folder) + str(img_ctr).zfill(5) + '_view_0' + '.png') inv_image2 = inv_transform(images[1][batch_index].cpu()) inv_image2.save( "{}/train_images/".format(args.save_folder) + str(img_ctr).zfill(5) + '_view_1' + '.png') # compute losses moco_losses = model(im_q=images[0], im_k=images[1]) total_loss = moco_losses.combine(contr_w=args.moco_contr_w, align_w=args.moco_align_w, unif_w=args.moco_unif_w) # record loss if args.index == 0: bs = images[0].shape[0] for update_fn in loss_updates: update_fn(moco_losses, total_loss, bs) # compute gradient and do SGD step optimizer.zero_grad() total_loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0 and args.index == 0: progress.display(i)
def train(train_queue, valid_queue, model, criterion, LatencyLoss, optimizer, alpha_optimizer, lr, epoch, writer, update_schedule): arch_param_num = np.sum( np.prod(params.size()) for params in model.module.arch_parameters()) binary_gates_num = len(list(model.module.binary_gates())) weight_param_num = len(list(model.module.weight_parameters())) print('#arch_params: %d\t#binary_gates: %d\t#weight_params: %d' % (arch_param_num, binary_gates_num, weight_param_num)) batch_time = utils.AverageMeters('Time', ':6.3f') data_time = utils.AverageMeters('Data', ':6.3f') losses = utils.AverageMeters('Loss', ':.4e') top1 = utils.AverageMeters('Acc@1', ':6.2f') top5 = utils.AverageMeters('Acc@5', ':6.2f') entropy = utils.AverageMeters('Entropy', ':.4e') progress = utils.ProgressMeter(len(train_queue), batch_time, data_time, losses, top1, top5, prefix="Epoch: [{}]".format(epoch)) cur_step = epoch * len(train_queue) writer.add_scalar('train/lr', lr, cur_step) model.train() end = time.time() for step, (input, target) in enumerate(train_queue): # measure data loading time data_time.update(time.time() - end) net_entropy = model.module.entropy() entropy.update(net_entropy.data.item() / arch_param_num, 1) # sample random path model.module.reset_binary_gates() # close unused module model.module.unused_modules_off() n = input.size(0) input = Variable(input, requires_grad=False).cuda() # target = Variable(target, requires_grad=False).cuda(async=True) target = Variable(target, requires_grad=False).cuda() logits = model(input) if config.label_smooth > 0.0: loss = utils.cross_entropy_with_label_smoothing( logits, target, config.label_smooth) else: loss = criterion(logits, target) acc1, acc5 = utils.accuracy(logits, target, topk=(1, 5)) losses.update(loss, n) top1.update(acc1, n) top5.update(acc5, n) model.zero_grad() loss.backward() nn.utils.clip_grad_norm(model.parameters(), config.grad_clip) optimizer.step() # unused module back model.module.unused_modules_back() # Training weights firstly, after few epoch, train arch parameters if epoch > 0: #### office warm up lr #### # T_cur = epoch * len(train_queue) + step # lr_max = 0.05 # T_totol = config.warmup_eforhs * len(train_queue) # lr = 0.5 * lr_max * (1 + math.cos(math.pi * T_cur / T_total)) #### office warm up lr #### for j in range(update_schedule.get(step, 0)): model.train() latency_loss = 0 expected_loss = 0 valid_iter = iter(valid_queue) input_valid, target_valid = next(valid_iter) # alpha_optimizer.zero_grad() input_valid = Variable(input_valid, requires_grad=False).cuda() # target = Variable(target, requires_grad=False).cuda(async=True) target_valid = Variable(target_valid, requires_grad=False).cuda() model.module.reset_binary_gates() model.module.unused_modules_off() output_valid = model(input_valid).float() loss_ce = criterion(output_valid, target_valid) expected_loss = LatencyLoss.expected_latency(model) expected_loss_tensor = torch.cuda.FloatTensor([expected_loss]) latency_loss = LatencyLoss(loss_ce, expected_loss_tensor, config) # compute gradient and do SGD step # zero grads of weight_param, arch_param & binary_param model.zero_grad() latency_loss.backward() # set architecture parameter gradients model.module.set_arch_param_grad() alpha_optimizer.step() model.module.rescale_updated_arch_param() model.module.unused_modules_back() log_str = 'Architecture [%d-%d]\t Loss %.4f\t %s LatencyLoss: %s' % ( epoch, step, latency_loss, config.target_hardware, expected_loss) utils.write_log(arch_logger_path, log_str) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % config.print_freq == 0 or step == len(train_queue) - 1: logger.info('train step:%03d %03d loss:%e top1:%05f top5:%05f', step, len(train_queue), losses.avg, top1.avg, top5.avg) progress.print(step) writer.add_scalar('train/loss', losses.avg, cur_step) writer.add_scalar('train/top1', top1.avg, cur_step) writer.add_scalar('train/top5', top5.avg, cur_step) return top1.avg, losses.avg
def warm_up(train_queue, valid_queue, model, criterion, Latency, optimizer, epoch, writer): batch_time = utils.AverageMeters('Time', ':6.3f') data_time = utils.AverageMeters('Data', ':6.3f') losses = utils.AverageMeters('Loss', ':.4e') top1 = utils.AverageMeters('Acc@1', ':6.2f') top5 = utils.AverageMeters('Acc@5', ':6.2f') progress = utils.ProgressMeter(len(train_queue), batch_time, data_time, losses, top1, top5, prefix="Epoch: [{}]".format(epoch)) cur_step = epoch * len(train_queue) model.train() print('\n', '-' * 30, 'Warmup epoch: %d' % (epoch), '-' * 30, '\n') end = time.time() lr = 0 for step, (input, target) in enumerate(train_queue): # measure data loading time data_time.update(time.time() - end) # office warm up lr #l'r T_cur = epoch * len(train_queue) + step lr_max = 0.05 T_total = config.warmup_epochs * len(train_queue) lr = 0.5 * lr_max * (1 + math.cos(math.pi * T_cur / T_total)) for param_group in optimizer.param_groups: param_group['lr'] = lr writer.add_scalar('warm-up/lr', lr, cur_step + step) #### office warm up lr #### n = input.size(0) input = Variable(input, requires_grad=False).cuda() # target = Variable(target, requires_grad=False).cuda(async=True) target = Variable(target, requires_grad=False).cuda() model.module.reset_binary_gates() model.module.unused_modules_off() logits = model(input) if config.label_smooth > 0 and epoch > config.warmup_epochs: loss = utils.cross_entropy_with_label_smoothing( logits, target, config.label_smooth) else: loss = criterion(logits, target) model.zero_grad() loss.backward() optimizer.step() acc1, acc5 = utils.accuracy(logits, target, topk=(1, 5)) losses.update(loss, n) top1.update(acc1, n) top5.update(acc5, n) # unused modules back model.module.unused_modules_back() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if step % config.print_freq == 0 or step == len(train_queue) - 1: logger.info( 'warmup train step:%03d %03d loss:%e top1:%05f top5:%05f', step, len(train_queue), losses.avg, top1.avg, top5.avg) progress.print(step) writer.add_scalar('warmup-train/loss', losses.avg, cur_step) writer.add_scalar('warmup-train/top1', top1.avg, cur_step) writer.add_scalar('warmup-train/top5', top5.avg, cur_step) logger.info('warmup epoch %d lr %e', epoch, lr) # set chosen op active model.module.set_chosen_op_active() # remove unused modules model.module.unused_modules_off() valid_top1, valid_top5, valid_loss = validate_warmup( valid_queue, model, epoch, criterion, writer) shape = [1, 3, 224, 224] input_var = torch.zeros(shape, device=device) flops = model.module.get_flops(input_var) latency = 0 if config.target_hardware in [None, 'flops']: latency = 0 else: latency = Latency.predict_latency(model) # unused modules back logger.info( 'Warmup Valid [{0}/{1}]\tloss {2:.3f}\ttop-1 acc {3:.3f}\ttop-5 acc ' '{4:.3f}\tflops: {5:.1f}M {6:.3f}ms'.format(epoch, config.warmup_epochs, valid_loss, valid_top1, valid_top5, flops / 1e6, latency)) model.module.unused_modules_back() config.warmup = epoch + 1 < config.warmup_epochs state_dict = model.state_dict() # rm architect params and binary getes for key in list(state_dict.keys()): if 'alpha' in key or 'path' in key: state_dict.pop(key) checkpoint = { 'state_dict': state_dict, 'warmup': config.warmup, } if config.warmup: checkpoint['warmup_epoch'] = epoch checkpoint['epoch'] = epoch checkpoint['w_optimizer'] = optimizer.state_dict() save_model(model, checkpoint, model_name='warmup.pth.tar') return top1.avg, losses.avg
def validate(val_loader, model, criterion,epoch, args): AverageMeter = utils.AverageMeter batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':f') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = utils.ProgressMeter( len(val_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() # 使用数组保存 dict1 = { 'vector':[], 'label':[], } pth_file_name = os.path.join(args.train_local, 'epoch_%s.pt' % (str(epoch + 1))) with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) if args.save_vector: # 存入数组 dict1['vector'].append(output) dict1['label'].append(target) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) # TODO: this should also be done with the ProgressMeter print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' .format(top1=top1, top5=top5)) # # 保存文件 # if args.save_vector: # torch.save(dict1, pth_file_name) # # np.save(pth_file_name,dict1) # if args.train_url.startswith('s3'): # mox.file.copy(pth_file_name, # args.train_url + '/' + os.path.basename(pth_file_name)) # os.remove(pth_file_name) if args.save_vector: return top1.avg,losses.avg,dict1 return top1.avg,losses.avg
def run(self, data_loader, prefix, epoch, metrics_calc): batch_time_meter = utils.AverageMeter('Time') data_time_meter = utils.AverageMeter('Data') other_meters = [] progress_display_made = False start_time = time.time() for i, batch in enumerate(data_loader): batch_number = epoch * len(data_loader) + i + 1 data_time_meter.update(time.time() - start_time, n=self.get_batch_size(batch)) # if batch_number % constants.INTERMITTENT_OUTPUT_FREQ == 0: # self.intermittent_introspection(batch, batch_number) # transfer from CPU -> GPU asynchronously if at all if torch.cuda.is_available(): if type(batch) != type([]) and type(batch) != type({}): batch = batch.cuda(non_blocking=True) elif type(batch) == type([]): for j in range(len(batch)): batch[j] = batch[j].cuda(non_blocking=True) else: # type(batch) == type({}) for key in batch.keys(): if self.keys_for_gpu is None or key in self.keys_for_gpu: batch[key] = batch[key].cuda(non_blocking=True) metrics = metrics_calc(batch) # loss.backward is called in metrics_calc if metrics is not None: for j, (metric_name, metric_val) in enumerate(metrics): self.writer.add_scalar( os.path.join(self.name, prefix + '_' + metric_name), metric_val, self.global_step) if not progress_display_made: other_meters.append(utils.AverageMeter(metric_name)) other_meters[j].update(metric_val, n=self.get_batch_size(batch)) else: other_meters[j].update(metric_val, n=self.get_batch_size(batch)) self.global_step += 1 if not progress_display_made: progress = utils.ProgressMeter(len(data_loader), other_meters + \ [batch_time_meter, data_time_meter], prefix=prefix) progress_display_made = True elif not progress_display_made: progress = utils.ProgressMeter( len(data_loader), [batch_time_meter, data_time_meter], prefix=prefix) batch_time_meter.update(time.time() - start_time, n=self.get_batch_size(batch)) start_time = time.time() if i % constants.PRINT_FREQ == 0: progress.display(i + 1, epoch) if i % constants.PRINT_FREQ != 0: progress.display(i + 1, epoch)
def validate(val_loader, model, epoch, criterion, config, early_stopping, writer, start): batch_time = utils.AverageMeters('Time', ':6.3f') losses = utils.AverageMeters('Loss', ':.4e') top1 = utils.AverageMeters('Acc@1', ':6.2f') top5 = utils.AverageMeters('Acc@5', ':6.2f') if 'DALIClassificationIterator' in val_loader.__class__.__name__: progress = utils.ProgressMeter(math.ceil(val_loader._size / config.batch_size), batch_time, losses, top1, top5, prefix='Test: ') else: progress = utils.ProgressMeter(len(val_loader), batch_time, losses, top1, top5, prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() if 'DALIClassificationIterator' in val_loader.__class__.__name__: for i, data in enumerate(val_loader): images = Variable(data[0]['data']) target = Variable( data[0]['label'].squeeze().cuda().long().cuda( non_blocking=True)) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) if config.distributed: reduced_loss = reduce_tensor(loss.data, world_size=config.world_size) acc1 = reduce_tensor(acc1, world_size=config.world_size) acc5 = reduce_tensor(acc5, world_size=config.world_size) else: reduced_loss = loss.data losses.update(to_python_float(reduced_loss), images.size(0)) top1.update(to_python_float(acc1), images.size(0)) top5.update(to_python_float(acc5), images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: progress.print(i) else: for i, (images, target) in enumerate(val_loader): images = images.cuda(device, non_blocking=True) target = target.cuda(device, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: progress.print(i) # TODO: this should also be done with the ProgressMeter print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) early_stopping(losses.avg, model, ckpt_dir=config.path) if early_stopping.early_stop: print("Early stopping") utils.time(time.time() - start) os._exit(0) writer.add_scalar('val/loss', losses.avg, epoch) writer.add_scalar('val/top1', top1.val, epoch) writer.add_scalar('val/top5', top5.val, epoch) return top1.avg
def train(train_loader, model, criterion, optimizer, epoch, config, writer): utils.adjust_learning_rate(optimizer, epoch, config) batch_time = utils.AverageMeters('Time', ':6.3f') data_time = utils.AverageMeters('Data', ':6.3f') losses = utils.AverageMeters('Loss', ':.4e') top1 = utils.AverageMeters('Acc@1', ':6.2f') top5 = utils.AverageMeters('Acc@5', ':6.2f') if 'DALIClassificationIterator' in train_loader.__class__.__name__: # TODO: IF need * config.world_size progress = utils.ProgressMeter(math.ceil(train_loader._size / config.batch_size), batch_time, data_time, losses, top1, top5, prefix="Epoch: [{}]".format(epoch)) cur_step = epoch * math.ceil(train_loader._size / config.batch_size) else: progress = utils.ProgressMeter(len(train_loader), batch_time, data_time, losses, top1, top5, prefix="Epoch: [{}]".format(epoch)) cur_step = epoch * len(train_loader) writer.add_scalar('train/lr', config.lr, cur_step) model.train() end = time.time() if 'DALIClassificationIterator' in train_loader.__class__.__name__: for i, data in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) images = Variable(data[0]['data']) target = Variable(data[0]['label'].squeeze().cuda().long()) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) if config.distributed: reduced_loss = reduce_tensor(loss.data, world_size=config.world_size) acc1 = reduce_tensor(acc1, world_size=config.world_size) acc5 = reduce_tensor(acc5, world_size=config.world_size) else: reduced_loss = loss.data losses.update(to_python_float(reduced_loss), images.size(0)) top1.update(to_python_float(acc1), images.size(0)) top5.update(to_python_float(acc5), images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() if config.fp16_allreduce: optimizer.backward(loss) else: loss.backward() optimizer.step() torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: progress.print(i) writer.add_scalar('train/loss', loss.item(), cur_step) writer.add_scalar('train/top1', top1.avg, cur_step) writer.add_scalar('train/top5', top5.avg, cur_step) else: for i, (images, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) images = images.cuda(device, non_blocking=True) target = target.cuda(device, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1.item(), images.size(0)) top5.update(acc5.item(), images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: progress.print(i) writer.add_scalar('train/loss', loss.item(), cur_step) writer.add_scalar('train/top1', top1.avg, cur_step) writer.add_scalar('train/top5', top5.avg, cur_step)