def train(train_loader, model, optimizer, lr_scheduler, tb_writer): cur_lr = lr_scheduler.get_cur_lr() rank = get_rank() average_meter = AverageMeter() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) world_size = get_world_size() num_per_epoch = len(train_loader.dataset) // \ cfg.TRAIN.EPOCH // (cfg.TRAIN.BATCH_SIZE * world_size) start_epoch = cfg.TRAIN.START_EPOCH epoch = start_epoch if not os.path.exists(cfg.TRAIN.SNAPSHOT_DIR) and \ get_rank() == 0: os.makedirs(cfg.TRAIN.SNAPSHOT_DIR) logger.info("model\n{}".format(describe(model.module))) end = time.time() for idx, data in enumerate(train_loader): if epoch != idx // num_per_epoch + start_epoch: epoch = idx // num_per_epoch + start_epoch if get_rank() == 0: torch.save( { 'epoch': epoch, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict() }, cfg.TRAIN.SNAPSHOT_DIR + '/checkpoint_e%d.pth' % (epoch)) if epoch == cfg.TRAIN.EPOCH: return if cfg.BACKBONE.TRAIN_EPOCH == epoch: logger.info('start training backbone.') optimizer, lr_scheduler = build_opt_lr(model.module, epoch) logger.info("model\n{}".format(describe(model.module))) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch: {}'.format(epoch + 1)) tb_idx = idx if idx % num_per_epoch == 0 and idx != 0: for idx, pg in enumerate(optimizer.param_groups): logger.info('epoch {} lr {}'.format(epoch + 1, pg['lr'])) if rank == 0: tb_writer.add_scalar('lr/group{}'.format(idx + 1), pg['lr'], tb_idx) data_time = average_reduce(time.time() - end) if rank == 0: tb_writer.add_scalar('time/data', data_time, tb_idx) outputs = model(data) loss = outputs['total_loss'] if is_valid_number(loss.data.item()): optimizer.zero_grad() loss.backward() reduce_gradients(model) if rank == 0 and cfg.TRAIN.LOG_GRADS: log_grads(model.module, tb_writer, tb_idx) # clip gradient clip_grad_norm_(model.parameters(), cfg.TRAIN.GRAD_CLIP) optimizer.step() batch_time = time.time() - end batch_info = {} batch_info['batch_time'] = average_reduce(batch_time) batch_info['data_time'] = average_reduce(data_time) for k, v in sorted(outputs.items()): batch_info[k] = average_reduce(v.data.item()) average_meter.update(**batch_info) if rank == 0: for k, v in batch_info.items(): tb_writer.add_scalar(k, v, tb_idx) if (idx + 1) % cfg.TRAIN.PRINT_FREQ == 0: info = "Epoch: [{}][{}/{}] lr: {:.6f}\n".format( epoch + 1, (idx + 1) % num_per_epoch, num_per_epoch, cur_lr) for cc, (k, v) in enumerate(batch_info.items()): if cc % 2 == 0: info += ("\t{:s}\t").format(getattr(average_meter, k)) else: info += ("{:s}\n").format(getattr(average_meter, k)) logger.info(info) print_speed(idx + 1 + start_epoch * num_per_epoch, average_meter.batch_time.avg, cfg.TRAIN.EPOCH * num_per_epoch) end = time.time()
def train(train_loader, model, optimizer, lr_scheduler, tb_writer): ''' :param train_loader: :param model: :param optimizer: :param lr_scheduler: :param tb_writer: :return: ''' cur_lr = lr_scheduler.get_cur_lr() #获得当前学习率 rank = get_rank() average_meter = AverageMeter() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) world_size = get_world_size() num_per_epoch = len(train_loader.dataset) // cfg.TRAIN.EPOCH // ( cfg.TRAIN.BATCH_SIZE * world_size) start_epoch = cfg.TRAIN.START_EPOCH epoch = start_epoch if not os.path.exists(cfg.TRAIN.SNAPSHOT_DIR) and get_rank() == 0: os.makedirs(cfg.TRAIN.SNAPSHOT_DIR) logger.info("model\n{}".format(describe(model.module))) #打印模型 end = time.time() for idx, data in enumerate(train_loader): if epoch != idx // num_per_epoch + start_epoch: #每个epoch的跳变沿进行一次模型存储 epoch = idx // num_per_epoch + start_epoch if get_rank() == 0: torch.save( { 'epoch': epoch, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict() }, cfg.TRAIN.SNAPSHOT_DIR + '/checkpoint_e%d.pth' % (epoch)) if epoch == cfg.TRAIN.EPOCH: return # 如果达到第10个epoch后,则要开始微调backbone的后面3层,要重新设置一下哪些参数是可训练的,哪些参数是不动的,学习率的调整因子 if cfg.BACKBONE.TRAIN_EPOCH == epoch: logger.info('start training backbone.') optimizer, lr_scheduler = build_opt_lr(model.module, epoch) logger.info("model\n{}".format(describe(model.module))) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch: {}'.format(epoch + 1)) tb_idx = idx #tensor board的idx if idx % num_per_epoch == 0 and idx != 0: for idx, pg in enumerate( optimizer.param_groups): #将优化器中的学习率添加到tensorboard中监视 logger.info('epoch {} lr {}'.format(epoch + 1, pg['lr'])) if rank == 0: tb_writer.add_scalar('lr/group{}'.format(idx + 1), pg['lr'], tb_idx) data_time = average_reduce(time.time() - end) if rank == 0: tb_writer.add_scalar('time/data', data_time, tb_idx) # show_tensor(data, tb_idx, tb_writer) # 只看输入数据,在tensorboard中显示输入数据 data[0]['iter'] = tb_idx #添加监视用 outputs = model(data) # loss = outputs['feat_loss'] loss = outputs['total_loss'] show_tensor(data, tb_idx, tb_writer, outputs) #输入输出都看,在tensorboard中显示输入数据 if is_valid_number( loss.data.item()): #判断损失是否是合法数据,滤掉nan,+inf,>10000的这样的损失 optimizer.zero_grad() loss.backward() reduce_gradients(model) #分发梯度 if rank == 0 and cfg.TRAIN.LOG_GRADS: #对梯度信息监视 log_grads(model.module, tb_writer, tb_idx) # clip gradient clip_grad_norm_(model.parameters(), cfg.TRAIN.GRAD_CLIP) optimizer.step() batch_time = time.time() - end batch_info = {} batch_info['batch_time'] = average_reduce(batch_time) batch_info['data_time'] = average_reduce(data_time) for k, v in sorted(outputs.items()): if k is 'zf' or k is 'zf_gt' or k is 'zfs' or k is 'box_img': pass else: batch_info[k] = average_reduce(v.data.item()) average_meter.update(**batch_info) if rank == 0: for k, v in batch_info.items(): tb_writer.add_scalar(k, v, tb_idx) if (idx + 1) % cfg.TRAIN.PRINT_FREQ == 0: info = "Epoch: [{}][{}/{}] lr: {:.6f}\n".format( epoch + 1, (idx + 1) % num_per_epoch, num_per_epoch, cur_lr) for cc, (k, v) in enumerate(batch_info.items()): if cc % 2 == 0: info += ("\t{:s}\t").format(getattr(average_meter, k)) else: info += ("{:s}\n").format(getattr(average_meter, k)) logger.info(info) print_speed(idx + 1 + start_epoch * num_per_epoch, average_meter.batch_time.avg, cfg.TRAIN.EPOCH * num_per_epoch) end = time.time()