'loss_rcnn_box': loss_rcnn_box } logger.add_scalars("logs_epoch_{}/losses".format(args.session), info, epoch) if args.epochs != 0: save_name = os.path.join(output_dir, 'all_s{}.pth'.format(args.session)) save_checkpoint( { 'session': args.session, 'epoch': epoch, 'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name)) if args.use_tfboard: logger.close() for test_pid in range(1, P_NUM): p_str = P_TYPE % test_pid print('{0:#^64}'.format(p_str))
def train(): args = parse_args() print('Called with args:') print(args) np.random.seed(4) torch.manual_seed(2017) torch.cuda.manual_seed(1086) output_dir = args.save_dir if not os.path.exists(output_dir): os.makedirs(output_dir) train_dataset = TDetDataset([args.dataset + '_train'], training=True, multi_scale=args.multiscale, rotation=args.rotation, pd=args.pd, warping=args.warping, prop_method=args.prop_method, prop_min_scale=args.prop_min_scale, prop_topk=args.prop_topk) val_dataset = TDetDataset([args.dataset + '_val'], training=False) tval_dataset = TDetDataset(['coco_voc_val'], training=False) lr = args.lr res_path = 'data/pretrained_model/resnet101_caffe.pth' vgg_path = 'data/pretrained_model/vgg16_caffe.pth' if args.net == 'UBR_VGG': UBR = UBR_VGG(vgg_path, not args.fc, not args.not_freeze, args.no_dropout) elif args.net == 'UBR_RES': UBR = UBR_RES(res_path, 1, not args.fc) elif args.net == 'UBR_RES_FC2': UBR = UBR_RES_FC2(res_path, 1) elif args.net == 'UBR_RES_FC3': UBR = UBR_RES_FC3(res_path, 1) else: print("network is not defined") pdb.set_trace() UBR.create_architecture() params = [] for key, value in dict(UBR.named_parameters()).items(): if value.requires_grad: if 'bias' in key: params += [{ 'params': [value], 'lr': lr * 2, 'weight_decay': 0 }] else: params += [{ 'params': [value], 'lr': lr, 'weight_decay': 0 if args.no_wd else 0.0005 }] optimizer = torch.optim.SGD(params, momentum=0.9) patience = 0 last_optima = 999 if args.resume: load_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.checksession, args.checkepoch)) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name) assert args.net == checkpoint['net'] args.start_epoch = checkpoint['epoch'] UBR.load_state_dict(checkpoint['model']) if not args.no_optim: if 'patience' in checkpoint: patience = checkpoint['patience'] if 'last_optima' in checkpoint: last_optima = checkpoint['last_optima'] optimizer.load_state_dict(checkpoint['optimizer']) lr = optimizer.param_groups[0]['lr'] print("loaded checkpoint %s" % (load_name)) log_file_name = os.path.join( output_dir, 'log_{}_{}.txt'.format(args.net, args.session)) if args.resume: log_file = open(log_file_name, 'a') else: log_file = open(log_file_name, 'w') log_file.write(str(args)) log_file.write('\n') UBR.cuda() if args.loss == 'smoothl1': criterion = UBR_SmoothL1Loss(args.iou_th) elif args.loss == 'iou': criterion = UBR_IoULoss(args.iou_th) if not args.use_prop: random_box_generator = NaturalUniformBoxGenerator( args.iou_th, pos_th=args.alpha, scale_min=1 - args.beta, scale_max=1 + args.beta) for epoch in range(args.start_epoch, args.max_epochs + 1): # setting to train mode UBR.train() loss_temp = 0 effective_iteration = 0 start = time.time() mean_boxes_per_iter = 0 rand_perm = np.random.permutation(len(train_dataset)) for step in range(1, len(train_dataset) + 1): index = rand_perm[step - 1] im_data, gt_boxes, box_labels, proposals, prop_scores, image_level_label, im_scale, raw_img, im_id, _ = train_dataset[ index] data_height = im_data.size(1) data_width = im_data.size(2) im_data = Variable(im_data.unsqueeze(0).cuda()) num_gt_box = gt_boxes.size(0) UBR.zero_grad() # generate random box from given gt box # the shape of rois is (n, 5), the first column is not used # so, rois[:, 1:5] is [xmin, ymin, xmax, ymax] num_per_base = 50 if num_gt_box > 4: num_per_base = 200 // num_gt_box if args.use_prop: proposals = sample_pos_prop(proposals, gt_boxes, args.iou_th) if proposals is None: # log_file.write('@@@@ no box @@@@\n') # print('@@@@@ no box @@@@@') continue rois = torch.zeros((proposals.size(0), 5)) rois[:, 1:] = proposals else: rois = torch.zeros((num_per_base * num_gt_box, 5)) cnt = 0 for i in range(num_gt_box): here = random_box_generator.get_rand_boxes( gt_boxes[i, :], num_per_base, data_height, data_width) if here is None: continue rois[cnt:cnt + here.size(0), :] = here cnt += here.size(0) if cnt == 0: log_file.write('@@@@ no box @@@@\n') print('@@@@@ no box @@@@@') continue rois = rois[:cnt, :] plt.imshow(raw_img) plt.show() continue mean_boxes_per_iter += rois.size(0) rois = Variable(rois.cuda()) gt_boxes = Variable(gt_boxes.cuda()) bbox_pred, shared_feat = UBR(im_data, rois) #refined_boxes = inverse_transform(rois[:, 1:].data, bbox_pred.data) #plt.imshow(raw_img) #draw_box(rois[:, 1:].data / im_scale) #draw_box(refined_boxes / im_scale, 'yellow') #draw_box(gt_boxes.data / im_scale, 'black') #plt.show() loss, num_selected_rois, num_rois, refined_rois = criterion( rois[:, 1:5], bbox_pred, gt_boxes) if loss is None: loss_temp = 1000000 loss = Variable(torch.zeros(1).cuda()) print('zero mached') loss = loss.mean() loss_temp += loss.data[0] # backward optimizer.zero_grad() loss.backward() clip_gradient([UBR], 10.0) optimizer.step() effective_iteration += 1 if step % args.disp_interval == 0: end = time.time() loss_temp /= effective_iteration mean_boxes_per_iter /= effective_iteration print( "[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e, time: %.1f, boxes: %.1f" % (args.net, args.session, epoch, step, loss_temp, lr, end - start, mean_boxes_per_iter)) log_file.write( "[net %s][session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e, time: %.1f, boxes: %.1f\n" % (args.net, args.session, epoch, step, loss_temp, lr, end - start, mean_boxes_per_iter)) loss_temp = 0 effective_iteration = 0 mean_boxes_per_iter = 0 start = time.time() if math.isnan(loss_temp): print('@@@@@@@@@@@@@@nan@@@@@@@@@@@@@') log_file.write('@@@@@@@nan@@@@@@@@\n') return val_loss = validate(UBR, None if args.use_prop else random_box_generator, criterion, val_dataset, args) tval_loss = validate(UBR, None if args.use_prop else random_box_generator, criterion, tval_dataset, args) print('[net %s][session %d][epoch %2d] validation loss: %.4f' % (args.net, args.session, epoch, val_loss)) log_file.write( '[net %s][session %d][epoch %2d] validation loss: %.4f\n' % (args.net, args.session, epoch, val_loss)) print( '[net %s][session %d][epoch %2d] transfer validation loss: %.4f' % (args.net, args.session, epoch, tval_loss)) log_file.write( '[net %s][session %d][epoch %2d] transfer validation loss: %.4f\n' % (args.net, args.session, epoch, tval_loss)) log_file.flush() if args.auto_decay: if last_optima - val_loss < 0.001: patience += 1 if last_optima > val_loss: last_optima = val_loss if patience >= args.decay_patience: adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma patience = 0 else: if epoch % args.lr_decay_step == 0: adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma if epoch % args.save_interval == 0 or lr < 0.000005: save_name = os.path.join( output_dir, '{}_{}_{}.pth'.format(args.net, args.session, epoch)) checkpoint = dict() checkpoint['net'] = args.net checkpoint['session'] = args.session checkpoint['epoch'] = epoch + 1 checkpoint['model'] = UBR.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['patience'] = patience checkpoint['last_optima'] = last_optima save_checkpoint(checkpoint, save_name) print('save model: {}'.format(save_name)) if lr < 0.000005: break log_file.close()
'loss_rpn_box': loss_rpn_box, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_box } for tag, value in info.items(): logger.scalar_summary(tag, value, step) loss_temp = 0 start = time.time() if epoch % 1 == 0: if args.mGPUs: save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint({ 'session': args.session, 'epoch': epoch + 1, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) else: save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint({ 'session': args.session, 'epoch': epoch + 1, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name))
optimizer.zero_grad() loss.backward() if args.net == "vgg16": clip_gradient(UBR, 10.) optimizer.step() if step % args.disp_interval == 0: end = time.time() if step > 0: loss_temp /= args.disp_interval print( "[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e, time: %f" % (args.session, epoch, step, loss_temp, lr, end - start)) loss_temp = 0 start = time.time() save_name = os.path.join( output_dir, 'ubr_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint( { 'session': args.session, 'epoch': epoch + 1, 'model': UBR.state_dict(), 'optimizer': optimizer.state_dict() }, save_name) print('save model: {}'.format(save_name)) end = time.time() print(end - start)
def train_epoch(model, dataloader, optimizer, epoch, iters_per_epoch, output_dir): model.train() loss_temp = 0 start = time.time() data_iter = iter(dataloader) for step in range(iters_per_epoch): data = next(data_iter) with torch.no_grad(): im_data.resize_(data[0].size()).copy_(data[0]) im_info.resize_(data[1].size()).copy_(data[1]) gt_boxes.resize_(data[2].size()).copy_(data[2]) num_boxes.resize_(data[3].size()).copy_(data[3]) sub_boxes.resize_(data[4].size()).copy_(data[4]) obj_boxes.resize_(data[5].size()).copy_(data[5]) rela_gt.resize_(data[6].size()).copy_(data[6]) model.zero_grad() dete_loss, rela_loss = \ model(im_data, im_info, gt_boxes, num_boxes, sub_boxes, obj_boxes, rela_gt) loss = dete_loss.mean() + rela_loss.mean() loss_temp += loss.item() optimizer.zero_grad() loss.backward() optimizer.step(epoch) if step % args.disp_interval == 0: end = time.time() if step > 0: loss_temp /= (args.disp_interval + 1) dete_loss = dete_loss.item() rela_loss = rela_loss.item() print("[epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e, dete_loss: %.4f, rela_loss: %.4f, time: %.2f" \ % (epoch, step, iters_per_epoch, loss_temp, optimizer.rate(epoch), dete_loss, rela_loss, end-start)) if args.use_tfboard: info = { 'loss': loss_temp, 'dete_loss': dete_loss, 'rela_loss': rela_loss, } logger.add_scalars("logs_s_{}/losses".format(args.session), info, (epoch - 1) * iters_per_epoch + step) loss_temp = 0 start = time.time() save_name = os.path.join( output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint( { 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name))
def train_epoch(model, dataloader, optimizer, epoch, iters_per_epoch, output_dir): model.train() loss_temp = 0 start = time.time() data_iter = iter(dataloader) for step in range(iters_per_epoch): data = next(data_iter) with torch.no_grad(): im_data.resize_(data[0].size()).copy_(data[0]) im_info.resize_(data[1].size()).copy_(data[1]) gt_boxes.resize_(data[2].size()).copy_(data[2]) num_boxes.resize_(data[3].size()).copy_(data[3]) model.zero_grad() rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, rois_label\ = model(im_data, im_info, gt_boxes, num_boxes) loss = rpn_loss_cls.mean() + rpn_loss_box.mean() + RCNN_loss_cls.mean( ) + RCNN_loss_bbox.mean() loss_temp += loss.item() optimizer.zero_grad() loss.backward() optimizer.step(epoch) if step % args.disp_interval == 0: end = time.time() if step > 0: loss_temp /= (args.disp_interval + 1) if args.mGPUs: loss_rpn_cls = rpn_loss_cls.mean().item() loss_rpn_box = rpn_loss_box.mean().item() loss_rcnn_cls = RCNN_loss_cls.mean().item() loss_rcnn_box = RCNN_loss_bbox.mean().item() fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt else: loss_rpn_cls = rpn_loss_cls.item() loss_rpn_box = rpn_loss_box.item() loss_rcnn_cls = RCNN_loss_cls.item() loss_rcnn_box = RCNN_loss_bbox.item() fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt print("[session %d][epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e" \ % (args.session, epoch, step, iters_per_epoch, loss_temp, optimizer.rate(epoch))) print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end - start)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \ % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box)) if args.use_tfboard: info = { 'loss': loss_temp, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_box, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_box } logger.add_scalars("logs_s_{}/losses".format(args.session), info, (epoch - 1) * iters_per_epoch + step) loss_temp = 0 start = time.time() save_name = os.path.join( output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint( { 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name))
print( '---------[session %d] [epoch %2d] [iter %4d/%4d]----------------' % (args.session, epoch, itr + 1, num_itr)) print('loss: %.5f' % (loss.data[0])) if args.cuda: prob = cls_prob[:, tracklet_label.long()].cpu() # label = tracklet_label.cpu() else: prob = cls_prob[:, tracklet_label.long()] # label = tracklet_label print('class prob:', prob.data.numpy()) print('lr: {}'.format(lr)) # print('label:', label.data.numpy()) # save the model "kuai gan huohjghghghghghghghjkjllljljljljkljljlj,wo shi yangwenfei" save_name = '{}_{}_{}_{}.pkl'.format(args.arch, args.session, epoch, itr) save_name = os.path.join(output_dir, save_name) if args.mGPUs: trained_weight = sbc_model.module.state_dict() else: trained_weight = sbc_model.state_dict() save_checkpoint( { 'session': args.session, 'epoch': epoch + 1, 'model': trained_weight, 'optimizer': optimizer.state_dict() }, save_name)
for epoch in range(start_epoch, max_epochs + 1): if epoch % (lr_decay_step + 1) == 0: adjust_learning_rate(optimizer, lr_decay_gamma) lr *= lr_decay_gamma train_net(tdcnn_demo, dataloader, optimizer, lr, epoch, disp_interval, session) if len(gpus) > 1: save_name = os.path.join( model_dir, 'tdcnn_{}_{}_{}.pth'.format(session, epoch, len(dataloader))) save_checkpoint( { 'session': session, 'epoch': epoch, 'model': tdcnn_demo.module.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE }, save_name) else: save_name = os.path.join( model_dir, 'tdcnn_{}_{}_{}.pth'.format(session, epoch, len(dataloader))) save_checkpoint( { 'session': session, 'epoch': epoch, 'model': tdcnn_demo.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE }, save_name)