def __init__(self, dataloader, hierarchical_transformer, config, i): super(Trainer, self).__init__() self.iter = i self.config = config self.cpu = torch.device("cpu") self.multi_gpu = len(self.config.gpu_idx) > 1 self.dataloader = dataloader self.word_encoder = WordEncoder.WordEncoder(config, self.dataloader.tweet_field.vocab) self.word_pos_encoder = PositionEncoder.PositionEncoder(config, self.config.max_length) self.time_delay_encoder = PositionEncoder.PositionEncoder(config, self.config.size) # <----------- Check for GPU setting -----------> if self.config.gpu: self.hierarchical_transformer = DataParallelModel(hierarchical_transformer.cuda()) self.criterion = DataParallelCriterion(nn.NLLLoss()) else: self.hierarchical_transformer = hierarchical_transformer self.criterion = nn.NLLLoss() self.adam_optimizer = optim.Adam(self.hierarchical_transformer.parameters(), np.power(self.config.d_model, - 0.5), betas = (self.config.beta_1, self.config.beta_2)) self.optimizer = Optimizer.Optimizer(self.config, self.adam_optimizer)
def create_single_model(args): print('Creating model with\n \ Input size: {}\n \ Output size: {}\n \ Activation {}\n \ Num layers: {}\n \ Hidden units per layer: {}\n \ Using bias: {}\n \ Using batchnorm {}\n \ With batchsize {}'.format( \ args.input_size, args.output_size, args.actv, args.num_layers, args.hidden, args.bias, args.bn, args.batch)) model = args.model(input_size=args.input_size, output_size=args.output_size, actv_type=args.actv, num_layers=args.num_layers, hidden_size=args.hidden, bias=args.bias, use_bn=args.bn) args.loss = model.loss if args.multi_gpu: print('Using data parallelism with {} GPUs'.format(args.num_gpu)) #model = nn.DataParallel(model, device_ids = args.device_ids) ### model = DataParallelModel(model, device_ids = args.device_ids) args.loss = DataParallelCriterion(args.loss, device_ids = args.device_ids) ### print('Sending model to device {}'.format(args.device)) model.to(args.device) return model
def init_fn(self, shared_model=None, **kwargs): self.gpu_inference = self.options.num_gpus > 0 if self.gpu_inference == 0: raise NotImplementedError( "CPU inference is currently buggy. This takes some extra efforts and " "might be fixed in the future.") if shared_model is not None: self.model = shared_model else: self.init_auxiliary() self.model = self.init_model() self.model = DataParallelModel(self.model.cuda(), device_ids=self.gpus)
def main(opt): logger.info('Loading model: %s', opt.model_file) checkpoint = torch.load(opt.model_file) checkpoint_opt = checkpoint['opt'] # Load model location model = LaneNet(cnn_type=checkpoint_opt.cnn_type) model = DataParallelModel(model) # Update/Overwrite some test options like batch size, location to metadata # file vars(checkpoint_opt).update(vars(opt)) logger.info('Building model...') model.load_state_dict(checkpoint['model']) if torch.cuda.is_available(): model = model.cuda() logger.info('Start testing...') test_video(model, opt.input_file, opt.output_file, checkpoint_opt.width, checkpoint_opt.height, genline_method=opt.genline_method)
def DataParallelModelProcess(self, model, ParallelModelType=1, is_eval='train', device='cuda'): if ParallelModelType == 1: parallel_model = DataParallelModel(model) elif ParallelModelType == 2: parallel_model = parallel_old.DataParallelModel(model) else: raise ValueError('ParallelModelType should be 1 or 2') if is_eval == 'eval': parallel_model.eval() elif is_eval == 'train': parallel_model.train() else: raise ValueError('is_eval should be eval or train') parallel_model.float() parallel_model.to(device) return parallel_model
def main(opt): logger.info('Loading model: %s', opt.model_file) checkpoint = torch.load(opt.model_file) checkpoint_opt = checkpoint['opt'] # Load model location model = LaneNet(cnn_type=checkpoint_opt.cnn_type) model = DataParallelModel(model) # Update/Overwrite some test options like batch size, location to metadata # file vars(checkpoint_opt).update(vars(opt)) test_loader = get_data_loader(checkpoint_opt, split='test', return_org_image=True) logger.info('Building model...') model.load_state_dict(checkpoint['model']) if torch.cuda.is_available(): model = model.cuda() postprocessor = PostProcessor() clustering = LaneClustering() logger.info('Start testing...') if opt.loader_type == 'tusimpletest': x_lanes, _, times, _ = test(model, test_loader, postprocessor, clustering, genline_method=opt.genline_method) output_tuprediction(opt.meta_file, x_lanes, times, opt.output_file) if opt.loader_type == 'culanetest': x_lanes, y_list, _, image_files = test( model, test_loader, postprocessor, clustering, genline_method=opt.genline_method) output_culaneprediction(opt.output_dir, x_lanes, y_list, image_files) if opt.loader_type == 'dirloader': visualize(model, test_loader, postprocessor, clustering, show_demo=opt.show_demo, output_dir=opt.output_dir, genline_method=opt.genline_method)
def main(opt): logger.info('Loading model: %s', opt.model_file) checkpoint = torch.load(opt.model_file) checkpoint_opt = checkpoint['opt'] # Update/Overwrite some test options like batch size, location to metadata # file vars(checkpoint_opt).update(vars(opt)) logger.info('Updated input arguments: %s', json.dumps(vars(checkpoint_opt), sort_keys=True, indent=4)) logger.info('Building model...') model = get_model(checkpoint_opt, num_classes=checkpoint_opt.num_classes) test_loader = get_data_loader(checkpoint_opt, training=False, return_org_image=True, data_list=opt.test_data_list) logger.info('Loading model parameters...') model = DataParallelModel(model) model.load_state_dict(checkpoint['model']) if torch.cuda.is_available(): model.cuda() logger.info('Start testing...') test(checkpoint_opt, model, test_loader)
def init_fn(self, shared_model=None, **kwargs): # Create auxiliary models self.init_auxiliary() if shared_model is not None: self.model = shared_model else: self.model = self.init_model() self.model = DataParallelModel(self.model.cuda(), device_ids=self.gpus) # self.model = torch.nn.DataParallel(self.model, device_ids=self.gpus).cuda() # Setup a joint optimizer for the 2 models self.optimizer = self.init_optimizer(self.options.optim.name) self.lr_scheduler = self.init_lr(self.options.optim.lr_scheduler) # Create loss functions self.criterion = self.init_loss_functions() self.criterion = DataParallelCriterion(self.criterion.cuda(), device_ids=self.gpus) # Create AverageMeters for losses self.losses = AverageMeter() # Evaluators # self.evaluators = [Evaluator(self.options, self.logger, self.summary_writer, shared_model=self.model)] self.dataset_size = None
def define_G(ngf): if opt.parallel and torch.cuda.device_count() > 1: return DataParallelModel(init_net(Generator(ngf))) else: return init_net(Generator(ngf))
def main(args): # initialization print("Input arguments:") for key, val in vars(args).items(): print("{:16} {}".format(key, val)) if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.method)) random.seed(args.seed) torch.manual_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True # conduct seg network seg_model = get_model(num_classes=args.num_classes) saved_state_dict = torch.load(args.restore_from) new_params = seg_model.state_dict().copy() # if args.init: # for i in saved_state_dict: # i_parts = i.split('.') # if not i_parts[0] == 'fc': # new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i] # seg_model.load_state_dict(new_params) # print('loading params w/o fc') # else: # seg_model.load_state_dict(saved_state_dict) # print('loading params all') model = DataParallelModel(seg_model) model.float() model.cuda() # define dataloader train_loader = data.DataLoader(TrainGenerator(root=args.root, list_path=args.lst, crop_size=args.crop_size, max_scale=2.0), batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True) # define criterion & optimizer criterion = ReportLovaszLoss(ignore_index=args.ignore_label, only_present=True) criterion = DataParallelCriterion(criterion).cuda() optimizer = optim.SGD( [{ 'params': filter(lambda p: p.requires_grad, seg_model.parameters()), 'lr': args.learning_rate }], lr=args.learning_rate, momentum=0.9, weight_decay=5e-4) start = time.time() for epoch in range(0, args.epochs): print('\n{} | {}'.format(epoch, args.epochs - 1)) # training _ = train(model, train_loader, epoch, criterion, optimizer, writer) if epoch == args.epochs - 1: model_dir = os.path.join(args.snapshot_dir, args.method + '_final.pth') torch.save(seg_model.state_dict(), model_dir) print('Model saved to %s' % model_dir) print('Complete using', time.time() - start, 'seconds')
def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) if len(args.gpu_ids) > 1: self.model = DataParallelModel(model) else: self.model = model self.optimizer = optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: # self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) # patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) if len(args.gpu_ids) > 1: self.model = DataParallelModel(model) else: self.model = model self.optimizer = optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: # self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) # patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() if image.shape[0] == 1: continue self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) #print(output.shape, target.shape) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
if not os.path.exists(snapshot_dir): os.makedirs(snapshot_dir) deeplab = get_model(num_classes=num_classes) # load pretrained ResNet101 backbone: saved_state_dict = torch.load(restore_from) new_params = deeplab.state_dict().copy() for i in saved_state_dict: i_parts = i.split('.') if not i_parts[0] == 'fc' and not i_parts[ 0] == 'last_linear' and not i_parts[0] == 'classifier': new_params['.'.join(i_parts[0:])] = saved_state_dict[i] deeplab.load_state_dict(new_params) model = DataParallelModel(deeplab) model.train() model.float() model.cuda() criterion = CriterionCrossEntropy() criterion = DataParallelCriterion(criterion) criterion.cuda() train_dataset = DatasetCityscapesAugmentation(root=data_dir, list_path=data_list, max_iters=num_steps * batch_size, crop_size=crop_size) train_loader = data.DataLoader(dataset=train_dataset, batch_size=batch_size,
def train_net(ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): mx.random.seed(3) np.random.seed(3) batch_size = len(ctx) backbone = ResNetV1() feat_symbol = backbone(mx.symbol.var(name="data"))[0] net = FasterRCNN(config, backbone) params = net.collect_params() params_pretrained = None # # uncommit the following line to load pretrained model. # params_pretrained = mx.nd.load("pretrained/rfcn-voc-resnet50_v1--29-0.804082102562.params") if params_pretrained is not None: for k in params.keys(): try: params[k]._load_init(params_pretrained[k], mx.cpu()) except Exception as e: logging.exception(e) for key in params.keys(): if params[key]._data is None: default_init = mx.init.Zero( ) if "bias" in key or "offset" in key else mx.init.Normal() default_init.set_verbosity(True) if params[key].init is not None: params[key].init.set_verbosity(True) params[key].initialize(init=params[key].init, default_init=params[key].init) else: params[key].initialize(default_init=default_init) net.collect_params().reset_ctx(list(set(ctx))) import data.transforms.bbox as bbox_t train_transforms = bbox_t.Compose([ # bbox_t.RandomRotate(bound=True, min_angle=-15, max_angle=15), bbox_t.Resize(target_size=config.SCALES[0][0], max_size=config.SCALES[0][1]), bbox_t.Normalize(), bbox_t.AssignAnchor(config, feat_strides=(16, 16), symbol=feat_symbol) ]) val_transforms = bbox_t.Compose([ bbox_t.Resize(target_size=config.SCALES[0][0], max_size=config.SCALES[0][1]), bbox_t.Normalize(), ]) train_dataset = VOCDetection(root=config.dataset.dataset_path, splits=((2007, 'trainval'), (2012, 'trainval')), transform=train_transforms) val_dataset = VOCDetection(root=config.dataset.dataset_path, splits=((2007, 'test'), )) train_loader = DataLoader(train_dataset, batchsize=len(ctx)) # train_loader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=len(ctx), batchify_fn=lambda x: x, # pin_memory=True, num_workers=8, last_batch="discard") rpn_eval_metric = RPNAccuMetric() loss_rpn_cls_metric = mx.metric.Loss(name="rpn_cls") loss_rpn_loc_metric = mx.metric.Loss(name="rpn_loc") loss_rcnn_cls_metric = mx.metric.Loss(name="rcnn_cls") loss_rcnn_loc_metric = mx.metric.Loss(name="rcnn_loc") eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [ rpn_eval_metric, loss_rpn_cls_metric, loss_rpn_loc_metric, loss_rcnn_cls_metric, loss_rcnn_loc_metric ]: eval_metrics.add(child_metric) params_all = net.collect_params() params_to_train = {} params_fixed_prefix = config.network.FIXED_PARAMS for p in params_all.keys(): ignore = False for f in params_fixed_prefix: if f in str(p): ignore = True params_all[p].grad_req = 'null' logging.info("{} is ignored when training.".format(p)) if not ignore: params_to_train[p] = params_all[p] base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(train_dataset) / batch_size) for epoch in lr_epoch_diff ] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) trainer = mx.gluon.Trainer( net.collect_params(), # fix batchnorm, fix first stage, etc... 'sgd', { 'learning_rate': config.TRAIN.lr, 'wd': config.TRAIN.wd, 'momentum': config.TRAIN.momentum, 'clip_gradient': None, 'lr_scheduler': lr_scheduler }) val_metric_5 = VOC07MApMetric(iou_thresh=.5) net_with_criterion = RCNNWithCriterion(base_net=net) net_parallel = DataParallelModel(net_with_criterion, ctx_list=ctx, sync=True) for epoch in range(begin_epoch, config.TRAIN.end_epoch): # train_data.reset() net.hybridize(static_alloc=True, static_shape=False) _ = net(mx.random.randn(1, 3, 512, 512, ctx=ctx[0]), mx.nd.array([[512, 512, 1]], ctx=ctx[0])) for nbatch, data_batch in enumerate( tqdm.tqdm(train_loader, total=len(train_dataset) // batch_size)): inputs = [[x.as_in_context(c) for x in d] for c, d in zip(ctx, data_batch)] losses = [] with ag.record(): outputs = net_parallel(*inputs) for output in outputs: loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls, loss_rcnn_loc, rpn_label, rpn_cls_score = output if nbatch % 4 == 0: rpn_eval_metric.update(rpn_label, rpn_cls_score) loss_rpn_cls_metric.update(None, loss_rpn_cls) loss_rpn_loc_metric.update(None, loss_rpn_loc) loss_rcnn_cls_metric.update(None, loss_rcnn_cls) loss_rcnn_loc_metric.update(None, loss_rcnn_loc) losses.extend([ loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls, loss_rcnn_loc ]) ag.backward(losses) trainer.step(1, ignore_stale_grad=True) if nbatch % 100 == 0: msg = ','.join([ '{}={:.3f}'.format(w, v) for w, v in zip(*eval_metrics.get()) ]) msg += ",lr={}".format(trainer.learning_rate) logging.info(msg) rpn_eval_metric.reset() val_metric_5.reset() net.hybridize(static_alloc=True, static_shape=False) for i in tqdm.tqdm(range(len(val_dataset))): img_path, gt_boxes = val_dataset.at_with_image_path(i) pred_bboxes, pred_scores, pred_clsid = im_detect_bbox_aug( net, nms_threshold=config.TEST.NMS, im=cv2.imread(img_path)[:, :, ::-1], # bgr scales=config.SCALES, ctx=ctx, bbox_stds=config.TRAIN.BBOX_STDS, threshold=1e-3, viz=False) val_metric_5.update(pred_bboxes=pred_bboxes[np.newaxis], pred_labels=pred_clsid[np.newaxis] - 1, pred_scores=pred_scores[np.newaxis], gt_bboxes=gt_boxes[np.newaxis, :, :4], gt_labels=gt_boxes[np.newaxis, :, 4], gt_difficults=gt_boxes[np.newaxis, :, 5]) re = val_metric_5.get() logging.info(re) save_path = "{}-{}-{}.params".format(config.TRAIN.model_prefix, epoch, re[1]) net.collect_params().save(save_path) logging.info("Saved checkpoint to {}.".format(save_path))
def train_net(ctx, begin_epoch, lr, lr_step): mx.random.seed(3) np.random.seed(3) batch_size = len(ctx) backbone = ResNetV1(num_devices=len(set(ctx)), num_layers=50, sync_bn=config.network.SYNC_BN, pretrained=True) feat_symbol = backbone(mx.sym.var(name="data")) net = PyramidRFCN(config, backbone) # Resume parameters. resume = None if resume is not None: params_coco = mx.nd.load(resume) for k in params_coco: params_coco[k.replace("arg:", "").replace("aux:", "")] = params_coco.pop(k) params = net.collect_params() for k in params.keys(): try: params[k]._load_init(params_coco[k], ctx=mx.cpu()) except Exception as e: logging.exception(e) # Initialize parameters params = net.collect_params() for key in params.keys(): if params[key]._data is None: default_init = mx.init.Zero() if "bias" in key or "offset" in key else mx.init.Normal() default_init.set_verbosity(True) if params[key].init is not None and hasattr(params[key].init, "set_verbosity"): params[key].init.set_verbosity(True) params[key].initialize(init=params[key].init, default_init=params[key].init) else: params[key].initialize(default_init=default_init) net.collect_params().reset_ctx(list(set(ctx))) import data.transforms.bbox as bbox_t train_transforms = bbox_t.Compose([ # Flipping is implemented in dataset. # bbox_t.RandomRotate(bound=True, min_angle=-15, max_angle=15), bbox_t.Resize(target_size=config.SCALES[0][0], max_size=config.SCALES[0][1]), # bbox_t.RandomResize(scales=[(960, 2000), (800, 1600), (600, 1200)]), bbox_t.Normalize(), bbox_t.AssignPyramidAnchor(config, symbol=feat_symbol, pad_n=32) ]) val_transforms = bbox_t.Compose([ bbox_t.Resize(target_size=config.SCALES[0][0], max_size=config.SCALES[0][1]), bbox_t.Normalize(), ]) from data.bbox.mscoco import COCODetection val_dataset = COCODetection(root=config.dataset.dataset_path., splits=("instances_val2017",), h_flip=False) train_dataset = COCODetection(root=config.dataset.dataset_path, splits=("instances_train2017",), h_flip=config.TRAIN.FLIP, transform=train_transforms) # val_dataset = YunChongDataSet(is_train=False, h_flip=False) # train_loader = DataLoader(train_dataset, batchsize=len(ctx)) train_loader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=len(ctx), batchify_fn=batch_fn, pin_memory=False, num_workers=0, last_batch="discard", shuffle=True) # for _ in tqdm.tqdm(train_loader, desc="Checking Dataset"): # pass rpn_eval_metric = RPNAccuMetric() loss_rpn_cls_metric = mx.metric.Loss(name="rpn_cls") loss_rpn_loc_metric = mx.metric.Loss(name="rpn_loc") loss_rcnn_cls_metric = mx.metric.Loss(name="rcnn_cls") loss_rcnn_loc_metric = mx.metric.Loss(name="rcnn_loc") eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [rpn_eval_metric, loss_rpn_cls_metric, loss_rpn_loc_metric, loss_rcnn_cls_metric, loss_rcnn_loc_metric]: eval_metrics.add(child_metric) params_all = net.collect_params() params_to_train = {} params_fixed_prefix = config.network.FIXED_PARAMS for p in params_all.keys(): ignore = False if params_fixed_prefix is not None: for f in params_fixed_prefix: if f in str(p): ignore = True params_all[p].grad_req = 'null' logging.info("{} is ignored when training.".format(p)) if not ignore: params_to_train[p] = params_all[p] base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(train_dataset) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) trainer = mx.gluon.Trainer( params_to_train, # fix batchnorm, fix first stage, etc... 'sgd', {'learning_rate': config.TRAIN.lr, 'wd': config.TRAIN.wd, 'momentum': config.TRAIN.momentum, 'clip_gradient': None, 'lr_scheduler': lr_scheduler }) val_metric_5 = VOC07MApMetric(iou_thresh=.5) net_with_criterion = RCNNWithCriterion(base_net=net) net_parallel = DataParallelModel(net_with_criterion, ctx_list=ctx, sync=True if config.network.IM_PER_GPU is 1 else False) for epoch in range(begin_epoch, config.TRAIN.end_epoch): eval_metrics.reset() net.feature_extractor.hybridize(static_alloc=True, static_shape=False) _ = net(mx.random.randn(1, 3, 512, 512, ctx=ctx[0]), mx.nd.array([[512, 512, 1]], ctx=ctx[0])) for nbatch, data_batch in enumerate(tqdm.tqdm(train_loader, total=len(train_dataset) // batch_size, unit_scale=batch_size)): inputs = [[x.as_in_context(c) for x in d] for c, d in zip(ctx, data_batch)] losses = [] net.collect_params().zero_grad() with ag.record(): outputs = net_parallel(*inputs) for output in outputs: loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls, loss_rcnn_loc, rpn_label, rpn_cls_score = output if nbatch % 4 == 0: rpn_eval_metric.update(rpn_label, rpn_cls_score) loss_rpn_cls_metric.update(None, loss_rpn_cls) loss_rpn_loc_metric.update(None, loss_rpn_loc) loss_rcnn_cls_metric.update(None, loss_rcnn_cls) loss_rcnn_loc_metric.update(None, loss_rcnn_loc) losses.extend([loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls, loss_rcnn_loc]) ag.backward(losses) trainer.step(len(ctx), ignore_stale_grad=True) if nbatch % 100 == 0: msg = ','.join(['{}={:.3f}'.format(w, v) for w, v in zip(*eval_metrics.get())]) msg += ",lr={}".format(trainer.learning_rate) logging.info(msg) rpn_eval_metric.reset() if nbatch % 10000 ==0: save_path = "{}-{}-{}.params".format(config.TRAIN.model_prefix, epoch, nbatch) net.collect_params().save(save_path) trainer.save_states(config.TRAIN.model_prefix + "-trainer.states") logging.info("Saved checkpoint to {}.".format(save_path)) # val_metric_5.reset() # for i in tqdm.tqdm(range(len(val_dataset))): # img_path, gt_boxes = val_dataset.at_with_image_path(i) # pred_bboxes, pred_scores, pred_clsid = im_detect_bbox_aug(net, nms_threshold=config.TEST.NMS, # im=cv2.imread(img_path)[:, :, ::-1], # scales=config.SCALES, # ctx=ctx, # bbox_stds=config.TRAIN.BBOX_STDS, # flip=True, # threshold=1e-3, # viz=False, # pad=32, # class_agnostic=config.CLASS_AGNOSTIC # ) # val_metric_5.update(pred_bboxes=pred_bboxes[np.newaxis], # pred_labels=pred_clsid[np.newaxis] - 1, # pred_scores=pred_scores[np.newaxis], # gt_bboxes=gt_boxes[np.newaxis, :, :4], # gt_labels=gt_boxes[np.newaxis, :, 4], # gt_difficults=gt_boxes[np.newaxis, :, 5]) # re = val_metric_5.get() re = ("mAP", "0.0") logging.info(re) save_path = "{}-{}-{}.params".format(config.TRAIN.model_prefix, epoch, re[1]) net.collect_params().save(save_path) trainer.save_states(config.TRAIN.model_prefix + "-trainer.states") logging.info("Saved checkpoint to {}.".format(save_path))
class Trainer(CheckpointRunner): # noinspection PyAttributeOutsideInit def init_fn(self, shared_model=None, **kwargs): # Create auxiliary models self.init_auxiliary() if shared_model is not None: self.model = shared_model else: self.model = self.init_model() self.model = DataParallelModel(self.model.cuda(), device_ids=self.gpus) # self.model = torch.nn.DataParallel(self.model, device_ids=self.gpus).cuda() # Setup a joint optimizer for the 2 models self.optimizer = self.init_optimizer(self.options.optim.name) self.lr_scheduler = self.init_lr(self.options.optim.lr_scheduler) # Create loss functions self.criterion = self.init_loss_functions() self.criterion = DataParallelCriterion(self.criterion.cuda(), device_ids=self.gpus) # Create AverageMeters for losses self.losses = AverageMeter() # Evaluators # self.evaluators = [Evaluator(self.options, self.logger, self.summary_writer, shared_model=self.model)] self.dataset_size = None def init_auxiliary(self): pass def init_model(self): raise NotImplementedError("Your model is not found") def init_loss_functions(self): raise NotImplementedError("Your loss is not found") def init_optimizer(self, optim_name): if optim_name == "adam": optimizer = torch.optim.Adam(params=list(self.model.parameters()), lr=self.options.optim.lr, betas=(self.options.optim.adam_beta1, 0.999), weight_decay=self.options.optim.wd) elif optim_name == "sgd": optimizer = torch.optim.SGD( params=list(self.model.parameters()), lr=self.options.optim.lr, momentum=self.options.optim.sgd_momentum, weight_decay=self.options.optim.wd) elif optim_name == "adam_gan": optimizer_d = torch.optim.Adam( params=list(self.model.module.D.parameters()), lr=self.options.optim.lr_d, betas=(self.options.optim.adam_beta1, 0.999), weight_decay=0) optimizer_g = torch.optim.Adam( params=list(self.model.module.G.parameters()), lr=self.options.optim.lr_g, betas=(self.options.optim.adam_beta1, 0.999), weight_decay=0) return {"optimizer_d": optimizer_d, "optimizer_g": optimizer_g} else: raise NotImplementedError("Your optimizer is not found") return optimizer def init_lr(self, lr_scheduler_name): if lr_scheduler_name == "multistep": lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, self.options.optim.lr_step, self.options.optim.lr_factor) elif lr_scheduler_name == "exp": lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( self.optimizer, gamma=self.options.optim.lr_gamma) elif lr_scheduler_name == "multistep_gan": lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer["optimizer_d"], self.options.optim.lr_step, self.options.optim.lr_factor) else: r_scheduler = None return lr_scheduler def models_dict(self): return {'model': self.model} def optimizers_dict(self): return {'optimizer': self.optimizer, 'lr_scheduler': self.lr_scheduler} def train_step(self, input_batch): # Grab data from the batch, predict with model out = self.model(input_batch) # compute loss loss, loss_summary = self.criterion(out, input_batch) self.losses.update(loss.detach().cpu().item()) # Do backprop self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Pack output arguments to be used for visualization return recursive_detach(out), recursive_detach(loss_summary) def get_dataloader(self): data_loader = DataLoader(self.dataset, batch_size=self.options.train.batch_size * self.options.num_gpus, num_workers=self.options.num_workers, pin_memory=self.options.pin_memory, shuffle=self.options.train.shuffle) return data_loader def train(self): self.logger.info("Start Trainning.") # Create data loader at very begining train_data_loader = self.get_dataloader() self.dataset_size = len(train_data_loader) # Run training for num_epochs epochs for epoch in range(self.epoch_count, self.options.train.num_epochs): self.epoch_count += 1 # Reset loss self.losses.reset() # Iterate over all batches in an epoch for step, batch in enumerate(train_data_loader): # Send input to GPU batch = { k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in batch.items() } # Run training step out = self.train_step(batch) self.step_count += 1 # Tensorboard logging every summary_steps steps if self.step_count % self.options.train.summary_steps == 0: self.train_summaries(batch, *out) # Save checkpoint every checkpoint_steps steps if self.step_count % self.options.train.checkpoint_steps == 0: self.dump_checkpoint() if not self.options.model.name.endswith('gan'): self.dump_checkpoint() if self.lr_scheduler is not None: self.lr_scheduler.step() def train_summaries(self, input_batch, out_summary, loss_summary): # Debug info for filenames self.logger.debug(input_batch["filename"]) # Save results in Tensorboard self.tensorboard_step(loss_summary) # Save results to log self.log_step(loss_summary) def log_step(self, loss_summary): self.logger.info( "Epoch %03d, Step %06d/%06d, Time elapsed %s, Loss %.5f (AvgLoss %.5f)" % (self.epoch_count, self.step_count, self.options.train.num_epochs * len(self.dataset) // (self.options.train.batch_size * self.options.num_gpus), self.time_elapsed, self.losses.val, self.losses.avg)) def tensorboard_step(self, loss_summary): for k, v in loss_summary.items(): self.summary_writer.add_scalar(k, v, self.step_count) def init_with_pretrained_backbone(self): checkpoint_file = os.path.abspath( self.options.train.backbone_pretrained_model) pretrained_dict = torch.load(checkpoint_file) self.model.module.load_state_dict(pretrained_dict, strict=False) self.logger.info("Init with pre-trained backbone from %s." % checkpoint_file) def test(self): self.model.eval() for evaluator in self.evaluators: evaluator.evaluate() self.model.train()
def main(): print("Input arguments:") for key, val in vars(args).items(): print("{:16} {}".format(key, val)) random.seed(args.seed) torch.manual_seed(args.seed) writer = SummaryWriter(args.snapshot_dir) os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu h, w = map(int, args.input_size.split(',')) input_size = (h, w) cudnn.enabled = True deeplab = get_segmentation_model("_".join([args.network, args.method]), num_classes=args.num_classes) saved_state_dict = torch.load(args.restore_from) new_params = deeplab.state_dict().copy() if 'wide' in args.network: saved_state_dict = saved_state_dict['state_dict'] if 'vistas' in args.method: saved_state_dict = saved_state_dict['body'] for i in saved_state_dict: new_params[i] = saved_state_dict[i] else: for i in saved_state_dict: i_parts = i.split('.') if not 'classifier' in i_parts: new_params['.'.join(i_parts[1:])] = saved_state_dict[i] elif 'mobilenet' in args.network: for i in saved_state_dict: i_parts = i.split('.') if not (i_parts[0]=='features' and i_parts[1]=='18') and not i_parts[0]=='classifier': new_params['.'.join(i_parts[0:])] = saved_state_dict[i] else: for i in saved_state_dict: i_parts = i.split('.') if not i_parts[0]=='fc' and not i_parts[0]=='last_linear' and not i_parts[0]=='classifier': new_params['.'.join(i_parts[0:])] = saved_state_dict[i] if args.start_iters > 0: deeplab.load_state_dict(saved_state_dict) else: deeplab.load_state_dict(new_params) model = DataParallelModel(deeplab) # model = nn.DataParallel(deeplab) model.train() model.float() model.cuda() criterion = CriterionCrossEntropy() if "dsn" in args.method: if args.ohem: if args.ohem_single: print('use ohem only for the second prediction map.') criterion = CriterionOhemDSN_single(thres=args.ohem_thres, min_kept=args.ohem_keep, dsn_weight=float(args.dsn_weight)) else: criterion = CriterionOhemDSN(thres=args.ohem_thres, min_kept=args.ohem_keep, dsn_weight=float(args.dsn_weight), use_weight=True) else: criterion = CriterionDSN(dsn_weight=float(args.dsn_weight), use_weight=True) criterion = DataParallelCriterion(criterion) criterion.cuda() cudnn.benchmark = True if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) trainloader = data.DataLoader(get_segmentation_dataset(args.dataset, root=args.data_dir, list_path=args.data_list, max_iters=args.num_steps*args.batch_size, crop_size=input_size, scale=args.random_scale, mirror=args.random_mirror, network=args.network), batch_size=args.batch_size, shuffle=True, num_workers=1, pin_memory=True) optimizer = optim.SGD([{'params': filter(lambda p: p.requires_grad, deeplab.parameters()), 'lr': args.learning_rate }], lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) optimizer.zero_grad() for i_iter, batch in enumerate(trainloader): sys.stdout.flush() i_iter += args.start_iters images, labels, _, _ = batch images = Variable(images.cuda()) labels = Variable(labels.long().cuda()) optimizer.zero_grad() lr = adjust_learning_rate(optimizer, i_iter) if args.fix_lr: lr = args.learning_rate print('learning_rate: {}'.format(lr)) if 'gt' in args.method: preds = model(images, labels) else: preds = model(images) loss = criterion(preds, labels) loss.backward() optimizer.step() if i_iter % 100 == 0: writer.add_scalar('learning_rate', lr, i_iter) writer.add_scalar('loss', loss.data.cpu().numpy(), i_iter) print('iter = {} of {} completed, loss = {}'.format(i_iter, args.num_steps, loss.data.cpu().numpy())) if i_iter >= args.num_steps-1: print('save model ...') torch.save(deeplab.state_dict(),osp.join(args.snapshot_dir, 'CS_scenes_'+str(args.num_steps)+'.pth')) break if i_iter % args.save_pred_every == 0: print('taking snapshot ...') torch.save(deeplab.state_dict(),osp.join(args.snapshot_dir, 'CS_scenes_'+str(i_iter)+'.pth')) end = timeit.default_timer() print(end-start,'seconds')
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): mx.random.seed(3) np.random.seed(3) logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) feat_pyramid_level = np.log2(config.network.RPN_FEAT_STRIDE).astype(int) feat_sym = [ sym.get_internals()['rpn_cls_score_p' + str(x) + '_output'] for x in feat_pyramid_level ] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [ load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data train_data = PyramidAnchorIterator( feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_strides=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, allowed_border=np.inf) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print 'providing maximum shape', max_data_shape, max_label_shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) # sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes # sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # decide training params # metric rpn_eval_metric = metric.RPNAccMetric() rpn_cls_metric = metric.RPNLogLossMetric() rpn_bbox_metric = metric.RPNL1LossMetric() rpn_fg_metric = metric.RPNFGFraction(config) eval_metric = metric.RCNNAccMetric(config) eval_fg_metric = metric.RCNNFGAccuracy(config) cls_metric = metric.RCNNLogLossMetric(config) bbox_metric = metric.RCNNL1LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [ rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, rpn_fg_metric, eval_fg_metric, eval_metric, cls_metric, bbox_metric ]: eval_metrics.add(child_metric) # callback # batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) # epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, # save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds)] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'clip_gradient': None } if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) net = FPNNet(sym, args_pretrained=arg_params, auxes_pretrained=aux_params) # create multi-threaded DataParallel Model. net_parallel = DataParallelModel(net, ctx_list=ctx) # create trainer, # !Important: A trainer can be only created after the function `resnet_ctx` is called. # Please Note that DataParallelModel will call reset_ctx to initialize parameters on gpus. trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', optimizer_params) for epoch in range(begin_epoch, config.TRAIN.end_epoch): train_data.reset() net.hybridize(static_alloc=True, static_shape=False) progress_bar = tqdm.tqdm(total=len(roidb)) for nbatch, data_batch in enumerate(train_data): inputs = [[ x.astype('f').as_in_context(c) for x in d + l ] for c, d, l in zip(ctx, data_batch.data, data_batch.label)] with ag.record(): outputs = net_parallel(*inputs) ag.backward(sum(outputs, ())) trainer.step(1) eval_metrics.update(data_batch.label[0], outputs[0]) if nbatch % 100 == 0: msg = ','.join([ '{}={:.3f}'.format(w, v) for w, v in zip(*eval_metrics.get()) ]) msg += ",lr={}".format(trainer.learning_rate) logger.info(msg) print(msg) eval_metrics.reset() progress_bar.update(len(inputs)) progress_bar.close() net.hybridize(static_alloc=True, static_shape=False) re = ("mAP", 0.0) logger.info(re) save_path = "{}-{}-{}.params".format(prefix, epoch, re[1]) net.collect_params().save(save_path) logger.info("Saved checkpoint to {}.".format(save_path))
class Predictor(CheckpointRunner): def __init__(self, options, logger: Logger, writer, shared_model=None): super().__init__(options, logger, writer, training=False, shared_model=shared_model) # noinspection PyAttributeOutsideInit def init_fn(self, shared_model=None, **kwargs): self.gpu_inference = self.options.num_gpus > 0 if self.gpu_inference == 0: raise NotImplementedError( "CPU inference is currently buggy. This takes some extra efforts and " "might be fixed in the future.") if shared_model is not None: self.model = shared_model else: self.init_auxiliary() self.model = self.init_model() self.model = DataParallelModel(self.model.cuda(), device_ids=self.gpus) def models_dict(self): return {'model': self.model} def init_auxiliary(self): pass def init_model(self): raise NotImplementedError("Your model is not found") def get_dataloader(self): data_loader = DataLoader(self.dataset, batch_size=self.options.test.batch_size, pin_memory=self.options.pin_memory, collate_fn=self.dataset_collate_fn, shuffle=self.options.test.shuffle) return data_loader def predict(self): self.logger.info("Running predictions...") predict_data_loader = self.get_dataloader() for step, batch in enumerate(predict_data_loader): self.logger.info( "Predicting [%05d/%05d]" % (step * self.options.test.batch_size, len(self.dataset))) if self.gpu_inference: # Send input to GPU batch = { k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in batch.items() } else: raise NotImplementedError( "CPU inference is currently buggy. This takes some extra efforts and " "might be fixed in the future.") self.predict_step(batch) def predict_step(self, input_batch): raise NotImplementedError("Your predict step function not found.") def save_inference_results(self, inputs, outputs): raise NotImplementedError("Your result saving function not found.")
class Trainer(): def __init__(self, dataloader, hierarchical_transformer, config, i): super(Trainer, self).__init__() self.iter = i self.config = config self.cpu = torch.device("cpu") self.multi_gpu = len(self.config.gpu_idx) > 1 self.dataloader = dataloader self.word_encoder = WordEncoder.WordEncoder(config, self.dataloader.tweet_field.vocab) self.word_pos_encoder = PositionEncoder.PositionEncoder(config, self.config.max_length) self.time_delay_encoder = PositionEncoder.PositionEncoder(config, self.config.size) # <----------- Check for GPU setting -----------> if self.config.gpu: self.hierarchical_transformer = DataParallelModel(hierarchical_transformer.cuda()) self.criterion = DataParallelCriterion(nn.NLLLoss()) else: self.hierarchical_transformer = hierarchical_transformer self.criterion = nn.NLLLoss() self.adam_optimizer = optim.Adam(self.hierarchical_transformer.parameters(), np.power(self.config.d_model, - 0.5), betas = (self.config.beta_1, self.config.beta_2)) self.optimizer = Optimizer.Optimizer(self.config, self.adam_optimizer) def test_performance(self, type_): predicted_y_lst = [] y_lst = [] self.hierarchical_transformer.eval() # Make sure that it is on eval mode first with torch.no_grad(): for X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post in self.dataloader.get_data(type_): # <-------- Casting as a variable ---------> X = Variable(X) y = Variable(y) word_pos = Variable(word_pos) time_delay = Variable(time_delay) structure = Variable(structure) attention_mask_word = Variable(attention_mask_word) attention_mask_post = Variable(attention_mask_post) # <-------- Encode content --------------> X = self.word_encoder(X) word_pos = self.word_pos_encoder(word_pos) time_delay = self.time_delay_encoder(time_delay) # <-------- Move to GPU --------------> if self.config.gpu: X = X.cuda() y = y.cuda() word_pos = word_pos.cuda() time_delay = time_delay.cuda() structure = structure.cuda() attention_mask_word = attention_mask_word.cuda() attention_mask_post = attention_mask_post.cuda() # <--------- Getting the predictions ---------> predicted_y = self.hierarchical_transformer(X, word_pos, time_delay, structure, attention_mask_word = attention_mask_word, attention_mask_post = attention_mask_post) # predicted_y, self_atten_output_post, self_atten_weights_dict_word, self_atten_weights_dict_post = self.hierarchical_transformer(X, word_pos, time_delay) # self_atten_weights_dict_word = merge_attention_dict(self_atten_weights_dict_word, self.config, "word") # self_atten_weights_dict_post = merge_attention_dict(self_atten_weights_dict_post, self.config, "post") if self.multi_gpu: predicted_y = torch.cat(list(predicted_y), dim = 0) # <------- to np array -------> predicted_y = predicted_y.cpu().numpy() y = y.cpu().numpy() print("test", predicted_y) # <------- Appending it to the master list -------> predicted_y_lst.extend(predicted_y) y_lst.extend(y) # <--------- Free up the GPU --------------> del X del y del predicted_y del word_pos del time_delay del structure # <------- Get scores -------> predicted_y_lst = np.array(predicted_y_lst) predicted_y_lst = get_labels(predicted_y_lst) y_lst = np.array(y_lst) return predicted_y_lst, y_lst def train(self): print("*" * 40 + " START OF TRAINING " + "*" * 40) epoch_values = {} # <------ Gets for test 1 ------> best_acc_test_1 = 0.0 best_f_score_test_1 = 0.0 best_acc_test_1_for_2 = 0.0 best_f_score_test_1_for_2 = 0.0 best_record_f_score_test_1 = {} best_record_accuracy_test_1 = {} # <------ Gets for test 2 ------> best_acc_test_2 = 0.0 best_f_score_test_2 = 0.0 best_acc_test_2_for_1 = 0.0 best_f_score_test_2_for_1 = 0.0 best_record_f_score_test_2 = {} best_record_accuracy_test_2 = {} # <------ Gets for full test ------> best_acc_test = 0.0 best_f_score_test = 0.0 best_record_f_score_test = {} best_record_accuracy_test = {} # <------ For logging purpose ------> dataset = self.config.data_folder.split("/")[-1] name = "{}_split_{}_{}".format(dataset, self.iter, datetime.now().strftime('%Y-%m-%d-%H:%M:%S')) # Date & Time for logging purposes path = os.path.join(self.config.log_folder, self.config.dataset_name, name + "_" + self.config.experiment_name) make_dir(path) print(path) save_vocab_vectors(self.dataloader, self.config, path) log_info(path, "*" * 40 + " EXPERIMENT " + "*" * 40) log_info(path, "*" * 40 + " SPLIT {} ".format(self.iter) + "*" * 40) log_info(path, str(vars(self.config))) log_info(path, "*" * 90) for epoch in tqdm(range(self.config.num_epoch)): running_loss = 0 i = 0 for X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post in self.dataloader.get_data("train"): # <-------- Casting as a variable ---------> X = Variable(X) y = Variable(y) word_pos = Variable(word_pos) time_delay = Variable(time_delay) structure = Variable(structure) attention_mask_word = Variable(attention_mask_word) attention_mask_post = Variable(attention_mask_post) # <-------- Encode content --------------> X = self.word_encoder(X) word_pos = self.word_pos_encoder(word_pos) time_delay = self.time_delay_encoder(time_delay) # <-------- Move to GPU --------------> if self.config.gpu: X = X.cuda() y = y.cuda() word_pos = word_pos.cuda() time_delay = time_delay.cuda() structure = structure.cuda() attention_mask_word = attention_mask_word.cuda() attention_mask_post = attention_mask_post.cuda() # <------- Settings -------------> self.hierarchical_transformer.train() # Set the model to be on train mode (So that the dropout applies) self.optimizer.zero_grad() # zero grad it # <--------- Getting the predictions ---------> predicted_y = self.hierarchical_transformer(X, word_pos, time_delay, structure, attention_mask_word = attention_mask_word, attention_mask_post = attention_mask_post) #predicted_y, self_atten_output_post, self_atten_weights_dict_word, self_atten_weights_dict_post = self.hierarchical_transformer(X, word_pos, time_delay) # self_atten_weights_dict_word = merge_attention_dict(self_atten_weights_dict_word, self.config, "word") # self_atten_weights_dict_post = merge_attention_dict(self_atten_weights_dict_post, self.config, "post") print(predicted_y) # <--------- Getting loss and backprop ---------> loss = self.criterion(predicted_y, y) loss.backward() self.optimizer.step_and_update_lr() # <--------- Calculating the loss ---------> running_loss += float(loss.detach().item()) i += 1 # <--------- Free up the GPU --------------> del X del y del predicted_y del word_pos del time_delay del structure torch.cuda.empty_cache() record = {} running_loss = running_loss/ float(i) print() print("Epoch {}: {}".format(epoch + 1, running_loss)) with torch.no_grad(): pred_train, true_train = self.test_performance("train_test") pred_test_1,true_test_1 = self.test_performance("test_1") pred_test_2,true_test_2 = self.test_performance("test_2") pred_test = np.concatenate((pred_test_1, pred_test_2)) true_test = np.concatenate((true_test_1, true_test_2)) # <-------- Getting performance for all the clases --------> acc_train, pre_train, recall_train, f_score_train, counter_true_train, counter_pred_train = cal_scores(pred_train, true_train, type_ = "all") acc_test_1, pre_test_1, recall_test_1, f_score_test_1, counter_true_test_1, counter_pred_test_1 = cal_scores(pred_test_1, true_test_1, type_ = "all") acc_test_2, pre_test_2, recall_test_2, f_score_test_2, counter_true_test_2, counter_pred_test_2 = cal_scores(pred_test_2, true_test_2, type_ = "all") acc_test, pre_test, recall_test, f_score_test, counter_true_test, counter_pred_test = cal_scores(pred_test, true_test, type_ = "all") # <-------- Getting performance for individual claseses --------> acc_test_1_class_0, pre_test_1_class_0, recall_test_1_class_0, f_score_test_1_class_0, counter_true_test_1_class_0, counter_pred_test_1_class_0 = cal_scores(pred_test_1, true_test_1, type_ = 0) acc_test_1_class_1, pre_test_1_class_1, recall_test_1_class_1, f_score_test_1_class_1, counter_true_test_1_class_1, counter_pred_test_1_class_1 = cal_scores(pred_test_1, true_test_1, type_ = 1) acc_test_2_class_0, pre_test_2_class_0, recall_test_2_class_0, f_score_test_2_class_0, counter_true_test_2_class_0, counter_pred_test_2_class_0 = cal_scores(pred_test_2, true_test_2, type_ = 0) acc_test_2_class_1, pre_test_2_class_1, recall_test_2_class_1, f_score_test_2_class_1, counter_true_test_2_class_1, counter_pred_test_2_class_1 = cal_scores(pred_test_2, true_test_2, type_ = 1) acc_test_class_0, pre_test_class_0, recall_test_class_0, f_score_test_class_0, counter_true_test_class_0, counter_pred_test_class_0 = cal_scores(pred_test, true_test, type_ = 0) acc_test_class_1, pre_test_class_1, recall_test_class_1, f_score_test_class_1, counter_true_test_class_1, counter_pred_test_class_1 = cal_scores(pred_test, true_test, type_ = 1) if epoch%self.config.interval == 0: check_point_epoch(epoch + 1, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, running_loss, acc_train, pre_train, recall_train, f_score_train, counter_true_train, counter_pred_train, acc_test_1, pre_test_1, recall_test_1, f_score_test_1, counter_true_test_1, counter_pred_test_1, acc_test_2, pre_test_2, recall_test_2, f_score_test_2, counter_true_test_2, counter_pred_test_2, acc_test, pre_test, recall_test, f_score_test, counter_true_test, counter_pred_test, path) record["epoch"] = epoch + 1 record["loss"] = running_loss record["acc_train"] = acc_train record["precision_train"] = pre_train record["recall_train"] = recall_train record["f_score_train"] = f_score_train record["counter_true_train"] = counter_true_train record["counter_pred_train"] = counter_pred_train record["acc_test_1"] = acc_test_1 record["precision_test_1"] = pre_test_1 record["recall_test_1"] = recall_test_1 record["f_score_test_1"] = f_score_test_1 record["counter_true_test_1"] = counter_true_test_1 record["counter_pred_test_1"] = counter_pred_test_1 record["acc_test_2"] = acc_test_2 record["precision_test_2"] = pre_test_2 record["recall_test_2"] = recall_test_2 record["f_score_test_2"] = f_score_test_2 record["counter_true_test_2"] = counter_true_test_2 record["counter_pred_test_2"] = counter_pred_test_2 record["acc_test"] = acc_test record["precision_test"] = pre_test record["recall_test"] = recall_test record["f_score_test"] = f_score_test record["counter_true_test"] = counter_true_test record["counter_pred_test"] = counter_pred_test # <--------- test 1 ---------> record["acc_test_1_classes"] = {0 : acc_test_1_class_0, 1 : acc_test_1_class_1} record["precision_test_1_classes"] = {0 : pre_test_1_class_0, 1 : pre_test_1_class_1} record["recall_test_1_classes"] = {0 : recall_test_1_class_0, 1 : recall_test_1_class_1} record["f_score_test_1_classes"] = {0 : f_score_test_1_class_0, 1 : f_score_test_1_class_1} record["counter_pred_test_1_classes"] = {0 : counter_pred_test_1_class_0, 1 : counter_pred_test_1_class_1} # <--------- test 2 ---------> record["acc_test_2_classes"] = {0 : acc_test_2_class_0, 1 : acc_test_2_class_1} record["precision_test_2_classes"] = {0 : pre_test_2_class_0, 1 : pre_test_2_class_1} record["recall_test_2_classes"] = {0 : recall_test_2_class_0, 1 : recall_test_2_class_1} record["f_score_test_2_classes"] = {0 : f_score_test_2_class_0, 1 : f_score_test_2_class_1} record["counter_pred_test_2_classes"] = {0 : counter_pred_test_2_class_0, 1 : counter_pred_test_2_class_1} # <--------- test ---------> record["acc_test_classes"] = {0 : acc_test_class_0, 1 : acc_test_class_1} record["precision_test_classes"] = {0 : pre_test_class_0, 1 : pre_test_class_1} record["recall_test_classes"] = {0 : recall_test_class_0, 1 : recall_test_class_1} record["f_score_test_classes"] = {0 : f_score_test_class_0, 1 : f_score_test_class_1} record["counter_pred_test_classes"] = {0 : counter_pred_test_class_0, 1 : counter_pred_test_class_1} epoch_values[epoch + 1] = record log_info(path, record) log_info(path, "=" * 90) if f_score_test_1 >= best_f_score_test_1: print("CURRENT BEST (F-SCORE) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_1")) if f_score_test_1 == best_f_score_test_1: if f_score_test_2 >= best_f_score_test_1_for_2: best_f_score_test_1_for_2 = f_score_test_2 best_f_score_test_1 = f_score_test_1 best_record_f_score_test_1 = record log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_1, "f_score", "test_1") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_1") else: best_f_score_test_1 = f_score_test_1 best_record_f_score_test_1 = record best_f_score_test_1_for_2 = f_score_test_2 log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_1, "f_score", "test_1") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_1") if acc_test_1 >= best_acc_test_1: print("CURRENT BEST (ACCURACY) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_1")) if acc_test_1 == best_acc_test_1: if acc_test_2 >= best_acc_test_1_for_2: best_acc_test_1_for_2 = acc_test_2 best_acc_test_1 = acc_test_1 best_record_acc_test_1 = record log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_1, "accuracy", "test_1") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_1") else: best_acc_test_1 = acc_test_1 best_record_acc_test_1 = record best_acc_test_1_for_2 = acc_test_2 log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_1, "accuracy", "test_1") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_1") if f_score_test_2 >= best_f_score_test_2: print("CURRENT BEST (F-SCORE) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_2")) if f_score_test_2 == best_f_score_test_2: if f_score_test_1 >= best_f_score_test_2_for_1: best_f_score_test_2_for_1 = f_score_test_1 best_f_score_test_2 = f_score_test_2 best_record_f_score_test_2 = record log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_2, "f_score", "test_2") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_2") else: best_f_score_test_2 = f_score_test_2 best_record_f_score_test_2 = record best_f_score_test_2_for_1 = f_score_test_1 log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_1, "f_score", "test_1") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_1") if acc_test_2 >= best_acc_test_2: print("CURRENT BEST (ACCURACY) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_2")) if acc_test_2 == best_acc_test_2: if acc_test_1 >= best_acc_test_2_for_1: best_acc_test_2_for_1 = acc_test_1 best_acc_test_2 = acc_test_2 best_record_acc_test_2 = record log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_2, "accuracy", "test_2") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_2") else: best_acc_test_2 = acc_test_2 best_record_acc_test_2 = record best_acc_test_2_for_1 = acc_test_1 log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_2, "accuracy", "test_2") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_2") if f_score_test >= best_f_score_test: print("CURRENT BEST (F-SCORE) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST")) best_f_score_test = f_score_test best_record_f_score_test = record log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test, "f_score", "test") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test") if acc_test >= best_acc_test: print("CURRENT BEST (ACCURACY) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST")) best_acc_test = acc_test best_record_acc_test = record log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test, "accuracy", "test") save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test") plot_graphs(path, epoch_values) print("*" * 40 + " DONE WITH TRAINING " + "*" * 40)
normalize, ]) train_ds = VOCSBDClassification('/path/to/VOC', '/path/to/SBD/benchmark_RELEASE/dataset', transform=train_trans, image_set='train') train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = VOCClassification('/path/to/VOC', transform=val_trans, image_set='val') val_dl = DataLoader(val_ds, batch_size=8, shuffle=True, num_workers=2, drop_last=True) # Model if args.arc == 'vgg': model = vgg19(pretrained=True) num_ftrs = model.classifier[6].in_features model.classifier[6] = nn.Linear(num_ftrs, train_ds.CLASSES) model = DataParallelModel(model.cuda()) else: raise Exception("Architecture {} not found".format(args.arc)) criterion = DataParallelCriterion(nn.BCEWithLogitsLoss().cuda()) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 20, gamma=0.2) best_pred = 0 # Load model if args.resume: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.module.load_state_dict(checkpoint['state_dict'])
def main(opt): # Set the random seed manually for reproducibility. if torch.cuda.is_available(): torch.cuda.manual_seed(opt.seed) else: torch.manual_seed(opt.seed) train_loader = get_data_loader(opt, split='train', return_org_image=False) val_loader = get_data_loader(opt, split='val', return_org_image=False) output_dir = os.path.dirname(opt.output_file) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info('Building model...') model = LaneNet(cnn_type=opt.cnn_type, embed_dim=opt.embed_dim) model = DataParallelModel(model) criterion_disc = DiscriminativeLoss(delta_var=0.5, delta_dist=1.5, norm=2, usegpu=True) criterion_ce = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate) if opt.start_from: logger.info('Restart training from %s', opt.start_from) checkpoint = torch.load(opt.start_from) model.load_state_dict(checkpoint['model']) if torch.cuda.is_available(): criterion_disc.cuda() criterion_ce.cuda() model = model.cuda() logger.info("Start training...") best_loss = sys.maxsize best_epoch = 0 for epoch in tqdm(range(opt.num_epochs), desc='Epoch: '): learning_rate = adjust_learning_rate(opt, optimizer, epoch) logger.info('===> Learning rate: %f: ', learning_rate) # train for one epoch train( opt, model, criterion_disc, criterion_ce, optimizer, train_loader) # validate at every val_step epoch if epoch % opt.val_step == 0: val_loss = test( opt, model, criterion_disc, criterion_ce, val_loader) logger.info('Val loss: %s\n', val_loss) loss = val_loss.avg if loss < best_loss: logger.info( 'Found new best loss: %.7f, previous loss: %.7f', loss, best_loss) best_loss = loss best_epoch = epoch logger.info('Saving new checkpoint to: %s', opt.output_file) torch.save({ 'epoch': epoch, 'model': model.state_dict(), 'best_loss': best_loss, 'best_epoch': best_epoch, 'opt': opt }, opt.output_file) else: logger.info( 'Current loss: %.7f, best loss is %.7f @ epoch %d', loss, best_loss, best_epoch) if epoch - best_epoch > opt.max_patience: logger.info('Terminated by early stopping!') break
def define_D(ndf): if opt.parallel and torch.cuda.device_count() > 1: return DataParallelModel(init_net(Discriminator(ndf))) else: return init_net(Discriminator(ndf))
def main(args): # initialization print("Input arguments:") for key, val in vars(args).items(): print("{:16} {}".format(key, val)) if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.method)) random.seed(args.seed) torch.manual_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True # conduct seg network seg_model = get_model(num_classes=args.num_classes) saved_state_dict = torch.load(args.restore_from) new_params = seg_model.state_dict().copy() if args.init: for i in saved_state_dict: i_parts = i.split('.') if not i_parts[0] == 'fc': new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i] seg_model.load_state_dict(new_params) print('loading params w/o fc') else: seg_model.load_state_dict(saved_state_dict) print('loading params all') model = DataParallelModel(seg_model) model.float() model.cuda() # define dataloader train_loader = data.DataLoader(DataGenerator(root=args.root, list_path=args.lst, crop_size=args.crop_size, training=True), batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True) val_loader = data.DataLoader(DataGenerator(root=args.val_root, list_path=args.val_lst, crop_size=args.crop_size, training=False), batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True) # define criterion & optimizer criterion = ABRLovaszLoss(ignore_index=args.ignore_label, only_present=True, cls_p= args.num_classes, cls_h= args.hbody_cls, cls_f= args.fbody_cls) criterion = DataParallelCriterion(criterion).cuda() optimizer = optim.SGD( [{'params': filter(lambda p: p.requires_grad, seg_model.parameters()), 'lr': args.learning_rate}], lr=args.learning_rate, momentum=0.9, weight_decay=5e-4) # key points best_val_mIoU = 0 best_val_pixAcc = 0 start = time.time() for epoch in range(0, args.epochs): print('\n{} | {}'.format(epoch, args.epochs - 1)) # training _ = train(model, train_loader, epoch, criterion, optimizer, writer) # validation if epoch %10 ==0 or epoch > args.epochs*0.8: val_pixacc, val_miou = validation(model, val_loader, epoch, writer) # save model if val_pixacc > best_val_pixAcc: best_val_pixAcc = val_pixacc if val_miou > best_val_mIoU: best_val_mIoU = val_miou model_dir = os.path.join(args.snapshot_dir, args.method + '_miou.pth') torch.save(seg_model.state_dict(), model_dir) print('Model saved to %s' % model_dir) os.rename(model_dir, os.path.join(args.snapshot_dir, args.method + '_miou'+str(best_val_mIoU)+'.pth')) print('Complete using', time.time() - start, 'seconds') print('Best pixAcc: {} | Best mIoU: {}'.format(best_val_pixAcc, best_val_mIoU))
def main(args): # initialization print("Input arguments:") for key, val in vars(args).items(): print("{:16} {}".format(key, val)) if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.method)) random.seed(args.seed) torch.manual_seed(args.seed) cudnn.enabled = True cudnn.benchmark = True adj_matrix = torch.tensor( [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0], [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]], requires_grad=False) upper_part_list = [1, 2, 3, 4, 5, 6, 7, 11, 13, 14, 15] lower_part_list = [8, 9, 10, 12, 16, 17, 18, 19] weight = torch.FloatTensor([ 0.7602572, 0.94236198, 0.85644457, 1.04346266, 1.10627293, 0.80980162, 0.95168713, 0.8403769, 1.05798412, 0.85746254, 1.01274366, 1.05854692, 1.03430773, 0.84867818, 0.88027721, 0.87580925, 0.98747462, 0.9876475, 1.00016535, 1.00108882 ]) # conduct seg network seg_model = get_model(num_classes=args.num_classes, adj_matrix=adj_matrix, upper_part_list=upper_part_list, lower_part_list=lower_part_list) saved_state_dict = torch.load(args.restore_from) new_params = seg_model.state_dict().copy() # if args.init: # for i in saved_state_dict: # i_parts = i.split('.') # if not i_parts[0] == 'fc': # new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i] # #new_params[i_parts[:]] = saved_state_dict[i] # seg_model.load_state_dict(new_params) # print('loading params w/o fc') # else: # seg_model.load_state_dict(saved_state_dict) # print('loading params all') for i in saved_state_dict: i_parts = i.split('.') if not i_parts[0] == 'fc': new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i] seg_model.load_state_dict(new_params) print('loading params w/o fc') # seg_model.load_state_dict(saved_state_dict) # print('loading params all') model = DataParallelModel(seg_model) model.float() model.cuda() # define dataloader train_loader = data.DataLoader(DatasetGenerator(root=args.root, list_path=args.lst, crop_size=args.crop_size, training=True), batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True) val_loader = data.DataLoader(DatasetGenerator(root=args.val_root, list_path=args.val_lst, crop_size=args.crop_size, training=False), batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True) # define criterion & optimizer criterion = ABRLovaszLoss(adj_matrix=adj_matrix, ignore_index=args.ignore_label, only_present=True, upper_part_list=upper_part_list, lower_part_list=lower_part_list, cls_p=args.num_classes, cls_h=args.hbody_cls, cls_f=args.fbody_cls, weight=weight) criterion = DataParallelCriterion(criterion).cuda() optimizer = optim.SGD( [{ 'params': filter(lambda p: p.requires_grad, seg_model.parameters()), 'lr': args.learning_rate }], lr=args.learning_rate, momentum=0.9, weight_decay=5e-4) # key points best_val_mIoU = 0 best_val_pixAcc = 0 start = time.time() for epoch in range(0, args.epochs): print('\n{} | {}'.format(epoch, args.epochs - 1)) # training _ = train(model, train_loader, epoch, criterion, optimizer, writer) # validation if epoch % 10 == 0 or epoch > args.epochs - 5: val_pixacc, val_miou = validation(model, val_loader, epoch, writer) # save model if val_pixacc > best_val_pixAcc: best_val_pixAcc = val_pixacc if val_miou > best_val_mIoU: best_val_mIoU = val_miou model_dir = os.path.join(args.snapshot_dir, args.method + '_miou.pth') torch.save(seg_model.state_dict(), model_dir) print('Model saved to %s' % model_dir) os.rename( model_dir, os.path.join(args.snapshot_dir, args.method + '_miou' + str(best_val_mIoU) + '.pth')) print('Complete using', time.time() - start, 'seconds') print('Best pixAcc: {} | Best mIoU: {}'.format(best_val_pixAcc, best_val_mIoU))