Example #1
0
	def __init__(self, dataloader, hierarchical_transformer, config, i):

		super(Trainer, self).__init__()

		self.iter = i
		self.config = config
		self.cpu = torch.device("cpu")
		self.multi_gpu = len(self.config.gpu_idx) > 1

		self.dataloader = dataloader
		self.word_encoder = WordEncoder.WordEncoder(config, self.dataloader.tweet_field.vocab)
		self.word_pos_encoder = PositionEncoder.PositionEncoder(config, self.config.max_length)
		self.time_delay_encoder = PositionEncoder.PositionEncoder(config, self.config.size)

		# <----------- Check for GPU setting ----------->
		if self.config.gpu:

			self.hierarchical_transformer = DataParallelModel(hierarchical_transformer.cuda())
			self.criterion = DataParallelCriterion(nn.NLLLoss())

		else:
			self.hierarchical_transformer = hierarchical_transformer
			self.criterion = nn.NLLLoss()

		self.adam_optimizer = optim.Adam(self.hierarchical_transformer.parameters(), np.power(self.config.d_model, - 0.5), betas = (self.config.beta_1, self.config.beta_2))
		self.optimizer = Optimizer.Optimizer(self.config, self.adam_optimizer)
Example #2
0
def create_single_model(args):
    print('Creating model with\n \
           Input size: {}\n \
           Output size: {}\n \
           Activation {}\n \
           Num layers: {}\n \
           Hidden units per layer: {}\n \
           Using bias: {}\n \
           Using batchnorm {}\n \
           With batchsize {}'.format( \
           args.input_size, args.output_size, args.actv,
           args.num_layers, args.hidden, args.bias, args.bn, args.batch))
    model = args.model(input_size=args.input_size, output_size=args.output_size,
                       actv_type=args.actv,
                       num_layers=args.num_layers,
                       hidden_size=args.hidden, 
                       bias=args.bias, 
                       use_bn=args.bn)

    args.loss = model.loss
    if args.multi_gpu:
        print('Using data parallelism with {} GPUs'.format(args.num_gpu))
        #model = nn.DataParallel(model, device_ids = args.device_ids)

        ###
        model = DataParallelModel(model, device_ids = args.device_ids)
        args.loss = DataParallelCriterion(args.loss, device_ids = args.device_ids)
        ###

    print('Sending model to device {}'.format(args.device))
    model.to(args.device)
    return model
Example #3
0
 def init_fn(self, shared_model=None, **kwargs):
     self.gpu_inference = self.options.num_gpus > 0
     if self.gpu_inference == 0:
         raise NotImplementedError(
             "CPU inference is currently buggy. This takes some extra efforts and "
             "might be fixed in the future.")
     if shared_model is not None:
         self.model = shared_model
     else:
         self.init_auxiliary()
         self.model = self.init_model()
         self.model = DataParallelModel(self.model.cuda(),
                                        device_ids=self.gpus)
Example #4
0
def main(opt):
    logger.info('Loading model: %s', opt.model_file)

    checkpoint = torch.load(opt.model_file)

    checkpoint_opt = checkpoint['opt']

    # Load model location
    model = LaneNet(cnn_type=checkpoint_opt.cnn_type)
    model = DataParallelModel(model)

    # Update/Overwrite some test options like batch size, location to metadata
    # file
    vars(checkpoint_opt).update(vars(opt))

    logger.info('Building model...')
    model.load_state_dict(checkpoint['model'])

    if torch.cuda.is_available():
        model = model.cuda()

    logger.info('Start testing...')
    test_video(model,
               opt.input_file,
               opt.output_file,
               checkpoint_opt.width,
               checkpoint_opt.height,
               genline_method=opt.genline_method)
 def DataParallelModelProcess(self,
                              model,
                              ParallelModelType=1,
                              is_eval='train',
                              device='cuda'):
     if ParallelModelType == 1:
         parallel_model = DataParallelModel(model)
     elif ParallelModelType == 2:
         parallel_model = parallel_old.DataParallelModel(model)
     else:
         raise ValueError('ParallelModelType should be 1 or 2')
     if is_eval == 'eval':
         parallel_model.eval()
     elif is_eval == 'train':
         parallel_model.train()
     else:
         raise ValueError('is_eval should be eval or train')
     parallel_model.float()
     parallel_model.to(device)
     return parallel_model
Example #6
0
def main(opt):
    logger.info('Loading model: %s', opt.model_file)

    checkpoint = torch.load(opt.model_file)

    checkpoint_opt = checkpoint['opt']

    # Load model location
    model = LaneNet(cnn_type=checkpoint_opt.cnn_type)
    model = DataParallelModel(model)

    # Update/Overwrite some test options like batch size, location to metadata
    # file
    vars(checkpoint_opt).update(vars(opt))

    test_loader = get_data_loader(checkpoint_opt,
                                  split='test',
                                  return_org_image=True)

    logger.info('Building model...')
    model.load_state_dict(checkpoint['model'])

    if torch.cuda.is_available():
        model = model.cuda()

    postprocessor = PostProcessor()
    clustering = LaneClustering()

    logger.info('Start testing...')

    if opt.loader_type == 'tusimpletest':
        x_lanes, _, times, _ = test(model,
                                    test_loader,
                                    postprocessor,
                                    clustering,
                                    genline_method=opt.genline_method)
        output_tuprediction(opt.meta_file, x_lanes, times, opt.output_file)
    if opt.loader_type == 'culanetest':
        x_lanes, y_list, _, image_files = test(
            model,
            test_loader,
            postprocessor,
            clustering,
            genline_method=opt.genline_method)
        output_culaneprediction(opt.output_dir, x_lanes, y_list, image_files)
    if opt.loader_type == 'dirloader':
        visualize(model,
                  test_loader,
                  postprocessor,
                  clustering,
                  show_demo=opt.show_demo,
                  output_dir=opt.output_dir,
                  genline_method=opt.genline_method)
Example #7
0
def main(opt):
    logger.info('Loading model: %s', opt.model_file)

    checkpoint = torch.load(opt.model_file)

    checkpoint_opt = checkpoint['opt']

    # Update/Overwrite some test options like batch size, location to metadata
    # file
    vars(checkpoint_opt).update(vars(opt))

    logger.info('Updated input arguments: %s',
                json.dumps(vars(checkpoint_opt), sort_keys=True, indent=4))

    logger.info('Building model...')
    model = get_model(checkpoint_opt, num_classes=checkpoint_opt.num_classes)

    test_loader = get_data_loader(checkpoint_opt,
                                  training=False,
                                  return_org_image=True,
                                  data_list=opt.test_data_list)

    logger.info('Loading model parameters...')
    model = DataParallelModel(model)
    model.load_state_dict(checkpoint['model'])

    if torch.cuda.is_available():
        model.cuda()

    logger.info('Start testing...')

    test(checkpoint_opt, model, test_loader)
Example #8
0
 def init_fn(self, shared_model=None, **kwargs):
     # Create auxiliary models
     self.init_auxiliary()
     if shared_model is not None:
         self.model = shared_model
     else:
         self.model = self.init_model()
         self.model = DataParallelModel(self.model.cuda(),
                                        device_ids=self.gpus)
         # self.model = torch.nn.DataParallel(self.model, device_ids=self.gpus).cuda()
     # Setup a joint optimizer for the 2 models
     self.optimizer = self.init_optimizer(self.options.optim.name)
     self.lr_scheduler = self.init_lr(self.options.optim.lr_scheduler)
     # Create loss functions
     self.criterion = self.init_loss_functions()
     self.criterion = DataParallelCriterion(self.criterion.cuda(),
                                            device_ids=self.gpus)
     # Create AverageMeters for losses
     self.losses = AverageMeter()
     # Evaluators
     # self.evaluators = [Evaluator(self.options, self.logger, self.summary_writer, shared_model=self.model)]
     self.dataset_size = None
Example #9
0
def define_G(ngf):
    if opt.parallel and torch.cuda.device_count() > 1:
        return DataParallelModel(init_net(Generator(ngf)))
    else:
        return init_net(Generator(ngf))
def main(args):
    # initialization
    print("Input arguments:")
    for key, val in vars(args).items():
        print("{:16} {}".format(key, val))

    if not os.path.exists(args.snapshot_dir):
        os.makedirs(args.snapshot_dir)
    writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.method))

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    cudnn.benchmark = True

    # conduct seg network
    seg_model = get_model(num_classes=args.num_classes)

    saved_state_dict = torch.load(args.restore_from)
    new_params = seg_model.state_dict().copy()

    # if args.init:
    #     for i in saved_state_dict:
    #         i_parts = i.split('.')
    #         if not i_parts[0] == 'fc':
    #             new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i]
    #     seg_model.load_state_dict(new_params)
    #     print('loading params w/o fc')
    # else:
    #     seg_model.load_state_dict(saved_state_dict)
    #     print('loading params all')

    model = DataParallelModel(seg_model)
    model.float()
    model.cuda()

    # define dataloader
    train_loader = data.DataLoader(TrainGenerator(root=args.root,
                                                  list_path=args.lst,
                                                  crop_size=args.crop_size,
                                                  max_scale=2.0),
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=4,
                                   pin_memory=True)

    # define criterion & optimizer
    criterion = ReportLovaszLoss(ignore_index=args.ignore_label,
                                 only_present=True)
    criterion = DataParallelCriterion(criterion).cuda()

    optimizer = optim.SGD(
        [{
            'params': filter(lambda p: p.requires_grad,
                             seg_model.parameters()),
            'lr': args.learning_rate
        }],
        lr=args.learning_rate,
        momentum=0.9,
        weight_decay=5e-4)

    start = time.time()

    for epoch in range(0, args.epochs):
        print('\n{} | {}'.format(epoch, args.epochs - 1))
        # training
        _ = train(model, train_loader, epoch, criterion, optimizer, writer)

        if epoch == args.epochs - 1:
            model_dir = os.path.join(args.snapshot_dir,
                                     args.method + '_final.pth')
            torch.save(seg_model.state_dict(), model_dir)
            print('Model saved to %s' % model_dir)

    print('Complete using', time.time() - start, 'seconds')
    def __init__(self, args):
        self.args = args

        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        # Define Dataloader
        kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(
            args, **kwargs)

        # Define network
        model = DeepLab(num_classes=self.nclass,
                        backbone=args.backbone,
                        output_stride=args.out_stride,
                        sync_bn=args.sync_bn,
                        freeze_bn=args.freeze_bn)

        train_params = [{
            'params': model.get_1x_lr_params(),
            'lr': args.lr
        }, {
            'params': model.get_10x_lr_params(),
            'lr': args.lr * 10
        }]

        # Define Optimizer
        optimizer = torch.optim.SGD(train_params,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=args.nesterov)

        # Define Criterion
        # whether to use class balanced weights
        if args.use_balanced_weights:
            classes_weights_path = os.path.join(
                Path.db_root_dir(args.dataset),
                args.dataset + '_classes_weights.npy')
            if os.path.isfile(classes_weights_path):
                weight = np.load(classes_weights_path)
            else:
                weight = calculate_weigths_labels(args.dataset,
                                                  self.train_loader,
                                                  self.nclass)
            weight = torch.from_numpy(weight.astype(np.float32))
        else:
            weight = None
        self.criterion = SegmentationLosses(
            weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type)
        if len(args.gpu_ids) > 1:
            self.model = DataParallelModel(model)
        else:
            self.model = model
        self.optimizer = optimizer

        # Define Evaluator
        self.evaluator = Evaluator(self.nclass)
        # Define lr scheduler
        self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs,
                                      len(self.train_loader))

        # Using cuda
        if args.cuda:
            # self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids)
            # patch_replication_callback(self.model)
            self.model = self.model.cuda()

        # Resuming checkpoint
        self.best_pred = 0.0
        if args.resume is not None:
            if not os.path.isfile(args.resume):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            if args.cuda:
                self.model.module.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])
            if not args.ft:
                self.optimizer.load_state_dict(checkpoint['optimizer'])
            self.best_pred = checkpoint['best_pred']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))

        # Clear start epoch if fine-tuning
        if args.ft:
            args.start_epoch = 0
class Trainer(object):
    def __init__(self, args):
        self.args = args

        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        # Define Dataloader
        kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(
            args, **kwargs)

        # Define network
        model = DeepLab(num_classes=self.nclass,
                        backbone=args.backbone,
                        output_stride=args.out_stride,
                        sync_bn=args.sync_bn,
                        freeze_bn=args.freeze_bn)

        train_params = [{
            'params': model.get_1x_lr_params(),
            'lr': args.lr
        }, {
            'params': model.get_10x_lr_params(),
            'lr': args.lr * 10
        }]

        # Define Optimizer
        optimizer = torch.optim.SGD(train_params,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=args.nesterov)

        # Define Criterion
        # whether to use class balanced weights
        if args.use_balanced_weights:
            classes_weights_path = os.path.join(
                Path.db_root_dir(args.dataset),
                args.dataset + '_classes_weights.npy')
            if os.path.isfile(classes_weights_path):
                weight = np.load(classes_weights_path)
            else:
                weight = calculate_weigths_labels(args.dataset,
                                                  self.train_loader,
                                                  self.nclass)
            weight = torch.from_numpy(weight.astype(np.float32))
        else:
            weight = None
        self.criterion = SegmentationLosses(
            weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type)
        if len(args.gpu_ids) > 1:
            self.model = DataParallelModel(model)
        else:
            self.model = model
        self.optimizer = optimizer

        # Define Evaluator
        self.evaluator = Evaluator(self.nclass)
        # Define lr scheduler
        self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs,
                                      len(self.train_loader))

        # Using cuda
        if args.cuda:
            # self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids)
            # patch_replication_callback(self.model)
            self.model = self.model.cuda()

        # Resuming checkpoint
        self.best_pred = 0.0
        if args.resume is not None:
            if not os.path.isfile(args.resume):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            if args.cuda:
                self.model.module.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])
            if not args.ft:
                self.optimizer.load_state_dict(checkpoint['optimizer'])
            self.best_pred = checkpoint['best_pred']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))

        # Clear start epoch if fine-tuning
        if args.ft:
            args.start_epoch = 0

    def training(self, epoch):
        train_loss = 0.0
        self.model.train()
        tbar = tqdm(self.train_loader)
        num_img_tr = len(self.train_loader)
        for i, sample in enumerate(tbar):
            image, target = sample['image'], sample['label']

            if self.args.cuda:
                image, target = image.cuda(), target.cuda()
            if image.shape[0] == 1:
                continue
            self.scheduler(self.optimizer, i, epoch, self.best_pred)
            self.optimizer.zero_grad()
            output = self.model(image)
            #print(output.shape, target.shape)
            loss = self.criterion(output, target)
            loss.backward()
            self.optimizer.step()
            train_loss += loss.item()
            tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))
            self.writer.add_scalar('train/total_loss_iter', loss.item(),
                                   i + num_img_tr * epoch)

            # Show 10 * 3 inference results each epoch
            if i % (num_img_tr // 10) == 0:
                global_step = i + num_img_tr * epoch
                self.summary.visualize_image(self.writer, self.args.dataset,
                                             image, target, output,
                                             global_step)

        self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch)
        print('[Epoch: %d, numImages: %5d]' %
              (epoch, i * self.args.batch_size + image.data.shape[0]))
        print('Loss: %.3f' % train_loss)

        if self.args.no_val:
            # save checkpoint every epoch
            is_best = False
            self.saver.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': self.model.module.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                    'best_pred': self.best_pred,
                }, is_best)

    def validation(self, epoch):
        self.model.eval()
        self.evaluator.reset()
        tbar = tqdm(self.val_loader, desc='\r')
        test_loss = 0.0
        for i, sample in enumerate(tbar):
            image, target = sample['image'], sample['label']
            if self.args.cuda:
                image, target = image.cuda(), target.cuda()
            with torch.no_grad():
                output = self.model(image)
            loss = self.criterion(output, target)
            test_loss += loss.item()
            tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1)))
            pred = output.data.cpu().numpy()
            target = target.cpu().numpy()
            pred = np.argmax(pred, axis=1)
            # Add batch sample into evaluator
            self.evaluator.add_batch(target, pred)

        # Fast test during the training
        Acc = self.evaluator.Pixel_Accuracy()
        Acc_class = self.evaluator.Pixel_Accuracy_Class()
        mIoU = self.evaluator.Mean_Intersection_over_Union()
        FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union()
        self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch)
        self.writer.add_scalar('val/mIoU', mIoU, epoch)
        self.writer.add_scalar('val/Acc', Acc, epoch)
        self.writer.add_scalar('val/Acc_class', Acc_class, epoch)
        self.writer.add_scalar('val/fwIoU', FWIoU, epoch)
        print('Validation:')
        print('[Epoch: %d, numImages: %5d]' %
              (epoch, i * self.args.batch_size + image.data.shape[0]))
        print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(
            Acc, Acc_class, mIoU, FWIoU))
        print('Loss: %.3f' % test_loss)

        new_pred = mIoU
        if new_pred > self.best_pred:
            is_best = True
            self.best_pred = new_pred
            self.saver.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': self.model.module.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                    'best_pred': self.best_pred,
                }, is_best)
    if not os.path.exists(snapshot_dir):
        os.makedirs(snapshot_dir)

    deeplab = get_model(num_classes=num_classes)

    # load pretrained ResNet101 backbone:
    saved_state_dict = torch.load(restore_from)
    new_params = deeplab.state_dict().copy()
    for i in saved_state_dict:
        i_parts = i.split('.')
        if not i_parts[0] == 'fc' and not i_parts[
                0] == 'last_linear' and not i_parts[0] == 'classifier':
            new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
    deeplab.load_state_dict(new_params)

    model = DataParallelModel(deeplab)
    model.train()
    model.float()
    model.cuda()

    criterion = CriterionCrossEntropy()
    criterion = DataParallelCriterion(criterion)
    criterion.cuda()

    train_dataset = DatasetCityscapesAugmentation(root=data_dir,
                                                  list_path=data_list,
                                                  max_iters=num_steps *
                                                  batch_size,
                                                  crop_size=crop_size)
    train_loader = data.DataLoader(dataset=train_dataset,
                                   batch_size=batch_size,
Example #14
0
def train_net(ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr,
              lr_step):
    mx.random.seed(3)
    np.random.seed(3)

    batch_size = len(ctx)
    backbone = ResNetV1()
    feat_symbol = backbone(mx.symbol.var(name="data"))[0]
    net = FasterRCNN(config, backbone)

    params = net.collect_params()
    params_pretrained = None  #
    # uncommit the following line to load pretrained model.
    # params_pretrained = mx.nd.load("pretrained/rfcn-voc-resnet50_v1--29-0.804082102562.params")
    if params_pretrained is not None:
        for k in params.keys():
            try:
                params[k]._load_init(params_pretrained[k], mx.cpu())
            except Exception as e:
                logging.exception(e)
    for key in params.keys():
        if params[key]._data is None:
            default_init = mx.init.Zero(
            ) if "bias" in key or "offset" in key else mx.init.Normal()
            default_init.set_verbosity(True)
            if params[key].init is not None:
                params[key].init.set_verbosity(True)
                params[key].initialize(init=params[key].init,
                                       default_init=params[key].init)
            else:
                params[key].initialize(default_init=default_init)
    net.collect_params().reset_ctx(list(set(ctx)))
    import data.transforms.bbox as bbox_t
    train_transforms = bbox_t.Compose([
        # bbox_t.RandomRotate(bound=True, min_angle=-15, max_angle=15),
        bbox_t.Resize(target_size=config.SCALES[0][0],
                      max_size=config.SCALES[0][1]),
        bbox_t.Normalize(),
        bbox_t.AssignAnchor(config, feat_strides=(16, 16), symbol=feat_symbol)
    ])
    val_transforms = bbox_t.Compose([
        bbox_t.Resize(target_size=config.SCALES[0][0],
                      max_size=config.SCALES[0][1]),
        bbox_t.Normalize(),
    ])

    train_dataset = VOCDetection(root=config.dataset.dataset_path,
                                 splits=((2007, 'trainval'), (2012,
                                                              'trainval')),
                                 transform=train_transforms)
    val_dataset = VOCDetection(root=config.dataset.dataset_path,
                               splits=((2007, 'test'), ))

    train_loader = DataLoader(train_dataset, batchsize=len(ctx))
    # train_loader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=len(ctx), batchify_fn=lambda x: x,
    #                                         pin_memory=True, num_workers=8, last_batch="discard")

    rpn_eval_metric = RPNAccuMetric()
    loss_rpn_cls_metric = mx.metric.Loss(name="rpn_cls")
    loss_rpn_loc_metric = mx.metric.Loss(name="rpn_loc")
    loss_rcnn_cls_metric = mx.metric.Loss(name="rcnn_cls")
    loss_rcnn_loc_metric = mx.metric.Loss(name="rcnn_loc")

    eval_metrics = mx.metric.CompositeEvalMetric()
    for child_metric in [
            rpn_eval_metric, loss_rpn_cls_metric, loss_rpn_loc_metric,
            loss_rcnn_cls_metric, loss_rcnn_loc_metric
    ]:
        eval_metrics.add(child_metric)

    params_all = net.collect_params()
    params_to_train = {}
    params_fixed_prefix = config.network.FIXED_PARAMS
    for p in params_all.keys():
        ignore = False
        for f in params_fixed_prefix:
            if f in str(p):
                ignore = True
                params_all[p].grad_req = 'null'
                logging.info("{} is ignored when training.".format(p))
        if not ignore: params_to_train[p] = params_all[p]
    base_lr = lr
    lr_factor = config.TRAIN.lr_factor
    lr_epoch = [float(epoch) for epoch in lr_step.split(',')]
    lr_epoch_diff = [
        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
    ]
    lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff)))
    lr_iters = [
        int(epoch * len(train_dataset) / batch_size) for epoch in lr_epoch_diff
    ]
    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
    lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor,
                                              config.TRAIN.warmup,
                                              config.TRAIN.warmup_lr,
                                              config.TRAIN.warmup_step)

    trainer = mx.gluon.Trainer(
        net.collect_params(),  # fix batchnorm, fix first stage, etc...
        'sgd',
        {
            'learning_rate': config.TRAIN.lr,
            'wd': config.TRAIN.wd,
            'momentum': config.TRAIN.momentum,
            'clip_gradient': None,
            'lr_scheduler': lr_scheduler
        })
    val_metric_5 = VOC07MApMetric(iou_thresh=.5)

    net_with_criterion = RCNNWithCriterion(base_net=net)
    net_parallel = DataParallelModel(net_with_criterion,
                                     ctx_list=ctx,
                                     sync=True)

    for epoch in range(begin_epoch, config.TRAIN.end_epoch):
        # train_data.reset()
        net.hybridize(static_alloc=True, static_shape=False)
        _ = net(mx.random.randn(1, 3, 512, 512, ctx=ctx[0]),
                mx.nd.array([[512, 512, 1]], ctx=ctx[0]))
        for nbatch, data_batch in enumerate(
                tqdm.tqdm(train_loader,
                          total=len(train_dataset) // batch_size)):
            inputs = [[x.as_in_context(c) for x in d]
                      for c, d in zip(ctx, data_batch)]
            losses = []
            with ag.record():
                outputs = net_parallel(*inputs)
                for output in outputs:
                    loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls, loss_rcnn_loc, rpn_label, rpn_cls_score = output
                    if nbatch % 4 == 0:
                        rpn_eval_metric.update(rpn_label, rpn_cls_score)
                        loss_rpn_cls_metric.update(None, loss_rpn_cls)
                        loss_rpn_loc_metric.update(None, loss_rpn_loc)
                        loss_rcnn_cls_metric.update(None, loss_rcnn_cls)
                        loss_rcnn_loc_metric.update(None, loss_rcnn_loc)
                    losses.extend([
                        loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls,
                        loss_rcnn_loc
                    ])
            ag.backward(losses)
            trainer.step(1, ignore_stale_grad=True)
            if nbatch % 100 == 0:
                msg = ','.join([
                    '{}={:.3f}'.format(w, v)
                    for w, v in zip(*eval_metrics.get())
                ])
                msg += ",lr={}".format(trainer.learning_rate)
                logging.info(msg)
                rpn_eval_metric.reset()
        val_metric_5.reset()
        net.hybridize(static_alloc=True, static_shape=False)
        for i in tqdm.tqdm(range(len(val_dataset))):
            img_path, gt_boxes = val_dataset.at_with_image_path(i)
            pred_bboxes, pred_scores, pred_clsid = im_detect_bbox_aug(
                net,
                nms_threshold=config.TEST.NMS,
                im=cv2.imread(img_path)[:, :, ::-1],  # bgr
                scales=config.SCALES,
                ctx=ctx,
                bbox_stds=config.TRAIN.BBOX_STDS,
                threshold=1e-3,
                viz=False)
            val_metric_5.update(pred_bboxes=pred_bboxes[np.newaxis],
                                pred_labels=pred_clsid[np.newaxis] - 1,
                                pred_scores=pred_scores[np.newaxis],
                                gt_bboxes=gt_boxes[np.newaxis, :, :4],
                                gt_labels=gt_boxes[np.newaxis, :, 4],
                                gt_difficults=gt_boxes[np.newaxis, :, 5])
        re = val_metric_5.get()
        logging.info(re)
        save_path = "{}-{}-{}.params".format(config.TRAIN.model_prefix, epoch,
                                             re[1])
        net.collect_params().save(save_path)
        logging.info("Saved checkpoint to {}.".format(save_path))
Example #15
0
def train_net(ctx, begin_epoch, lr, lr_step):
    mx.random.seed(3)
    np.random.seed(3)

    batch_size = len(ctx)
    backbone = ResNetV1(num_devices=len(set(ctx)), num_layers=50, sync_bn=config.network.SYNC_BN, pretrained=True)
    feat_symbol = backbone(mx.sym.var(name="data"))
    net = PyramidRFCN(config, backbone)

    # Resume parameters.
    resume = None
    if resume is not None:
        params_coco = mx.nd.load(resume)
        for k in params_coco:
            params_coco[k.replace("arg:", "").replace("aux:", "")] = params_coco.pop(k)
        params = net.collect_params()

        for k in params.keys():
            try:
                params[k]._load_init(params_coco[k], ctx=mx.cpu())
            except Exception as e:
                logging.exception(e)

    # Initialize parameters
    params = net.collect_params()
    for key in params.keys():
        if params[key]._data is None:
            default_init = mx.init.Zero() if "bias" in key or "offset" in key else mx.init.Normal()
            default_init.set_verbosity(True)
            if params[key].init is not None and hasattr(params[key].init, "set_verbosity"):
                params[key].init.set_verbosity(True)
                params[key].initialize(init=params[key].init, default_init=params[key].init)
            else:
                params[key].initialize(default_init=default_init)

    net.collect_params().reset_ctx(list(set(ctx)))
    import data.transforms.bbox as bbox_t
    train_transforms = bbox_t.Compose([
        # Flipping is implemented in dataset.
        # bbox_t.RandomRotate(bound=True, min_angle=-15, max_angle=15),
        bbox_t.Resize(target_size=config.SCALES[0][0], max_size=config.SCALES[0][1]),
        # bbox_t.RandomResize(scales=[(960, 2000), (800, 1600), (600, 1200)]),
        bbox_t.Normalize(),
        bbox_t.AssignPyramidAnchor(config, symbol=feat_symbol, pad_n=32)
    ])
    val_transforms = bbox_t.Compose([
        bbox_t.Resize(target_size=config.SCALES[0][0], max_size=config.SCALES[0][1]),
        bbox_t.Normalize(),
    ])
    from data.bbox.mscoco import COCODetection
    val_dataset = COCODetection(root=config.dataset.dataset_path., splits=("instances_val2017",), h_flip=False)
    train_dataset = COCODetection(root=config.dataset.dataset_path, splits=("instances_train2017",),
                                  h_flip=config.TRAIN.FLIP,
                                  transform=train_transforms)
    # val_dataset = YunChongDataSet(is_train=False, h_flip=False)

    # train_loader = DataLoader(train_dataset, batchsize=len(ctx))
    train_loader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=len(ctx), batchify_fn=batch_fn,
                                            pin_memory=False, num_workers=0, last_batch="discard", shuffle=True)
    # for _ in tqdm.tqdm(train_loader, desc="Checking Dataset"):
    #     pass

    rpn_eval_metric = RPNAccuMetric()
    loss_rpn_cls_metric = mx.metric.Loss(name="rpn_cls")
    loss_rpn_loc_metric = mx.metric.Loss(name="rpn_loc")
    loss_rcnn_cls_metric = mx.metric.Loss(name="rcnn_cls")
    loss_rcnn_loc_metric = mx.metric.Loss(name="rcnn_loc")

    eval_metrics = mx.metric.CompositeEvalMetric()
    for child_metric in [rpn_eval_metric, loss_rpn_cls_metric, loss_rpn_loc_metric, loss_rcnn_cls_metric,
                         loss_rcnn_loc_metric]:
        eval_metrics.add(child_metric)

    params_all = net.collect_params()
    params_to_train = {}
    params_fixed_prefix = config.network.FIXED_PARAMS
    for p in params_all.keys():
        ignore = False
        if params_fixed_prefix is not None:
            for f in params_fixed_prefix:
                if f in str(p):
                    ignore = True
                    params_all[p].grad_req = 'null'
                    logging.info("{} is ignored when training.".format(p))
        if not ignore: params_to_train[p] = params_all[p]
    base_lr = lr
    lr_factor = config.TRAIN.lr_factor
    lr_epoch = [float(epoch) for epoch in lr_step.split(',')]
    lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
    lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
    lr_iters = [int(epoch * len(train_dataset) / batch_size) for epoch in lr_epoch_diff]
    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
    lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr,
                                              config.TRAIN.warmup_step)

    trainer = mx.gluon.Trainer(
        params_to_train,  # fix batchnorm, fix first stage, etc...
        'sgd',
        {'learning_rate': config.TRAIN.lr,
         'wd': config.TRAIN.wd,
         'momentum': config.TRAIN.momentum,
         'clip_gradient': None,
         'lr_scheduler': lr_scheduler
         })
    val_metric_5 = VOC07MApMetric(iou_thresh=.5)

    net_with_criterion = RCNNWithCriterion(base_net=net)
    net_parallel = DataParallelModel(net_with_criterion, ctx_list=ctx,
                                     sync=True if config.network.IM_PER_GPU is 1 else False)

    for epoch in range(begin_epoch, config.TRAIN.end_epoch):
        eval_metrics.reset()
        net.feature_extractor.hybridize(static_alloc=True, static_shape=False)
        _ = net(mx.random.randn(1, 3, 512, 512, ctx=ctx[0]), mx.nd.array([[512, 512, 1]], ctx=ctx[0]))
        for nbatch, data_batch in enumerate(tqdm.tqdm(train_loader, total=len(train_dataset) // batch_size,
                                                      unit_scale=batch_size)):
            inputs = [[x.as_in_context(c) for x in d] for c, d in zip(ctx, data_batch)]
            losses = []
            net.collect_params().zero_grad()
            with ag.record():
                outputs = net_parallel(*inputs)
            for output in outputs:
                loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls, loss_rcnn_loc, rpn_label, rpn_cls_score = output
                if nbatch % 4 == 0:
                    rpn_eval_metric.update(rpn_label, rpn_cls_score)
                    loss_rpn_cls_metric.update(None, loss_rpn_cls)
                    loss_rpn_loc_metric.update(None, loss_rpn_loc)
                    loss_rcnn_cls_metric.update(None, loss_rcnn_cls)
                    loss_rcnn_loc_metric.update(None, loss_rcnn_loc)
                losses.extend([loss_rpn_cls, loss_rpn_loc, loss_rcnn_cls, loss_rcnn_loc])
            ag.backward(losses)
            trainer.step(len(ctx), ignore_stale_grad=True)
            if nbatch % 100 == 0:
                msg = ','.join(['{}={:.3f}'.format(w, v) for w, v in zip(*eval_metrics.get())])
                msg += ",lr={}".format(trainer.learning_rate)
                logging.info(msg)
                rpn_eval_metric.reset()
            if nbatch % 10000 ==0:
                save_path = "{}-{}-{}.params".format(config.TRAIN.model_prefix, epoch, nbatch)
                net.collect_params().save(save_path)
                trainer.save_states(config.TRAIN.model_prefix + "-trainer.states")
                logging.info("Saved checkpoint to {}.".format(save_path))
        # val_metric_5.reset()
        # for i in tqdm.tqdm(range(len(val_dataset))):
        #     img_path, gt_boxes = val_dataset.at_with_image_path(i)
        #     pred_bboxes, pred_scores, pred_clsid = im_detect_bbox_aug(net, nms_threshold=config.TEST.NMS,
        #                                                               im=cv2.imread(img_path)[:, :, ::-1],
        #                                                               scales=config.SCALES,
        #                                                               ctx=ctx,
        #                                                               bbox_stds=config.TRAIN.BBOX_STDS,
        #                                                               flip=True,
        #                                                               threshold=1e-3,
        #                                                               viz=False,
        #                                                               pad=32,
        #                                                               class_agnostic=config.CLASS_AGNOSTIC
        #                                                               )
        #     val_metric_5.update(pred_bboxes=pred_bboxes[np.newaxis],
        #                         pred_labels=pred_clsid[np.newaxis] - 1,
        #                         pred_scores=pred_scores[np.newaxis],
        #                         gt_bboxes=gt_boxes[np.newaxis, :, :4],
        #                         gt_labels=gt_boxes[np.newaxis, :, 4],
        #                         gt_difficults=gt_boxes[np.newaxis, :, 5])
        # re = val_metric_5.get()
        re = ("mAP", "0.0")
        logging.info(re)
        save_path = "{}-{}-{}.params".format(config.TRAIN.model_prefix, epoch, re[1])
        net.collect_params().save(save_path)
        trainer.save_states(config.TRAIN.model_prefix + "-trainer.states")
        logging.info("Saved checkpoint to {}.".format(save_path))
Example #16
0
class Trainer(CheckpointRunner):
    # noinspection PyAttributeOutsideInit
    def init_fn(self, shared_model=None, **kwargs):
        # Create auxiliary models
        self.init_auxiliary()
        if shared_model is not None:
            self.model = shared_model
        else:
            self.model = self.init_model()
            self.model = DataParallelModel(self.model.cuda(),
                                           device_ids=self.gpus)
            # self.model = torch.nn.DataParallel(self.model, device_ids=self.gpus).cuda()
        # Setup a joint optimizer for the 2 models
        self.optimizer = self.init_optimizer(self.options.optim.name)
        self.lr_scheduler = self.init_lr(self.options.optim.lr_scheduler)
        # Create loss functions
        self.criterion = self.init_loss_functions()
        self.criterion = DataParallelCriterion(self.criterion.cuda(),
                                               device_ids=self.gpus)
        # Create AverageMeters for losses
        self.losses = AverageMeter()
        # Evaluators
        # self.evaluators = [Evaluator(self.options, self.logger, self.summary_writer, shared_model=self.model)]
        self.dataset_size = None

    def init_auxiliary(self):
        pass

    def init_model(self):
        raise NotImplementedError("Your model is not found")

    def init_loss_functions(self):
        raise NotImplementedError("Your loss is not found")

    def init_optimizer(self, optim_name):
        if optim_name == "adam":
            optimizer = torch.optim.Adam(params=list(self.model.parameters()),
                                         lr=self.options.optim.lr,
                                         betas=(self.options.optim.adam_beta1,
                                                0.999),
                                         weight_decay=self.options.optim.wd)
        elif optim_name == "sgd":
            optimizer = torch.optim.SGD(
                params=list(self.model.parameters()),
                lr=self.options.optim.lr,
                momentum=self.options.optim.sgd_momentum,
                weight_decay=self.options.optim.wd)
        elif optim_name == "adam_gan":
            optimizer_d = torch.optim.Adam(
                params=list(self.model.module.D.parameters()),
                lr=self.options.optim.lr_d,
                betas=(self.options.optim.adam_beta1, 0.999),
                weight_decay=0)
            optimizer_g = torch.optim.Adam(
                params=list(self.model.module.G.parameters()),
                lr=self.options.optim.lr_g,
                betas=(self.options.optim.adam_beta1, 0.999),
                weight_decay=0)
            return {"optimizer_d": optimizer_d, "optimizer_g": optimizer_g}
        else:
            raise NotImplementedError("Your optimizer is not found")
        return optimizer

    def init_lr(self, lr_scheduler_name):
        if lr_scheduler_name == "multistep":
            lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
                self.optimizer, self.options.optim.lr_step,
                self.options.optim.lr_factor)
        elif lr_scheduler_name == "exp":
            lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
                self.optimizer, gamma=self.options.optim.lr_gamma)
        elif lr_scheduler_name == "multistep_gan":
            lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
                self.optimizer["optimizer_d"], self.options.optim.lr_step,
                self.options.optim.lr_factor)
        else:
            r_scheduler = None

        return lr_scheduler

    def models_dict(self):
        return {'model': self.model}

    def optimizers_dict(self):
        return {'optimizer': self.optimizer, 'lr_scheduler': self.lr_scheduler}

    def train_step(self, input_batch):
        # Grab data from the batch, predict with model
        out = self.model(input_batch)
        # compute loss
        loss, loss_summary = self.criterion(out, input_batch)
        self.losses.update(loss.detach().cpu().item())
        # Do backprop
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # Pack output arguments to be used for visualization
        return recursive_detach(out), recursive_detach(loss_summary)

    def get_dataloader(self):
        data_loader = DataLoader(self.dataset,
                                 batch_size=self.options.train.batch_size *
                                 self.options.num_gpus,
                                 num_workers=self.options.num_workers,
                                 pin_memory=self.options.pin_memory,
                                 shuffle=self.options.train.shuffle)
        return data_loader

    def train(self):
        self.logger.info("Start Trainning.")
        # Create data loader at very begining
        train_data_loader = self.get_dataloader()
        self.dataset_size = len(train_data_loader)

        # Run training for num_epochs epochs
        for epoch in range(self.epoch_count, self.options.train.num_epochs):
            self.epoch_count += 1
            # Reset loss
            self.losses.reset()
            # Iterate over all batches in an epoch
            for step, batch in enumerate(train_data_loader):
                # Send input to GPU
                batch = {
                    k: v.cuda() if isinstance(v, torch.Tensor) else v
                    for k, v in batch.items()
                }
                # Run training step
                out = self.train_step(batch)
                self.step_count += 1
                # Tensorboard logging every summary_steps steps
                if self.step_count % self.options.train.summary_steps == 0:
                    self.train_summaries(batch, *out)
                # Save checkpoint every checkpoint_steps steps
                if self.step_count % self.options.train.checkpoint_steps == 0:
                    self.dump_checkpoint()
            if not self.options.model.name.endswith('gan'):
                self.dump_checkpoint()
            if self.lr_scheduler is not None:
                self.lr_scheduler.step()

    def train_summaries(self, input_batch, out_summary, loss_summary):
        # Debug info for filenames
        self.logger.debug(input_batch["filename"])
        # Save results in Tensorboard
        self.tensorboard_step(loss_summary)
        # Save results to log
        self.log_step(loss_summary)

    def log_step(self, loss_summary):
        self.logger.info(
            "Epoch %03d, Step %06d/%06d, Time elapsed %s, Loss %.5f (AvgLoss %.5f)"
            % (self.epoch_count, self.step_count,
               self.options.train.num_epochs * len(self.dataset) //
               (self.options.train.batch_size * self.options.num_gpus),
               self.time_elapsed, self.losses.val, self.losses.avg))

    def tensorboard_step(self, loss_summary):
        for k, v in loss_summary.items():
            self.summary_writer.add_scalar(k, v, self.step_count)

    def init_with_pretrained_backbone(self):
        checkpoint_file = os.path.abspath(
            self.options.train.backbone_pretrained_model)
        pretrained_dict = torch.load(checkpoint_file)
        self.model.module.load_state_dict(pretrained_dict, strict=False)
        self.logger.info("Init with pre-trained backbone from %s." %
                         checkpoint_file)

    def test(self):
        self.model.eval()
        for evaluator in self.evaluators:
            evaluator.evaluate()
        self.model.train()
Example #17
0
def main():
    print("Input arguments:")
    for key, val in vars(args).items():
        print("{:16} {}".format(key, val))

    random.seed(args.seed)
    torch.manual_seed(args.seed)

    writer = SummaryWriter(args.snapshot_dir)
    os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu
    h, w = map(int, args.input_size.split(','))
    input_size = (h, w)
    cudnn.enabled = True

    deeplab = get_segmentation_model("_".join([args.network, args.method]), num_classes=args.num_classes)

    saved_state_dict = torch.load(args.restore_from)
    new_params = deeplab.state_dict().copy()

    if 'wide' in args.network:
        saved_state_dict = saved_state_dict['state_dict']
        if 'vistas' in args.method:
            saved_state_dict = saved_state_dict['body']
            for i in saved_state_dict:
                new_params[i] = saved_state_dict[i]
        else:     
            for i in saved_state_dict:
                i_parts = i.split('.')
                if not 'classifier' in i_parts: 
                    new_params['.'.join(i_parts[1:])] = saved_state_dict[i]
    elif 'mobilenet' in args.network:
        for i in saved_state_dict:
            i_parts = i.split('.')
            if not (i_parts[0]=='features' and i_parts[1]=='18') and not i_parts[0]=='classifier':
                new_params['.'.join(i_parts[0:])] = saved_state_dict[i] 
    else:
        for i in saved_state_dict:
            i_parts = i.split('.')
            if not i_parts[0]=='fc' and not  i_parts[0]=='last_linear' and not  i_parts[0]=='classifier':
                new_params['.'.join(i_parts[0:])] = saved_state_dict[i] 

    if args.start_iters > 0:
        deeplab.load_state_dict(saved_state_dict)
    else:
        deeplab.load_state_dict(new_params)

    model = DataParallelModel(deeplab)
    # model = nn.DataParallel(deeplab)
    model.train()     
    model.float()
    model.cuda()    

    criterion = CriterionCrossEntropy()
    if "dsn" in args.method:
        if args.ohem:
            if args.ohem_single:
                print('use ohem only for the second prediction map.')
                criterion = CriterionOhemDSN_single(thres=args.ohem_thres, min_kept=args.ohem_keep, dsn_weight=float(args.dsn_weight))
            else:
                criterion = CriterionOhemDSN(thres=args.ohem_thres, min_kept=args.ohem_keep, dsn_weight=float(args.dsn_weight), use_weight=True)
        else:
            criterion = CriterionDSN(dsn_weight=float(args.dsn_weight), use_weight=True)


    criterion = DataParallelCriterion(criterion)
    criterion.cuda()
    cudnn.benchmark = True


    if not os.path.exists(args.snapshot_dir):
        os.makedirs(args.snapshot_dir)

    trainloader = data.DataLoader(get_segmentation_dataset(args.dataset, root=args.data_dir, list_path=args.data_list,
                    max_iters=args.num_steps*args.batch_size, crop_size=input_size, 
                    scale=args.random_scale, mirror=args.random_mirror, network=args.network), 
                    batch_size=args.batch_size, shuffle=True, num_workers=1, pin_memory=True)

    optimizer = optim.SGD([{'params': filter(lambda p: p.requires_grad, deeplab.parameters()), 'lr': args.learning_rate }], 
                lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay)


    optimizer.zero_grad()

    for i_iter, batch in enumerate(trainloader):
        sys.stdout.flush()
        i_iter += args.start_iters
        images, labels, _, _ = batch
        images = Variable(images.cuda())
        labels = Variable(labels.long().cuda())
        optimizer.zero_grad()
        lr = adjust_learning_rate(optimizer, i_iter)
        if args.fix_lr:
            lr = args.learning_rate
        print('learning_rate: {}'.format(lr))

        if 'gt' in args.method:
            preds = model(images, labels)
        else:
            preds = model(images)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()

        if i_iter % 100 == 0:
            writer.add_scalar('learning_rate', lr, i_iter)
            writer.add_scalar('loss', loss.data.cpu().numpy(), i_iter)
        print('iter = {} of {} completed, loss = {}'.format(i_iter, args.num_steps, loss.data.cpu().numpy()))

        if i_iter >= args.num_steps-1:
            print('save model ...')
            torch.save(deeplab.state_dict(),osp.join(args.snapshot_dir, 'CS_scenes_'+str(args.num_steps)+'.pth'))
            break

        if i_iter % args.save_pred_every == 0:
            print('taking snapshot ...')
            torch.save(deeplab.state_dict(),osp.join(args.snapshot_dir, 'CS_scenes_'+str(i_iter)+'.pth'))     

    end = timeit.default_timer()
    print(end-start,'seconds')
Example #18
0
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr,
              lr_step):
    mx.random.seed(3)
    np.random.seed(3)
    logger, final_output_path = create_logger(config.output_path, args.cfg,
                                              config.dataset.image_set)
    prefix = os.path.join(final_output_path, prefix)

    # load symbol
    shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'),
                 final_output_path)
    sym_instance = eval(config.symbol + '.' + config.symbol)()
    sym = sym_instance.get_symbol(config, is_train=True)

    feat_pyramid_level = np.log2(config.network.RPN_FEAT_STRIDE).astype(int)
    feat_sym = [
        sym.get_internals()['rpn_cls_score_p' + str(x) + '_output']
        for x in feat_pyramid_level
    ]

    # setup multi-gpu
    batch_size = len(ctx)
    input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size

    # print config
    pprint.pprint(config)
    logger.info('training config:{}\n'.format(pprint.pformat(config)))

    # load dataset and prepare imdb for training
    image_sets = [iset for iset in config.dataset.image_set.split('+')]
    roidbs = [
        load_gt_roidb(config.dataset.dataset,
                      image_set,
                      config.dataset.root_path,
                      config.dataset.dataset_path,
                      flip=config.TRAIN.FLIP) for image_set in image_sets
    ]
    roidb = merge_roidb(roidbs)
    roidb = filter_roidb(roidb, config)

    # load training data

    train_data = PyramidAnchorIterator(
        feat_sym,
        roidb,
        config,
        batch_size=input_batch_size,
        shuffle=config.TRAIN.SHUFFLE,
        ctx=ctx,
        feat_strides=config.network.RPN_FEAT_STRIDE,
        anchor_scales=config.network.ANCHOR_SCALES,
        anchor_ratios=config.network.ANCHOR_RATIOS,
        aspect_grouping=config.TRAIN.ASPECT_GROUPING,
        allowed_border=np.inf)

    # infer max shape
    max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3,
                                max([v[0] for v in config.SCALES]),
                                max([v[1] for v in config.SCALES])))]
    max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
    max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5)))
    print 'providing maximum shape', max_data_shape, max_label_shape

    data_shape_dict = dict(train_data.provide_data_single +
                           train_data.provide_label_single)
    pprint.pprint(data_shape_dict)
    sym_instance.infer_shape(data_shape_dict)

    # load and initialize params
    if config.TRAIN.RESUME:
        print('continue training from ', begin_epoch)
        arg_params, aux_params = load_param(prefix, begin_epoch, convert=True)
    else:
        arg_params, aux_params = load_param(pretrained, epoch, convert=True)
        # sym_instance.init_weight(config, arg_params, aux_params)

    # check parameter shapes
    # sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict)

    # decide training params
    # metric
    rpn_eval_metric = metric.RPNAccMetric()
    rpn_cls_metric = metric.RPNLogLossMetric()
    rpn_bbox_metric = metric.RPNL1LossMetric()
    rpn_fg_metric = metric.RPNFGFraction(config)
    eval_metric = metric.RCNNAccMetric(config)
    eval_fg_metric = metric.RCNNFGAccuracy(config)
    cls_metric = metric.RCNNLogLossMetric(config)
    bbox_metric = metric.RCNNL1LossMetric(config)
    eval_metrics = mx.metric.CompositeEvalMetric()
    # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric
    for child_metric in [
            rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, rpn_fg_metric,
            eval_fg_metric, eval_metric, cls_metric, bbox_metric
    ]:
        eval_metrics.add(child_metric)
    # callback
    # batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent)
    means = np.tile(np.array(config.TRAIN.BBOX_MEANS),
                    2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES)
    stds = np.tile(np.array(config.TRAIN.BBOX_STDS),
                   2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES)
    # epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1,
    # save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds)]
    # decide learning rate
    base_lr = lr
    lr_factor = config.TRAIN.lr_factor
    lr_epoch = [float(epoch) for epoch in lr_step.split(',')]
    lr_epoch_diff = [
        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
    ]
    lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff)))
    lr_iters = [
        int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff
    ]
    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
    lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor,
                                              config.TRAIN.warmup,
                                              config.TRAIN.warmup_lr,
                                              config.TRAIN.warmup_step)
    # optimizer
    optimizer_params = {
        'momentum': config.TRAIN.momentum,
        'wd': config.TRAIN.wd,
        'learning_rate': lr,
        'lr_scheduler': lr_scheduler,
        'clip_gradient': None
    }
    if not isinstance(train_data, PrefetchingIter):
        train_data = PrefetchingIter(train_data)

    net = FPNNet(sym, args_pretrained=arg_params, auxes_pretrained=aux_params)

    # create multi-threaded DataParallel Model.
    net_parallel = DataParallelModel(net, ctx_list=ctx)

    # create trainer,
    # !Important: A trainer can be only created after the function `resnet_ctx` is called.
    # Please Note that DataParallelModel will call reset_ctx to initialize parameters on gpus.
    trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', optimizer_params)

    for epoch in range(begin_epoch, config.TRAIN.end_epoch):
        train_data.reset()
        net.hybridize(static_alloc=True, static_shape=False)
        progress_bar = tqdm.tqdm(total=len(roidb))
        for nbatch, data_batch in enumerate(train_data):
            inputs = [[
                x.astype('f').as_in_context(c) for x in d + l
            ] for c, d, l in zip(ctx, data_batch.data, data_batch.label)]
            with ag.record():
                outputs = net_parallel(*inputs)
                ag.backward(sum(outputs, ()))
            trainer.step(1)
            eval_metrics.update(data_batch.label[0], outputs[0])
            if nbatch % 100 == 0:
                msg = ','.join([
                    '{}={:.3f}'.format(w, v)
                    for w, v in zip(*eval_metrics.get())
                ])
                msg += ",lr={}".format(trainer.learning_rate)
                logger.info(msg)
                print(msg)
                eval_metrics.reset()
            progress_bar.update(len(inputs))
        progress_bar.close()
        net.hybridize(static_alloc=True, static_shape=False)
        re = ("mAP", 0.0)
        logger.info(re)
        save_path = "{}-{}-{}.params".format(prefix, epoch, re[1])
        net.collect_params().save(save_path)
        logger.info("Saved checkpoint to {}.".format(save_path))
Example #19
0
class Predictor(CheckpointRunner):
    def __init__(self, options, logger: Logger, writer, shared_model=None):
        super().__init__(options,
                         logger,
                         writer,
                         training=False,
                         shared_model=shared_model)

    # noinspection PyAttributeOutsideInit
    def init_fn(self, shared_model=None, **kwargs):
        self.gpu_inference = self.options.num_gpus > 0
        if self.gpu_inference == 0:
            raise NotImplementedError(
                "CPU inference is currently buggy. This takes some extra efforts and "
                "might be fixed in the future.")
        if shared_model is not None:
            self.model = shared_model
        else:
            self.init_auxiliary()
            self.model = self.init_model()
            self.model = DataParallelModel(self.model.cuda(),
                                           device_ids=self.gpus)

    def models_dict(self):
        return {'model': self.model}

    def init_auxiliary(self):
        pass

    def init_model(self):
        raise NotImplementedError("Your model is not found")

    def get_dataloader(self):
        data_loader = DataLoader(self.dataset,
                                 batch_size=self.options.test.batch_size,
                                 pin_memory=self.options.pin_memory,
                                 collate_fn=self.dataset_collate_fn,
                                 shuffle=self.options.test.shuffle)
        return data_loader

    def predict(self):
        self.logger.info("Running predictions...")
        predict_data_loader = self.get_dataloader()
        for step, batch in enumerate(predict_data_loader):
            self.logger.info(
                "Predicting [%05d/%05d]" %
                (step * self.options.test.batch_size, len(self.dataset)))

            if self.gpu_inference:
                # Send input to GPU
                batch = {
                    k: v.cuda() if isinstance(v, torch.Tensor) else v
                    for k, v in batch.items()
                }
            else:
                raise NotImplementedError(
                    "CPU inference is currently buggy. This takes some extra efforts and "
                    "might be fixed in the future.")
            self.predict_step(batch)

    def predict_step(self, input_batch):
        raise NotImplementedError("Your predict step function not found.")

    def save_inference_results(self, inputs, outputs):
        raise NotImplementedError("Your result saving function not found.")
Example #20
0
class Trainer():

	def __init__(self, dataloader, hierarchical_transformer, config, i):

		super(Trainer, self).__init__()

		self.iter = i
		self.config = config
		self.cpu = torch.device("cpu")
		self.multi_gpu = len(self.config.gpu_idx) > 1

		self.dataloader = dataloader
		self.word_encoder = WordEncoder.WordEncoder(config, self.dataloader.tweet_field.vocab)
		self.word_pos_encoder = PositionEncoder.PositionEncoder(config, self.config.max_length)
		self.time_delay_encoder = PositionEncoder.PositionEncoder(config, self.config.size)

		# <----------- Check for GPU setting ----------->
		if self.config.gpu:

			self.hierarchical_transformer = DataParallelModel(hierarchical_transformer.cuda())
			self.criterion = DataParallelCriterion(nn.NLLLoss())

		else:
			self.hierarchical_transformer = hierarchical_transformer
			self.criterion = nn.NLLLoss()

		self.adam_optimizer = optim.Adam(self.hierarchical_transformer.parameters(), np.power(self.config.d_model, - 0.5), betas = (self.config.beta_1, self.config.beta_2))
		self.optimizer = Optimizer.Optimizer(self.config, self.adam_optimizer)
	
	def test_performance(self, type_):

		predicted_y_lst = []
		y_lst = []

		self.hierarchical_transformer.eval() # Make sure that it is on eval mode first

		with torch.no_grad():

			for X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post in self.dataloader.get_data(type_):

				# <-------- Casting as a variable --------->
				X = Variable(X)
				y = Variable(y)
				word_pos = Variable(word_pos)
				time_delay = Variable(time_delay)
				structure = Variable(structure)
				attention_mask_word = Variable(attention_mask_word)
				attention_mask_post = Variable(attention_mask_post)

				# <-------- Encode content -------------->
				X = self.word_encoder(X)
				word_pos = self.word_pos_encoder(word_pos)
				time_delay = self.time_delay_encoder(time_delay)

				# <-------- Move to GPU -------------->
				if self.config.gpu:
					X = X.cuda()
					y = y.cuda()
					word_pos = word_pos.cuda()
					time_delay = time_delay.cuda()
					structure = structure.cuda()
					attention_mask_word = attention_mask_word.cuda()
					attention_mask_post = attention_mask_post.cuda()

				# <--------- Getting the predictions ---------> 
				predicted_y = self.hierarchical_transformer(X, word_pos, time_delay, structure, attention_mask_word = attention_mask_word, attention_mask_post = attention_mask_post)

				# predicted_y, self_atten_output_post, self_atten_weights_dict_word, self_atten_weights_dict_post = self.hierarchical_transformer(X, word_pos, time_delay)
				# self_atten_weights_dict_word = merge_attention_dict(self_atten_weights_dict_word, self.config, "word")
				# self_atten_weights_dict_post = merge_attention_dict(self_atten_weights_dict_post, self.config, "post")

				if self.multi_gpu:
					predicted_y = torch.cat(list(predicted_y), dim = 0)

				# <------- to np array ------->
				predicted_y = predicted_y.cpu().numpy()
				y = y.cpu().numpy()

				print("test", predicted_y)

				# <------- Appending it to the master list ------->
				predicted_y_lst.extend(predicted_y)
				y_lst.extend(y)

				# <--------- Free up the GPU -------------->
				del X
				del y
				del predicted_y
				del word_pos
				del time_delay
				del structure

			# <------- Get scores ------->
			predicted_y_lst = np.array(predicted_y_lst)
			predicted_y_lst = get_labels(predicted_y_lst)
			y_lst = np.array(y_lst)

			return predicted_y_lst, y_lst

	def train(self):

		print("*" * 40 + " START OF TRAINING " + "*" * 40)

		epoch_values = {}

		# <------ Gets for test 1 ------>
		best_acc_test_1 = 0.0
		best_f_score_test_1 = 0.0
		best_acc_test_1_for_2 = 0.0
		best_f_score_test_1_for_2 = 0.0

		best_record_f_score_test_1 = {}
		best_record_accuracy_test_1 = {}

		# <------ Gets for test 2 ------>
		best_acc_test_2 = 0.0
		best_f_score_test_2 = 0.0
		best_acc_test_2_for_1 = 0.0
		best_f_score_test_2_for_1 = 0.0

		best_record_f_score_test_2 = {}
		best_record_accuracy_test_2 = {}

		# <------ Gets for full test ------>
		best_acc_test = 0.0
		best_f_score_test = 0.0

		best_record_f_score_test = {}
		best_record_accuracy_test = {}

		# <------ For logging purpose ------>

		dataset = self.config.data_folder.split("/")[-1]
		name = "{}_split_{}_{}".format(dataset, self.iter, datetime.now().strftime('%Y-%m-%d-%H:%M:%S')) # Date & Time for logging purposes
		path = os.path.join(self.config.log_folder, self.config.dataset_name, name + "_" + self.config.experiment_name)

		make_dir(path)
		print(path)
		save_vocab_vectors(self.dataloader, self.config, path)
		log_info(path, "*" * 40 + " EXPERIMENT " + "*" * 40)
		log_info(path, "*" * 40 + " SPLIT {} ".format(self.iter) + "*" * 40)
		log_info(path, str(vars(self.config)))
		log_info(path, "*" * 90)

		for epoch in tqdm(range(self.config.num_epoch)):

			running_loss = 0
			i  = 0

			for X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post in self.dataloader.get_data("train"):

				# <-------- Casting as a variable --------->
				X = Variable(X)
				y = Variable(y)
				word_pos = Variable(word_pos)
				time_delay = Variable(time_delay)
				structure = Variable(structure)
				attention_mask_word = Variable(attention_mask_word)
				attention_mask_post = Variable(attention_mask_post)
				
				# <-------- Encode content -------------->
				X = self.word_encoder(X)
				word_pos = self.word_pos_encoder(word_pos)
				time_delay = self.time_delay_encoder(time_delay)

				# <-------- Move to GPU -------------->
				if self.config.gpu:
					X = X.cuda()
					y = y.cuda()
					word_pos = word_pos.cuda()
					time_delay = time_delay.cuda()
					structure = structure.cuda()
					attention_mask_word = attention_mask_word.cuda()
					attention_mask_post = attention_mask_post.cuda()
				
				# <------- Settings ------------->
				self.hierarchical_transformer.train() # Set the model to be on train mode (So that the dropout applies)
				self.optimizer.zero_grad() # zero grad it

				# <--------- Getting the predictions ---------> 
				predicted_y = self.hierarchical_transformer(X, word_pos, time_delay, structure, attention_mask_word = attention_mask_word, attention_mask_post = attention_mask_post)
				
				#predicted_y, self_atten_output_post, self_atten_weights_dict_word, self_atten_weights_dict_post = self.hierarchical_transformer(X, word_pos, time_delay)
				# self_atten_weights_dict_word = merge_attention_dict(self_atten_weights_dict_word, self.config, "word")
				# self_atten_weights_dict_post = merge_attention_dict(self_atten_weights_dict_post, self.config, "post")

				print(predicted_y)

				# <--------- Getting loss and backprop --------->
				loss = self.criterion(predicted_y, y)
				loss.backward()
				self.optimizer.step_and_update_lr()

				# <--------- Calculating the loss --------->
				running_loss += float(loss.detach().item())
				i += 1 

				# <--------- Free up the GPU -------------->
				del X
				del y
				del predicted_y
				del word_pos
				del time_delay
				del structure
			
				torch.cuda.empty_cache()

			record = {}
			running_loss = running_loss/ float(i)
			print()
			print("Epoch {}: {}".format(epoch + 1, running_loss))

			with torch.no_grad():

				pred_train, true_train = self.test_performance("train_test")
				pred_test_1,true_test_1 = self.test_performance("test_1")
				pred_test_2,true_test_2 = self.test_performance("test_2")

				pred_test = np.concatenate((pred_test_1, pred_test_2))
				true_test = np.concatenate((true_test_1, true_test_2))

				# <-------- Getting performance for all the clases  -------->
				acc_train, pre_train, recall_train, f_score_train, counter_true_train, counter_pred_train = cal_scores(pred_train, true_train, type_ = "all") 
				acc_test_1, pre_test_1, recall_test_1, f_score_test_1, counter_true_test_1, counter_pred_test_1 = cal_scores(pred_test_1, true_test_1, type_ = "all") 
				acc_test_2, pre_test_2, recall_test_2, f_score_test_2, counter_true_test_2, counter_pred_test_2 = cal_scores(pred_test_2, true_test_2, type_ = "all") 
				acc_test, pre_test, recall_test, f_score_test, counter_true_test, counter_pred_test = cal_scores(pred_test, true_test, type_ = "all") 

				# <-------- Getting performance for individual claseses  -------->
				acc_test_1_class_0, pre_test_1_class_0, recall_test_1_class_0, f_score_test_1_class_0, counter_true_test_1_class_0, counter_pred_test_1_class_0 = cal_scores(pred_test_1, true_test_1, type_ = 0) 
				acc_test_1_class_1, pre_test_1_class_1, recall_test_1_class_1, f_score_test_1_class_1, counter_true_test_1_class_1, counter_pred_test_1_class_1 = cal_scores(pred_test_1, true_test_1, type_ = 1)

				acc_test_2_class_0, pre_test_2_class_0, recall_test_2_class_0, f_score_test_2_class_0, counter_true_test_2_class_0, counter_pred_test_2_class_0 = cal_scores(pred_test_2, true_test_2, type_ = 0) 
				acc_test_2_class_1, pre_test_2_class_1, recall_test_2_class_1, f_score_test_2_class_1, counter_true_test_2_class_1, counter_pred_test_2_class_1 = cal_scores(pred_test_2, true_test_2, type_ = 1)

				acc_test_class_0, pre_test_class_0, recall_test_class_0, f_score_test_class_0, counter_true_test_class_0, counter_pred_test_class_0 = cal_scores(pred_test, true_test, type_ = 0) 
				acc_test_class_1, pre_test_class_1, recall_test_class_1, f_score_test_class_1, counter_true_test_class_1, counter_pred_test_class_1 = cal_scores(pred_test, true_test, type_ = 1)

				if epoch%self.config.interval == 0:

					check_point_epoch(epoch + 1, 
										self.hierarchical_transformer,
										self.word_encoder,
										self.word_pos_encoder,
										self.time_delay_encoder,
										self.optimizer,
										running_loss,
										acc_train,
										pre_train,
										recall_train,
										f_score_train,
										counter_true_train,
										counter_pred_train,
										acc_test_1,
										pre_test_1,
										recall_test_1,
										f_score_test_1,
										counter_true_test_1,
										counter_pred_test_1,
										acc_test_2,
										pre_test_2,
										recall_test_2,
										f_score_test_2,
										counter_true_test_2,
										counter_pred_test_2,
										acc_test,
										pre_test,
										recall_test,
										f_score_test,
										counter_true_test,
										counter_pred_test,
										path)
 
				record["epoch"] = epoch + 1
				record["loss"] = running_loss

				record["acc_train"] = acc_train
				record["precision_train"] = pre_train
				record["recall_train"] = recall_train
				record["f_score_train"] = f_score_train
				record["counter_true_train"] = counter_true_train
				record["counter_pred_train"] = counter_pred_train

				record["acc_test_1"] = acc_test_1
				record["precision_test_1"] = pre_test_1
				record["recall_test_1"] = recall_test_1
				record["f_score_test_1"] = f_score_test_1
				record["counter_true_test_1"] = counter_true_test_1
				record["counter_pred_test_1"] = counter_pred_test_1

				record["acc_test_2"] = acc_test_2
				record["precision_test_2"] = pre_test_2
				record["recall_test_2"] = recall_test_2
				record["f_score_test_2"] = f_score_test_2
				record["counter_true_test_2"] = counter_true_test_2
				record["counter_pred_test_2"] = counter_pred_test_2

				record["acc_test"] = acc_test
				record["precision_test"] = pre_test
				record["recall_test"] = recall_test
				record["f_score_test"] = f_score_test
				record["counter_true_test"] = counter_true_test
				record["counter_pred_test"] = counter_pred_test

				# <--------- test 1 --------->
				record["acc_test_1_classes"] = {0 : acc_test_1_class_0,
												1 : acc_test_1_class_1}

				record["precision_test_1_classes"] = {0 : pre_test_1_class_0,
													  1 : pre_test_1_class_1}

				record["recall_test_1_classes"] = {0 : recall_test_1_class_0,
									  			   1 : recall_test_1_class_1}
				
				record["f_score_test_1_classes"] = {0 : f_score_test_1_class_0,
									  			    1 : f_score_test_1_class_1}

				record["counter_pred_test_1_classes"] = {0 : counter_pred_test_1_class_0,
														 1 : counter_pred_test_1_class_1}

				# <--------- test 2 --------->
				record["acc_test_2_classes"] = {0 : acc_test_2_class_0,
												1 : acc_test_2_class_1}

				record["precision_test_2_classes"] = {0 : pre_test_2_class_0,
													  1 : pre_test_2_class_1}

				record["recall_test_2_classes"] = {0 : recall_test_2_class_0,
									  			   1 : recall_test_2_class_1}

				record["f_score_test_2_classes"] = {0 : f_score_test_2_class_0,
									  			    1 : f_score_test_2_class_1}

				record["counter_pred_test_2_classes"] = {0 : counter_pred_test_2_class_0,
														 1 : counter_pred_test_2_class_1}

				# <--------- test --------->
				record["acc_test_classes"] = {0 : acc_test_class_0,
											  1 : acc_test_class_1}

				record["precision_test_classes"] = {0 : pre_test_class_0,
													1 : pre_test_class_1}

				record["recall_test_classes"] = {0 : recall_test_class_0,
												 1 : recall_test_class_1}
				
				record["f_score_test_classes"] = {0 : f_score_test_class_0,
									  			   1 : f_score_test_class_1}

				record["counter_pred_test_classes"] = {0 : counter_pred_test_class_0,
													   1 : counter_pred_test_class_1}

				epoch_values[epoch + 1] = record

				log_info(path, record)
				log_info(path, "=" * 90)

				if f_score_test_1 >= best_f_score_test_1:

					print("CURRENT BEST (F-SCORE) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_1"))

					if f_score_test_1 == best_f_score_test_1:

						if f_score_test_2 >= best_f_score_test_1_for_2:

							best_f_score_test_1_for_2 = f_score_test_2
							best_f_score_test_1 = f_score_test_1
							best_record_f_score_test_1 = record

							log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_1, "f_score", "test_1")
							save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_1")

					else:

						best_f_score_test_1 = f_score_test_1
						best_record_f_score_test_1 = record
						best_f_score_test_1_for_2 = f_score_test_2

						log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_1, "f_score", "test_1")
						save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_1")

				if acc_test_1 >= best_acc_test_1:

					print("CURRENT BEST (ACCURACY) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_1"))
					
					if acc_test_1 == best_acc_test_1:

						if acc_test_2 >= best_acc_test_1_for_2:

							best_acc_test_1_for_2 = acc_test_2
							best_acc_test_1 = acc_test_1
							best_record_acc_test_1 = record

							log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_1, "accuracy", "test_1")
							save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_1")

					else:

						best_acc_test_1 = acc_test_1
						best_record_acc_test_1 = record
						best_acc_test_1_for_2 = acc_test_2

						log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_1, "accuracy", "test_1")
						save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_1")

				if f_score_test_2 >= best_f_score_test_2:

					print("CURRENT BEST (F-SCORE) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_2"))

					if f_score_test_2 == best_f_score_test_2:

						if f_score_test_1 >= best_f_score_test_2_for_1:

							best_f_score_test_2_for_1 = f_score_test_1
							best_f_score_test_2 = f_score_test_2
							best_record_f_score_test_2 = record

							log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_2, "f_score", "test_2")
							save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_2")

					else:

						best_f_score_test_2 = f_score_test_2
						best_record_f_score_test_2 = record
						best_f_score_test_2_for_1 = f_score_test_1

						log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test_1, "f_score", "test_1")
						save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test_1")

				if acc_test_2 >= best_acc_test_2:
					print("CURRENT BEST (ACCURACY) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST_2"))
					
					if acc_test_2 == best_acc_test_2:

						if acc_test_1 >= best_acc_test_2_for_1:

							best_acc_test_2_for_1 = acc_test_1
							best_acc_test_2 = acc_test_2
							best_record_acc_test_2 = record

							log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_2, "accuracy", "test_2")
							save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_2")

					else:

						best_acc_test_2 = acc_test_2
						best_record_acc_test_2 = record
						best_acc_test_2_for_1 = acc_test_1

						log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test_2, "accuracy", "test_2")
						save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test_2")

				if f_score_test >= best_f_score_test:
					print("CURRENT BEST (F-SCORE) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST"))
					
					best_f_score_test = f_score_test
					best_record_f_score_test = record

					log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_f_score_test, "f_score", "test")
					save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "f_score", "test")

				if acc_test >= best_acc_test:
					print("CURRENT BEST (ACCURACY) FOUND AT EPOCH : {} (EVALUATED WITH {})".format(epoch + 1, "TEST"))
					
					best_acc_test = acc_test
					best_record_acc_test = record

					log_best_model_info(path, "epoch : " + str(epoch + 1), best_record_acc_test, "accuracy", "test")
					save_best_model(path, self.hierarchical_transformer, self.word_encoder, self.word_pos_encoder, self.time_delay_encoder, self.optimizer, "accuracy", "test")

		plot_graphs(path, epoch_values)
		print("*" * 40 + " DONE WITH TRAINING " + "*" * 40)
                                normalize,
                                ])
train_ds = VOCSBDClassification('/path/to/VOC',
                                '/path/to/SBD/benchmark_RELEASE/dataset',
                                transform=train_trans, image_set='train')
train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True)
val_ds = VOCClassification('/path/to/VOC', transform=val_trans, image_set='val')
val_dl = DataLoader(val_ds, batch_size=8, shuffle=True, num_workers=2, drop_last=True)


# Model
if args.arc == 'vgg':
    model = vgg19(pretrained=True)
    num_ftrs = model.classifier[6].in_features
    model.classifier[6] = nn.Linear(num_ftrs, train_ds.CLASSES)
    model = DataParallelModel(model.cuda())
else:
    raise Exception("Architecture {} not found".format(args.arc))

criterion = DataParallelCriterion(nn.BCEWithLogitsLoss().cuda())
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 20, gamma=0.2)
best_pred = 0

# Load model
if args.resume:
    if not os.path.isfile(args.resume):
        raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume))
    checkpoint = torch.load(args.resume)
    args.start_epoch = checkpoint['epoch']
    model.module.load_state_dict(checkpoint['state_dict'])
Example #22
0
def main(opt):

    # Set the random seed manually for reproducibility.
    if torch.cuda.is_available():
        torch.cuda.manual_seed(opt.seed)
    else:
        torch.manual_seed(opt.seed)

    train_loader = get_data_loader(opt,
                                   split='train',
                                   return_org_image=False)

    val_loader = get_data_loader(opt,
                                 split='val',
                                 return_org_image=False)

    output_dir = os.path.dirname(opt.output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    logger.info('Building model...')

    model = LaneNet(cnn_type=opt.cnn_type, embed_dim=opt.embed_dim)
    model = DataParallelModel(model)

    criterion_disc = DiscriminativeLoss(delta_var=0.5,
                                        delta_dist=1.5,
                                        norm=2,
                                        usegpu=True)

    criterion_ce = nn.CrossEntropyLoss()

    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate)

    if opt.start_from:
        logger.info('Restart training from %s', opt.start_from)
        checkpoint = torch.load(opt.start_from)
        model.load_state_dict(checkpoint['model'])

    if torch.cuda.is_available():
        criterion_disc.cuda()
        criterion_ce.cuda()
        model = model.cuda()

    logger.info("Start training...")
    best_loss = sys.maxsize
    best_epoch = 0

    for epoch in tqdm(range(opt.num_epochs), desc='Epoch: '):
        learning_rate = adjust_learning_rate(opt, optimizer, epoch)
        logger.info('===> Learning rate: %f: ', learning_rate)

        # train for one epoch
        train(
            opt,
            model,
            criterion_disc,
            criterion_ce,
            optimizer,
            train_loader)

        # validate at every val_step epoch
        if epoch % opt.val_step == 0:
            val_loss = test(
                opt,
                model,
                criterion_disc,
                criterion_ce,
                val_loader)
            logger.info('Val loss: %s\n', val_loss)

            loss = val_loss.avg
            if loss < best_loss:
                logger.info(
                    'Found new best loss: %.7f, previous loss: %.7f',
                    loss,
                    best_loss)
                best_loss = loss
                best_epoch = epoch

                logger.info('Saving new checkpoint to: %s', opt.output_file)
                torch.save({
                    'epoch': epoch,
                    'model': model.state_dict(),
                    'best_loss': best_loss,
                    'best_epoch': best_epoch,
                    'opt': opt
                }, opt.output_file)

            else:
                logger.info(
                    'Current loss: %.7f, best loss is %.7f @ epoch %d',
                    loss,
                    best_loss,
                    best_epoch)

        if epoch - best_epoch > opt.max_patience:
            logger.info('Terminated by early stopping!')
            break
Example #23
0
def define_D(ndf):
    if opt.parallel and torch.cuda.device_count() > 1:
        return DataParallelModel(init_net(Discriminator(ndf)))
    else:
        return init_net(Discriminator(ndf))
Example #24
0
def main(args):
    # initialization
    print("Input arguments:")
    for key, val in vars(args).items():
        print("{:16} {}".format(key, val))

    if not os.path.exists(args.snapshot_dir):
        os.makedirs(args.snapshot_dir)
    writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.method))

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    cudnn.benchmark = True

    # conduct seg network
    seg_model = get_model(num_classes=args.num_classes)

    saved_state_dict = torch.load(args.restore_from)
    new_params = seg_model.state_dict().copy()

    if args.init:
        for i in saved_state_dict:
            i_parts = i.split('.')
            if not i_parts[0] == 'fc':
                new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i]
        seg_model.load_state_dict(new_params)
        print('loading params w/o fc')
    else:
        seg_model.load_state_dict(saved_state_dict)
        print('loading params all')

    model = DataParallelModel(seg_model)
    model.float()
    model.cuda()

    # define dataloader
    train_loader = data.DataLoader(DataGenerator(root=args.root, list_path=args.lst,
                                                    crop_size=args.crop_size, training=True),
                                   batch_size=args.batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = data.DataLoader(DataGenerator(root=args.val_root, list_path=args.val_lst,
                                                  crop_size=args.crop_size, training=False),
                                 batch_size=args.batch_size, shuffle=False, num_workers=4, pin_memory=True)

    # define criterion & optimizer
    criterion = ABRLovaszLoss(ignore_index=args.ignore_label, only_present=True, cls_p= args.num_classes, cls_h= args.hbody_cls, cls_f= args.fbody_cls)
    criterion = DataParallelCriterion(criterion).cuda()

    optimizer = optim.SGD(
        [{'params': filter(lambda p: p.requires_grad, seg_model.parameters()), 'lr': args.learning_rate}],
        lr=args.learning_rate, momentum=0.9, weight_decay=5e-4)

    # key points
    best_val_mIoU = 0
    best_val_pixAcc = 0
    start = time.time()

    for epoch in range(0, args.epochs):
        print('\n{} | {}'.format(epoch, args.epochs - 1))
        # training
        _ = train(model, train_loader, epoch, criterion, optimizer, writer)

        # validation
        if epoch %10 ==0 or epoch > args.epochs*0.8:
            val_pixacc, val_miou = validation(model, val_loader, epoch, writer)
            # save model
            if val_pixacc > best_val_pixAcc:
                best_val_pixAcc = val_pixacc
            if val_miou > best_val_mIoU:
                best_val_mIoU = val_miou
                model_dir = os.path.join(args.snapshot_dir, args.method + '_miou.pth')
                torch.save(seg_model.state_dict(), model_dir)
                print('Model saved to %s' % model_dir)

    os.rename(model_dir, os.path.join(args.snapshot_dir, args.method + '_miou'+str(best_val_mIoU)+'.pth'))
    print('Complete using', time.time() - start, 'seconds')
    print('Best pixAcc: {} | Best mIoU: {}'.format(best_val_pixAcc, best_val_mIoU))
Example #25
0
def main(args):
    # initialization
    print("Input arguments:")
    for key, val in vars(args).items():
        print("{:16} {}".format(key, val))

    if not os.path.exists(args.snapshot_dir):
        os.makedirs(args.snapshot_dir)
    writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.method))

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    cudnn.benchmark = True

    adj_matrix = torch.tensor(
        [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0],
         [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0],
         [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
         [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]],
        requires_grad=False)
    upper_part_list = [1, 2, 3, 4, 5, 6, 7, 11, 13, 14, 15]
    lower_part_list = [8, 9, 10, 12, 16, 17, 18, 19]
    weight = torch.FloatTensor([
        0.7602572, 0.94236198, 0.85644457, 1.04346266, 1.10627293, 0.80980162,
        0.95168713, 0.8403769, 1.05798412, 0.85746254, 1.01274366, 1.05854692,
        1.03430773, 0.84867818, 0.88027721, 0.87580925, 0.98747462, 0.9876475,
        1.00016535, 1.00108882
    ])

    # conduct seg network
    seg_model = get_model(num_classes=args.num_classes,
                          adj_matrix=adj_matrix,
                          upper_part_list=upper_part_list,
                          lower_part_list=lower_part_list)

    saved_state_dict = torch.load(args.restore_from)
    new_params = seg_model.state_dict().copy()

    # if args.init:
    #    for i in saved_state_dict:
    #        i_parts = i.split('.')
    #        if not i_parts[0] == 'fc':
    #            new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i]
    #            #new_params[i_parts[:]] = saved_state_dict[i]
    #    seg_model.load_state_dict(new_params)
    #    print('loading params w/o fc')
    # else:
    #    seg_model.load_state_dict(saved_state_dict)
    #    print('loading params all')

    for i in saved_state_dict:
        i_parts = i.split('.')
        if not i_parts[0] == 'fc':
            new_params['encoder.' + '.'.join(i_parts[:])] = saved_state_dict[i]
    seg_model.load_state_dict(new_params)
    print('loading params w/o fc')

    # seg_model.load_state_dict(saved_state_dict)
    # print('loading params all')

    model = DataParallelModel(seg_model)
    model.float()
    model.cuda()

    # define dataloader
    train_loader = data.DataLoader(DatasetGenerator(root=args.root,
                                                    list_path=args.lst,
                                                    crop_size=args.crop_size,
                                                    training=True),
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=4,
                                   pin_memory=True)
    val_loader = data.DataLoader(DatasetGenerator(root=args.val_root,
                                                  list_path=args.val_lst,
                                                  crop_size=args.crop_size,
                                                  training=False),
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=4,
                                 pin_memory=True)

    # define criterion & optimizer
    criterion = ABRLovaszLoss(adj_matrix=adj_matrix,
                              ignore_index=args.ignore_label,
                              only_present=True,
                              upper_part_list=upper_part_list,
                              lower_part_list=lower_part_list,
                              cls_p=args.num_classes,
                              cls_h=args.hbody_cls,
                              cls_f=args.fbody_cls,
                              weight=weight)
    criterion = DataParallelCriterion(criterion).cuda()

    optimizer = optim.SGD(
        [{
            'params': filter(lambda p: p.requires_grad,
                             seg_model.parameters()),
            'lr': args.learning_rate
        }],
        lr=args.learning_rate,
        momentum=0.9,
        weight_decay=5e-4)

    # key points
    best_val_mIoU = 0
    best_val_pixAcc = 0
    start = time.time()

    for epoch in range(0, args.epochs):
        print('\n{} | {}'.format(epoch, args.epochs - 1))
        # training
        _ = train(model, train_loader, epoch, criterion, optimizer, writer)
        # validation
        if epoch % 10 == 0 or epoch > args.epochs - 5:
            val_pixacc, val_miou = validation(model, val_loader, epoch, writer)
            # save model
            if val_pixacc > best_val_pixAcc:
                best_val_pixAcc = val_pixacc
            if val_miou > best_val_mIoU:
                best_val_mIoU = val_miou
                model_dir = os.path.join(args.snapshot_dir,
                                         args.method + '_miou.pth')
                torch.save(seg_model.state_dict(), model_dir)
                print('Model saved to %s' % model_dir)

    os.rename(
        model_dir,
        os.path.join(args.snapshot_dir,
                     args.method + '_miou' + str(best_val_mIoU) + '.pth'))
    print('Complete using', time.time() - start, 'seconds')
    print('Best pixAcc: {} | Best mIoU: {}'.format(best_val_pixAcc,
                                                   best_val_mIoU))