Example #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument('--model',
                        '-m',
                        choices=['resnet50', 'resnet101'],
                        default='resnet50',
                        help='Base model of Mask R-CNN.')
    parser.add_argument('--pooling-func',
                        '-p',
                        choices=['pooling', 'align', 'resize'],
                        default='align',
                        help='Pooling function.')
    parser.add_argument('--gpu', '-g', type=int, help='GPU id.')
    parser.add_argument('--multi-node',
                        action='store_true',
                        help='use multi node')
    default_max_epoch = 120
    parser.add_argument('--max-epoch',
                        type=float,
                        default=default_max_epoch,
                        help='epoch')
    parser.add_argument('--pretrained-model', help='pretrained model')
    parser.add_argument(
        '--notrain',
        choices=['pix', 'ins'],
        help='not training pixel or instance segmentation',
    )
    parser.add_argument(
        '--lr-base',
        default=0.00125,
        type=float,
        help='learning rate per batch size 1',
    )
    parser.add_argument(
        '--noaugmentation',
        action='store_true',
        help='not apply data augmentation',
    )
    parser.add_argument(
        '--pix-loss-scale',
        default=1.,
        type=float,
        help='scale of pixel loss',
    )
    parser.add_argument(
        '--dataset',
        default='occlusion',
        choices=['occlusion', 'occlusion+synthetic'],
        help='dataset',
    )
    args = parser.parse_args()

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('pure_nccl')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        if args.gpu is None:
            print(
                '--gpu option is required if --multi-node is not specified.',
                file=sys.stderr,
            )
            sys.exit(1)
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()

    if not args.multi_node or comm.rank == 0:
        out = osp.join(here, 'logs', now.strftime('%Y%m%d_%H%M%S.%f'))
    else:
        out = None
    if args.multi_node:
        args.out = comm.bcast_obj(out)
    else:
        args.out = out
    del out

    # 0.00125 * 8 = 0.01  in original
    args.batch_size = 1 * args.n_gpu
    args.lr = args.lr_base * args.batch_size
    args.weight_decay = 0.0001

    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [(120e3 / 180e3) * args.max_epoch,
                      (160e3 / 180e3) * args.max_epoch]

    random.seed(args.seed)
    np.random.seed(args.seed)

    # Default Config
    # args.min_size = 800
    # args.max_size = 1333
    # args.anchor_scales = (2, 4, 8, 16, 32)
    args.min_size = 600
    args.max_size = 1000
    args.anchor_scales = (4, 8, 16, 32)
    args.rpn_dim = 512

    # -------------------------------------------------------------------------
    # Dataset

    train_data = \
        instance_occlsegm.datasets.PanopticOcclusionSegmentationDataset(
            'train', augmentation=not args.noaugmentation
        )
    if args.dataset == 'occlusion+synthetic':
        synthetic_data = \
            instance_occlsegm.datasets.\
            PanopticOcclusionSegmentationSyntheticDataset(
                do_aug=not args.noaugmentation,
                size=len(train_data),
            )
        train_data = chainer.datasets.ConcatenatedDataset(
            train_data, synthetic_data)
    test_data = \
        instance_occlsegm.datasets.PanopticOcclusionSegmentationDataset(
            'test'
        )
    fg_class_names = test_data.class_names
    args.class_names = fg_class_names.tolist()
    test_data_list = test_data.get_video_datasets()
    del test_data

    # -------------------------------------------------------------------------
    # Model + Optimizer.

    if args.pooling_func == 'align':
        pooling_func = cmr.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = chainer.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = cmr.functions.crop_and_resize
    else:
        raise ValueError

    args.mask_loss = 'softmax'
    assert args.model in ['resnet50', 'resnet101']
    n_layers = int(args.model.lstrip('resnet'))
    mask_rcnn = instance_occlsegm.models.MaskRCNNPanopticResNet(
        n_layers=n_layers,
        n_fg_class=len(fg_class_names),
        pretrained_model=args.pretrained_model,
        pooling_func=pooling_func,
        anchor_scales=args.anchor_scales,
        min_size=args.min_size,
        max_size=args.max_size,
        rpn_dim=args.rpn_dim,
    )
    mask_rcnn.nms_thresh = 0.3
    mask_rcnn.score_thresh = 0.05

    model = instance_occlsegm.models.MaskRCNNPanopticTrainChain(
        mask_rcnn,
        notrain=args.notrain,
        pix_loss_scale=args.pix_loss_scale,
    )
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    mask_rcnn.extractor.conv1.disable_update()
    mask_rcnn.extractor.bn1.disable_update()
    mask_rcnn.extractor.res2.disable_update()
    for link in mask_rcnn.links():
        if isinstance(link, cmr.links.AffineChannel2D):
            link.disable_update()

    # -------------------------------------------------------------------------
    # Iterator.

    train_data = chainer.datasets.TransformDataset(
        train_data,
        instance_occlsegm.datasets.MaskRCNNPanopticTransform(mask_rcnn),
    )
    test_data_list = [
        chainer.datasets.TransformDataset(
            td,
            instance_occlsegm.datasets.MaskRCNNPanopticTransform(
                mask_rcnn,
                train=False,
            )) for td in test_data_list
    ]
    test_concat_data = chainer.datasets.ConcatenatedDataset(*test_data_list)
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)

    # for training
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data,
        batch_size=1,
        n_processes=14,
        shared_mem=10**9,
    )
    # for evaluation
    test_iters = {
        i: chainer.iterators.SerialIterator(td,
                                            batch_size=1,
                                            repeat=False,
                                            shuffle=False)
        for i, td in enumerate(test_data_list)
    }
    # for visualization
    test_concat_iter = chainer.iterators.SerialIterator(test_concat_data,
                                                        batch_size=1,
                                                        repeat=False,
                                                        shuffle=False)

    # -------------------------------------------------------------------------

    converter = functools.partial(
        cmr.datasets.concat_examples,
        padding=0,
        # img, bboxes, labels, masks, scales, lbls_vis, lbls_occ
        indices_concat=[0, 2, 3, 4, 5, 6],
        indices_to_device=[0, 1, 5, 6],
    )
    updater = chainer.training.updater.StandardUpdater(train_iter,
                                                       optimizer,
                                                       device=device,
                                                       converter=converter)

    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'),
                               out=args.out)

    trainer.extend(extensions.FailOnNonNumber())
    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=training.triggers.ManualScheduleTrigger(
                       args.step_size, 'epoch'))

    eval_interval = 1, 'epoch'
    log_interval = 10, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = log_interval

    if not args.multi_node or comm.rank == 0:
        evaluator = \
            instance_occlsegm.extensions.PanopticSegmentationVOCEvaluator(
                test_iters,
                model.mask_rcnn,
                device=device,
                use_07_metric=False,
                label_names=fg_class_names,
            )
        trainer.extend(evaluator, trigger=eval_interval)
        trainer.extend(extensions.snapshot_object(model.mask_rcnn,
                                                  'snapshot_model.npz'),
                       trigger=training.triggers.MaxValueTrigger(
                           'validation/main/mpq', eval_interval))
        args.git_hash = cmr.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))
        trainer.extend(
            instance_occlsegm.extensions.PanopticSegmentationVisReport(
                test_concat_iter, model.mask_rcnn, label_names=fg_class_names),
            trigger=eval_interval,
        )
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(
            extensions.PrintReport([
                'iteration',
                'epoch',
                'elapsed_time',
                'lr',
                'main/loss',
                'main/roi_loc_loss',
                'main/roi_cls_loss',
                'main/roi_mask_loss',
                'main/rpn_loc_loss',
                'main/rpn_cls_loss',
                'main/pix_vis_loss',
                'main/pix_occ_loss',
                'validation/main/miou',
                'validation/main/mpq',
            ], ),
            trigger=print_interval,
        )
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # plot
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(
                [
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                    'main/ins_loss',
                    'main/pix_vis_loss',
                    'main/pix_occ_loss'
                    'main/pix_loss'
                    'main/loss',
                ],
                file_name='loss.png',
                trigger=plot_interval,
            ),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport([
                'validation/main/miou/vis',
                'validation/main/miou/occ',
                'validation/main/miou',
                'validation/main/map',
                'validation/main/msq',
                'validation/main/mdq',
                'validation/main/mpq',
            ],
                                  file_name='accuracy.png',
                                  trigger=plot_interval),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
Example #2
0
def main():
    parser = argparse.ArgumentParser(description='Train Deblur Network')
    parser.add_argument('--seed',
                        '-s',
                        type=int,
                        default=0,
                        help='seed for random values')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='Number of images in each mini-batch')
    parser.add_argument('--learnrate',
                        '-l',
                        type=float,
                        default=0.1,
                        help='Learning rate for SGD')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=50,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print(args)
    print('')

    set_random_seed(args.seed)

    predictor = srcnn.create_srcnn()
    model = L.Classifier(predictor, lossfun=F.mean_squared_error, accfun=psnr)

    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    optimizer = chainer.optimizers.MomentumSGD(args.learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

    base_dir = 'data/blurred_sharp'
    train_data = pairwise_dataset.PairwiseDataset(
        blur_image_list=str(Path(base_dir).joinpath('train_blur_images.txt')),
        sharp_image_list=str(
            Path(base_dir).joinpath('train_sharp_images.txt')),
        root=base_dir)
    train_data = chainer.datasets.TransformDataset(train_data,
                                                   transform.Transform())

    test_data = pairwise_dataset.PairwiseDataset(
        blur_image_list=str(Path(base_dir).joinpath('test_blur_images.txt')),
        sharp_image_list=str(Path(base_dir).joinpath('test_sharp_images.txt')),
        root=base_dir)
    # 普通はTransformしないような気がするけど、解像度がかわっちゃうのがなー
    test_data = chainer.datasets.TransformDataset(test_data,
                                                  transform.Transform())

    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test_data,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)
    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.FailOnNonNumber())
    # Evaluate the model with the test dataset for each epoch
    eval_trigger = (1, 'epoch')
    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu),
                   trigger=eval_trigger)

    # Reduce the learning rate by half every 25 epochs.
    lr_drop_epoch = [int(args.epoch * 0.5), int(args.epoch * 0.75)]
    lr_drop_ratio = 0.1
    print('lr schedule: {}, timing: {}'.format(lr_drop_ratio, lr_drop_epoch))

    def lr_drop(trainer):
        trainer.updater.get_optimizer('main').lr *= lr_drop_ratio

    trainer.extend(lr_drop,
                   trigger=chainer.training.triggers.ManualScheduleTrigger(
                       lr_drop_epoch, 'epoch'))
    trainer.extend(extensions.observe_lr(), trigger=(1, 'epoch'))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    trainer.extend(extensions.dump_graph('main/loss'))

    # Take a snapshot at each epoch
    trainer.extend(extensions.snapshot(model.predictor,
                                       'model_{.updater.epoch}.npz'),
                   trigger=(1, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=(100, 'iteration')))

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(extensions.PrintReport([
        'epoch', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy',
        'validation/main/accuracy', 'elapsed_time'
    ]),
                   trigger=(100, 'iteration'))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())
    # interact with chainerui
    trainer.extend(CommandsExtension(), trigger=(100, 'iteration'))
    # save args
    save_args(args, args.out)

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Example #3
0
def create_trainer(
    config: Config,
    output: Path,
):
    assert_config(config)
    if output.exists():
        raise Exception(f"output directory {output} already exists.")

    # model
    predictor = create_predictor(config.model)
    if config.train.trained_model is not None:
        chainer.serializers.load_npz(
            config.train.trained_model["predictor_path"], predictor)
    model = Model(
        loss_config=config.loss,
        predictor=predictor,
        local_padding_size=config.dataset.local_padding_size,
    )

    model.to_gpu(config.train.gpu[0])
    cuda.get_device_from_id(config.train.gpu[0]).use()

    # dataset
    dataset = create_dataset(config.dataset)
    batchsize_devided = config.train.batchsize // len(config.train.gpu)
    train_iter = MultiprocessIterator(dataset["train"], config.train.batchsize)
    test_iter = MultiprocessIterator(dataset["test"],
                                     batchsize_devided,
                                     repeat=False,
                                     shuffle=True)
    train_test_iter = MultiprocessIterator(dataset["train_test"],
                                           batchsize_devided,
                                           repeat=False,
                                           shuffle=True)

    if dataset["test_eval"] is not None:
        test_eval_iter = MultiprocessIterator(dataset["test_eval"],
                                              batchsize_devided,
                                              repeat=False,
                                              shuffle=True)
    else:
        test_eval_iter = None

    # optimizer
    def create_optimizer(model):
        cp: Dict[str, Any] = copy(config.train.optimizer)
        n = cp.pop("name").lower()

        if n == "adam":
            optimizer = optimizers.Adam(**cp)
        elif n == "sgd":
            optimizer = optimizers.SGD(**cp)
        else:
            raise ValueError(n)

        optimizer.setup(model)

        if config.train.optimizer_gradient_clipping is not None:
            optimizer.add_hook(
                optimizer_hooks.GradientClipping(
                    config.train.optimizer_gradient_clipping))

        return optimizer

    optimizer = create_optimizer(model)
    if config.train.trained_model is not None:
        chainer.serializers.load_npz(
            config.train.trained_model["optimizer_path"], optimizer)

    # updater
    if len(config.train.gpu) <= 1:
        updater = StandardUpdater(
            iterator=train_iter,
            optimizer=optimizer,
            converter=concat_optional,
            device=config.train.gpu[0],
        )
    else:
        updater = ParallelUpdater(
            iterator=train_iter,
            optimizer=optimizer,
            converter=concat_optional,
            devices={
                "main" if i == 0 else f"gpu{gpu}": gpu
                for i, gpu in enumerate(config.train.gpu)
            },
        )
    if config.train.trained_model is not None:
        updater.iteration = optimizer.t

    # trainer
    output.mkdir()
    config.save_as_json((output / "config.json").absolute())

    trigger_log = (config.train.log_iteration, "iteration")
    trigger_snapshot = (config.train.snapshot_iteration, "iteration")
    trigger_stop = ((config.train.stop_iteration, "iteration")
                    if config.train.stop_iteration is not None else None)

    trainer = training.Trainer(updater, stop_trigger=trigger_stop, out=output)
    tb_writer = SummaryWriter(Path(output))

    shift_ext = None
    if config.train.linear_shift is not None:
        shift_ext = extensions.LinearShift(**config.train.linear_shift)
    if config.train.step_shift is not None:
        shift_ext = extensions.StepShift(**config.train.step_shift)
    if shift_ext is not None:
        if config.train.trained_model is not None:
            shift_ext._t = optimizer.t
        trainer.extend(shift_ext)

    if config.train.ema_decay is not None:
        train_predictor = predictor
        predictor = deepcopy(predictor)
        ext = ExponentialMovingAverage(target=train_predictor,
                                       ema_target=predictor,
                                       decay=config.train.ema_decay)
        trainer.extend(ext, trigger=(1, "iteration"))

    ext = extensions.Evaluator(test_iter,
                               model,
                               concat_optional,
                               device=config.train.gpu[0])
    trainer.extend(ext, name="test", trigger=trigger_log)
    ext = extensions.Evaluator(train_test_iter,
                               model,
                               concat_optional,
                               device=config.train.gpu[0])
    trainer.extend(ext, name="train", trigger=trigger_log)

    if test_eval_iter is not None:
        generator = Generator(config=config,
                              model=predictor,
                              max_batch_size=config.train.batchsize)
        generate_evaluator = GenerateEvaluator(
            generator=generator,
            time_length=config.dataset.time_length_evaluate,
            local_padding_time_length=config.dataset.
            local_padding_time_length_evaluate,
        )
        ext = extensions.Evaluator(
            test_eval_iter,
            generate_evaluator,
            concat_optional,
            device=config.train.gpu[0],
        )
        trainer.extend(ext, name="eval", trigger=trigger_snapshot)

    ext = extensions.snapshot_object(predictor,
                                     filename="main_{.updater.iteration}.npz")
    trainer.extend(ext, trigger=trigger_snapshot)
    # ext = extensions.snapshot_object(
    #     optimizer, filename="optimizer_{.updater.iteration}.npz"
    # )
    # trainer.extend(ext, trigger=trigger_snapshot)

    trainer.extend(extensions.FailOnNonNumber(), trigger=trigger_log)
    trainer.extend(extensions.observe_lr(), trigger=trigger_log)
    trainer.extend(extensions.LogReport(trigger=trigger_log))
    trainer.extend(
        extensions.PrintReport(["iteration", "main/loss", "test/main/loss"]),
        trigger=trigger_log,
    )
    trainer.extend(TensorBoardReport(writer=tb_writer), trigger=trigger_log)

    trainer.extend(extensions.dump_graph(root_name="main/loss"))

    if trigger_stop is not None:
        trainer.extend(extensions.ProgressBar(trigger_stop))

    return trainer
def main():
    parser = argparse.ArgumentParser(description='Chainer example: VAE')
    parser.add_argument('--gpu',
                        default=0,
                        type=int,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--output_dir',
                        '-o',
                        default='result_mvae/',
                        help='Directory to output the result')
    parser.add_argument('--epochs',
                        '-e',
                        default=100,
                        type=int,
                        help='Number of epochs')
    parser.add_argument('--dimz',
                        '-z',
                        default=8,
                        type=int,
                        help='Dimention of encoded vector')
    parser.add_argument('--batchsize',
                        '-batch',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--beta',
                        '-b',
                        default=1,
                        help='Beta coefficient for the KL loss')
    parser.add_argument(
        '--gamma_obj',
        '-gO',
        default=1,
        help='Gamma coefficient for the OBJECT classification loss')
    parser.add_argument(
        '--gamma_rel',
        '-gR',
        default=1,
        help='Gamma coefficient for the RELATIONAL classification loss')
    parser.add_argument('--alpha',
                        '-a',
                        default=1,
                        help='Alpha coefficient for the reconstruction loss')
    parser.add_argument(
        '--freq',
        '-f',
        default=1000,
        help='Frequency at which snapshots of the model are saved.')
    parser.add_argument('--augment_counter',
                        type=int,
                        default=0,
                        help='Number ot times to augment the train data')
    parser.add_argument('--objects_n',
                        default=2,
                        type=int,
                        help='# of objects to be used')

    args = parser.parse_args()

    if not osp.isdir(osp.join(args.output_dir)):
        os.makedirs(args.output_dir)

    if not osp.isdir(osp.join(args.output_dir, 'models')):
        os.makedirs(osp.join(args.output_dir, 'models'))

    print('\n###############################################')
    print('# GPU: \t\t\t{}'.format(args.gpu))
    print('# dim z: \t\t{}'.format(args.dimz))
    print('# Minibatch-size: \t{}'.format(args.batchsize))
    print('# Epochs: \t\t{}'.format(args.epochs))
    print('# Beta: \t\t{}'.format(args.beta))
    print('# Gamma OBJ: \t\t{}'.format(args.gamma_obj))
    print('# Gamma REL: \t\t{}'.format(args.gamma_rel))
    print('# Frequency: \t\t{}'.format(args.freq))
    print('# Out Folder: \t\t{}'.format(args.output_dir))
    print('###############################################\n')

    stats = {
        'train_loss': [],
        'train_rec_loss': [],
        'train_kl': [],
        'train_label_obj_acc': [],
        'train_label_obj_loss': [],
        'train_label_rel_acc': [],
        'train_label_rel_loss': [],
        'valid_loss': [],
        'valid_rec_loss': [],
        'valid_kl': [],
        'valid_label_obj_acc': [],
        'valid_label_obj_loss': [],
        'valid_label_rel_acc': [],
        'valid_label_rel_loss': []
    }

    models_folder = os.path.join(args.output_dir, "models")

    folder_names = [
        'yordan_experiments/off-on', 'yordan_experiments/nonfacing-facing',
        'yordan_experiments/out-in'
    ]
    # folder_names = ['yordan_experiments/off-on']

    generator = data_generator.DataGenerator(folder_names=folder_names,\
                                             data_split=0.8)

    train, train_labels, train_concat, train_vectors, test, test_labels, test_concat, test_vectors,\
    unseen, unseen_labels, unseen_concat, unseen_vectors,\
    groups_obj, groups_rel = generator.generate_dataset(args=args)

    data_dimensions = train.shape
    print('\n###############################################')
    print("DATA_LOADED")
    print("# Training Images: \t\t{0}".format(train.shape))
    print("# Testing Images: \t\t{0}".format(test.shape))
    print("# Unseen Images: \t\t{0}".format(unseen.shape))
    print("# Training Rel Labels: \t\t{0}".format(train_labels.shape))
    print("# Testing Rel Labels: \t\t{0}".format(test_labels.shape))
    print("# Unseen Rel Labels: \t\t{0}".format(unseen_labels.shape))
    print("# Training Rel Vectors: \t\t{0}".format(train_vectors.shape))
    print("# Testing Rel Vectors: \t\t{0}".format(test_vectors.shape))
    print('###############################################\n')

    if len(train_concat[1]) > 0:
        print("# Relation Label Stats:")
        for group_idx, group in groups_rel.items():
            print("# Group: \t\t{0} : {1}".format(group_idx, group))
            for label_idx, label in enumerate(group + ["unlabelled"]):
                print("#{0} Train: \t\t{1}".format(
                    label,
                    len(filter(lambda x: label == x[group_idx],
                               train_labels))))
                print("#{0} Test: \t\t{1}".format(
                    label,
                    len(filter(lambda x: label == x[group_idx], test_labels))))
        print('###############################################\n')

    if len(train_concat[3]) > 0:
        print("# Object Label Stats:")
        train_object_vectors = np.array([
            train_concat[i][3][j] for i in range(len(train_concat))
            for j in range(args.objects_n)
        ])
        test_object_vectors = np.array([
            test_concat[i][3][j] for i in range(len(test_concat))
            for j in range(args.objects_n)
        ])

        train_object_vector_masks = np.array([
            train_concat[i][4][j] for i in range(len(train_concat))
            for j in range(args.objects_n)
        ])
        test_object_vector_masks = np.array([
            test_concat[i][4][j] for i in range(len(test_concat))
            for j in range(args.objects_n)
        ])
        for group_idx, group in groups_obj.items():
            print("# Group: \t\t{0} : {1}".format(group_idx, group))
            for label_idx, label in enumerate(group):
                print("#{0} Train: \t\t{1}".format(
                    label,
                    len(
                        filter(
                            lambda (x, y): label_idx == x[group_idx] and y[
                                group_idx] != 0,
                            zip(train_object_vectors,
                                train_object_vector_masks)))))
                print("#{0} Test: \t\t{1}".format(
                    label,
                    len(
                        filter(
                            lambda (x, y): label_idx == x[group_idx] and y[
                                group_idx] != 0,
                            zip(test_object_vectors,
                                test_object_vector_masks)))))
            for label_idx, label in enumerate(["unlabelled"]):
                print("#{0} Train: \t\t{1}".format(
                    label,
                    len(
                        filter(
                            lambda (x, y): label_idx == x[group_idx] and y[
                                group_idx] == 0,
                            zip(train_object_vectors,
                                train_object_vector_masks)))))
                print("#{0} Test: \t\t{1}".format(
                    label,
                    len(
                        filter(
                            lambda (x, y): label_idx == x[group_idx] and y[
                                group_idx] == 0,
                            zip(test_object_vectors,
                                test_object_vector_masks)))))
        print('###############################################\n')

    train_iter = chainer.iterators.SerialIterator(train_concat, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test_concat,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    model = net.Conv_MVAE(train.shape[1],
                          latent_n=args.dimz,
                          groups_obj=groups_obj,
                          groups_rel=groups_rel,
                          alpha=args.alpha,
                          beta=args.beta,
                          gamma_obj=args.gamma_obj,
                          gamma_rel=args.gamma_rel,
                          objects_n=args.objects_n)

    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    # optimizer = chainer.optimizers.RMSprop()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(0.0005))
    # optimizer.add_hook(chainer.optimizer_hooks.GradientClipping(0.00001))

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       loss_func=model.lf,
                                       device=args.gpu)

    trainer = training.Trainer(updater, (args.epochs, 'epoch'),
                               out=args.output_dir)
    trainer.extend(extensions.Evaluator(test_iter,
                                        model,
                                        eval_func=model.lf,
                                        device=args.gpu),
                   name="val",
                   trigger=(1, 'epoch'))
    trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
    trainer.extend(extensions.PrintReport([
                                           'epoch', \
                                           'main/rec_l', 'val/main/rec_l', \
                                           'val/main/kl', \
                                           'main/obj_a','val/main/obj_a', \
                                           'main/rel_a','val/main/rel_a', \
                                           'main/obj_l', \
                                           'val/main/obj_l', \
                                           'main/rel_l',\
                                           'val/main/rel_l']))
    trainer.extend(extensions.PlotReport(['main/rec_l', \
                                          'val/main/rec_l'], \
                                           x_key='epoch', file_name='rec_loss.png', marker=None))
    trainer.extend(extensions.PlotReport(['main/kl', \
                                          'val/main/kl'], \
                                           x_key='epoch', file_name='kl.png', marker=None))
    trainer.extend(extensions.PlotReport(['main/obj_a', \
                                          'val/main/obj_a'], \
                                           x_key='epoch', file_name='object_acc.png', marker=None))
    trainer.extend(extensions.PlotReport(['main/obj_l', \
                                          'val/main/obj_l'], \
                                           x_key='epoch', file_name='object_loss.png', marker=None))
    trainer.extend(extensions.PlotReport(['main/rel_a', \
                                          'val/main/rel_a'], \
                                           x_key='epoch', file_name='relation_acc.png', marker=None))
    trainer.extend(extensions.PlotReport(['main/rel_l', \
                                          'val/main/rel_l'], \
                                           x_key='epoch', file_name='relation_loss.png', marker=None))
    # trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.ProgressBar(update_interval=10))
    trainer.extend(extensions.FailOnNonNumber())
    trainer.extend(extensions.snapshot(
        filename='snapshot_epoch_{.updater.epoch}.trainer'),
                   trigger=(args.epochs, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        model, filename='snapshot_epoch_{.updater.epoch}.model'),
                   trigger=(10, 'epoch'))
    trainer.extend(extensions.snapshot_object(model, 'final.model'),
                   trigger=(args.epochs, 'epoch'))
    trainer.extend(model.check_loss_coefficients(), trigger=(1, 'epoch'))
    trainer.extend(extensions.ExponentialShift('alpha',
                                               0.5,
                                               init=1e-3,
                                               target=1e-8),
                   trigger=(args.epochs / 2, 'epoch'))  # For Adam

    trainer.run()
Example #5
0
def main3():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu_id', '-g', type=int, default=0)
    parser.add_argument('--batch_size', '-b', type=int, default=60)
    parser.add_argument('--test_split', type=float, default=0.2)
    parser.add_argument(
        '--real_test',
        dest='real_test',
        action='store_true',
        help='Whether to split the data or use a complete new trial.')
    parser.add_argument('--max_epoch', '-e', type=int, default=110)
    parser.add_argument('--resume', '-r', type=int, default=None)
    parser.add_argument(
        '--out_dir',
        '-o',
        type=str,
        default=
        '/mnt/7ac4c5b9-8c05-451f-9e6d-897daecb7442/gears/results_gsm/result_right_arm2'
    )
    args = parser.parse_args()

    model = GoalScoreModel()

    frames, labels = load_all_data(prep_f=model.prepare)

    frames, labels = igp.unison_shuffled_copies(frames, labels)
    print('Frames shape: ', frames.shape, ' Labels shape: ', labels.shape)

    data = chainer.datasets.TupleDataset(frames, labels)  #.to_device(gpu_id)
    print('Dataset length: ', data._length)

    print('Frame size: ', data[0][0].shape, data[0][0].dtype)

    if args.real_test:
        print('Using test trial.')
        train_iter = iterators.SerialIterator(data,
                                              args.batch_size,
                                              shuffle=True)

        # Load the test data
        test_frames, test_labels = load_frames_labels(
            ids=[11],
            filestype=''.join((args.data_base_dir, args.data_file_pattern)),
            blackout=args.blackout)
        data_test = chainer.datasets.TupleDataset(test_frames, test_labels)
        test_iter = iterators.SerialIterator(data_test,
                                             args.batch_size,
                                             repeat=False,
                                             shuffle=False)
    else:
        data_test, data_train = split_dataset(data,
                                              int(args.test_split * len(data)))
        train_iter = iterators.SerialIterator(data_train,
                                              args.batch_size,
                                              shuffle=True)
        test_iter = iterators.SerialIterator(data_test,
                                             args.batch_size,
                                             repeat=False,
                                             shuffle=False)

    if args.gpu_id >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu_id).use()
        model.to_gpu(args.gpu_id)

    # Create the optimizer for the model
    optimizer = optimizers.Adam().setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=1e-6))

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       loss_func=model.calc_loss,
                                       device=args.gpu_id)

    # Full training
    print('Full model training ...')
    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'),
                               out=args.out_dir)
    trainer.extend(extensions.Evaluator(test_iter,
                                        model,
                                        eval_func=model.calc_loss,
                                        device=args.gpu_id),
                   name='val',
                   trigger=(1, 'epoch'))
    trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'main/mae', 'main/gnll',
            'main/weighted', 'main/VAE', 'main/VAE_REC', 'main/VAE_KL',
            'val/main/loss', 'val/main/mae', 'val/main/weighted',
            'elapsed_time'
        ])
    )  #, 'val/main/VAE', 'main/loss', 'validation/main/loss', 'elapsed_time'], ))
    trainer.extend(
        extensions.PlotReport(
            ['main/mae', 'val/main/mae', 'main/VAE', 'val/main/VAE'],
            x_key='epoch',
            file_name='loss.png',
            marker=None))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.ProgressBar(update_interval=10))
    trainer.extend(extensions.FailOnNonNumber())
    # Save every X epochs
    trainer.extend(extensions.snapshot(
        filename='snapshot_epoch_{.updater.epoch}.trainer'),
                   trigger=(200, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        model,
        '%s_model_epoch_{.updater.epoch}.model' % (model.__class__.__name__)),
                   trigger=(10, 'epoch'))

    trainer.extend(utils.display_image(model.vae_image,
                                       data_test,
                                       args.out_dir,
                                       args.gpu_id,
                                       n=3),
                   trigger=(1, 'epoch'))

    trainer.extend(extensions.ExponentialShift('alpha',
                                               0.5,
                                               init=1e-3,
                                               target=1e-8),
                   trigger=(100, 'epoch'))

    # Resume from a specified snapshot
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
    print('Done.')
def main(hpt):

    logger.info('load dataset')
    train, valid, test = dataset.get_dataset(hpt.dataset.type, **hpt.dataset)
    assert valid is None
    assert test is None

    if hpt.general.test:
        train, _ = chainer.datasets.split_dataset(train, 100)
        chainer.set_debug(True)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  hpt.training.batch_size)

    logger.info('build model')
    loss = get_model(hpt)
    if hpt.general.gpu >= 0:
        loss.to_gpu(hpt.general.gpu)

    logger.info('setup optimizer')
    if hpt.optimizer.type == 'adam':
        optimizer = chainer.optimizers.Adam(alpha=hpt.optimizer.lr)
    elif hpt.optimizer.type == 'adagrad':
        optimizer = chainer.optimizers.AdaGrad(lr=hpt.optimizer.lr)
    else:
        raise AttributeError
    optimizer.setup(loss)

    logger.info('setup updater/trainer')
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=hpt.general.gpu,
                                                loss_func=loss)

    trainer = training.Trainer(updater, (hpt.training.iteration, 'iteration'),
                               out=po.namedir(output='str'))

    lr_name = 'alpha' if hpt.optimizer.type == 'adam' else 'lr'
    trainer.extend(
        Burnin(lr_name, burnin_step=hpt.training.burnin_step,
               c=hpt.training.c))

    trainer.extend(extensions.FailOnNonNumber())

    trainer.extend(extensions.snapshot_object(
        loss, 'loss_snapshot_iter_{.updater.iteration}'),
                   trigger=(int(hpt.training.iteration / 5), 'iteration'))
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.observe_lr())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'main/kl_target',
            'main/kl_negative', 'lr', 'main/bound', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    # Save plot images to the result dir
    if (not hpt.general.noplot) and extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss'],
                                  'epoch',
                                  file_name=(po.imagesdir() /
                                             'loss.png').as_posix()))
        trainer.extend(
            extensions.PlotReport(['main/kl_target', 'main/kl_negative'],
                                  'epoch',
                                  file_name=(po.imagesdir() /
                                             'kldiv.png').as_posix()))

    # Run the training
    logger.info('run training')
    trainer.run()

    logger.info('evaluate')
    metrics = evaluate(hpt, train, test, loss)
    for metric_name, metric in metrics.items():
        logger.info('{}: {:.4f}'.format(metric_name, metric))

    if hpt.general.noplot:
        return metrics

    return metrics
Example #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument("--multi-node", action="store_true", help="multi node")
    parser.add_argument("--out", help="output directory")
    parser.add_argument("--debug", action="store_true", help="debug mode")
    parser.add_argument("--gpu", type=int, default=0, help="gpu id")
    parser.add_argument("--seed", type=int, default=0, help="random seed")
    parser.add_argument(
        "--lr",
        type=float,
        default=0.0001,
        help="learning rate",
    )
    parser.add_argument(
        "--max-epoch",
        type=int,
        default=30,
        help="max epoch",
    )
    parser.add_argument(
        "--call-evaluation-before-training",
        action="store_true",
        help="call evaluation before training",
    )

    def argparse_type_class_ids(string):
        if string == "all":
            n_class = len(morefusion.datasets.ycb_video.class_names)
            class_ids = np.arange(n_class)[1:].tolist()
        elif string == "asymmetric":
            class_ids = (
                morefusion.datasets.ycb_video.class_ids_asymmetric.tolist())
        elif string == "symmetric":
            class_ids = (
                morefusion.datasets.ycb_video.class_ids_symmetric.tolist())
        else:
            class_ids = [int(x) for x in string.split(",")]
        return class_ids

    parser.add_argument(
        "--class-ids",
        type=argparse_type_class_ids,
        default="all",
        help="class id (e.g., 'all', 'asymmetric', 'symmetric', '1,6,9')",
    )
    parser.add_argument(
        "--pretrained-model",
        help="pretrained model",
    )
    parser.add_argument(
        "--with-occupancy",
        action="store_true",
        help="with occupancy",
    )
    parser.add_argument(
        "--note",
        help="note",
    )
    parser.add_argument(
        "--pretrained-resnet18",
        action="store_true",
        help="pretrained resnet18",
    )
    parser.add_argument(
        "--resume",
        help="resume",
    )
    parser.add_argument(
        "--loss",
        choices=[
            "add/add_s",
            "add/add_s+occupancy",
            "add->add_s|1",
            "add->add/add_s|1",
            "add->add/add_s|1+occupancy",
        ],
        default="add->add/add_s|1",
        help="loss",
    )
    parser.add_argument(
        "--loss-scale",
        type=json.loads,
        default={"occupancy": 1.0},
        help="loss scale",
    )
    args = parser.parse_args()

    chainer.global_config.debug = args.debug

    # -------------------------------------------------------------------------

    # device initialization
    if args.multi_node:
        import chainermn

        comm = chainermn.create_communicator("pure_nccl")
        device = comm.intra_rank
        n_gpu = comm.size
    else:
        device = args.gpu
        n_gpu = 1

    if not args.multi_node or comm.rank == 0:
        now = datetime.datetime.now(datetime.timezone.utc)
        args.timestamp = now.isoformat()
        args.hostname = socket.gethostname()
        args.githash = morefusion.utils.githash(__file__)

        termcolor.cprint("==> Started training", attrs={"bold": True})

    if args.out is None:
        if not args.multi_node or comm.rank == 0:
            args.out = osp.join(here, "logs", now.strftime("%Y%m%d_%H%M%S.%f"))
        else:
            args.out = None
        if args.multi_node:
            args.out = comm.bcast_obj(args.out)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()

    # seed initialization
    random.seed(args.seed)
    np.random.seed(args.seed)
    if device >= 0:
        chainer.cuda.cupy.random.seed(args.seed)

    # dataset initialization
    data_train = None
    data_valid = None
    if not args.multi_node or comm.rank == 0:
        termcolor.cprint("==> Dataset size", attrs={"bold": True})

        data_ycb_trainreal = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed(  # NOQA
            "trainreal", class_ids=args.class_ids, augmentation=True)
        data_ycb_syn = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed(  # NOQA
            "syn", class_ids=args.class_ids, augmentation=True)
        data_ycb_syn = morefusion.datasets.RandomSamplingDataset(
            data_ycb_syn, len(data_ycb_trainreal))
        data_my_train = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed(  # NOQA
            "train", class_ids=args.class_ids, augmentation=True)
        data_train = chainer.datasets.ConcatenatedDataset(
            data_ycb_trainreal, data_ycb_syn, data_my_train)
        print(f"ycb_trainreal={len(data_ycb_trainreal)}, "
              f"ycb_syn={len(data_ycb_syn)}, my_train={len(data_my_train)}")
        del data_ycb_trainreal, data_ycb_syn, data_my_train

        data_ycb_val = morefusion.datasets.YCBVideoRGBDPoseEstimationDatasetReIndexed(  # NOQA
            "val", class_ids=args.class_ids)
        data_my_val = morefusion.datasets.MySyntheticYCB20190916RGBDPoseEstimationDatasetReIndexed(  # NOQA
            "val", class_ids=args.class_ids)
        data_valid = chainer.datasets.ConcatenatedDataset(
            data_ycb_val,
            data_my_val,
        )
        print(f"ycb_val={len(data_ycb_val)}, my_val={len(data_my_val)}")
        del data_ycb_val, data_my_val

        data_train = chainer.datasets.TransformDataset(
            data_train,
            Transform(train=True, with_occupancy=args.with_occupancy),
        )
        data_valid = chainer.datasets.TransformDataset(
            data_valid,
            Transform(train=False, with_occupancy=args.with_occupancy),
        )
        print(f"train={len(data_train)}, valid={len(data_valid)}")

    if args.multi_node:
        data_train = chainermn.scatter_dataset(data_train,
                                               comm,
                                               shuffle=True,
                                               seed=args.seed)
        data_valid = chainermn.scatter_dataset(data_valid,
                                               comm,
                                               shuffle=False,
                                               seed=args.seed)

    args.class_names = morefusion.datasets.ycb_video.class_names.tolist()

    loss = args.loss
    if loss in ["add->add_s|1", "add->add/add_s|1"]:
        loss = "add"
    elif loss == "add->add/add_s|1+occupancy":
        loss = "add+occupancy"

    # model initialization
    model = singleview_3d.models.Model(
        n_fg_class=len(args.class_names[1:]),
        pretrained_resnet18=args.pretrained_resnet18,
        with_occupancy=args.with_occupancy,
        loss=loss,
        loss_scale=args.loss_scale,
    )
    if args.pretrained_model is not None:
        chainer.serializers.load_npz(args.pretrained_model, model)
    if device >= 0:
        model.to_gpu()

    # optimizer initialization
    optimizer = chainer.optimizers.Adam(alpha=args.lr)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)

    if args.pretrained_resnet18:
        model.resnet_extractor.init_block.disable_update()
        model.resnet_extractor.res2.disable_update()
        for link in model.links():
            if isinstance(link, chainer.links.BatchNormalization):
                link.disable_update()

    if not args.multi_node or comm.rank == 0:
        termcolor.cprint("==> Link update rules", attrs={"bold": True})
        for name, link in model.namedlinks():
            print(name, link.update_enabled)

    # iterator initialization
    iter_train = chainer.iterators.MultithreadIterator(
        data_train,
        batch_size=16 // n_gpu,
        repeat=True,
        shuffle=True,
    )
    iter_valid = chainer.iterators.MultithreadIterator(
        data_valid,
        batch_size=48,
        repeat=False,
        shuffle=False,
    )

    updater = chainer.training.StandardUpdater(
        iterator=iter_train,
        optimizer=optimizer,
        device=device,
    )
    if not args.multi_node or comm.rank == 0:
        writer = tensorboardX.SummaryWriter(log_dir=args.out)
        writer_with_updater = morefusion.training.SummaryWriterWithUpdater(
            writer)
        writer_with_updater.setup(updater)

    # -------------------------------------------------------------------------

    trainer = chainer.training.Trainer(updater, (args.max_epoch, "epoch"),
                                       out=args.out)
    trainer.extend(E.FailOnNonNumber())

    @chainer.training.make_extension(trigger=(1, "iteration"))
    def update_loss(trainer):
        updater = trainer.updater
        optimizer = updater.get_optimizer("main")
        target = optimizer.target
        assert trainer.stop_trigger.unit == "epoch"

        if args.loss == "add->add/add_s|1":
            if updater.epoch_detail < 1:
                assert target._loss == "add"
            else:
                target._loss = "add/add_s"
        elif args.loss == "add->add_s|1":
            if updater.epoch_detail < 1:
                assert target._loss == "add"
            else:
                target._loss = "add_s"
        elif args.loss == "add->add/add_s|1+occupancy":
            if updater.epoch_detail < 1:
                assert target._loss == "add+occupancy"
            else:
                target._loss = "add/add_s+occupancy"
        else:
            assert args.loss in ["add/add_s", "add/add_s+occupancy"]
            return

    trainer.extend(update_loss)

    log_interval = 10, "iteration"
    eval_interval = 0.25, "epoch"

    # evaluate
    evaluator = morefusion.training.extensions.PoseEstimationEvaluator(
        iterator=iter_valid,
        target=model,
        device=device,
        progress_bar=True,
    )
    if args.multi_node:
        evaluator.comm = comm
    trainer.extend(
        evaluator,
        trigger=eval_interval,
        call_before_training=args.call_evaluation_before_training,
    )

    if not args.multi_node or comm.rank == 0:
        # print arguments
        msg = pprint.pformat(args.__dict__)
        msg = textwrap.indent(msg, prefix=" " * 2)
        termcolor.cprint("==> Arguments", attrs={"bold": True})
        print(f"\n{msg}\n")

        trainer.extend(
            morefusion.training.extensions.ArgsReport(args),
            call_before_training=True,
        )

        # snapshot
        trigger_best_add = chainer.training.triggers.MinValueTrigger(
            key="validation/main/add_or_add_s",
            trigger=eval_interval,
        )
        trigger_best_auc = chainer.training.triggers.MaxValueTrigger(
            key="validation/main/auc/add_or_add_s",
            trigger=eval_interval,
        )
        trainer.extend(
            E.snapshot(filename="snapshot_trainer_latest.npz"),
            trigger=eval_interval,
        )
        trainer.extend(
            E.snapshot_object(model, filename="snapshot_model_latest.npz"),
            trigger=eval_interval,
        )
        trainer.extend(
            E.snapshot_object(model, filename="snapshot_model_best_add.npz"),
            trigger=trigger_best_add,
        )
        trainer.extend(
            E.snapshot_object(model, filename="snapshot_model_best_auc.npz"),
            trigger=trigger_best_auc,
        )

        # log
        trainer.extend(
            morefusion.training.extensions.LogTensorboardReport(
                writer=writer,
                trigger=log_interval,
            ),
            call_before_training=True,
        )
        trainer.extend(
            E.PrintReport(
                [
                    "epoch",
                    "iteration",
                    "elapsed_time",
                    "main/loss",
                    "main/add_or_add_s",
                    "validation/main/auc/add_or_add_s",
                ],
                log_report="LogTensorboardReport",
            ),
            trigger=log_interval,
            call_before_training=True,
        )
        trainer.extend(E.ProgressBar(update_interval=1))

    # -------------------------------------------------------------------------

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Example #8
0
def train(
    lr=0.001,
    device=0,
    epoch=10,
    h_size=1000,
    b_size=100,
    weight_decay=0.0005,
    margin=1.,
    saveto='output/checkpoint/',
    text_net='cnn',
    ocr_type='cloudvision',
    model_name='ocr',
    san_check=False,
    early_stopping=False,
    remove_stopwords=False,
):

    chainer.config.remove_stopwords = remove_stopwords

    args = locals()
    if not os.path.exists(saveto):
        os.makedirs(saveto)
    json.dump(args, open(os.path.join(saveto, 'args'), 'w'))

    log_interval = (10, 'iteration')
    val_interval = (1, 'epoch')

    dataset = DatasetOCR('train', ocr_type=ocr_type, san_check=san_check)
    train, val = chainer.datasets.split_dataset_random(dataset,
                                                       first_size=int(
                                                           len(dataset) * .9),
                                                       seed=1234)

    print('train: %i, val: %i' % (len(train), len(val)))
    train_itr = chainer.iterators.SerialIterator(train, batch_size=b_size)
    val_itr = chainer.iterators.SerialIterator(val,
                                               batch_size=b_size,
                                               repeat=False,
                                               shuffle=False)

    if remove_stopwords:
        wvec_f = 'data/wordvec_wo_stopwords.npy'
    else:
        wvec_f = 'data/wordvec.npy'
    word_vec = np.load(wvec_f)

    if text_net == 'cnn':
        lng_net = TextCNN(len(dataset.tokenizer.word_index) + 1, word_vec)
    elif text_net == 'lstm':
        lng_net = TextLSTM(len(dataset.tokenizer.word_index) + 1, word_vec)
    else:
        raise RuntimeError('invalid text_net')

    if model_name == 'ocr':
        model = NonVisualNet(lng_net, h_size=h_size, margin=margin)
    elif model_name == 'ocr+vis':
        att_net = AttentionNetWTL(h_size=100)
        model = Net(lng_net, att_net)
    else:
        raise RuntimeError

    if device is not None:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainer.optimizers.Adam(alpha=lr)
    optimizer.use_cleargrads()
    optimizer.setup(model)

    if text_net == 'lstm':
        optimizer.add_hook(chainer.optimizer.GradientClipping(5),
                           name='grad_clip')

    if weight_decay is not None:
        optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay),
                           name='weight_decay')

    updater = training.StandardUpdater(train_itr,
                                       optimizer,
                                       converter=my_converter,
                                       device=device)

    stop_trigger = (epoch, 'epoch')

    if early_stopping:
        stop_trigger = training.triggers.EarlyStoppingTrigger(
            monitor='validation/main/r@1',
            patients=2,
            mode='max',
            verbose=True,
            max_trigger=(epoch, 'epoch'))

    trainer = training.Trainer(updater, stop_trigger, saveto)
    trainer.extend(extensions.FailOnNonNumber())
    trainer.extend(extensions.Evaluator(val_itr,
                                        model,
                                        converter=my_converter,
                                        device=device),
                   trigger=val_interval)

    if not san_check:
        trainer.extend(extensions.ExponentialShift('alpha', 0.5),
                       trigger=(5, 'epoch'))

    trainer.extend(extensions.LogReport(trigger=log_interval))

    trainer.extend(extensions.ProgressBar(update_interval=10))

    best_val_trigger = training.triggers.MaxValueTrigger(
        'validation/main/ranking_score', trigger=val_interval)
    trainer.extend(extensions.snapshot_object(model, 'model'),
                   trigger=best_val_trigger)

    trainer.run()

    return best_val_trigger._best_value
Example #9
0
    def set_event_handler(self):

        self.set_target()

        # (Not Implemented)Evaluator(train)
        self.trainer.extend(extensions.Evaluator(
            self.valid_loader,
            self.target,
            converter=self.converter,
            device=self.device,
        ),
                            trigger=(self.eval_interval, 'epoch'),
                            call_before_training=self.call_before_training)

        self.trainer.extend(extensions.ProgressBar())

        self.trainer.extend(extensions.observe_lr())

        # self.trainer.extend(extensions.MicroAverage('loss', 'lr', 'mav'))

        self.trainer.extend(extensions.LogReport(trigger=(self.log_interval,
                                                          'epoch')),
                            call_before_training=self.call_before_training)

        self.trainer.extend(extensions.FailOnNonNumber())

        # self.trainer.extend(extensions.ExponentialShift('lr', rate=0.9))
        self.trainer.extend(
            extensions.ExponentialShift('lr', rate=0.99, init=self.lr * 10.0))
        # (Not Implemented)InverseShift
        # (Not Implemented)LinearShift
        # (Not Implemented)MultistepShift
        # (Not Implemented)PolynomialShift
        # (Not Implemented)StepShift
        # (Not Implemented)WarmupShift

        self.trainer.extend(
            extensions.ParameterStatistics(self.model,
                                           trigger=(self.eval_interval,
                                                    'epoch')))

        self.trainer.extend(extensions.VariableStatisticsPlot(self.model))

        self.trainer.extend(extensions.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
        ]),
                            call_before_training=self.call_before_training)

        self.trainer.extend(extensions.PlotReport(
            ['main/loss', 'validation/main/loss'],
            'epoch',
            file_name='loss.png'),
                            call_before_training=self.call_before_training)
        self.trainer.extend(extensions.PlotReport(
            ['main/accuracy', 'validation/main/accuracy'],
            'epoch',
            file_name='accuracy.png'),
                            call_before_training=self.call_before_training)

        self.trainer.extend(extensions.snapshot(n_retains=self.retain_num),
                            trigger=(self.log_interval, 'epoch'))

        self.set_additonal_event_handler()
Example #10
0
def main3():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu_id', '-g', type=int, default=1)
    parser.add_argument('--batch_size', '-b', type=int, default=100)
    parser.add_argument('--test_split', type=float, default=0.2)
    parser.add_argument(
        '--real_test',
        dest='real_test',
        action='store_true',
        help='Whether to split the data or use a complete new trial.')
    parser.add_argument('--mdn_hidden-units', '-u', type=int, default=24)
    parser.add_argument('--mdn_gaussian-mixtures', '-m', type=int, default=24)
    parser.add_argument('--max_epoch', '-e', type=int, default=250)
    parser.add_argument('--resume', '-r', type=int, default=None)
    parser.add_argument('--out_dir',
                        '-o',
                        type=str,
                        default='results/result_test')
    parser.add_argument('--data_base_dir',
                        type=str,
                        default='/media/daniel/data/hhc/')
    parser.add_argument('--data_file_pattern',
                        '-f',
                        type=str,
                        default='trial{}.avi')
    args = parser.parse_args()

    # frames, labels = load_frames_labels(filestype='/media/daniel/data/hhc/trial{}_r_forearm.avi')
    frames, labels = load_frames_labels(filestype=''.join(
        (args.data_base_dir, args.data_file_pattern)),
                                        verbose=0)

    frames, labels = unison_shuffled_copies(frames, labels)
    print('Frames shape: ', frames.shape, ' Labels shape: ', labels.shape)

    data = chainer.datasets.TupleDataset(frames, labels)  #.to_device(gpu_id)
    print('Dataset length: ', data._length)

    print('Frame size: ', data[0][0].shape, data[0][0].dtype)

    if args.real_test:
        print('Using test trial.')
        train_iter = iterators.SerialIterator(data,
                                              args.batch_size,
                                              shuffle=True)

        # Load the test data
        test_frames, test_labels = load_frames_labels(
            ids=[11],
            filestype=''.join((args.data_base_dir, args.data_file_pattern)))
        test_data = chainer.datasets.TupleDataset(test_frames, test_labels)
        test_iter = iterators.SerialIterator(test_data,
                                             args.batch_size,
                                             repeat=False,
                                             shuffle=False)
    else:
        data_test, data_train = split_dataset(data,
                                              int(args.test_split * len(data)))
        train_iter = iterators.SerialIterator(data_train,
                                              args.batch_size,
                                              shuffle=True)
        test_iter = iterators.SerialIterator(data_test,
                                             args.batch_size,
                                             repeat=False,
                                             shuffle=False)

    model = GoalScoreModel()

    if args.gpu_id >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu_id).use()
        model.to_gpu(args.gpu_id)
        # labels = chainer.dataset.to_device(args.gpu_id, labels)
        # frames = chainer.dataset.to_device(args.gpu_id, frames)

    # Create the optimizer for the model
    optimizer = optimizers.Adam().setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=1e-6))
    # optimizer.add_hook(chainer.optimizer_hooks.GradientHardClipping(-.1, .1))

    # xp = chainer.backend.get_array_module(data_train)
    # optimizer.update(model.calc_loss, xp.asarray([data_train[0][0]]), xp.asarray([data_train[0][1]]))
    # import chainer.computational_graph as c
    # g = c.build_computational_graph(model.calc_loss)
    # with open('results/graph.dot', 'w') as o:
    #     o.write(g.dump())

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       loss_func=model.calc_loss,
                                       device=args.gpu_id)

    # updater = training.ParallelUpdater(train_iter, optimizer,
    #                                 loss_func=model.calc_loss,
    #                                 devices={'main': args.gpu_id, 'second': 1})

    # Pre-training
    print('Pretraining started.')
    trainer = training.Trainer(updater, (3, 'epoch'), out=args.out_dir)
    # Disable update for the head model
    print('Disabling training of head model.')
    model.head_model.disable_update()
    trainer.extend(extensions.ProgressBar())
    trainer.extend(extensions.FailOnNonNumber())
    trainer.run()

    # Full training
    print('Full model training ...')
    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'),
                               out=args.out_dir)
    trainer.extend(extensions.Evaluator(test_iter,
                                        model,
                                        eval_func=model.calc_loss,
                                        device=args.gpu_id),
                   trigger=(1, 'epoch'))
    trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'main/nll', 'main/mae', 'main/sigma',
            'validation/main/loss', 'validation/main/mae',
            'validation/main/sigma', 'elapsed_time'
        ]))  #, 'main/loss', 'validation/main/loss', 'elapsed_time'], ))
    trainer.extend(
        extensions.PlotReport(['main/mae', 'validation/main/mae'],
                              x_key='epoch',
                              file_name='loss.png',
                              marker=None))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.ProgressBar())
    trainer.extend(extensions.FailOnNonNumber())
    trainer.extend(
        extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'),
        trigger=(20, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        model, 'model_epoch_{.updater.epoch}.model'),
                   trigger=(20, 'epoch'))

    # Disable/Enable update for the head model
    model.head_model.enable_update()

    # Resume from a specified snapshot
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
    print('Done.')
Example #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu_id', '-g', type=int, default=1)
    parser.add_argument('--batch_size', '-b', type=int, default=30)
    parser.add_argument('--test_split', type=float, default=0.2)
    parser.add_argument('--real_test',  dest='real_test', action='store_true', 
                                help='Whether to split the data or use a complete new trial.')
    parser.add_argument('--aug', type=int, default=2,
                                help='How many times to increase the dataset with augmented images.')
    parser.add_argument('--subset', type=int, default=-1, help='Should we read just `x` number of folders.')
    parser.add_argument('--skipcount', type=int, default=1, help='Take every `x`-th frame from a sequence.')
    # parser.add_argument('--blackout',  dest='blackout', action='store_true', 
    #                             help='Whether to blackout part of the image or not.')
    # parser.add_argument('--mdn_hidden-units', '-u', type=int, default=24)
    # parser.add_argument('--mdn_gaussian-mixtures', '-m', type=int, default=24)
    parser.add_argument('--max_epoch', '-e', type=int, default=2500)
    parser.add_argument('--resume', '-r', type=int, default=None)
    parser.add_argument('--out_dir', '-o', type=str, default='results/result_test')
    # parser.add_argument('--data_base_dir', type=str, default='/media/daniel/data/hhc/')
    # parser.add_argument('--data_file_pattern', '-f', type=str, default='trial{}.avi')
    args = parser.parse_args()


    model = InsertGearPolicy()
    # frames, joints = load_data(prep_f=model.prepare, prepare_joints=model.prepare_joints) # Scale it all
    frames, joints = load_data(prep_f=model.prepare, prepare_joints=None, 
                               subset=args.subset, skipcount=args.skipcount) # Scale only images
    print('Frames shape: ', frames.shape, ' joints shape: ', joints.shape)

    from sklearn.model_selection import train_test_split
    frames_train, frames_test, joints_train, joints_test = train_test_split(
                                    frames, joints, test_size=args.test_split, random_state=42)

    if args.aug > 1:
        frames_train, joints_train = colour_agumentations(frames_train, joints_train, n=args.aug)
        print('After augmentation. Frames shape: ', frames.shape, ' joints shape: ', joints.shape)

    data_train = chainer.datasets.TupleDataset(frames_train, joints_train)
    data_test = chainer.datasets.TupleDataset(frames_test, joints_test)
    train_iter = iterators.SerialIterator(data_train, args.batch_size, shuffle=True)
    test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False)


    # frames, joints = unison_shuffled_copies(frames, joints)

    # data = chainer.datasets.TupleDataset(frames, joints)
    # print('Dataset length: ', data._length)

    # print('Frame size: ', data[0][0].shape, data[0][0].dtype)

    # if args.real_test:
    #     print('Using test trial.')
    #     train_iter = iterators.SerialIterator(data, args.batch_size, shuffle=True)

    #     # Load the test data
    #     print('Not done.')
    #     exit(0)
    #     # test_frames, test_joints = load_frames_labels(ids=[11], filestype=''.join((args.data_base_dir, args.data_file_pattern)), blackout=args.blackout)
    #     data_test = chainer.datasets.TupleDataset(test_frames, test_joints)
    #     test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False)
    # else:
    #     print('Splitting data at ratio: ', args.test_split)
    #     data_test, data_train = split_dataset(data, int(args.test_split*len(data)))
    #     train_iter = iterators.SerialIterator(data_train, args.batch_size, shuffle=True)
    #     test_iter = iterators.SerialIterator(data_test, args.batch_size, repeat=False, shuffle=False)


    if args.gpu_id >= 0:
        print('Loading model to gpu', args.gpu_id)
        chainer.backends.cuda.get_device_from_id(args.gpu_id).use()
        model.to_gpu(args.gpu_id)


    # Create the optimizer for the model
    optimizer = optimizers.Adam().setup(model)
    # optimizer = optimizers.SGD().setup(model)
    # optimizer.add_hook(chainer.optimizer.WeightDecay(rate=1e-6))
    # optimizer.add_hook(chainer.optimizer_hooks.GradientHardClipping(-.1, .1))


    # xp = chainer.backend.get_array_module(data_train)
    # optimizer.update(model.calc_loss, xp.asarray([data_train[0][0]]), xp.asarray([data_train[0][1]]))
    # import chainer.computational_graph as c
    # g = c.build_computational_graph(model.calc_loss)
    # with open('results/graph.dot', 'w') as o:
    #     o.write(g.dump())

    updater = training.StandardUpdater(train_iter, optimizer, 
                                       loss_func=model.calc_loss,
                                       device=args.gpu_id)
    # Resume from a specified snapshot
    if args.resume:
        print('Loading from resume snapshot: ', args.resume, '{}/snapshot_epoch_{}.trainer'.format(args.out_dir, args.resume))
        chainer.serializers.load_npz('{}/snapshot_epoch_{}.trainer'.format(args.out_dir, args.resume), trainer)

    # Pre-training
    # print('Pretraining started.')
    # trainer = training.Trainer(updater, (3, 'epoch'), out=args.out_dir)
    
    

    # # Disable update for the head model
    # print('Disabling training of head model.')
    # model.encode_model.disable_update()
    # trainer.extend(extensions.ProgressBar())
    # trainer.extend(extensions.FailOnNonNumber())
    # trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'main/mae', 'main/VAE', 'validation/main/loss', 'validation/main/mae', 'validation/main/VAE', 'elapsed_time']))
    # trainer.extend(utils.display_image(model.vae_image, data_test, args.out_dir, args.gpu_id), trigger=(1, 'epoch'))
    # trainer.run()

    # Full training
    print('Full model training ...')
    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'), out=args.out_dir)
    trainer.extend(extensions.Evaluator(test_iter, model, eval_func=model.calc_loss, device=args.gpu_id), name='val', trigger=(1, 'epoch'))
    # trainer.extend(extensions.Evaluator(test_iter, {'m':model}, eval_func=model.calc_loss, device=args.gpu_id), trigger=(1, 'epoch'))
    trainer.extend(extensions.LogReport(trigger=(1, 'epoch')))
    trainer.extend(extensions.PrintReport(['epoch', 'iteration', 'main/loss', 'main/mae', 'main/gnll', 'main/weighted', 'main/VAE', 'main/VAE_REC','main/VAE_KL', 'val/main/loss', 'val/main/mae', 'val/main/weighted', 'elapsed_time']))#, 'val/main/VAE', 'main/loss', 'validation/main/loss', 'elapsed_time'], ))
    trainer.extend(extensions.PlotReport(['main/mae', 'val/main/mae', 'main/VAE', 'val/main/VAE'], x_key='epoch', file_name='loss.png', marker=None))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.ProgressBar(update_interval=10))
    trainer.extend(extensions.FailOnNonNumber())
    # Save every X epochs
    trainer.extend(extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}.trainer'), trigger=(200, 'epoch'))
    trainer.extend(extensions.snapshot_object(model, '%s_model_epoch_{.updater.epoch}.model' % (model.__class__.__name__)), trigger=(10, 'epoch'))
    # # Take a best snapshot
    # record_trigger = training.triggers.MinValueTrigger('validation/main/mae', (1, 'epoch'))
    # trainer.extend(extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'), trigger=record_trigger)
    # trainer.extend(extensions.snapshot_object(model, '%s_best_model.npz' % (model.__class__.__name__)), trigger=record_trigger)

    trainer.extend(utils.display_image(model.vae_image, data_test, args.out_dir, args.gpu_id, n=3), trigger=(1, 'epoch'))
# FOR SGD   trainer.extend(extensions.ExponentialShift('lr', 0.5, init=1e-4, target=1e-8), trigger=(200, 'epoch'))
    # trainer.extend(extensions.ExponentialShift('alpha', 0.5, init=1e-3, target=1e-8), trigger=(100, 'epoch'))


    # Disable/Enable update for the head model
    model.encode_model.enable_update()


    trainer.run()
    print('Done.')