Esempio n. 1
0
def train(config):
    gpus = [int(i) for i in config.gpus.split(',')]
    num_gpus = len(gpus)

    logging.info("number of gpu %d" % num_gpus)

    if len(gpus) == 0:
        kv = None
    else:
        kv = mx.kvstore.create('local')
    logging.info("Running on GPUs: {}".format(gpus))

    # Modify to make it consistent with the distributed trainer
    total_batch_size = config.batch_per_device * num_gpus

    # Create symbol, arg and aux
    if config.begin_epoch > 0:
        sym, arg_params, aux_params = mx.model.load_checkpoint(
            os.path.join(config.output, 'test'), config.begin_epoch)
    else:
        # Create Network
        sym = create_r3d(
            num_class=config.num_class,
            no_bias=True,
            model_depth=config.model_depth,
            final_spatial_kernel=config.final_spatial_kernel,
            final_temporal_kernel=int(config.n_frame / 8),
            bn_mom=config.bn_mom,
            cudnn_tune=config.cudnn_tune,
            workspace=config.workspace,
            spatial_scale=720.0 / config.scale_w * config.spatial_scale,
            pooled_size=config.pooled_size,
            n_frame=config.n_frame,
            n_bbox=config.n_bbox,
        )
        # Load pretrained params
        arg_params, aux_params = {}, {}
        if config.pretrained:
            arg_params, aux_params = load_from_caffe2_pkl(
                config.pretrained, sym)
        logging.info("load pretrained okay, num of arg_p %d, num of aux_p %d" %
                     (len(arg_params), len(aux_params)))

    # Create Module
    # We can set fixed params here if needed
    m = mx.module.Module(sym,
                         context=[mx.gpu(i) for i in gpus],
                         data_names=['data', 'rois'],
                         label_names=['softmax_label'])

    if config.plot:
        v = mx.viz.plot_network(sym,
                                title='R2Plus1D-train',
                                shape={
                                    'data':
                                    (total_batch_size, 3, config.n_frame,
                                     config.scale_h, config.scale_w),
                                    'rois':
                                    (total_batch_size,
                                     config.n_frame // config.temporal_scale,
                                     config.n_bbox, 5),
                                    'softmax_label':
                                    (total_batch_size, config.n_bbox)
                                })
        v.render(filename=os.path.join(config.output, 'vis'), cleanup=True)

    df_train = pd.read_pickle(config.df_train)
    df_test = pd.read_pickle(config.df_test)
    train_data = mx.io.PrefetchingIter(
        ClipBatchIter(df=df_train,
                      batch_size=total_batch_size,
                      n_frame=config.n_frame,
                      train=True,
                      n_bbox=config.n_bbox,
                      scale_w=config.scale_w,
                      scale_h=config.scale_h,
                      batch_per_device=config.batch_per_device,
                      temporal_scale=config.temporal_scale,
                      use_large_bbox=config.use_large_bbox))
    test_data = mx.io.PrefetchingIter(
        ClipBatchIter(df=df_test,
                      batch_size=total_batch_size,
                      n_frame=config.n_frame,
                      train=False,
                      n_bbox=config.n_bbox,
                      scale_w=config.scale_w,
                      scale_h=config.scale_h,
                      batch_per_device=config.batch_per_device,
                      temporal_scale=config.temporal_scale,
                      use_large_bbox=config.use_large_bbox))

    # Set optimizer
    optimizer = config.optimizer
    optimizer_params = {}
    optimizer_params['learning_rate'] = config.lr
    optimizer_params['momentum'] = config.momentum
    optimizer_params['wd'] = config.wd

    print(config.lr)
    print(config.lr_step)

    if config.lr_step:
        optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
            step=config.lr_step, factor=config.lr_factor)
    metric = RCNNAccMetric()

    m.fit(
        train_data=train_data,
        eval_data=test_data,
        eval_metric=metric,
        epoch_end_callback=mx.callback.do_checkpoint(config.output + '/test',
                                                     1),
        batch_end_callback=mx.callback.Speedometer(total_batch_size, 20),
        kvstore=kv,
        optimizer=optimizer,
        optimizer_params=optimizer_params,
        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        begin_epoch=config.begin_epoch,
        num_epoch=config.num_epoch,
    )
Esempio n. 2
0
def train(config):
    gpus = [int(i) for i in config.gpus.split(',')]
    num_gpus = len(gpus)

    logging.info("number of gpu %d" % num_gpus)

    if len(gpus) == 0:
        kv = None
    else:
        kv = mx.kvstore.create('local')
    logging.info("Running on GPUs: {}".format(gpus))

    # Modify to make it consistent with the distributed trainer
    total_batch_size = config.batch_per_device * num_gpus
    config.total_batch_size = total_batch_size

    # Create symbol, arg and aux
    if config.begin_epoch>0:
        sym, arg_params, aux_params = mx.model.load_checkpoint(os.path.join(config.output, 'test'), config.begin_epoch)
    else:
        # Create Network
        sym = create_r3d(
            num_class=config.num_class,
            no_bias=True,
            model_depth=config.model_depth,
            final_spatial_kernel=config.final_spatial_kernel,
            final_temporal_kernel=int(config.n_frame / 8),
            bn_mom=config.bn_mom,
            cudnn_tune=config.cudnn_tune,
            workspace=config.workspace,
            spatial_scale=config.spatial_scale,
            pooled_size=config.pooled_size,
            n_frame=config.n_frame,
            n_bbox=config.n_bbox,
        )
        # Load pretrained params
        arg_params, aux_params = {}, {}
        if config.pretrained:
            arg_params, aux_params = load_from_caffe2_pkl(config.pretrained, sym)
        logging.info("load pretrained okay, num of arg_p %d, num of aux_p %d" % (len(arg_params), len(aux_params)))

    # Create Module
    # We can set fixed params here if needed
    m = mx.module.Module(sym, context=[mx.gpu(i) for i in gpus], data_names=['data', 'rois'],
                         label_names=['softmax_label'])

    if config.plot:
        v = mx.viz.plot_network(sym, title='R2Plus1D-train',
                                shape={'data': (total_batch_size, 3, config.n_frame, config.scale_h, config.scale_w),
                                       'rois': (total_batch_size, config.n_frame // config.temporal_scale, config.n_bbox, 5),
                                       'softmax_label': (total_batch_size, config.n_bbox, config.num_class)})
        v.render(filename=os.path.join(config.output, 'vis'), cleanup=True)

    train_data = mx.io.PrefetchingIter(ClipBatchIter(config=config, train=True))
    test_data = mx.io.PrefetchingIter(ClipBatchIter(config=config, train=False))

    # Set optimizer
    optimizer = config.optimizer
    optimizer_params = {}
    optimizer_params['learning_rate'] = config.lr
    optimizer_params['momentum'] = config.momentum
    optimizer_params['wd'] = config.wd

    print(config.lr)
    print(config.lr_step)

    if config.lr_step:
        optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(step=config.lr_step,
                                                                           factor=config.lr_factor)
    # metric = RCNNAccMetric()


    def acc(label, pred):
        label = label.reshape((-1, config.num_class))
        # print('in acc, pred.size', pred.size, 'pred.shape', pred.shape, 'label.shape', label.shape, 'numerator', (label == np.round(pred)).sum(), 'res', float((label == np.round(pred)).sum()) / pred.size)
        return (label == np.round(pred)).astype(np.float32).mean()

    def all_correct_acc(label, pred):
        label = label.reshape((-1, config.num_class))
        # print('in acc, pred.size', pred.size, 'pred.shape', pred.shape, 'label.shape', label.shape, 'numerator', (label == np.round(pred)).sum(), 'res', float((label == np.round(pred)).sum()) / pred.size)
        equal = (label == np.round(pred)).astype(np.int32)
        equal_sum = equal.sum(axis=-1)
        return (equal_sum == label.shape[-1]).astype(np.float32).mean()


    def loss(label, pred):
        label = label.reshape((-1, config.num_class))
        loss_all = 0
        for i in range(len(pred)):
            loss = 0
            loss -= label[i] * np.log(pred[i] + 1e-6) + (1.- label[i]) * np.log(1. + 1e-6 - pred[i])
            loss_all += np.sum(loss)
        loss_all = float(loss_all)/float(len(pred) + 0.000001)
        return loss_all

    eval_metric = list()
    eval_metric.append(mx.metric.np(acc))
    eval_metric.append(mx.metric.np(all_correct_acc))
    eval_metric.append(mx.metric.np(loss))

    m.fit(
        train_data=train_data,
        eval_data=test_data,
        eval_metric=eval_metric,
        epoch_end_callback=mx.callback.do_checkpoint(config.output + '/test', 1),
        batch_end_callback=mx.callback.Speedometer(total_batch_size, 20),
        kvstore=kv,
        optimizer=optimizer,
        optimizer_params=optimizer_params,
        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        begin_epoch=config.begin_epoch,
        num_epoch=config.num_epoch,
    )
Esempio n. 3
0
def train(args):
    gpus = [int(i) for i in args.gpus.split(',')]
    num_gpus = len(gpus)

    logging.info("number of gpu %d" % num_gpus)

    if len(gpus) == 0:
        kv = None
    else:
        kv = mx.kvstore.create('local')
    logging.info("Running on GPUs: {}".format(gpus))

    # Modify to make it consistent with the distributed trainer
    total_batch_size = args.batch_per_device * num_gpus

    # Round down epoch size to closest multiple of batch size across machines
    epoch_iters = int(args.epoch_size / total_batch_size)
    args.epoch_size = epoch_iters * total_batch_size
    logging.info("Using epoch size: {}".format(args.epoch_size))

    # Create Network
    net = create_r3d(
        num_class=args.num_class,
        no_bias=True,
        model_depth=args.model_depth,
        final_spatial_kernel=7 if args.crop_size == 112 else 14,
        final_temporal_kernel=int(args.n_frame / 8),
        bn_mom=args.bn_mom,
        cudnn_tune=args.cudnn_tune,
        workspace=args.workspace,
    )

    # Load pretrained params
    arg_p = {}
    aux_p = {}
    if args.pretrained:
        arg_p, aux_p = load_from_caffe2_pkl(args.pretrained, net)
        logging.info("load pretrained okay, num of arg_p %d, num of aux_p %d" % (len(arg_p), len(aux_p)))

    # Create Module
    m = mx.module.Module(net, context=[mx.gpu(i) for i in gpus])  # , fixed_param_names=fixed_params)
    if args.plot:
        v = mx.viz.plot_network(net, title='R2Plus1D-train',
                                shape={'data': (total_batch_size, 3, args.n_frame, args.crop_size, args.crop_size)})
        v.render(filename='models/R2Plus1D-train', cleanup=True)

    train_data = mx.io.PrefetchingIter(ClipBatchIter(datadir=args.datadir, batch_size=total_batch_size,
                                                     n_frame=args.n_frame, crop_size=args.crop_size, train=True,
                                                     scale_w=args.scale_w, scale_h=args.scale_h))
    eval_data = mx.io.PrefetchingIter(ClipBatchIter(datadir=args.datadir, batch_size=total_batch_size,
                                                    n_frame=args.n_frame, crop_size=args.crop_size, train=False,
                                                    scale_w=args.scale_w, scale_h=args.scale_h,
                                                    temporal_center=True))

    # Set optimizer
    optimizer = args.optimizer
    optimizer_params = {}
    optimizer_params['learning_rate'] = args.lr
    optimizer_params['momentum'] = args.momentum
    optimizer_params['wd'] = args.wd

    if args.lr_scheduler_step:
        optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(step=args.lr_scheduler_step,
                                                                           factor=args.lr_scheduler_factor)

    m.fit(
        train_data=train_data,
        eval_data=eval_data,
        eval_metric='accuracy',
        epoch_end_callback=mx.callback.do_checkpoint(args.output + '/test', 1),
        batch_end_callback=mx.callback.Speedometer(total_batch_size, 20),
        kvstore=kv,
        optimizer=optimizer,
        optimizer_params=optimizer_params,
        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
        arg_params=arg_p,
        aux_params=aux_p,
        allow_missing=True,
        begin_epoch=args.begin_epoch,
        num_epoch=args.num_epoch,
    )