Esempio n. 1
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator. Only worker 0 loads the whole dataset.
    # The dataset of worker 0 is evenly split and distributed to all workers.
    if comm.rank == 0:
        train, valid = chainer.datasets.get_mnist()
        rng = np.random.RandomState(0)
        train = chainer.datasets.SubDataset(
            train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train))
        )
        valid = chainer.datasets.SubDataset(
            valid, 0, N_VALID_EXAMPLES, order=rng.permutation(len(valid))
        )
    else:
        train, valid = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    valid = chainermn.scatter_dataset(valid, comm)

    train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE, shuffle=True)
    valid_iter = chainer.iterators.SerialIterator(valid, BATCHSIZE, repeat=False, shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, "epoch"))

    # Add Chainer extension for pruners.
    trainer.extend(
        optuna.integration.ChainerPruningExtension(
            trial, "validation/main/accuracy", (PRUNER_INTERVAL, "epoch")
        )
    )
    evaluator = chainer.training.extensions.Evaluator(valid_iter, model)
    trainer.extend(chainermn.create_multi_node_evaluator(evaluator, comm))
    log_report_extension = chainer.training.extensions.LogReport(log_name=None)
    trainer.extend(log_report_extension)

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    # Please set show_loop_exception_msg False to inhibit messages about TrialPruned exception.
    # ChainerPruningExtension raises TrialPruned exception to stop training, and
    # trainer shows some messages every time it receive TrialPruned.
    trainer.run(show_loop_exception_msg=False)

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(valid_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return report["main/accuracy"]
Esempio n. 2
0
def get_trainer(args, comm, model, device, train_iterator, val_iterator,
                optimizer):
    updater = training.StandardUpdater(train_iterator,
                                       optimizer,
                                       device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    # Evaluator
    evaluator = TestModeEvaluator(val_iterator, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    if args.optimizer == 'rmsprop_warmup':
        scheduler = dlframeworks.chainer.optimizers.RMSpropWarmupScheduler(
            comm.size, args.batchsize)
        trainer.extend(scheduler)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    return trainer
Esempio n. 3
0
def main(args, model, x, t, valid_rate=0.2):
    print('Start a training script using multiple nodes.')

    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank
    assert device >= 0, 'invalid device ID: {}'.format(device)

    if comm.mpi_comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    if comm.rank == 0:
        threshold = int(len(t) * (1 - valid_rate))
        train = datasets.tuple_dataset.TupleDataset(x[0:threshold],
                                                    t[0:threshold])
        valid = datasets.tuple_dataset.TupleDataset(x[threshold:],
                                                    t[threshold:])
        datasize = len(train) * args.epoch
    else:
        train, valid = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    valid = chainermn.scatter_dataset(valid, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    valid_iter = chainer.iterators.SerialIterator(valid,
                                                  args.batchsize,
                                                  repeat=False,
                                                  shuffle=False)

    if device >= 0:
        cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.SGD(lr=2e-4), comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-2))

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    evaluator = extensions.Evaluator(valid_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)

    prepare_extensions(trainer, evaluator, args, comm)

    trainer.run()

    if comm.rank == 0:
        throughput = datasize / trainer.elapsed_time
        print('Throughput: {} [images/sec.] ({} / {})'.format(
            throughput, datasize, trainer.elapsed_time))

        model_filepath = os.path.join(args.out, 'trained.model')
        chainer.serializers.save_npz(model_filepath, model)
Esempio n. 4
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator. Only worker 0 loads the whole dataset.
    # The dataset of worker 0 is evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
        rng = np.random.RandomState(0)
        train = chainer.datasets.SubDataset(train,
                                            0,
                                            N_TRAIN_EXAMPLES,
                                            order=rng.permutation(len(train)))
        test = chainer.datasets.SubDataset(test,
                                           0,
                                           N_TEST_EXAMPLES,
                                           order=rng.permutation(len(test)))
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  BATCHSIZE,
                                                  shuffle=True)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 BATCHSIZE,
                                                 repeat=False,
                                                 shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch'))

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    trainer.run()

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(test_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    # The following line mitigates the memory problem in CircleCI
    # (see https://github.com/pfnet/optuna/pull/325 for more details).
    gc.collect()

    return 1.0 - report['main/accuracy']
Esempio n. 5
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator. Only worker 0 loads the whole dataset.
    # The dataset of worker 0 is evenly split and distributed to all workers.
    if comm.rank == 0:
        train, valid = chainer.datasets.get_mnist()
        rng = np.random.RandomState(0)
        train = chainer.datasets.SubDataset(train,
                                            0,
                                            N_TRAIN_EXAMPLES,
                                            order=rng.permutation(len(train)))
        valid = chainer.datasets.SubDataset(valid,
                                            0,
                                            N_VALID_EXAMPLES,
                                            order=rng.permutation(len(valid)))
    else:
        train, valid = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    valid = chainermn.scatter_dataset(valid, comm)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  BATCHSIZE,
                                                  shuffle=True)
    valid_iter = chainer.iterators.SerialIterator(valid,
                                                  BATCHSIZE,
                                                  repeat=False,
                                                  shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, "epoch"))

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    trainer.run()

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(valid_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return report["main/accuracy"]
def objective(trial, comm):
    device = comm.intra_rank
    chainer.cuda.get_device_from_id(device).use()

    # Sample an architecture.
    model = L.Classifier(create_model(trial))
    model.to_gpu()

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator. Only worker 0 loads the whole dataset.
    # The dataset of worker 0 is evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
        rng = np.random.RandomState(0)
        train = chainer.datasets.SubDataset(
            train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train)))
        test = chainer.datasets.SubDataset(
            test, 0, N_TEST_EXAMPLES, order=rng.permutation(len(test)))
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm)

    train_iter = chainer.iterators.SerialIterator(
        train, BATCHSIZE, shuffle=True)
    test_iter = chainer.iterators.SerialIterator(
        test, BATCHSIZE, repeat=False, shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(
        train_iter, optimizer, device=device)
    trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch'))

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    trainer.run()

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(
        test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return report['main/accuracy']
Esempio n. 7
0
    def test_mnist(self, display_log=True):
        # This test file is intended to be run on Travis-CI and
        # GPU is not used for now.
        epoch = 5
        batchsize = 100
        n_units = 100

        comm = chainermn.create_communicator('naive')
        model = L.Classifier(MLP(n_units, 10))
        optimizer = chainermn.create_multi_node_optimizer(
            chainer.optimizers.Adam(), comm)
        optimizer.setup(model)

        if comm.rank == 0:
            train, test = chainer.datasets.get_mnist()
        else:
            train, test = None, None

        train = chainermn.scatter_dataset(train, comm)
        test = chainermn.scatter_dataset(test, comm)

        train_iter = chainer.iterators.SerialIterator(train, batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        updater = training.StandardUpdater(train_iter, optimizer)
        trainer = training.Trainer(updater, (epoch, 'epoch'))

        # Wrap standard Chainer evaluators by MultiNodeEvaluator.
        evaluator = extensions.Evaluator(test_iter, model)
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
        trainer.extend(evaluator)

        # Some display and output extensions are necessary only for one worker.
        # (Otherwise, there would just be repeated outputs.)
        if comm.rank == 0 and display_log:
            trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                           trigger=(1, 'epoch'))
            trainer.extend(extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ],
                                                  out=sys.stderr),
                           trigger=(1, 'epoch'))
        trainer.run()

        err = evaluator()['validation/main/accuracy']
        self.assertGreaterEqual(err, 0.95)
Esempio n. 8
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator.
    train, test = chainer.datasets.get_mnist()
    rng = np.random.RandomState(0)
    train = chainer.datasets.SubDataset(train,
                                        0,
                                        N_TRAIN_EXAMPLES,
                                        order=rng.permutation(len(train)))
    test = chainer.datasets.SubDataset(test,
                                       0,
                                       N_TEST_EXAMPLES,
                                       order=rng.permutation(len(test)))

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  BATCHSIZE,
                                                  shuffle=True)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 BATCHSIZE,
                                                 repeat=False,
                                                 shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch'))

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    trainer.run()

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(test_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return 1.0 - report['main/accuracy']
Esempio n. 9
0
    def train(self, dataset, comm=None):
        mc = self.model_config
        tc = self.training_config
        if comm is None:
            comm = chainermn.create_communicator('naive', MPI.comm)
        result = {'training_time': 0.0, 'observation': []}

        # model and optimizer
        master_nnp = MasterNNP(
            tc.elements, mc.n_input, mc.hidden_layers, mc.n_output)
        master_opt = chainer.optimizers.Adam(tc.init_lr)
        master_opt = chainermn.create_multi_node_optimizer(master_opt, comm)
        master_opt.setup(master_nnp)
        master_opt.add_hook(chainer.optimizer_hooks.Lasso(tc.l1_norm))
        master_opt.add_hook(chainer.optimizer_hooks.WeightDecay(tc.l2_norm))

        for training, test in dataset:
            tag = training.tag
            properties = training.property.properties

            # iterators
            train_iter = chainer.iterators.SerialIterator(
                training, tc.batch_size // MPI.size, repeat=True, shuffle=True)
            test_iter = chainer.iterators.SerialIterator(
                test, tc.batch_size // MPI.size, repeat=False, shuffle=False)

            # model
            hdnnp = HighDimensionalNNP(
                training.elemental_composition,
                mc.n_input, mc.hidden_layers, mc.n_output)
            hdnnp.sync_param_with(master_nnp)
            main_opt = chainer.Optimizer()
            main_opt = chainermn.create_multi_node_optimizer(main_opt, comm)
            main_opt.setup(hdnnp)

            # loss function
            _, kwargs = tc.loss_function
            loss_function = self.loss_function(hdnnp, properties, **kwargs)
            observation_keys = loss_function.observation_keys

            # triggers
            interval = (tc.interval, 'epoch')
            stop_trigger = EarlyStoppingTrigger(
                check_trigger=interval,
                monitor=f'val/main/{observation_keys[-1]}',
                patients=tc.patients, mode='min',
                verbose=self.verbose, max_trigger=(tc.epoch, 'epoch'))

            # updater and trainer
            updater = Updater(train_iter,
                              {'main': main_opt, 'master': master_opt},
                              loss_func=loss_function.eval)
            out_dir = tc.out_dir / tag
            trainer = chainer.training.Trainer(updater, stop_trigger, out_dir)

            # extensions
            trainer.extend(ext.ExponentialShift('alpha', 1 - tc.lr_decay,
                                                target=tc.final_lr,
                                                optimizer=master_opt))
            evaluator = chainermn.create_multi_node_evaluator(
                ext.Evaluator(test_iter, hdnnp, eval_func=loss_function.eval),
                comm)
            trainer.extend(evaluator, name='val')
            if tc.scatter_plot:
                trainer.extend(ScatterPlot(test, hdnnp, comm),
                               trigger=interval)
            if MPI.rank == 0:
                if tc.log_report:
                    trainer.extend(ext.LogReport(log_name='training.log'))
                if tc.print_report:
                    trainer.extend(ext.PrintReport(
                        ['epoch', 'iteration']
                        + [f'main/{key}' for key in observation_keys]
                        + [f'val/main/{key}' for key in observation_keys]))
                if tc.plot_report:
                    trainer.extend(ext.PlotReport(
                        [f'main/{key}' for key in observation_keys],
                        x_key='epoch', postprocess=set_log_scale,
                        file_name='training_set.png', marker=None))
                    trainer.extend(ext.PlotReport(
                        [f'val/main/{key}' for key in observation_keys],
                        x_key='epoch', postprocess=set_log_scale,
                        file_name='validation_set.png', marker=None))

            manager = Manager(tag, trainer, result, is_snapshot=True)
            if self.is_resume:
                manager.check_to_resume(self.resume_dir.name)
            if manager.allow_to_run:
                with manager:
                    trainer.run()

        if MPI.rank == 0:
            chainer.serializers.save_npz(
                tc.out_dir / 'master_nnp.npz', master_nnp)

        return result
Esempio n. 10
0
def train(args, train_data, test_data, evaluator_type):
    required_args = [
        'dataset',
        'class_names',
        'logs_dir',
        'min_size',
        'max_size',
        'anchor_scales',
    ]
    for arg_key in required_args:
        if not hasattr(args, arg_key):
            raise ValueError(
                'args must contain required key: {}'.format(arg_key)
            )

    assert evaluator_type in ['voc', 'coco'], \
        'Unsupported evaluator_type: {}'.format(evaluator_type)

    if args.multi_node:
        import chainermn

        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        if args.gpu is None:
            print(
                'Option --gpu is required without --multi-node.',
                file=sys.stderr,
            )
            sys.exit(1)
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()
    args.out = osp.join(args.logs_dir, now.strftime('%Y%m%d_%H%M%S'))

    args.batch_size = args.batch_size_per_gpu * args.n_gpu

    # lr: 0.00125 * 8 = 0.01  in original
    args.lr = 0.00125 * args.batch_size
    args.weight_decay = 0.0001

    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [
        (120e3 / 180e3) * args.max_epoch,
        (160e3 / 180e3) * args.max_epoch,
    ]

    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.pooling_func == 'align':
        pooling_func = cmr.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = cmr.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = cmr.functions.crop_and_resize
    else:
        raise ValueError(
            'Unsupported pooling_func: {}'.format(args.pooling_func)
        )

    if args.initializer == 'normal':
        mask_initialW = chainer.initializers.Normal(0.01)
    elif args.initializer == 'he_normal':
        mask_initialW = chainer.initializers.HeNormal(fan_option='fan_out')
    else:
        raise ValueError(
            'Unsupported initializer: {}'.format(args.initializer)
        )

    if args.model in ['resnet50', 'resnet101']:
        n_layers = int(args.model.lstrip('resnet'))
        mask_rcnn = cmr.models.MaskRCNNResNet(
            n_layers=n_layers,
            n_fg_class=len(args.class_names),
            pooling_func=pooling_func,
            anchor_scales=args.anchor_scales,
            roi_size=args.roi_size,
            min_size=args.min_size,
            max_size=args.max_size,
            mask_initialW=mask_initialW,
        )
    else:
        raise ValueError('Unsupported model: {}'.format(args.model))
    model = cmr.models.MaskRCNNTrainChain(mask_rcnn)
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    if args.model in ['resnet50', 'resnet101']:
        # ResNetExtractor.freeze_at is not enough to freeze params
        # since WeightDecay updates the param little by little.
        mask_rcnn.extractor.conv1.disable_update()
        mask_rcnn.extractor.bn1.disable_update()
        mask_rcnn.extractor.res2.disable_update()
        for link in mask_rcnn.links():
            if isinstance(link, cmr.links.AffineChannel2D):
                link.disable_update()

    train_data = chainer.datasets.TransformDataset(
        train_data,
        cmr.datasets.MaskRCNNTransform(mask_rcnn),
    )
    test_data = chainer.datasets.TransformDataset(
        test_data,
        cmr.datasets.MaskRCNNTransform(mask_rcnn, train=False),
    )
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
            test_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        test_data = chainermn.scatter_dataset(test_data, comm)

    # FIXME: MultiProcessIterator sometimes hangs
    train_iter = chainer.iterators.SerialIterator(
        train_data,
        batch_size=args.batch_size_per_gpu,
    )
    test_iter = chainer.iterators.SerialIterator(
        test_data,
        batch_size=args.batch_size_per_gpu,
        repeat=False,
        shuffle=False,
    )

    converter = functools.partial(
        cmr.datasets.concat_examples,
        padding=0,
        # img, bboxes, labels, masks, scales
        indices_concat=[0, 2, 3, 4],  # img, _, labels, masks, scales
        indices_to_device=[0, 1],  # img, bbox
    )
    updater = chainer.training.updater.StandardUpdater(
        train_iter,
        optimizer,
        device=device,
        converter=converter,
    )

    trainer = training.Trainer(
        updater,
        (args.max_epoch, 'epoch'),
        out=args.out,
    )

    trainer.extend(
        extensions.ExponentialShift('lr', 0.1),
        trigger=training.triggers.ManualScheduleTrigger(
            args.step_size,
            'epoch',
        ),
    )

    eval_interval = 1, 'epoch'
    log_interval = 20, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = 20, 'iteration'

    if evaluator_type == 'voc':
        evaluator = cmr.extensions.InstanceSegmentationVOCEvaluator(
            test_iter,
            model.mask_rcnn,
            device=device,
            use_07_metric=True,
            label_names=args.class_names,
        )
    elif evaluator_type == 'coco':
        evaluator = cmr.extensions.InstanceSegmentationCOCOEvaluator(
            test_iter,
            model.mask_rcnn,
            device=device,
            label_names=args.class_names,
        )
    else:
        raise ValueError(
            'Unsupported evaluator_type: {}'.format(evaluator_type)
        )
    if args.multi_node:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=eval_interval)

    if not args.multi_node or comm.rank == 0:
        # Save snapshot.
        trainer.extend(
            extensions.snapshot_object(model.mask_rcnn, 'snapshot_model.npz'),
            trigger=training.triggers.MaxValueTrigger(
                'validation/main/map',
                eval_interval,
            ),
        )

        # Dump params.yaml.
        args.git_hash = cmr.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))

        # Visualization.
        trainer.extend(
            cmr.extensions.InstanceSegmentationVisReport(
                test_iter,
                model.mask_rcnn,
                label_names=args.class_names,
            ),
            trigger=eval_interval,
        )

        # Logging.
        trainer.extend(
            chainer.training.extensions.observe_lr(),
            trigger=log_interval,
        )
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(
            extensions.PrintReport(
                [
                    'iteration',
                    'epoch',
                    'elapsed_time',
                    'lr',
                    'main/loss',
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                    'validation/main/map',
                ],
            ),
            trigger=print_interval,
        )
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # Plot.
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(
                [
                    'main/loss',
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                ],
                file_name='loss.png',
                trigger=plot_interval,
            ),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport(
                ['validation/main/map'],
                file_name='accuracy.png',
                trigger=plot_interval,
            ),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
    optimizer = chainermn.create_multi_node_optimizer(chainer.optimizers.MomentumSGD(args.learning_rate), comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))
    
    num_loaders = 2
    train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size, n_processes=num_loaders)
    test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_loaders)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.output_data_dir)

    # Evaluate the model with the test dataset for each epoch

    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Reduce the learning rate by half every 25 epochs.
    trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    trainer.extend(extensions.dump_graph('main/loss'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())
    if comm.rank == 0:
        if extensions.PlotReport.available():
            trainer.extend(
                extensions.PlotReport(['main/loss', 'validation/main/loss'],
    test_dataset = _preprocess_mnist(test_file, **preprocess_mnist_options)

    train_iter = chainer.iterators.SerialIterator(train_dataset,
                                                  args.batch_size)
    test_iter = chainer.iterators.SerialIterator(test_dataset,
                                                 args.batch_size,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epochs, 'epoch'),
                               out=args.output_data_dir)

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        if extensions.PlotReport.available():
            trainer.extend(
                extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                      'epoch',
                                      file_name='loss.png'))
            trainer.extend(
                extensions.PlotReport(
                    ['main/accuracy', 'validation/main/accuracy'],
                    'epoch',
                    file_name='accuracy.png'))
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action="store_true", default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help="Type of communicator")
    parser.add_argument('--stop', '-s', type=str, default="15e",
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default="adam()",
                        help="Optimizer and its argument")
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.mpi_comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print("RD source done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print("RD target done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print("Rank {} GPU: {}".format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.mpi_comm.bcast(source_ids, root=0)
    target_ids = comm.mpi_comm.bcast(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print("target_words : {}".format(len(target_words)))
        print("source_words : {}".format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write("Error: unknown stop trigger: {}".format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print("Trigger: {}".format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()
Esempio n. 14
0
def main():
    import chainermn
    chainer.global_config.autotune = True
    parser = argparse.ArgumentParser(description='ChainerMN example: Train MQAP using 3DCNN')
    parser.add_argument('--communicator', type=str,
                        default='hierarchical', help='Type of communicator')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', action='store_true',
                        help='Resume the training from snapshot')
    parser.add_argument('--weight', '-w', action='store_true',
                        help='Resume only weight')
    parser.add_argument('--config', '-c', type=int, default=0,
                        help='Number of config')
    parser.add_argument('--config_file', type=str, default='./data/config.json',
                        help='Config file path')

    args = parser.parse_args()
    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator, allreduce_grad_dtype='float16')
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1
    f = open(args.config_file, 'r')

    config = json.load(f)['Config'][args.config]
    args.out = os.path.join(args.out, str(args.config))
    if comm.rank == 0:
        print('==========================================')
        chainer.print_runtime_info()
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num epoch: {}'.format(config['epoch']))
        print('Batch size:  {}'.format(config['batch_size'] * comm.size))
        print('Optimizer:  {}'.format(config['optimizer']))
        print('Learning Rate:  {}'.format(config['learning_rate']))
        print('Out Directory:  {}'.format(args.out))
        print('Vertex feature:  {}'.format(config['vertex_feature']))
        if config['global_mode']:
            print('Using Global loss')
        if config['local_mode']:
            print('Using local loss')
            print('Local type : {}'.format(config['local_type']))
            print('Local label : {}'.format(config['local_label']))
        print('==========================================')
    d = Dataproc(size=comm.size, rank=comm.rank, config=config)
    if device >= 0:
        chainer.cuda.get_device(device).use()
    # sub_comm = comm.split(comm.rank // comm.intra_size, comm.rank)
    if config['local_type'] == 'Regression':
        local_loss_func = F.mean_squared_error
    else:
        local_loss_func = F.sigmoid_cross_entropy
    global_loss_func = F.mean_squared_error
    model = build_model(config=config, comm=comm)
    model = Classifier(predictor=model, local_loss_func=local_loss_func, global_loss_func=global_loss_func,
                       config=config)
    if device >= 0:
        model.to_gpu()
    train, test = d.get_dataset(key='train'), d.get_dataset(key='test')
    train_iter = I.SerialIterator(dataset=train, batch_size=config['batch_size'], repeat=True, shuffle=True)
    test_iter = I.SerialIterator(dataset=test, batch_size=config['batch_size'], repeat=False, shuffle=False)
    # train_iter = I.MultiprocessIterator(dataset=train, batch_size=args.batch, repeat=True, shuffle=True, n_processes=10)
    # test_iter = I.MultiprocessIterator(dataset=test, batch_size=args.batch, repeat=False, shuffle=True, n_processes=10)

    if config['optimizer'] == 'Adam':
        optimizer = chainer.optimizers.Adam(alpha=config['learning_rate'],
                                            weight_decay_rate=config['weight_decay_rate'], amsgrad=True)
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'MomentumSGD':
        optimizer = chainer.optimizers.MomentumSGD(lr=config['learning_rate'])
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'SMORMS3':
        optimizer = chainer.optimizers.SMORMS3(lr=config['learning_rate'])
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'Eve':
        from my_optimizer.eve import Eve, create_multi_node_optimizer
        optimizer = Eve(alpha=config['learning_rate'])
        optimizer = create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'Adabound':
        from my_optimizer.adabound import Adam as Adabound
        optimizer = Adabound(alpha=config['learning_rate'], adabound=True, amsgrad=True,
                             weight_decay_rate=config['weight_decay_rate'])
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    optimizer.setup(model)
    val_interval = 1, 'epoch'
    log_interval = 1, 'epoch'
    updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=d.get_converter())
    trainer = training.Trainer(updater, (config['epoch'], 'epoch'), out=args.out)
    evaluator = GraphEvaluator(iterator=test_iter, target=model.predictor, device=device, converter=d.get_converter(),
                               comm=comm, local_loss_func=local_loss_func, global_loss_func=global_loss_func,
                               name='val', config=config)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.snapshot(), trigger=val_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PlotReport(['main/loss', 'val/main/loss'], 'epoch', file_name='loss.png'),
                       trigger=val_interval)
        report_list = ['epoch', 'main/loss', 'val/main/loss']
        if config['global_mode']:
            report_list.extend(['main/global_loss', 'val/main/global_loss', 'val/main/global_pearson'])
            trainer.extend(extensions.PlotReport(['main/global_loss', 'val/main/global_loss'], 'epoch',
                                                 file_name='global_loss.png'), trigger=val_interval)
        if config['local_mode']:
            report_list.extend(['main/local_loss', 'val/main/local_loss', 'val/main/local_mean_pearson'])
            if config['local_type'] == 'Classification':
                report_list.append('val/main/local_auc')
                trainer.extend(extensions.PlotReport(['val/main/local_auc'], 'epoch', file_name='local_auc.png'),
                               trigger=val_interval)
            else:
                report_list.append('val/main/local_pearson')
        report_list.append('elapsed_time')
        trainer.extend(extensions.PrintReport(report_list), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))
    if args.resume:
        snap_list = [p for p in os.listdir(args.out) if 'snapshot' in p]
        snap_num = np.array([int(re.findall("[+-]?[0-9]+[\.]?[0-9]*[eE]?[+-]?[0-9]*", p)[0]) for p in snap_list])
        path = snap_list[np.argmax(snap_num)]
        path = os.path.join(args.out, path)
        if args.weight:
            obj_path = 'updater/model:main/predictor/'
            chainer.serializers.load_npz(path, model.predictor, obj_path)
        else:
            chainer.serializers.load_npz(path, trainer)
    if comm.rank == 0:
        protein_name_dict = d.get_protein_name_dict()
        out_path = Path(args.out)
        if not out_path.exists():
            out_path.mkdir(parents=True, exist_ok=True)
        np.savez(os.path.join(args.out, 'protein_name'), **protein_name_dict)
        f = open(os.path.join(args.out, 'config.json'), 'w')
        json.dump(config, f, ensure_ascii=False, indent=4, sort_keys=True, separators=(',', ': '))
        f.close()
        f = open(os.path.join(args.out, 'args.json'), 'w')
        json.dump(vars(args), f)
        f.close()
    if comm.rank == 0:
        print('train start!!!')
    trainer.run()
Esempio n. 15
0
def main():
    model_cfgs = {
        'resnet50': {
            'class': ResNet50,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet101': {
            'class': ResNet101,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet152': {
            'class': ResNet152,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        }
    }
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to root of the train dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')
    parser.add_argument('--model',
                        '-m',
                        choices=model_cfgs.keys(),
                        default='resnet50',
                        help='Convnet models')
    parser.add_argument('--communicator',
                        type=str,
                        default='pure_nccl',
                        help='Type of communicator')
    parser.add_argument('--loaderjob', type=int, default=4)
    parser.add_argument('--batchsize',
                        type=int,
                        default=32,
                        help='Batch size for each worker')
    parser.add_argument('--lr', type=float)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--weight_decay', type=float, default=0.0001)
    parser.add_argument('--out', type=str, default='result')
    parser.add_argument('--epoch', type=int, default=90)
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256
        if comm.rank == 0:
            print('lr={}: lr is selected based on the linear '
                  'scaling rule'.format(lr))

    label_names = directory_parsing_label_names(args.train)

    model_cfg = model_cfgs[args.model]
    extractor = model_cfg['class'](n_class=len(label_names),
                                   **model_cfg['kwargs'])
    extractor.pick = model_cfg['score_layer_name']
    model = Classifier(extractor)
    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in model.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    train_data = DirectoryParsingLabelDataset(args.train)
    val_data = DirectoryParsingLabelDataset(args.val)
    train_data = TransformDataset(train_data, ('img', 'label'),
                                  TrainTransform(extractor.mean))
    val_data = TransformDataset(val_data, ('img', 'label'),
                                ValTransform(extractor.mean))
    print('finished loading dataset')

    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(train_indices,
                                              comm,
                                              shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(val_data,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_processes=args.loaderjob)

    optimizer = chainermn.create_multi_node_optimizer(
        CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    optimizer.setup(model)
    for param in model.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu()

    updater = chainer.training.StandardUpdater(train_iter,
                                               optimizer,
                                               device=device)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, model, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.snapshot_object(
            extractor, 'snapshot_model_{.updater.epoch}.npz'),
                       trigger=(args.epoch, 'epoch'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
Esempio n. 16
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError('ImageNet requires GPU support.')

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Start method of multiprocessing module need to be changed if we
    # are using InfiniBand and MultiprocessIterator. This is because
    # processes often crash when calling fork if they are using
    # Infiniband.  (c.f.,
    # https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    # Also, just setting the start method does not seem to be
    # sufficient to actually launch the forkserver processes, so also
    # start a dummy process.
    # See also our document:
    # https://chainermn.readthedocs.io/en/stable/tutorial/tips_faqs.html#using-multiprocessiterator
    # This must be done *before* ``chainermn.create_communicator``!!!
    multiprocessing.set_start_method('forkserver')
    p = multiprocessing.Process(target=lambda *x: x, args=())
    p.start()
    p.join()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(args.val, args.root, mean, model.insize,
                                  False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # A workaround for processes crash should be done before making
    # communicator above, when using fork (e.g. MultiProcessIterator)
    # along with Infiniband.
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out', '-o', default=None)
    parser.add_argument('--config', default=None)
    parser.add_argument('--resume', default=None)
    args = parser.parse_args()

    # gpu communicator
    comm = chainermn.create_communicator('hierarchical')
    device = comm.intra_rank
    chainer.cuda.get_device_from_id(device).use()

    # out
    out = args.out
    if out is None:
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        out = osp.join(filepath, 'out', timestamp)

    # config
    cfgpath = args.config
    if cfgpath is None:
        cfgpath = osp.join(filepath, 'cfg', 'train.yaml')
    with open(cfgpath, 'r') as f:
        config = easydict.EasyDict(yaml.load(f))

    if comm.rank == 0:
        os.makedirs(out)
        shutil.copy(cfgpath, osp.join(out, 'train.yaml'))

    min_size = config.min_size
    max_size = config.max_size
    random_seed = config.random_seed
    if 'max_epoch' in config:
        max_epoch = config.max_epoch
        max_iter = None
    else:
        max_epoch = None
        max_iter = config.max_iter
    lr = config.lr
    if 'cooldown_epoch' in config:
        cooldown_epoch = config.cooldown_epoch
        cooldown_iter = None
    else:
        cooldown_epoch = None
        cooldown_iter = config.cooldown_iter
    lr = config.lr
    lr_cooldown_factor = config.lr_cooldown_factor

    # set random seed
    np.random.seed(random_seed)
    cp.random.seed(random_seed)

    # model
    n_class = len(voc_label_names)
    fcis_model = fcis.models.FCISResNet101(n_class,
                                           ratios=(0.5, 1.0, 2.0),
                                           anchor_scales=(8, 16, 32),
                                           rpn_min_size=16)
    if args.resume is None:
        fcis_model.extractor.init_weight()
    else:
        chainer.serializers.load_npz(args.resume, fcis_model)
    model = fcis.models.FCISTrainChain(fcis_model,
                                       n_sample=128,
                                       bg_iou_thresh_lo=0.1)
    model.to_gpu()

    # optimizer
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=lr, momentum=0.9), comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005))

    # disable update
    model.fcis.extractor.res1.disable_update(True, True)
    model.fcis.extractor.res2.disable_update(True, True)
    model.fcis.extractor.res3.disable_update(False, True)
    model.fcis.extractor.res4.disable_update(False, True)
    model.fcis.extractor.res5.disable_update(False, True)

    # psroi_conv1 lr
    model.fcis.head.psroi_conv1.W.update_rule.add_hook(GradientScaling(3.0))
    model.fcis.head.psroi_conv1.b.update_rule.add_hook(GradientScaling(3.0))

    # dataset
    if comm.rank == 0:
        if config.use_sbd:
            dataset_class = SBDInstanceSegmentationDataset
        else:
            dataset_class = VOCInstanceSegmentationDataset
        train_dataset = dataset_class(split='train')
        test_dataset = dataset_class(split='val')

        train_dataset = TransformDataset(
            train_dataset, Transform(model.fcis, min_size, max_size))
        test_dataset = TransformDataset(
            test_dataset, Transform(model.fcis, min_size, max_size,
                                    flip=False))
    else:
        train_dataset = None
        test_dataset = None

    train_dataset = chainermn.scatter_dataset(train_dataset,
                                              comm,
                                              shuffle=True)
    test_dataset = chainermn.scatter_dataset(test_dataset, comm, shuffle=False)

    # iterator
    train_iter = chainer.iterators.SerialIterator(train_dataset, batch_size=1)
    test_iter = chainer.iterators.SerialIterator(test_dataset,
                                                 batch_size=1,
                                                 repeat=False,
                                                 shuffle=False)
    updater = chainer.training.updater.StandardUpdater(
        train_iter,
        optimizer,
        converter=fcis.dataset.concat_examples,
        device=device)

    # interval
    if max_epoch is not None:
        max_interval = max_epoch, 'epoch'
    else:
        max_interval = max_iter, 'iteration'

    if cooldown_epoch is not None:
        cooldown_interval = cooldown_epoch, 'epoch'
    else:
        cooldown_interval = cooldown_iter, 'iteration'

    save_interval = 1, 'epoch'
    log_interval = 100, 'iteration'
    print_interval = 20, 'iteration'
    test_interval = 8, 'epoch'

    # trainer
    trainer = chainer.training.Trainer(updater, max_interval, out=out)

    # lr scheduler
    trainer.extend(chainer.training.extensions.ExponentialShift(
        'lr', lr_cooldown_factor, init=lr),
                   trigger=chainer.training.triggers.ManualScheduleTrigger(
                       *cooldown_interval))

    # evaluator
    trainer.extend(chainermn.create_multi_node_evaluator(
        chainer.training.extensions.Evaluator(
            test_iter,
            model,
            converter=fcis.dataset.concat_examples,
            device=device), comm),
                   trigger=test_interval)

    # logging
    if comm.rank == 0:
        snapshot_filename = '{}_model_iter_{{.updater.iteration}}.npz'.format(
            model.fcis.__class__.__name__)

        trainer.extend(chainer.training.extensions.snapshot_object(
            model.fcis,
            savefun=chainer.serializers.save_npz,
            filename=snapshot_filename),
                       trigger=save_interval)
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(
            chainer.training.extensions.LogReport(log_name='log.json',
                                                  trigger=log_interval))
        trainer.extend(chainer.training.extensions.PrintReport([
            'iteration',
            'epoch',
            'elapsed_time',
            'lr',
            'main/loss',
            'main/rpn_loc_loss',
            'main/rpn_cls_loss',
            'main/fcis_loc_loss',
            'main/fcis_cls_loss',
            'main/fcis_mask_loss',
            'main/rpn_acc',
            'main/fcis_cls_acc',
            'main/fcis_fg_acc',
            'validation/main/rpn_acc',
            'validation/main/fcis_cls_acc',
            'validation/main/fcis_fg_acc',
        ]),
                       trigger=print_interval)
        trainer.extend(
            chainer.training.extensions.ProgressBar(update_interval=10))
        trainer.extend(chainer.training.extensions.dump_graph('main/loss'))

    trainer.run()

    if comm.rank == 0:
        print('log is saved in {}'.format(out))
Esempio n. 18
0
def main():
    # Start the multiprocessing environment
    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    # Set up workspace
    # 12 GB GPU RAM for workspace
    chainer.cuda.set_max_workspace_size(16 * 1024 * 1024 * 1024)

    # Setup the multi-node environment
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank
    print(
        '==> Successfully setup communicator: "{}" rank: {} device: {} size: {}'
        .format(args.communicator, comm.rank, device, comm.size))
    set_random_seed(args, device)

    # Setup LR
    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256  # TODO: why?
        if comm.rank == 0:
            print(
                'LR = {} is selected based on the linear scaling rule'.format(
                    lr))

    # Setup dataset
    train_dir = os.path.join(args.dataset_dir, 'train')
    val_dir = os.path.join(args.dataset_dir, 'val')
    label_names = datasets.directory_parsing_label_names(train_dir)
    train_data = datasets.DirectoryParsingLabelDataset(train_dir)
    val_data = datasets.DirectoryParsingLabelDataset(val_dir)
    train_data = TransformDataset(train_data, ('img', 'label'),
                                  TrainTransform(_mean, args))
    val_data = TransformDataset(val_data, ('img', 'label'),
                                ValTransform(_mean, args))
    print('==> [{}] Successfully finished loading dataset'.format(comm.rank))

    # Initializing dataset iterators
    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(train_indices,
                                              comm,
                                              shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(val_data,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_processes=args.loaderjob)

    # Create the model
    kwargs = {}
    if args.first_bn_mixed16 and args.dtype == 'float16':
        print('==> Setting the first BN layer to mixed16')
        kwargs['first_bn_mixed16'] = True

    # Initialize the model
    net = models.__dict__[args.arch](n_class=len(label_names), **kwargs)
    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in net.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    # Apply ada loss transform
    recorder = AdaLossRecorder(sample_per_n_iter=100)
    # Update the model to support AdaLoss
    net = AdaLossScaled(net,
                        init_scale=args.init_scale,
                        cfg={
                            'loss_scale_method': args.loss_scale_method,
                            'scale_upper_bound': args.scale_upper_bound,
                            'accum_upper_bound': args.accum_upper_bound,
                            'update_per_n_iteration':
                            args.update_per_n_iteration,
                            'recorder': recorder,
                        },
                        transforms=[
                            AdaLossTransformLinear(),
                            AdaLossTransformBottleneck(),
                            AdaLossTransformBasicBlock(),
                            AdaLossTransformConv2DBNActiv(),
                        ],
                        verbose=args.verbose)

    if comm.rank == 0:  # print network only in the 1-rank machine
        print(net)
    net = L.Classifier(net)
    hook = AdaLossMonitor(sample_per_n_iter=100,
                          verbose=args.verbose,
                          includes=['Grad', 'Deconvolution'])

    # Setup optimizer
    optim = chainermn.create_multi_node_optimizer(
        optimizers.CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    if args.dtype == 'mixed16':
        print('==> Using FP32 update for dtype=mixed16')
        optim.use_fp32_update()  # by default use fp32 update

        # HACK: support skipping update by existing loss scaling functionality
        if args.dynamic_interval is not None:
            optim.loss_scaling(interval=args.dynamic_interval, scale=None)
        else:
            optim.loss_scaling(interval=float('inf'), scale=None)
            optim._loss_scale_max = 1.0  # to prevent actual loss scaling

    optim.setup(net)

    # setup weight decay
    for param in net.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    # allocate model to multiple GPUs
    if device >= 0:
        chainer.cuda.get_device(device).use()
        net.to_gpu()

    # Create an updater that implements how to update based on one train_iter input
    updater = chainer.training.StandardUpdater(train_iter,
                                               optim,
                                               device=device)
    # Setup Trainer
    stop_trigger = (args.epoch, 'epoch')
    if args.iter is not None:
        stop_trigger = (args.iter, 'iteration')
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        """ LR schedule for training ResNet especially.
        NOTE: lr should be within the context.
        """
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5  # NOTE: mentioned the original ResNet paper.
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, net, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)

        # NOTE: may take snapshot every iteration now
        snapshot_label = 'epoch' if args.iter is None else 'iteration'
        snapshot_trigger = (args.snapshot_freq, snapshot_label)
        snapshot_filename = ('snapshot_' + snapshot_label + '_{.updater.' +
                             snapshot_label + '}.npz')
        trainer.extend(extensions.snapshot(filename=snapshot_filename),
                       trigger=snapshot_trigger)

        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_value(
            'loss_scale',
            lambda trainer: trainer.updater.get_optimizer('main')._loss_scale),
                       trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'loss_scale',
            'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    recorder.trainer = trainer
    hook.trainer = trainer
    with ExitStack() as stack:
        if comm.rank == 0:
            stack.enter_context(hook)
        trainer.run()

    # store recorded results
    if comm.rank == 0:  # NOTE: only export in the first rank
        recorder.export().to_csv(os.path.join(args.out, 'loss_scale.csv'))
        hook.export_history().to_csv(os.path.join(args.out, 'grad_stats.csv'))
Esempio n. 19
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--np',
                        '-n',
                        type=int,
                        required=True,
                        help='Minimum number of processes')
    parser.add_argument('--bind',
                        '-p',
                        type=str,
                        required=True,
                        help='address to bind gRPC server')
    parser.add_argument('--etcd',
                        '-c',
                        type=str,
                        default='etcd://127.0.0.1:2379/train_mnist.py',
                        help='etcd location and path')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    n = args.np
    bind = args.bind
    scale_policy = MinMaxPolicy(n, n, block=True)
    comm = None
    if args.gpu:
        from echainer import NcclCommunicator
        comm = NcclCommunicator(policy=scale_policy, bind=bind, etcd=args.etcd)
    else:
        comm = MetaCommunicator(policy=scale_policy, bind=bind, etcd=args.etcd)

    late = not comm.initial

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPU ', comm.intra_rank)
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    device = -1
    model = L.Classifier(MLP(args.unit, 10))
    if args.gpu:
        device = comm.intra_rank
        model.to_gpu(device=device)
    print('Using GPU ', device)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)
    done = False
    retry = False
    while not done:
        if args.gpu and retry:
            device = comm.intra_rank
            print('Using GPU No.', comm.intra_rank)
            model.to_gpu(device=device)

            optimizer = chainermn.create_multi_node_optimizer(
                chainer.optimizers.Adam(), comm)
            optimizer.setup(model)

        # Split and distribute the dataset. Only worker 0 loads the whole dataset.
        # Datasets of worker 0 are evenly split and distributed to all workers.
        print('get dataset')
        if comm.rank == 0:
            train, test = chainer.datasets.get_mnist()
        else:
            train, test = None, None

        print('scatter dataset')
        train = chainermn.scatter_dataset(train, comm, shuffle=True)
        test = chainermn.scatter_dataset(test, comm, shuffle=True)

        print('create iterator')
        train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     args.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           device=device)
        trainer = training.Trainer(updater, (args.epoch, 'epoch'),
                                   out=args.out)

        # Create a multi node evaluator from a standard Chainer evaluator.
        evaluator = extensions.Evaluator(test_iter, model, device=device)
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
        trainer.extend(evaluator)

        trainer.extend(ReformHandler())
        trainer.extend(comm.get_monitor())
        trainer.extend(comm.get_uninitializer(), trigger=(1, 'iteration'))

        # Some display and output extensions are necessary only for one worker.
        # (Otherwise, there would just be repeated outputs.)
        if comm.rank == 0:
            trainer.extend(extensions.dump_graph('main/loss'))
            trainer.extend(echainer.extension.Lineage(comm))
            trainer.extend(
                extensions.PrintReport([
                    'epoch', 'main/loss', 'validation/main/loss',
                    'main/accuracy', 'validation/main/accuracy', 'elapsed_time'
                ],
                                       log_report='Lineage'))
            trainer.extend(extensions.ProgressBar())

        # Register extension to save trainer's progress (iteration) in communicator
        # trainer.extend(comm.get_progress_updater())

        if args.resume:
            chainer.serializers.load_npz(args.resume, trainer)

        # Optimizer includes model parameters and other params in optimizer
        comm.register_state('optimizer', optimizer)
        comm.register_state('model', model)
        # Iterators: Well if number of nodes changed then current
        # position becomes wrong but try to recover That's why
        # recoevering iterators are nonsense.
        # Trainer: Too large and it includes Iterators.
        print(updater.epoch, updater.iteration)

        if retry or late:
            (iteration, epoch) = comm.fetch_state('optimizer', optimizer)
            (iteration, epoch) = comm.fetch_state('model', model)
            train_iter.epoch = epoch
            updater.iteration = iteration

        optimizers = trainer.updater.get_all_optimizers()
        # bcast again anyway
        for name in optimizers.keys():
            optimizers[name].reset_prev_params()

        try:
            print('start trainer.run(), ', trainer.updater.iteration,
                  trainer.updater.epoch)
            trainer.run()
            done = trainer._done
        except CommException as ce:
            print(">>>>>>>>>>>", ce, updater.iteration, updater.epoch)
            comm.save_all_states(updater.iteration, updater.epoch)
            # Here comm will be ready to accept fetch state calls and once all
            # nodes got catched up it'll return and continue to run: TODO
            comm.sync_cluster(trainer.updater.get_all_optimizers())
            retry = True
            continue
        except ClusterUpdatedException as ce:
            print(">>>>>>>>>>>", ce)
            comm.save_all_states(updater.iteration, updater.epoch)
            comm.sync_cluster(trainer.updater.get_all_optimizers())
            retry = True
            continue
        except Exception as e:
            print("Unexpected >>>>>>>>>>>", e)
            break

    # TODO: this should be called cleanly, unless it runs forever somehow...
    comm.leave()
Esempio n. 20
0
def get_trainer(args):
    config = yaml.load(open(args.config))

    # Set workspace size
    if 'max_workspace_size' in config:
        chainer.cuda.set_max_workspace_size(config['max_workspace_size'])

    # Prepare ChainerMN communicator
    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    # Show the setup information
    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        if args.gpu:
            print('Using GPUs - max workspace size:',
                  chainer.cuda.get_max_workspace_size())
        print('Using {} communicator'.format(args.communicator))

    # Output version info
    if comm.rank == 0:
        print('Chainer version: {}'.format(chainer.__version__))
        print('ChainerMN version: {}'.format(chainermn.__version__))
        print('cuda: {}, cudnn: {}'.format(chainer.cuda.available,
                                           chainer.cuda.cudnn_enabled))

    # Create result_dir
    if args.result_dir is not None:
        config['result_dir'] = args.result_dir
        model_fn = config['model']['module'].split('.')[-1]
        sys.path.insert(0, args.result_dir)
        config['model']['module'] = model_fn
    else:
        config['result_dir'] = create_result_dir_from_config_path(args.config)
    log_fn = save_config_get_log_fn(config['result_dir'], args.config)
    if comm.rank == 0:
        print('result_dir:', config['result_dir'])

    # Instantiate model
    model = get_model_from_config(config, comm)
    if args.gpu:
        chainer.cuda.get_device(device).use()
        model.to_gpu()
    if comm.rank == 0:
        print('model:', model.__class__.__name__)

    # Initialize optimizer
    optimizer = get_optimizer_from_config(model, config)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    if comm.rank == 0:
        print('optimizer:', optimizer.__class__.__name__)

    # Setting up datasets
    if comm.rank == 0:
        train_dataset, valid_dataset = get_dataset_from_config(config)
        print('train_dataset: {}'.format(len(train_dataset)),
              train_dataset.__class__.__name__)
        print('valid_dataset: {}'.format(len(valid_dataset)),
              valid_dataset.__class__.__name__)
    else:
        train_dataset, valid_dataset = [], []
    train_dataset = chainermn.scatter_dataset(train_dataset, comm)
    valid_dataset = chainermn.scatter_dataset(valid_dataset, comm)

    # Create iterators
    # multiprocessing.set_start_method('forkserver')
    train_iter, valid_iter = create_iterators(train_dataset, valid_dataset,
                                              config)
    if comm.rank == 0:
        print('train_iter:', train_iter.__class__.__name__)
        print('valid_iter:', valid_iter.__class__.__name__)

    # Create updater and trainer
    if 'updater_creator' in config:
        updater_creator = get_updater_creator_from_config(config)
        updater = updater_creator(train_iter, optimizer, device=device)
    else:
        updater = create_updater(train_iter, optimizer, device=device)
    if comm.rank == 0:
        print('updater:', updater.__class__.__name__)

    # Create Trainer
    trainer = training.Trainer(updater,
                               config['stop_trigger'],
                               out=config['result_dir'])
    if comm.rank == 0:
        print('Trainer stops:', config['stop_trigger'])

    # Trainer extensions
    for ext in config['trainer_extension']:
        ext, values = ext.popitem()
        if ext == 'LogReport' and comm.rank == 0:
            trigger = values['trigger']
            trainer.extend(
                extensions.LogReport(trigger=trigger, log_name=log_fn))
        elif ext == 'observe_lr' and comm.rank == 0:
            trainer.extend(extensions.observe_lr(), trigger=values['trigger'])
        elif ext == 'dump_graph' and comm.rank == 0:
            trainer.extend(extensions.dump_graph(**values))
        elif ext == 'Evaluator':
            assert 'module' in values
            mod = import_module(values['module'])
            evaluator = getattr(mod, values['name'])
            if evaluator is extensions.Evaluator:
                evaluator = evaluator(valid_iter, model, device=device)
            else:
                evaluator = evaluator(valid_iter, model.predictor)
            evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
            trainer.extend(evaluator,
                           trigger=values['trigger'],
                           name=values['prefix'])
        elif ext == 'PlotReport' and comm.rank == 0:
            trainer.extend(extensions.PlotReport(**values))
        elif ext == 'PrintReport' and comm.rank == 0:
            trigger = values.pop('trigger')
            trainer.extend(extensions.PrintReport(**values), trigger=trigger)
        elif ext == 'ProgressBar' and comm.rank == 0:
            upd_int = values['update_interval']
            trigger = values['trigger']
            trainer.extend(extensions.ProgressBar(update_interval=upd_int),
                           trigger=trigger)
        elif ext == 'snapshot' and comm.rank == 0:
            filename = values['filename']
            trigger = values['trigger']
            trainer.extend(extensions.snapshot(filename=filename),
                           trigger=trigger)

    # LR decay
    if 'lr_drop_ratio' in config['optimizer'] \
            and 'lr_drop_triggers' in config['optimizer']:
        ratio = config['optimizer']['lr_drop_ratio']
        points = config['optimizer']['lr_drop_triggers']['points']
        unit = config['optimizer']['lr_drop_triggers']['unit']
        drop_trigger = triggers.ManualScheduleTrigger(points, unit)

        def lr_drop(trainer):
            trainer.updater.get_optimizer('main').lr *= ratio

        trainer.extend(lr_drop, trigger=drop_trigger)

    if 'lr_drop_poly_power' in config['optimizer']:
        power = config['optimizer']['lr_drop_poly_power']
        stop_trigger = config['stop_trigger']
        batchsize = train_iter.batch_size
        len_dataset = len(train_dataset)
        trainer.extend(PolynomialShift('lr', power, stop_trigger, batchsize,
                                       len_dataset),
                       trigger=(1, 'iteration'))

    # Resume
    if args.resume is not None:
        # fn = '{}.bak'.format(args.resume)
        # shutil.copy(args.resume, fn)
        serializers.load_npz(args.resume, trainer)
        if comm.rank == 0:
            print('Resumed from:', args.resume)

    if comm.rank == 0:
        print('==========================================')

    return trainer
Esempio n. 21
0
def check_mnist(gpu, display_log=True):
    epoch = 5
    batchsize = 100
    n_units = 100

    comm = chainermn.create_communicator('naive')
    if gpu:
        device = comm.intra_rank
        chainer.cuda.get_device_from_id(device).use()
    else:
        device = -1

    model = L.Classifier(MLP(n_units, 10))
    if gpu:
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test, batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(
        train_iter,
        optimizer,
        device=device
    )

    trainer = training.Trainer(updater, (epoch, 'epoch'))

    # Wrap standard Chainer evaluators by MultiNodeEvaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Add checkpointer. This is just to check checkpointing runs
    # without errors
    path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + "-tmp-")
    checkpointer = create_multi_node_checkpointer(name=__name__, comm=comm,
                                                  path=path)
    trainer.extend(checkpointer, trigger=(1, 'epoch'))

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0 and display_log:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.PrintReport(['epoch',
                                               'main/loss',
                                               'validation/main/loss',
                                               'main/accuracy',
                                               'validation/main/accuracy',
                                               'elapsed_time'],
                                              out=sys.stderr),
                       trigger=(1, 'epoch'))
    trainer.run()

    err = evaluator()['validation/main/accuracy']
    assert err > 0.95

    # Check checkpointer successfully finalized snapshot directory
    assert [] == os.listdir(path)
    os.removedirs(path)
Esempio n. 22
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--model', '-m',
                        choices=['vgg16', 'resnet50', 'resnet101'],
                        default='resnet50', help='base model')
    parser.add_argument('--pooling-func', '-p',
                        choices=['pooling', 'align', 'resize'],
                        default='align', help='pooling function')
    parser.add_argument('--gpu', '-g', type=int, help='GPU id.')
    parser.add_argument('--multi-node', '-n', action='store_true',
                        help='use multi node')
    parser.add_argument('--roi-size', '-r', type=int, default=7,
                        help='roi size')
    args = parser.parse_args()

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        if args.gpu is None:
            print('Option --gpu is required without --multi-node.',
                  file=sys.stderr)
            quit(1)
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()
    args.out = osp.join(here, 'logs', now.strftime('%Y%m%d_%H%M%S'))

    # 0.00125 * 8 = 0.01  in original
    args.batch_size = 1 * args.n_gpu
    args.lr = 0.00125 * args.batch_size
    args.weight_decay = 0.0001

    # (180e3 * 8) / len(coco_trainval)
    args.max_epoch = (180e3 * 8) / 118287
    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [(120e3 / 180e3) * args.max_epoch,
                      (160e3 / 180e3) * args.max_epoch]

    random.seed(args.seed)
    np.random.seed(args.seed)

    args.dataset = 'coco'
    train_data = chainer.datasets.ConcatenatedDataset(
        mrcnn.datasets.COCOInstanceSegmentationDataset('train'),
        mrcnn.datasets.COCOInstanceSegmentationDataset('valminusminival'),
    )
    test_data = mrcnn.datasets.COCOInstanceSegmentationDataset(
        'minival', use_crowd=True, return_crowd=True, return_area=True)
    class_names = test_data.class_names

    train_data = MaskRCNNDataset(train_data)
    test_data = MaskRCNNDataset(test_data)

    if args.pooling_func == 'align':
        pooling_func = mrcnn.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = chainer.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = mrcnn.functions.crop_and_resize
    else:
        raise ValueError

    min_size = 800
    max_size = 1333
    anchor_scales = (2, 4, 8, 16, 32)

    if args.model == 'vgg16':
        mask_rcnn = mrcnn.models.MaskRCNNVGG16(
            n_fg_class=len(class_names),
            pretrained_model='imagenet',
            pooling_func=pooling_func,
            anchor_scales=anchor_scales,
            min_size=min_size,
            max_size=max_size,
            roi_size=args.roi_size,
        )
    elif args.model in ['resnet50', 'resnet101']:
        n_layers = int(args.model.lstrip('resnet'))
        mask_rcnn = mrcnn.models.MaskRCNNResNet(
            n_layers=n_layers,
            n_fg_class=len(class_names),
            pretrained_model='imagenet',
            pooling_func=pooling_func,
            anchor_scales=anchor_scales,
            min_size=min_size,
            max_size=max_size,
            roi_size=args.roi_size,
        )
    else:
        raise ValueError
    mask_rcnn.use_preset('evaluate')
    model = mrcnn.models.MaskRCNNTrainChain(mask_rcnn)
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    if args.model in ['resnet50', 'resnet101']:
        model.mask_rcnn.extractor.mode = 'res3+'
        mask_rcnn.extractor.conv1.disable_update()
        mask_rcnn.extractor.bn1.disable_update()
        mask_rcnn.extractor.res2.disable_update()

    train_data = chainer.datasets.TransformDataset(
        train_data, mrcnn.datasets.MaskRCNNTransform(mask_rcnn))
    test_data = chainer.datasets.TransformDataset(
        test_data, mrcnn.datasets.MaskRCNNTransform(mask_rcnn, train=False))
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
            test_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        test_data = chainermn.scatter_dataset(test_data, comm)

    # FIXME: MultiProcessIterator sometimes hangs
    train_iter = chainer.iterators.SerialIterator(
        train_data, batch_size=1)
    test_iter = chainer.iterators.SerialIterator(
        test_data, batch_size=1, repeat=False, shuffle=False)

    updater = chainer.training.updater.StandardUpdater(
        train_iter, optimizer, device=device,
        converter=mrcnn.datasets.concat_examples)

    trainer = training.Trainer(
        updater, (args.max_epoch, 'epoch'), out=args.out)

    trainer.extend(
        extensions.ExponentialShift('lr', 0.1),
        trigger=training.triggers.ManualScheduleTrigger(
            args.step_size, 'epoch'))

    eval_interval = 1, 'epoch'
    log_interval = 20, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = 20, 'iteration'

    evaluator = mrcnn.extensions.InstanceSegmentationCOCOEvaluator(
        test_iter, model.mask_rcnn, device=device, label_names=class_names)
    if args.multi_node:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=eval_interval)

    if not args.multi_node or comm.rank == 0:
        trainer.extend(
            extensions.snapshot_object(
                model.mask_rcnn, 'snapshot_model.npz'),
            trigger=training.triggers.MaxValueTrigger(
                'validation/main/map', eval_interval))

        args.git_hash = mrcnn.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))
        trainer.extend(
            mrcnn.extensions.InstanceSegmentationVisReport(
                test_iter, model.mask_rcnn,
                label_names=class_names),
            trigger=eval_interval)
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport(
            ['iteration', 'epoch', 'elapsed_time', 'lr',
             'main/loss',
             'main/roi_loc_loss',
             'main/roi_cls_loss',
             'main/roi_mask_loss',
             'main/rpn_loc_loss',
             'main/rpn_cls_loss',
             'validation/main/map']), trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # plot
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(
                ['main/loss',
                 'main/roi_loc_loss',
                 'main/roi_cls_loss',
                 'main/roi_mask_loss',
                 'main/rpn_loc_loss',
                 'main/rpn_cls_loss'],
                file_name='loss.png', trigger=plot_interval
            ),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport(
                ['validation/main/map'],
                file_name='accuracy.png', trigger=plot_interval
            ),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
Esempio n. 23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_path',
                        type=str,
                        default='configs/base.yml',
                        help='path to config file')
    parser.add_argument('--results_dir',
                        type=str,
                        default='./result/',
                        help='directory to save the results to')
    parser.add_argument('--resume',
                        type=str,
                        default='',
                        help='path to the snapshot')
    parser.add_argument('--process_num', type=int, default=0)
    parser.add_argument('--seed', type=int, default=42)

    args = parser.parse_args()
    config = yaml_utils.Config(
        yaml.load(open(args.config_path), Loader=yaml.SafeLoader))
    pattern = "-".join([
        config.pattern, config.models['classifier']['name'],
        config.dataset['dataset_name']
    ])
    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        print('Num Minibatch-size: {}'.format(config.batchsize))
        print('Num Epoch: {}'.format(config.epoch))
        print('==========================================')

    # Model
    classifier = load_models(config.models['classifier'])

    if args.resume:
        print("Resume training with snapshot:{}".format(args.resume))
        chainer.serializers.load_npz(args.resume, classifier)

    chainer.cuda.get_device_from_id(device).use()
    classifier.to_gpu()
    # models = {"classifier": classifier}

    # Optimizer
    opt = make_optimizer(classifier, comm, config)
    opt.add_hook(chainer.optimizer.WeightDecay(5e-4))

    # Dataset
    if comm.rank == 0:
        dataset = yaml_utils.load_dataset(config)
        first_size = int(len(dataset) * config.train_val_split_ratio)
        train, val = chainer.datasets.split_dataset_random(dataset,
                                                           first_size,
                                                           seed=args.seed)
    else:
        yaml_utils.load_module(config.dataset['dataset_func'],
                               config.dataset['dataset_name'])
        train, val = None, None

    train = chainermn.scatter_dataset(train, comm)
    val = chainermn.scatter_dataset(val, comm)

    # Iterator
    train_iterator = chainer.iterators.SerialIterator(train, config.batchsize)
    val_iterator = chainer.iterators.SerialIterator(val,
                                                    config.batchsize,
                                                    repeat=False,
                                                    shuffle=False)
    kwargs = config.updater['args'] if 'args' in config.updater else {}
    kwargs.update({
        'classifier': classifier,
        'iterator': train_iterator,
        'optimizer': opt,
        'device': device,
    })

    # Updater
    updater = yaml_utils.load_updater_class(config)
    updater = updater(**kwargs)
    out = args.results_dir + '/' + pattern

    if comm.rank == 0:
        create_result_dir(out, args.config_path, config)

    # Trainer
    trainer = training.Trainer(updater, (config.epoch, 'epoch'), out=out)

    # Evaluator
    evaluator = ClassifierEvaluator(val_iterator, classifier, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Learning Rate Schedule (fixed)
    schedule = [config.epoch * 0.3, config.epoch * 0.6, config.epoch * 0.8]
    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=ManualScheduleTrigger(schedule, 'epoch'))

    report_keys = [
        'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
        'validation/main/accuracy', 'elapsed_time'
    ]
    if comm.rank == 0:
        # Set up logging
        trainer.extend(extensions.snapshot_object(
            classifier, 'classifier{}.npz'.format(args.process_num)),
                       trigger=MaxValueTrigger('validation/main/accuracy'))
        trainer.extend(
            extensions.LogReport(keys=report_keys,
                                 trigger=(config.display_interval, 'epoch')))
        trainer.extend(extensions.PrintReport(report_keys),
                       trigger=(config.display_interval, 'epoch'))
        trainer.extend(
            extensions.ProgressBar(
                update_interval=config.progressbar_interval))
    # Run the training
    trainer.run()
Esempio n. 24
0
def main():
    parser = argparse.ArgumentParser(
        description='ChainerMN example: pipelined neural network')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = -1

    if model_comm.size != 2:
        raise ValueError(
            'This example can only be executed on the even number'
            'of processes.')

    if comm.rank == 0:
        print('==========================================')
        if args.gpu:
            print('Using GPUs')
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    if data_axis == 0:
        model = L.Classifier(MLP0(model_comm, args.unit))
    elif data_axis == 1:
        model = MLP1(model_comm, args.unit, 10)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), data_comm)
    optimizer.setup(model)

    # Original dataset on worker 0 and 1.
    # Datasets of worker 0 and 1 are split and distributed to all workers.
    if model_axis == 0:
        train, test = chainer.datasets.get_mnist()
        if data_axis == 1:
            train = chainermn.datasets.create_empty_dataset(train)
            test = chainermn.datasets.create_empty_dataset(test)
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, data_comm, shuffle=True)
    test = chainermn.scatter_dataset(test, data_comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(
        train, args.batchsize, shuffle=False)
    test_iter = chainer.iterators.SerialIterator(
        test, args.batchsize, repeat=False, shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, data_comm)
    trainer.extend(evaluator)

    # Some display and output extentions are necessary only for worker 0.
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
Esempio n. 25
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('dataset',
                        choices=['real', 'synthetic'],
                        help='The dataset.')
    parser.add_argument('--model',
                        '-m',
                        choices=['vgg16', 'resnet50', 'resnet101'],
                        default='resnet50',
                        help='Base model of Mask R-CNN.')
    parser.add_argument('--pooling-func',
                        '-pf',
                        choices=['pooling', 'align', 'resize'],
                        default='align',
                        help='Pooling function.')
    parser.add_argument('--gpu', '-g', type=int, help='GPU id.')
    parser.add_argument('--multi-node',
                        '-mn',
                        action='store_true',
                        help='use multi node')
    parser.add_argument('--max-epoch',
                        type=float,
                        help='Epoch (default: 12.17)')
    args = parser.parse_args()

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()
    args.out = osp.join(here, 'logs/train_mrcnn',
                        now.strftime('%Y%m%d_%H%M%S'))

    # 0.00125 * 8 = 0.01  in original
    args.batch_size = 1 * args.n_gpu
    args.lr = 0.00125 * args.batch_size
    args.weight_decay = 0.0001

    if args.max_epoch is None:
        # (180e3 * 8) / len(coco_trainval)
        args.max_epoch = (180e3 * 8) / 118287
    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [(120e3 / 180e3) * args.max_epoch,
                      (160e3 / 180e3) * args.max_epoch]

    random.seed(args.seed)
    np.random.seed(args.seed)

    # Default Config
    min_size = 600
    max_size = 1000
    anchor_scales = [4, 8, 16, 32]
    proposal_creator_params = dict(
        n_train_pre_nms=12000,
        n_train_post_nms=2000,
        n_test_pre_nms=6000,
        n_test_post_nms=1000,
        min_size=0,
    )

    # if args.dataset == 'voc':
    #     train_data = mrcnn.datasets.SBDInstanceSeg('train')
    #     test_data = mrcnn.datasets.VOC2012InstanceSeg('val')
    # elif args.dataset == 'coco':
    #     train_data = chainer.datasets.ConcatenatedDataset(
    #         mrcnn.datasets.CocoInstanceSeg('train'),
    #         mrcnn.datasets.CocoInstanceSeg('valminusminival'),
    #     )
    #     test_data = mrcnn.datasets.CocoInstanceSeg('minival')
    #     train_data.class_names = test_data.class_names
    #     min_size = 800
    #     max_size = 1333
    # else:
    #     raise ValueError
    # instance_class_names = train_data.class_names[1:]
    # train_data = mrcnn.datasets.MaskRcnnDataset(train_data)
    # test_data = mrcnn.datasets.MaskRcnnDataset(test_data)

    if args.dataset == 'real':
        train_data = contrib.datasets.ARC2017RealInstancesDataset(
            'train', aug='standard')
    elif args.dataset == 'synthetic':
        train_data = contrib.datasets.ARC2017SyntheticInstancesDataset(
            do_aug=True, aug_level='all')
    else:
        raise ValueError
    test_data = contrib.datasets.ARC2017RealInstancesDataset('test')
    instance_class_names = train_data.class_names[1:]
    train_data = MaskRcnnDataset(train_data)
    test_data = MaskRcnnDataset(test_data)

    if args.pooling_func == 'align':
        pooling_func = mrcnn.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = chainer.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = mrcnn.functions.crop_and_resize
    else:
        raise ValueError

    if args.model == 'vgg16':
        mask_rcnn = mrcnn.models.MaskRCNNVGG16(
            n_fg_class=len(instance_class_names),
            pretrained_model='imagenet',
            pooling_func=pooling_func,
            anchor_scales=anchor_scales,
            proposal_creator_params=proposal_creator_params,
            min_size=min_size,
            max_size=max_size)
    elif args.model in ['resnet50', 'resnet101']:
        n_layers = int(args.model.lstrip('resnet'))
        mask_rcnn = mrcnn.models.MaskRCNNResNet(
            n_layers=n_layers,
            n_fg_class=len(instance_class_names),
            pretrained_model='imagenet',
            pooling_func=pooling_func,
            anchor_scales=anchor_scales,
            proposal_creator_params=proposal_creator_params,
            min_size=min_size,
            max_size=max_size)
    else:
        raise ValueError
    mask_rcnn.use_preset('evaluate')
    model = mrcnn.models.MaskRCNNTrainChain(
        mask_rcnn,
        proposal_target_creator=mrcnn.utils.ProposalTargetCreator(
            n_sample=512),
    )
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    if args.model in ['resnet50', 'resnet101']:
        model.mask_rcnn.extractor.mode = 'res3+'
        mask_rcnn.extractor.conv1.disable_update()
        mask_rcnn.extractor.bn1.disable_update()
        mask_rcnn.extractor.res2.disable_update()

    train_data = chainer.datasets.TransformDataset(
        train_data, mrcnn.datasets.MaskRCNNTransform(mask_rcnn))
    test_data = chainer.datasets.TransformDataset(
        test_data, mrcnn.datasets.MaskRCNNTransform(mask_rcnn, train=False))
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
            test_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.MultiprocessIterator(train_data,
                                                        batch_size=1,
                                                        n_prefetch=4,
                                                        shared_mem=10**8)
    test_iter = chainer.iterators.MultiprocessIterator(test_data,
                                                       batch_size=1,
                                                       n_prefetch=4,
                                                       shared_mem=10**8,
                                                       repeat=False,
                                                       shuffle=False)

    updater = chainer.training.updater.StandardUpdater(
        train_iter,
        optimizer,
        device=device,
        converter=mrcnn.datasets.concat_examples)

    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'),
                               out=args.out)

    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=training.triggers.ManualScheduleTrigger(
                       args.step_size, 'epoch'))

    eval_interval = 1, 'epoch'
    log_interval = 20, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = 20, 'iteration'

    evaluator = mrcnn.extensions.InstanceSegmentationVOCEvaluator(
        test_iter,
        model.mask_rcnn,
        device=device,
        use_07_metric=True,
        label_names=instance_class_names)
    if args.multi_node:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=eval_interval)

    if not args.multi_node or comm.rank == 0:
        trainer.extend(extensions.snapshot_object(model.mask_rcnn,
                                                  'snapshot_model.npz'),
                       trigger=training.triggers.MaxValueTrigger(
                           'validation/main/map', eval_interval))
        args.git_hash = mrcnn.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))
        trainer.extend(mrcnn.extensions.InstanceSegmentationVisReport(
            test_iter, model.mask_rcnn, label_names=instance_class_names),
                       trigger=eval_interval)
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss',
            'main/rpn_loc_loss', 'main/rpn_cls_loss', 'validation/main/map'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # plot
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport([
                'main/loss',
                'main/roi_loc_loss',
                'main/roi_cls_loss',
                'main/roi_mask_loss',
                'main/rpn_loc_loss',
                'main/rpn_cls_loss',
            ],
                                  file_name='loss.png',
                                  trigger=plot_interval),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport(['validation/main/map'],
                                  file_name='accuracy.png',
                                  trigger=plot_interval),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
Esempio n. 26
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator', type=str,
                        default='hierarchical', help='Type of communicator')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--benchmark', action='store_true',
                        help='benchmark mode')
    parser.add_argument('--benchmark-iteration', type=int, default=500,
                        help='the number of iterations when using benchmark mode')
    parser.add_argument('--cprofile', action='store_true', help='cprofile')
    args = parser.parse_args()

    multiprocessing.set_start_method('forkserver')
    p = multiprocessing.Process(target=dummy_func, args=())
    p.start()
    p.join()

    # Prepare ChainerMN communicator.
    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob,
        n_prefetch=args.loaderjob)
    test_iter = chainer.iterators.MultiprocessIterator(
        test, args.batchsize, repeat=False, n_processes=args.loaderjob,
        n_prefetch=args.loaderjob)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    if args.benchmark:
        stop_trigger = (args.benchmark_iteration, 'iteration')
    else:
        stop_trigger = (args.epoch, 'epoch')
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        if args.benchmark:
            trainer.extend(extensions.LogReport(), trigger=stop_trigger)
        else:
            trainer.extend(extensions.dump_graph('main/loss'))
            trainer.extend(extensions.LogReport())
            trainer.extend(extensions.PrintReport(
                ['epoch', 'main/loss', 'validation/main/loss',
                 'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
            trainer.extend(extensions.ProgressBar())

            if args.resume:
                chainer.serializers.load_npz(args.resume, trainer)

    if args.cprofile:
        pr = cProfile.Profile()
        pr.enable()

    trainer.run()

    if args.cprofile:
        pr.disable()
        s = io.StringIO()
        sort_by = 'tottime'
        ps = pstats.Stats(pr, stream=s).sort_stats(sort_by)
        ps.print_stats()
        if comm.rank == 0:
            print(s.getvalue())
        pr.dump_stats('{0}/rank_{1}.cprofile'.format(args.out, comm.rank))
Esempio n. 27
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action='store_true', default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--stop', '-s', type=str, default='15e',
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default='adam()',
                        help='Optimizer and its argument')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print('RD source done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print('RD target done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print('Rank {} GPU: {}'.format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.bcast_obj(source_ids, root=0)
    target_ids = comm.bcast_obj(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print('target_words : {}'.format(len(target_words)))
        print('source_words : {}'.format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device_from_id(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write('Error: unknown stop trigger: {}'.format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print('Trigger: {}'.format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], numpy.int32)
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()
Esempio n. 28
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument("--batchsize", type=int, default=16, help="batch size")
    parser.add_argument("--out", default="logs", help="logs")
    parser.add_argument("--resume", help="resume")
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, "set_start_method"):
        multiprocessing.set_start_method("forkserver")
        p = multiprocessing.Process()
        p.start()
        p.join()

    comm = chainermn.create_communicator("pure_nccl")
    device = comm.intra_rank

    class_names = morefusion.datasets.ycb_video.class_names
    fg_class_names = class_names[1:]
    model = MaskRCNNFPNResNet50(n_fg_class=len(fg_class_names),
                                pretrained_model="imagenet")
    model_coco = MaskRCNNFPNResNet50(pretrained_model="coco")
    _copyparams(model, model_coco)

    model.use_preset("evaluate")
    train_chain = TrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    train_chain.to_gpu()

    if comm.rank == 0:
        train = chainer.datasets.ConcatenatedDataset(
            morefusion.datasets.YCBVideoInstanceSegmentationDataset(
                split="train", sampling=15),
            morefusion.datasets.YCBVideoSyntheticInstanceSegmentationDataset(
                bg_composite=True),
            morefusion.datasets.
            MySyntheticYCB20190916InstanceSegmentationDataset(  # NOQA
                "train", bg_composite=True),
        )
        train = transform_dataset(train, model, train=True)
        val = morefusion.datasets.YCBVideoInstanceSegmentationDataset(
            split="keyframe", sampling=1)
        val = transform_dataset(val, model, train=False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm, shuffle=False)

    train_iter = chainer.iterators.MultiprocessIterator(
        train,
        args.batchsize // comm.size,
        n_processes=args.batchsize // comm.size,
        shared_mem=100 * 1000 * 1000 * 4,
    )
    val_iter = chainer.iterators.MultiprocessIterator(
        val,
        args.batchsize // comm.size,
        n_processes=args.batchsize // comm.size,
        shared_mem=100 * 1000 * 1000 * 4,
        shuffle=False,
        repeat=False,
    )

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    optimizer.add_hook(WeightDecay(0.0001))

    for link in model.links():
        if isinstance(link, L.BatchNormalization):
            link.disable_update()
    model.extractor.disable_update()
    model.rpn.disable_update()

    for name, link in model.namedlinks():
        print(name, link.update_enabled)

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                converter=converter,
                                                device=device)
    max_epoch = (180e3 * 8) / 118287
    trainer = training.Trainer(updater, (max_epoch, "epoch"), args.out)

    @make_shift("lr")
    def lr_schedule(trainer):
        base_lr = 0.02 * args.batchsize / 16
        warm_up_duration = 500
        warm_up_rate = 1 / 3

        iteration = trainer.updater.iteration
        if iteration < warm_up_duration:
            rate = (warm_up_rate +
                    (1 - warm_up_rate) * iteration / warm_up_duration)
        else:
            rate = 1
            for step in [120e3 / 180e3 * max_epoch, 160e3 / 180e3 * max_epoch]:
                if trainer.updater.epoch_detail >= step:
                    rate *= 0.1

        return base_lr * rate

    trainer.extend(lr_schedule)

    val_interval = 10000, "iteration"
    evaluator = InstanceSegmentationCOCOEvaluator(val_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    if comm.rank == 0:
        log_interval = 10, "iteration"
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        keys = [
            "epoch",
            "iteration",
            "lr",
            "main/loss",
            "main/loss/rpn/loc",
            "main/loss/rpn/conf",
            "main/loss/bbox_head/loc",
            "main/loss/bbox_head/conf",
            "main/loss/mask_head",
            "validation/main/map/iou=0.50:0.95/area=all/max_dets=100",
        ]
        trainer.extend(extensions.PrintReport(keys), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration'))
        trainer.extend(
            extensions.snapshot_object(model, "model_iter_best"),
            trigger=training.triggers.MaxValueTrigger(
                "validation/main/map/iou=0.50:0.95/area=all/max_dets=100",
                trigger=val_interval,
            ),
        )
        trainer.extend(
            extensions.snapshot_object(model,
                                       "model_iter_{.updater.iteration}"),
            trigger=(max_epoch, "epoch"),
        )

    if args.resume:
        serializers.load_npz(args.resume, trainer, strict=False)

    trainer.run()
Esempio n. 29
0
def check_mnist(gpu, display_log=True):
    epoch = 5
    batchsize = 100
    n_units = 100

    comm = chainermn.create_communicator('naive')
    if gpu:
        device = comm.intra_rank
        chainer.cuda.get_device(device).use()
    else:
        device = -1

    model = L.Classifier(MLP(n_units, 10))
    if gpu:
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)

    trainer = training.Trainer(updater, (epoch, 'epoch'))

    # Wrap standard Chainer evaluators by MultiNodeEvaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Add checkpointer. This is just to check checkpointing runs
    # without errors
    path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + "-tmp-")
    checkpointer = create_multi_node_checkpointer(name=__name__,
                                                  comm=comm,
                                                  path=path)
    trainer.extend(checkpointer, trigger=(1, 'epoch'))

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0 and display_log:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ],
                                              out=sys.stderr),
                       trigger=(1, 'epoch'))
    trainer.run()

    err = evaluator()['validation/main/accuracy']
    assert err > 0.95

    # Check checkpointer successfully finalized snapshot directory
    assert [] == os.listdir(path)
    os.removedirs(path)
Esempio n. 30
0
def main():
    # ===== Argparse ===== #
    parser = argparse.ArgumentParser()
    parser.add_argument("--communicator",
                        type=str,
                        default="hierarchical",
                        help="Type of communicator")
    parser.add_argument("--gpu", "-g", action="store_true", help="Use GPU")
    parser.add_argument("--batch_size",
                        "-b",
                        type=int,
                        default=4,
                        help="batch size")
    parser.add_argument("--iteration",
                        "-i",
                        type=int,
                        default=1000,
                        help="# of epochs")
    parser.add_argument("--units",
                        "-u",
                        type=int,
                        default=5000,
                        help="# of FC units")
    parser.add_argument("--resume",
                        "-r",
                        default="",
                        help="Resume the training from snapshot")
    parser.add_argument("--data_visual",
                        type=str,
                        default=DATA_DIR_VISUAL,
                        help="Visual data directory, which has csv files")
    parser.add_argument("--data_speech",
                        type=str,
                        default=DATA_DIR_SPEC,
                        help="Spectrogram data directory, which has npz files")
    parser.add_argument("--result_dir",
                        type=str,
                        default="result",
                        help="Save directory")
    args = parser.parse_args()

    # ===== GPU or CPU ===== #
    if args.gpu:
        xp = cuda.cupy
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator("naive")
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num Minibatch-size: {}'.format(args.batch_size))
        print('Num iteration: {}'.format(args.iteration))
        print('==========================================')

    # ===== Load model ===== #
    if comm.rank == 0:
        print("loading model...")
    model = Audio_Visual_Net(gpu=0, num_fusion_units=args.units)
    if device >= 0:
        cuda.get_device_from_id(device).use()
        model.to_gpu()
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # ===== Set data ===== #
    if comm.rank == 0:
        print("loading data...")
        spec_input = sorted(glob.glob(os.path.join(args.data_speech, "*.npz")))
        vis_input = sorted(glob.glob(os.path.join(args.data_visual, "*")))
        assert len(spec_input) == len(
            vis_input), "# of files are different between faces and audios."
        all_nums = range(len(spec_input))
        all_nums.remove(5151)
        random.sample(all_nums, len(all_nums))
        threshold = int(len(all_nums) * 0.995)
        all_nums_train = all_nums[:threshold]
        all_nums_test = all_nums[threshold:]
        train = [(i) for i in all_nums_train]
        test = [(i) for i in all_nums_test]
    else:
        train = None
        test = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)
    train_iter = chainer.iterators.SerialIterator(dataset=train,
                                                  batch_size=args.batch_size,
                                                  shuffle=False,
                                                  repeat=True)
    test_iter = chainer.iterators.SerialIterator(dataset=test,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 repeat=False)

    # ===== Define trainer ===== #
    if comm.rank == 0:
        print("setting trainer...")
    updater = chainer.training.StandardUpdater(train_iter,
                                               optimizer,
                                               device=device)
    trainer = chainer.training.Trainer(updater, (args.iteration, "iteration"),
                                       out=args.result_dir)

    iter_trigger = 10
    evaluator = TestModeEvaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=(int(iter_trigger), "iteration"))

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(iter_trigger,
                                                     "iteration")),
                       trigger=(iter_trigger, "iteration"))
        trainer.extend(
            extensions.ProgressBar(update_interval=int(iter_trigger / 10)))
        trainer.extend(
            extensions.PlotReport(["main/loss", "validation/main/loss"],
                                  "iteration",
                                  file_name="loss.png",
                                  trigger=(iter_trigger, "iteration")))
        trainer.extend(extensions.PrintReport([
            "epoch", "iteration", "main/loss", "validation/main/loss",
            "elapsed_time"
        ]),
                       trigger=(iter_trigger, "iteration"))
        trainer.extend(extensions.snapshot(),
                       trigger=(int(iter_trigger * 10), "iteration"))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    # ===== Training ===== #
    if comm.rank == 0:
        print("start training...")
    trainer.run()

    # ===== Save model ===== #
    if comm.rank == 0:
        print("saving model...")
    model.to_cpu()
    chainer.serializers.save_npz(os.path.join(args.result_dir, "model"), model)
    chainer.serializers.save_npz(os.path.join(args.result_dir, "optimizer"),
                                 optimizer)

    if comm.rank == 0:
        print("done!!")
Esempio n. 31
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError("ImageNet requires GPU support.")

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(
            args.val, args.root, mean, model.insize, False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # We need to change the start method of multiprocessing module if we are
    # using InfiniBand and MultiprocessIterator. This is because processes
    # often crash when calling fork if they are using Infiniband.
    # (c.f., https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Esempio n. 32
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator.
    train, test = chainer.datasets.get_mnist()
    rng = np.random.RandomState(0)
    train = chainer.datasets.SubDataset(train,
                                        0,
                                        N_TRAIN_EXAMPLES,
                                        order=rng.permutation(len(train)))
    test = chainer.datasets.SubDataset(test,
                                       0,
                                       N_TEST_EXAMPLES,
                                       order=rng.permutation(len(test)))

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  BATCHSIZE,
                                                  shuffle=True)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 BATCHSIZE,
                                                 repeat=False,
                                                 shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch'))

    # Add Chainer extension for pruners.
    trainer.extend(
        optuna.integration.ChainerPruningExtension(trial,
                                                   'validation/main/loss',
                                                   (PRUNER_INTERVAL, 'epoch')))
    evaluator = chainer.training.extensions.Evaluator(test_iter, model)
    trainer.extend(chainermn.create_multi_node_evaluator(evaluator, comm))
    log_report_extension = chainer.training.extensions.LogReport(log_name=None)
    trainer.extend(log_report_extension)

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    # Please set show_loop_exception_msg False to inhibit messages about TrialPruned exception.
    # ChainerPruningExtension raises TrialPruned exception to stop training, and
    # trainer shows some messages every time it receive TrialPruned.
    trainer.run(show_loop_exception_msg=False)

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(test_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    # The following line mitigates the memory problem in CircleCI
    # (see https://github.com/pfnet/optuna/pull/325 for more details).
    gc.collect()

    return 1.0 - report['main/accuracy']
Esempio n. 33
0
def main():
    parser = argparse.ArgumentParser(description='''\
ChainerMN example: MNIST with automatic checkpoints enabled''')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator',
                        type=str,
                        default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--run-id',
                        type=str,
                        default='train-mnist-example',
                        help='ID of the task name')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Enable checkpointer and recover from checkpoint if any checkpoint exists
    checkpointer = create_multi_node_checkpointer(name=args.run_id, comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    print("Rank", comm.rank, ": (Re)Starting from (epoch, iter) =",
          (trainer.updater.epoch, trainer.updater.iteration))
    trainer.extend(checkpointer, trigger=(1000, 'iteration'))

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
Esempio n. 34
0
def main():
    parser = argparse.ArgumentParser(
        description='ChainerMN example: pipelined neural network')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('pure_nccl')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = -1

    if model_comm.size != 2:
        raise ValueError('This example can only be executed on the even number'
                         'of processes.')

    if comm.rank == 0:
        print('==========================================')
        if args.gpu:
            print('Using GPUs')
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    if data_axis == 0:
        model = L.Classifier(MLP0(model_comm, args.unit))
    elif data_axis == 1:
        model = MLP1(model_comm, args.unit, 10)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), data_comm)
    optimizer.setup(model)

    # Original dataset on worker 0 and 1.
    # Datasets of worker 0 and 1 are split and distributed to all workers.
    if model_axis == 0:
        train, test = chainer.datasets.get_mnist()
        if data_axis == 1:
            train = chainermn.datasets.create_empty_dataset(train)
            test = chainermn.datasets.create_empty_dataset(test)
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, data_comm, shuffle=True)
    test = chainermn.scatter_dataset(test, data_comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  args.batchsize,
                                                  shuffle=False)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, data_comm)
    trainer.extend(evaluator)

    # Some display and output extentions are necessary only for worker 0.
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
Esempio n. 35
0
def main():
    parser = argparse.ArgumentParser(description='Chainer K-FAC example: MNIST')  # NOQA
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--num_epochs', type=int, default=20)
    parser.add_argument('--snapshot_interval', type=int, default=-1)
    parser.add_argument('--no_cuda', action='store_true')
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume', default='')
    parser.add_argument('--optimizer', default='kfac')
    parser.add_argument('--arch', choices=['mlp', 'cnn'], default='mlp')
    parser.add_argument('--plot', action='store_true')
    parser.add_argument('--distributed', action='store_true')
    args = parser.parse_args()

    # Prepare communicator
    if not args.distributed:
        # Single process execution
        comm = None
        rank = 0
        device = -1 if args.no_cuda else 0
    else:
        # Multiple processes execution, constructs a communicator.
        # chainerkfac uses different method to create a communicator from
        # chainermn.
        if args.optimizer == 'kfac':
            comm = chainerkfac.create_communicator('pure_nccl')
        else:
            comm = chainermn.create_communicator('pure_nccl')
        rank = comm.rank
        device = comm.intra_rank
        if rank == 0:
            print('======== DISTRIBUTED TRAINING ========')

    # Set up a neural network to train
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if args.arch == 'mlp':
        model = L.Classifier(MLP())
        in_ndim = 1  # input dimentions
    else:
        model = L.Classifier(CNN())
        in_ndim = 3  # input dimentions

    if device >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(device).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    if args.optimizer == 'kfac':
        if comm is None:
            optimizer = chainerkfac.optimizers.KFAC()
        else:
            optimizer = chainerkfac.optimizers.DistributedKFAC(comm)
    else:
        optimizer = chainer.optimizers.Adam()
        if comm is not None:
            optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)

    # Load the MNIST dataset
    if rank == 0:
        train, test = chainer.datasets.get_mnist(ndim=in_ndim)
    else:
        train, test = None, None
    if comm is not None:
        train = chainermn.scatter_dataset(train, comm, shuffle=True)
        test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batch_size)
    test_iter = chainer.iterators.SerialIterator(test, args.batch_size,
                                                 repeat=False, shuffle=False)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.num_epochs, 'epoch'),
                               out=args.out)

    # Evaluate the model with the test dataset for each epoch
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    if comm is not None:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if rank == 0:
        # Take a snapshot for each specified epoch
        snapshot_interval = args.num_epochs \
            if args.snapshot_interval == -1 else max(1, args.snapshot_interval)
        trainer.extend(extensions.snapshot(),
                       trigger=(snapshot_interval, 'epoch'))

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())

        # Save two plot images to the result dir
        if args.plot and extensions.PlotReport.available():
            trainer.extend(
                extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                      'epoch', file_name='loss.png'))
            trainer.extend(
                extensions.PlotReport(
                    ['main/accuracy', 'validation/main/accuracy'],
                    'epoch', file_name='accuracy.png'))

        # Print selected entries of the log to stdout
        # Here "main" refers to the target link of the "main" optimizer again,
        # and "validation" refers to the default name of the Evaluator
        # extension. Entries other than 'epoch' are reported by the Classifier
        # link, called by either the updater or the evaluator.
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Esempio n. 36
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator', type=str,
                        default='hierarchical', help='Type of communicator')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
                                                 repeat=False, shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()