예제 #1
0
def test_deprecation_single():
    ranks = _communication_utility.init_ranks(mpi_comm)
    inter_size = ranks[4]
    if inter_size > 1:
        pytest.skip('This test is for single node only')

    with chainer.testing.assert_warns(DeprecationWarning):
        chainermn.create_communicator('single_node')
예제 #2
0
    def setup(self, gpu):
        if gpu:
            self.communicator = chainermn.create_communicator('hierarchical')
            self.device = self.communicator.rank
            chainer.cuda.get_device_from_id(self.device).use()
        else:
            self.communicator = chainermn.create_communicator('naive')
            self.device = -1

        if self.communicator.size != 2:
            pytest.skip('This test is for two processes')
    def setup(self, gpu):
        numpy.random.seed(42)

        if gpu:
            self.communicator = chainermn.create_communicator('flat')
            self.device = self.communicator.intra_rank
            chainer.cuda.get_device_from_id(self.device).use()
        else:
            self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size < 2:
            pytest.skip('This test is for multinode')
def create_communicator(gpu, param):
    if gpu:
        communicator = chainermn.create_communicator('flat')
        device = communicator.intra_rank
        chainer.cuda.get_device_from_id(device).use()
    else:
        communicator = chainermn.create_communicator('naive')

    if communicator.size < 2:
        pytest.skip('This test is for multinode')

    return communicator
예제 #5
0
def create_communicator(gpu):
    if gpu:
        communicator = chainermn.create_communicator('flat')
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()
    else:
        communicator = chainermn.create_communicator('naive')

    if communicator.size < 2:
        pytest.skip('This test is for multinode only')

    rank_next = (communicator.rank + 1) % communicator.size
    rank_prev = (communicator.rank - 1) % communicator.size
    return communicator, rank_next, rank_prev
예제 #6
0
    def setup(self, gpu):
        if gpu:
            self.communicator = chainermn.create_communicator('hierarchical')
            self.device = self.communicator.rank
            chainer.cuda.get_device_from_id(self.device).use()
        else:
            self.communicator = chainermn.create_communicator('naive')
            self.device = -1

        if self.communicator.size != 2:
            pytest.skip('This test is for two processes')

        # dtypes to be tested
        # DO NOT USE chainer.testing.parameterize
        # (because running order of generated test cases is not unique)
        self.dtypes = [np.int32, np.int64, np.float32, np.float64]
예제 #7
0
    def setUp(self):
        self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size < 2:
            pytest.skip("This test is for multinode only")

        N = 100
        self.dataset = np.arange(N).astype(np.float32)
예제 #8
0
    def test_allreduce_persistent_gpu(self):
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank
        chainer.cuda.get_device_from_id(device).use()

        model = ExampleModel()
        model.to_gpu()
        self._test(comm, model)
    def setUp(self):
        self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size < 2:
            pytest.skip("This test is for multinode")

        self.rank_send = (self.communicator.rank + 1) % self.communicator.size
        self.rank_recv = (self.communicator.rank - 1) % self.communicator.size
예제 #10
0
    def setup(self, gpu):
        if gpu:
            self.communicator = chainermn.create_communicator('hierarchical')
            chainer.cuda.get_device_from_id(self.communicator.intra_rank).use()
        else:
            self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size < 2:
            pytest.skip('This test is for multinode only')

        self.rank_next = self.communicator.rank + 1
        self.rank_prev = self.communicator.rank - 1

        if self.rank_prev < 0:
            self.rank_prev = None

        if self.rank_next >= self.communicator.size:
            self.rank_next = None
예제 #11
0
 def setup_cpu(self):
     self.comm = chainermn.create_communicator('naive')
     self.target = DynamicExampleModel()
     self.target.a.W.data[:] = self.comm.rank
     self.target.b.W.data[:] = self.comm.rank + 1
     self.target.a.W.grad[:] = 0
     self.target.b.W.grad[:] = 0
     self.actual_optimizer = chainer.GradientMethod()
     self.actual_optimizer.create_update_rule = mock.MagicMock
예제 #12
0
 def setup_gpu(self, device=None):
     self.comm = chainermn.create_communicator('hierarchical')
     device = self.comm.intra_rank
     chainer.cuda.get_device_from_id(device).use()
     self.target = DynamicExampleModel()
     self.target.to_gpu()
     self.target.a.W.data[:] = self.comm.rank
     self.target.b.W.data[:] = self.comm.rank + 1
     self.target.a.W.grad[:] = 0
     self.target.b.W.grad[:] = 0
     self.actual_optimizer = chainer.GradientMethod()
     self.actual_optimizer.create_update_rule = mock.MagicMock
예제 #13
0
    def setUp(self):
        if self.iterator_class == chainer.iterators.MultiprocessIterator and \
                int(platform.python_version_tuple()[0]) < 3:
            pytest.skip('This test requires Python version >= 3')
        self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size < 2:
            pytest.skip('This test is for multinode only')

        self.N = 6
        self.dataset = numpy.arange(self.N).astype(numpy.float32)
        self.bs = 2
    def setup(self, gpu):
        self.gpu = gpu
        if self.gpu:
            self.communicator = chainermn.create_communicator('hierarchical')
            device = self.communicator.intra_rank
            chainer.cuda.get_device_from_id(device).use()
        else:
            self.communicator = chainermn.create_communicator('naive')
            device = -1

        if self.communicator.size < 2:
            pytest.skip("This test is for multinode")

        self.rank_send = (self.communicator.rank + 1) % self.communicator.size
        self.rank_recv = (self.communicator.rank - 1) % self.communicator.size

        # Activation function.
        self.f = chainer.functions.sigmoid

        # Evaluation function.
        self.evaluation = chainer.functions.mean_squared_error

        # Input data.
        self.x = chainer.Variable(
            numpy.arange(10).reshape(1, 10).astype(numpy.float32) / 10)

        self.model = chainer.links.Linear(
            10, 10, initialW=self._init_w(self.communicator.rank))
        self.entire_model = [chainer.links.Linear(
            10, 10, initialW=self._init_w(l))
            for l in range(self.communicator.size)]
        self.device = device

        if device >= 0:
            self.x.to_gpu()
            self.model.to_gpu()
            for model in self.entire_model:
                model.to_gpu()
 def setup_gpu(self, device=None):
     if nccl.get_build_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.comm = chainermn.create_communicator('pure_nccl')
     device = self.comm.intra_rank
     chainer.cuda.get_device_from_id(device).use()
     self.target = DynamicExampleModel()
     self.target.to_gpu()
     self.target.a.W.data[:] = self.comm.rank
     self.target.b.W.data[:] = self.comm.rank + 1
     self.target.a.W.grad[:] = 0
     self.target.b.W.grad[:] = 0
     self.actual_optimizer = chainer.GradientMethod()
     self.actual_optimizer.create_update_rule = mock.MagicMock
예제 #16
0
    def setUp(self):
        self.x = np.random.uniform(
            -1, 1, (5, self.in_channels, 5, 5)).astype(np.float32)
        self.gy = np.random.uniform(
            -1, 1, (5, self.out_channels, 5, 5)).astype(np.float32)

        # Convolution is the identity function.
        initialW = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]],
                            dtype=np.float32).reshape((1, 1, 3, 3))
        bn_kwargs = {'decay': 0.8, 'comm': create_communicator('naive')}
        initial_bias = 0
        activ = relu
        self.l = Conv2DBNActiv(
            self.in_channels, self.out_channels, self.ksize, self.stride,
            self.pad, self.dilate, initialW=initialW,
            initial_bias=initial_bias, activ=activ, bn_kwargs=bn_kwargs)
예제 #17
0
    def setUp(self):
        if self.iterator_class == chainer.iterators.MultiprocessIterator and \
                int(platform.python_version_tuple()[0]) < 3:
            pytest.skip('This test requires Python version >= 3')
        self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size < 2:
            pytest.skip("This test is for multinode only")

        self.N = 100
        if self.paired_dataset:
            self.dataset = list(zip(
                np.arange(self.N).astype(np.float32),
                np.arange(self.N).astype(np.float32)))
        else:
            self.dataset = np.arange(self.N).astype(np.float32)
def test_non_variable_send(param):
    """Checks if backward will be called even if inputs are not Variable.

    This test confirms whether deadlock occurs when numpy/cupy array is
    given as an input of send.
    In this case, the input will be converted to chainer Variable without
    ``requires_grad``, thus ``backward`` will not be called without any
    modification.
    """
    communicator = chainermn.create_communicator('naive')

    if communicator.size < 2:
        pytest.skip('This test is for multinode')

    rank_send = (communicator.rank + 1) % communicator.size
    rank_recv = (communicator.rank - 1) % communicator.size

    if communicator.rank == 0:
        x = numpy.ones((1, 10)).astype(param.dtype)
        phi = chainermn.functions.send(
            x, communicator, rank=rank_send)
        x, = chainermn.functions.pseudo_connect(phi, x)
        y = chainer.functions.sum(x)
        t = numpy.array(0).astype(param.dtype)
        z = chainer.functions.mean_squared_error(y, t)
        z.backward()

    elif communicator.rank == communicator.size - 1:
        x = chainermn.functions.recv(communicator, rank=rank_recv)
        y = chainer.functions.sum(x)
        t = numpy.array(0).astype(param.dtype)
        z = chainer.functions.mean_squared_error(y, t)
        z.backward()

    else:
        x = chainermn.functions.recv(communicator, rank=rank_recv)
        phi = chainermn.functions.send(
            x, communicator, rank=rank_send)
        phi.backward()
예제 #19
0
 def setUp(self):
     self.communicator = chainermn.create_communicator('naive')
예제 #20
0
    evaluator = chainer.training.extensions.Evaluator(valid_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return report["main/accuracy"]


if __name__ == "__main__":
    # Please make sure common study and storage are shared among nodes.
    study_name = sys.argv[1]
    storage_url = sys.argv[2]

    study = optuna.load_study(study_name,
                              storage_url,
                              pruner=optuna.pruners.MedianPruner())
    comm = chainermn.create_communicator("naive")
    if comm.rank == 0:
        print("Study name:", study_name)
        print("Storage URL:", storage_url)
        print("Number of nodes:", comm.size)

    # Run optimization!
    chainermn_study = optuna.integration.ChainerMNStudy(study, comm)
    chainermn_study.optimize(objective, n_trials=25)

    if comm.rank == 0:
        pruned_trials = study.get_trials(deepcopy=False,
                                         states=[TrialState.PRUNED])
        complete_trials = study.get_trials(deepcopy=False,
                                           states=[TrialState.COMPLETE])
        print("Study statistics: ")
예제 #21
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument('--model',
                        '-m',
                        choices=['resnet50', 'resnet101'],
                        default='resnet50',
                        help='Base model of Mask R-CNN.')
    parser.add_argument('--pooling-func',
                        '-p',
                        choices=['pooling', 'align', 'resize'],
                        default='align',
                        help='Pooling function.')
    parser.add_argument('--gpu', '-g', type=int, help='GPU id.')
    parser.add_argument('--multi-node',
                        action='store_true',
                        help='use multi node')
    default_max_epoch = 120
    parser.add_argument('--max-epoch',
                        type=float,
                        default=default_max_epoch,
                        help='epoch')
    parser.add_argument('--pretrained-model', help='pretrained model')
    parser.add_argument(
        '--notrain',
        choices=['pix', 'ins'],
        help='not training pixel or instance segmentation',
    )
    parser.add_argument(
        '--lr-base',
        default=0.00125,
        type=float,
        help='learning rate per batch size 1',
    )
    parser.add_argument(
        '--noaugmentation',
        action='store_true',
        help='not apply data augmentation',
    )
    parser.add_argument(
        '--pix-loss-scale',
        default=1.,
        type=float,
        help='scale of pixel loss',
    )
    parser.add_argument(
        '--dataset',
        default='occlusion',
        choices=['occlusion', 'occlusion+synthetic'],
        help='dataset',
    )
    args = parser.parse_args()

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('pure_nccl')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        if args.gpu is None:
            print(
                '--gpu option is required if --multi-node is not specified.',
                file=sys.stderr,
            )
            sys.exit(1)
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()

    if not args.multi_node or comm.rank == 0:
        out = osp.join(here, 'logs', now.strftime('%Y%m%d_%H%M%S.%f'))
    else:
        out = None
    if args.multi_node:
        args.out = comm.bcast_obj(out)
    else:
        args.out = out
    del out

    # 0.00125 * 8 = 0.01  in original
    args.batch_size = 1 * args.n_gpu
    args.lr = args.lr_base * args.batch_size
    args.weight_decay = 0.0001

    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [(120e3 / 180e3) * args.max_epoch,
                      (160e3 / 180e3) * args.max_epoch]

    random.seed(args.seed)
    np.random.seed(args.seed)

    # Default Config
    # args.min_size = 800
    # args.max_size = 1333
    # args.anchor_scales = (2, 4, 8, 16, 32)
    args.min_size = 600
    args.max_size = 1000
    args.anchor_scales = (4, 8, 16, 32)
    args.rpn_dim = 512

    # -------------------------------------------------------------------------
    # Dataset

    train_data = \
        instance_occlsegm.datasets.PanopticOcclusionSegmentationDataset(
            'train', augmentation=not args.noaugmentation
        )
    if args.dataset == 'occlusion+synthetic':
        synthetic_data = \
            instance_occlsegm.datasets.\
            PanopticOcclusionSegmentationSyntheticDataset(
                do_aug=not args.noaugmentation,
                size=len(train_data),
            )
        train_data = chainer.datasets.ConcatenatedDataset(
            train_data, synthetic_data)
    test_data = \
        instance_occlsegm.datasets.PanopticOcclusionSegmentationDataset(
            'test'
        )
    fg_class_names = test_data.class_names
    args.class_names = fg_class_names.tolist()
    test_data_list = test_data.get_video_datasets()
    del test_data

    # -------------------------------------------------------------------------
    # Model + Optimizer.

    if args.pooling_func == 'align':
        pooling_func = cmr.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = chainer.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = cmr.functions.crop_and_resize
    else:
        raise ValueError

    args.mask_loss = 'softmax'
    assert args.model in ['resnet50', 'resnet101']
    n_layers = int(args.model.lstrip('resnet'))
    mask_rcnn = instance_occlsegm.models.MaskRCNNPanopticResNet(
        n_layers=n_layers,
        n_fg_class=len(fg_class_names),
        pretrained_model=args.pretrained_model,
        pooling_func=pooling_func,
        anchor_scales=args.anchor_scales,
        min_size=args.min_size,
        max_size=args.max_size,
        rpn_dim=args.rpn_dim,
    )
    mask_rcnn.nms_thresh = 0.3
    mask_rcnn.score_thresh = 0.05

    model = instance_occlsegm.models.MaskRCNNPanopticTrainChain(
        mask_rcnn,
        notrain=args.notrain,
        pix_loss_scale=args.pix_loss_scale,
    )
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    mask_rcnn.extractor.conv1.disable_update()
    mask_rcnn.extractor.bn1.disable_update()
    mask_rcnn.extractor.res2.disable_update()
    for link in mask_rcnn.links():
        if isinstance(link, cmr.links.AffineChannel2D):
            link.disable_update()

    # -------------------------------------------------------------------------
    # Iterator.

    train_data = chainer.datasets.TransformDataset(
        train_data,
        instance_occlsegm.datasets.MaskRCNNPanopticTransform(mask_rcnn),
    )
    test_data_list = [
        chainer.datasets.TransformDataset(
            td,
            instance_occlsegm.datasets.MaskRCNNPanopticTransform(
                mask_rcnn,
                train=False,
            )) for td in test_data_list
    ]
    test_concat_data = chainer.datasets.ConcatenatedDataset(*test_data_list)
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)

    # for training
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data,
        batch_size=1,
        n_processes=14,
        shared_mem=10**9,
    )
    # for evaluation
    test_iters = {
        i: chainer.iterators.SerialIterator(td,
                                            batch_size=1,
                                            repeat=False,
                                            shuffle=False)
        for i, td in enumerate(test_data_list)
    }
    # for visualization
    test_concat_iter = chainer.iterators.SerialIterator(test_concat_data,
                                                        batch_size=1,
                                                        repeat=False,
                                                        shuffle=False)

    # -------------------------------------------------------------------------

    converter = functools.partial(
        cmr.datasets.concat_examples,
        padding=0,
        # img, bboxes, labels, masks, scales, lbls_vis, lbls_occ
        indices_concat=[0, 2, 3, 4, 5, 6],
        indices_to_device=[0, 1, 5, 6],
    )
    updater = chainer.training.updater.StandardUpdater(train_iter,
                                                       optimizer,
                                                       device=device,
                                                       converter=converter)

    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'),
                               out=args.out)

    trainer.extend(extensions.FailOnNonNumber())
    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=training.triggers.ManualScheduleTrigger(
                       args.step_size, 'epoch'))

    eval_interval = 1, 'epoch'
    log_interval = 10, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = log_interval

    if not args.multi_node or comm.rank == 0:
        evaluator = \
            instance_occlsegm.extensions.PanopticSegmentationVOCEvaluator(
                test_iters,
                model.mask_rcnn,
                device=device,
                use_07_metric=False,
                label_names=fg_class_names,
            )
        trainer.extend(evaluator, trigger=eval_interval)
        trainer.extend(extensions.snapshot_object(model.mask_rcnn,
                                                  'snapshot_model.npz'),
                       trigger=training.triggers.MaxValueTrigger(
                           'validation/main/mpq', eval_interval))
        args.git_hash = cmr.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))
        trainer.extend(
            instance_occlsegm.extensions.PanopticSegmentationVisReport(
                test_concat_iter, model.mask_rcnn, label_names=fg_class_names),
            trigger=eval_interval,
        )
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(
            extensions.PrintReport([
                'iteration',
                'epoch',
                'elapsed_time',
                'lr',
                'main/loss',
                'main/roi_loc_loss',
                'main/roi_cls_loss',
                'main/roi_mask_loss',
                'main/rpn_loc_loss',
                'main/rpn_cls_loss',
                'main/pix_vis_loss',
                'main/pix_occ_loss',
                'validation/main/miou',
                'validation/main/mpq',
            ], ),
            trigger=print_interval,
        )
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # plot
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(
                [
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                    'main/ins_loss',
                    'main/pix_vis_loss',
                    'main/pix_occ_loss'
                    'main/pix_loss'
                    'main/loss',
                ],
                file_name='loss.png',
                trigger=plot_interval,
            ),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport([
                'validation/main/miou/vis',
                'validation/main/miou/occ',
                'validation/main/miou',
                'validation/main/map',
                'validation/main/msq',
                'validation/main/mdq',
                'validation/main/mpq',
            ],
                                  file_name='accuracy.png',
                                  trigger=plot_interval),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
                        type=str,
                        default=env.channel_input_dirs['train'])
    parser.add_argument('--test',
                        type=str,
                        default=env.channel_input_dirs['test'])

    args = parser.parse_args()

    train_file = np.load(os.path.join(args.train, 'train.npz'))
    test_file = np.load(os.path.join(args.test, 'test.npz'))

    logger.info('Current host: {}'.format(args.host))

    communicator = 'naive' if args.num_gpus == 0 else args.communicator

    comm = chainermn.create_communicator(communicator)
    device = comm.intra_rank if args.num_gpus > 0 else -1

    print('==========================================')
    print('Using {} communicator'.format(comm))
    print('Num unit: {}'.format(args.units))
    print('Num Minibatch-size: {}'.format(args.batch_size))
    print('Num epoch: {}'.format(args.epochs))
    print('==========================================')

    model = L.Classifier(MLP(args.units, 10))
    if device >= 0:
        chainer.cuda.get_device(device).use()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
예제 #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        choices=('ssd300', 'ssd512'),
                        default='ssd300')
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if args.model == 'ssd300':
        model = SSD300(n_fg_class=len(epic_kitchens_bbox_category_names),
                       pretrained_model='imagenet')
    elif args.model == 'ssd512':
        model = SSD512(n_fg_class=len(epic_kitchens_bbox_category_names),
                       pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    model.to_gpu()

    train = EpicKitchensBboxDataset(year='2018', split='train')
    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    train = TransformDataset(train, ('img', 'mb_loc', 'mb_label'),
                             Transform(model.coder, model.insize, model.mean))

    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    # http://chainermn.readthedocs.io/en/latest/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(train,
                                                        args.batchsize,
                                                        n_processes=2)

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, (18, 'epoch'), args.out)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=args.lr),
                   trigger=triggers.ManualScheduleTrigger([12, 15], 'epoch'))

    if comm.rank == 0:
        log_interval = 10, 'iteration'
        trainer.extend(
            extensions.LogReport(log_name='log.json', trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
            'main/loss/conf'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=1))

        trainer.extend(extensions.snapshot_object(
            model, 'model_iter_{.updater.iteration}.npz'),
                       trigger=(1, 'epoch'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #24
0
def main(argv):

    if len(argv) < 5:
        print("python " + argv[0] + " data_path out_path layer epoch")
        sys.exit(0)

    data_path = argv[1]
    out_path = argv[2]
    layer = int(argv[3])
    epoch = int(argv[4])

    x_file = os.path.join(data_path, "en.txt")
    y_file = os.path.join(data_path, "ja.txt")
    vocab_path = os.path.join(data_path, "vocab.dump")

    # 単語とidの辞書
    with open(vocab_path, "rb") as f:
        vocab = pickle.load(f)

    train_data1 = load_data(x_file, vocab)
    train_data2 = load_data(y_file, vocab)

    eos_id = vocab["<eos>"]
    batch_size = 256
    demb = 256
    drop_out = 0.5
    model = gAtt(layer, len(vocab) + 1, demb, drop_out)

    comm = chainermn.create_communicator("single_node")
    device = comm.intra_rank
    chainer.cuda.get_device(device).use()
    model.to_gpu(device)
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        # file・directory生成
        date = datetime.datetime.today()
        folder_name = "_".join(
            [str(date.year), str(date.month),
             str(date.day)])
        out_path = (os.path.join(out_path, folder_name, "".join(
            ["layer", str(layer)])) + os.sep)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        loss_out_path = os.path.join(
            out_path, "".join(["loss_",
                               str(epoch), "_",
                               str(layer), ".csv"]))
        loss_out = open(loss_out_path, "w")
        print(
            "epoch:",
            epoch,
            " batch:",
            batch_size,
            " drop:",
            drop_out,
            " demb:",
            demb,
            " layer:",
            layer,
            end="\n",
            file=loss_out,
        )

    xs = []
    ys = []
    s = []
    # xのデータを生成
    if comm.rank == 0:
        for pos in range(len(train_data1)):
            id = train_data1[pos]
            if id != eos_id:
                s += [id]
            else:
                xs += [xp.asarray(s, dtype=xp.int32)]
                s = []
        # yのデータを生成
        for pos in range(len(train_data2)):
            id = train_data2[pos]
            if id != eos_id:
                s += [id]
            else:
                ys += [xp.asarray(s, dtype=xp.int32)]
                s = []
    # データを配る
    xs = chainermn.scatter_dataset(xs, comm)
    ys = chainermn.scatter_dataset(ys, comm)

    loss = None
    for cnt in range(epoch):
        index = np.random.permutation(len(xs))
        for pos in range(0, len(xs), batch_size):
            # ミニバッチを生成
            batch_xs = []
            batch_ys = []
            for idx in index[pos:pos + (batch_size)]:
                batch_xs.append(xs[idx])
                batch_ys.append(ys[idx])
            model.cleargrads()
            # 初期値を生成
            hx = chainer.Variable(
                xp.zeros((2 * layer, len(batch_xs), demb), dtype=xp.float32))
            cx = chainer.Variable(
                xp.zeros((2 * layer, len(batch_xs), demb), dtype=xp.float32))
            # 学習する
            loss = model(hx, cx, batch_xs, batch_ys, len(batch_xs), vocab)
            loss.backward()
            optimizer.update()
            print(cnt + 1, " : ", pos + len(batch_xs), "/", len(xs),
                  " finished")
        if comm.rank == 0:
            print(loss.array, end="\n", file=loss_out)
            out_file = out_path + "nsteplstm-" + str(layer) + "-" + str(
                cnt) + ".model"
            model.to_cpu()
            serializers.save_npz(out_file, model)
            model.to_gpu(0)
    parser = argparse.ArgumentParser()

    parser.add_argument('--num-gpus', type=int, default=env.num_gpus)
    parser.add_argument('--communicator',
                        type=str,
                        default='naive' if env.num_gpus == 0 else 'pure_nccl')
    parser.add_argument('--current_host', type=str, default=env.current_host)
    parser.add_argument('--hosts', type=str, default=env.hosts)
    parser.add_argument('--output-data-dir',
                        type=str,
                        default=env.output_data_dir)

    args = parser.parse_args()

    comm = chainermn.create_communicator(args.communicator)

    num_hosts = len(args.hosts)
    print('process %s on host %s of %s starting' %
          (comm.intra_rank, args.current_host, num_hosts))

    if comm.intra_rank == 1 and args.current_host != 'algo-1':
        os.makedirs(args.output_data_dir)
        # this sleep time must be longer than the polling interval to check if mpi is finished.
        print('process %s on host %s of %s sleeping' %
              (comm.intra_rank, args.current_host, num_hosts))

        time.sleep(20)
        open(os.path.join(args.output_data_dir, 'process_could_complete'),
             'a').close()
예제 #26
0
def main():
    import chainermn
    chainer.global_config.autotune = True
    parser = argparse.ArgumentParser(description='ChainerMN example: Train MQAP using 3DCNN')
    parser.add_argument('--communicator', type=str,
                        default='hierarchical', help='Type of communicator')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', action='store_true',
                        help='Resume the training from snapshot')
    parser.add_argument('--weight', '-w', action='store_true',
                        help='Resume only weight')
    parser.add_argument('--config', '-c', type=int, default=0,
                        help='Number of config')
    parser.add_argument('--config_file', type=str, default='./data/config.json',
                        help='Config file path')

    args = parser.parse_args()
    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator, allreduce_grad_dtype='float16')
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1
    f = open(args.config_file, 'r')

    config = json.load(f)['Config'][args.config]
    args.out = os.path.join(args.out, str(args.config))
    if comm.rank == 0:
        print('==========================================')
        chainer.print_runtime_info()
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num epoch: {}'.format(config['epoch']))
        print('Batch size:  {}'.format(config['batch_size'] * comm.size))
        print('Optimizer:  {}'.format(config['optimizer']))
        print('Learning Rate:  {}'.format(config['learning_rate']))
        print('Out Directory:  {}'.format(args.out))
        print('Vertex feature:  {}'.format(config['vertex_feature']))
        if config['global_mode']:
            print('Using Global loss')
        if config['local_mode']:
            print('Using local loss')
            print('Local type : {}'.format(config['local_type']))
            print('Local label : {}'.format(config['local_label']))
        print('==========================================')
    d = Dataproc(size=comm.size, rank=comm.rank, config=config)
    if device >= 0:
        chainer.cuda.get_device(device).use()
    # sub_comm = comm.split(comm.rank // comm.intra_size, comm.rank)
    if config['local_type'] == 'Regression':
        local_loss_func = F.mean_squared_error
    else:
        local_loss_func = F.sigmoid_cross_entropy
    global_loss_func = F.mean_squared_error
    model = build_model(config=config, comm=comm)
    model = Classifier(predictor=model, local_loss_func=local_loss_func, global_loss_func=global_loss_func,
                       config=config)
    if device >= 0:
        model.to_gpu()
    train, test = d.get_dataset(key='train'), d.get_dataset(key='test')
    train_iter = I.SerialIterator(dataset=train, batch_size=config['batch_size'], repeat=True, shuffle=True)
    test_iter = I.SerialIterator(dataset=test, batch_size=config['batch_size'], repeat=False, shuffle=False)
    # train_iter = I.MultiprocessIterator(dataset=train, batch_size=args.batch, repeat=True, shuffle=True, n_processes=10)
    # test_iter = I.MultiprocessIterator(dataset=test, batch_size=args.batch, repeat=False, shuffle=True, n_processes=10)

    if config['optimizer'] == 'Adam':
        optimizer = chainer.optimizers.Adam(alpha=config['learning_rate'],
                                            weight_decay_rate=config['weight_decay_rate'], amsgrad=True)
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'MomentumSGD':
        optimizer = chainer.optimizers.MomentumSGD(lr=config['learning_rate'])
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'SMORMS3':
        optimizer = chainer.optimizers.SMORMS3(lr=config['learning_rate'])
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'Eve':
        from my_optimizer.eve import Eve, create_multi_node_optimizer
        optimizer = Eve(alpha=config['learning_rate'])
        optimizer = create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    elif config['optimizer'] == 'Adabound':
        from my_optimizer.adabound import Adam as Adabound
        optimizer = Adabound(alpha=config['learning_rate'], adabound=True, amsgrad=True,
                             weight_decay_rate=config['weight_decay_rate'])
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm, double_buffering=False)
    optimizer.setup(model)
    val_interval = 1, 'epoch'
    log_interval = 1, 'epoch'
    updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=d.get_converter())
    trainer = training.Trainer(updater, (config['epoch'], 'epoch'), out=args.out)
    evaluator = GraphEvaluator(iterator=test_iter, target=model.predictor, device=device, converter=d.get_converter(),
                               comm=comm, local_loss_func=local_loss_func, global_loss_func=global_loss_func,
                               name='val', config=config)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.snapshot(), trigger=val_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PlotReport(['main/loss', 'val/main/loss'], 'epoch', file_name='loss.png'),
                       trigger=val_interval)
        report_list = ['epoch', 'main/loss', 'val/main/loss']
        if config['global_mode']:
            report_list.extend(['main/global_loss', 'val/main/global_loss', 'val/main/global_pearson'])
            trainer.extend(extensions.PlotReport(['main/global_loss', 'val/main/global_loss'], 'epoch',
                                                 file_name='global_loss.png'), trigger=val_interval)
        if config['local_mode']:
            report_list.extend(['main/local_loss', 'val/main/local_loss', 'val/main/local_mean_pearson'])
            if config['local_type'] == 'Classification':
                report_list.append('val/main/local_auc')
                trainer.extend(extensions.PlotReport(['val/main/local_auc'], 'epoch', file_name='local_auc.png'),
                               trigger=val_interval)
            else:
                report_list.append('val/main/local_pearson')
        report_list.append('elapsed_time')
        trainer.extend(extensions.PrintReport(report_list), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))
    if args.resume:
        snap_list = [p for p in os.listdir(args.out) if 'snapshot' in p]
        snap_num = np.array([int(re.findall("[+-]?[0-9]+[\.]?[0-9]*[eE]?[+-]?[0-9]*", p)[0]) for p in snap_list])
        path = snap_list[np.argmax(snap_num)]
        path = os.path.join(args.out, path)
        if args.weight:
            obj_path = 'updater/model:main/predictor/'
            chainer.serializers.load_npz(path, model.predictor, obj_path)
        else:
            chainer.serializers.load_npz(path, trainer)
    if comm.rank == 0:
        protein_name_dict = d.get_protein_name_dict()
        out_path = Path(args.out)
        if not out_path.exists():
            out_path.mkdir(parents=True, exist_ok=True)
        np.savez(os.path.join(args.out, 'protein_name'), **protein_name_dict)
        f = open(os.path.join(args.out, 'config.json'), 'w')
        json.dump(config, f, ensure_ascii=False, indent=4, sort_keys=True, separators=(',', ': '))
        f.close()
        f = open(os.path.join(args.out, 'args.json'), 'w')
        json.dump(vars(args), f)
        f.close()
    if comm.rank == 0:
        print('train start!!!')
    trainer.run()
예제 #27
0
 def test_allreduce_persistent_gpu(self):
     comm = chainermn.create_communicator('flat')
     model = ExampleModel()
     self._test(comm, model, True, False)  # GPU test (CuPy)
     self._test(comm, model, True, True)  # GPU test (ChainerX)
예제 #28
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('dataset',
                        choices=['real', 'synthetic'],
                        help='The dataset.')
    parser.add_argument('--model',
                        '-m',
                        choices=['vgg16', 'resnet50', 'resnet101'],
                        default='resnet50',
                        help='Base model of Mask R-CNN.')
    parser.add_argument('--pooling-func',
                        '-pf',
                        choices=['pooling', 'align', 'resize'],
                        default='align',
                        help='Pooling function.')
    parser.add_argument('--gpu', '-g', type=int, help='GPU id.')
    parser.add_argument('--multi-node',
                        '-mn',
                        action='store_true',
                        help='use multi node')
    parser.add_argument('--max-epoch',
                        type=float,
                        help='Epoch (default: 12.17)')
    args = parser.parse_args()

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()
    args.out = osp.join(here, 'logs/train_mrcnn',
                        now.strftime('%Y%m%d_%H%M%S'))

    # 0.00125 * 8 = 0.01  in original
    args.batch_size = 1 * args.n_gpu
    args.lr = 0.00125 * args.batch_size
    args.weight_decay = 0.0001

    if args.max_epoch is None:
        # (180e3 * 8) / len(coco_trainval)
        args.max_epoch = (180e3 * 8) / 118287
    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [(120e3 / 180e3) * args.max_epoch,
                      (160e3 / 180e3) * args.max_epoch]

    random.seed(args.seed)
    np.random.seed(args.seed)

    # Default Config
    min_size = 600
    max_size = 1000
    anchor_scales = [4, 8, 16, 32]
    proposal_creator_params = dict(
        n_train_pre_nms=12000,
        n_train_post_nms=2000,
        n_test_pre_nms=6000,
        n_test_post_nms=1000,
        min_size=0,
    )

    # if args.dataset == 'voc':
    #     train_data = mrcnn.datasets.SBDInstanceSeg('train')
    #     test_data = mrcnn.datasets.VOC2012InstanceSeg('val')
    # elif args.dataset == 'coco':
    #     train_data = chainer.datasets.ConcatenatedDataset(
    #         mrcnn.datasets.CocoInstanceSeg('train'),
    #         mrcnn.datasets.CocoInstanceSeg('valminusminival'),
    #     )
    #     test_data = mrcnn.datasets.CocoInstanceSeg('minival')
    #     train_data.class_names = test_data.class_names
    #     min_size = 800
    #     max_size = 1333
    # else:
    #     raise ValueError
    # instance_class_names = train_data.class_names[1:]
    # train_data = mrcnn.datasets.MaskRcnnDataset(train_data)
    # test_data = mrcnn.datasets.MaskRcnnDataset(test_data)

    if args.dataset == 'real':
        train_data = contrib.datasets.ARC2017RealInstancesDataset(
            'train', aug='standard')
    elif args.dataset == 'synthetic':
        train_data = contrib.datasets.ARC2017SyntheticInstancesDataset(
            do_aug=True, aug_level='all')
    else:
        raise ValueError
    test_data = contrib.datasets.ARC2017RealInstancesDataset('test')
    instance_class_names = train_data.class_names[1:]
    train_data = MaskRcnnDataset(train_data)
    test_data = MaskRcnnDataset(test_data)

    if args.pooling_func == 'align':
        pooling_func = mrcnn.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = chainer.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = mrcnn.functions.crop_and_resize
    else:
        raise ValueError

    if args.model == 'vgg16':
        mask_rcnn = mrcnn.models.MaskRCNNVGG16(
            n_fg_class=len(instance_class_names),
            pretrained_model='imagenet',
            pooling_func=pooling_func,
            anchor_scales=anchor_scales,
            proposal_creator_params=proposal_creator_params,
            min_size=min_size,
            max_size=max_size)
    elif args.model in ['resnet50', 'resnet101']:
        n_layers = int(args.model.lstrip('resnet'))
        mask_rcnn = mrcnn.models.MaskRCNNResNet(
            n_layers=n_layers,
            n_fg_class=len(instance_class_names),
            pretrained_model='imagenet',
            pooling_func=pooling_func,
            anchor_scales=anchor_scales,
            proposal_creator_params=proposal_creator_params,
            min_size=min_size,
            max_size=max_size)
    else:
        raise ValueError
    mask_rcnn.use_preset('evaluate')
    model = mrcnn.models.MaskRCNNTrainChain(
        mask_rcnn,
        proposal_target_creator=mrcnn.utils.ProposalTargetCreator(
            n_sample=512),
    )
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    if args.model in ['resnet50', 'resnet101']:
        model.mask_rcnn.extractor.mode = 'res3+'
        mask_rcnn.extractor.conv1.disable_update()
        mask_rcnn.extractor.bn1.disable_update()
        mask_rcnn.extractor.res2.disable_update()

    train_data = chainer.datasets.TransformDataset(
        train_data, mrcnn.datasets.MaskRCNNTransform(mask_rcnn))
    test_data = chainer.datasets.TransformDataset(
        test_data, mrcnn.datasets.MaskRCNNTransform(mask_rcnn, train=False))
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
            test_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.MultiprocessIterator(train_data,
                                                        batch_size=1,
                                                        n_prefetch=4,
                                                        shared_mem=10**8)
    test_iter = chainer.iterators.MultiprocessIterator(test_data,
                                                       batch_size=1,
                                                       n_prefetch=4,
                                                       shared_mem=10**8,
                                                       repeat=False,
                                                       shuffle=False)

    updater = chainer.training.updater.StandardUpdater(
        train_iter,
        optimizer,
        device=device,
        converter=mrcnn.datasets.concat_examples)

    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'),
                               out=args.out)

    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=training.triggers.ManualScheduleTrigger(
                       args.step_size, 'epoch'))

    eval_interval = 1, 'epoch'
    log_interval = 20, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = 20, 'iteration'

    evaluator = mrcnn.extensions.InstanceSegmentationVOCEvaluator(
        test_iter,
        model.mask_rcnn,
        device=device,
        use_07_metric=True,
        label_names=instance_class_names)
    if args.multi_node:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=eval_interval)

    if not args.multi_node or comm.rank == 0:
        trainer.extend(extensions.snapshot_object(model.mask_rcnn,
                                                  'snapshot_model.npz'),
                       trigger=training.triggers.MaxValueTrigger(
                           'validation/main/map', eval_interval))
        args.git_hash = mrcnn.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))
        trainer.extend(mrcnn.extensions.InstanceSegmentationVisReport(
            test_iter, model.mask_rcnn, label_names=instance_class_names),
                       trigger=eval_interval)
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss',
            'main/rpn_loc_loss', 'main/rpn_cls_loss', 'validation/main/map'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # plot
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport([
                'main/loss',
                'main/roi_loc_loss',
                'main/roi_cls_loss',
                'main/roi_mask_loss',
                'main/rpn_loc_loss',
                'main/rpn_cls_loss',
            ],
                                  file_name='loss.png',
                                  trigger=plot_interval),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport(['validation/main/map'],
                                  file_name='accuracy.png',
                                  trigger=plot_interval),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
예제 #29
0
def main():
    info = collections.OrderedDict()

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--root_train', default='.',
                        help='Root directory path of training image files')
    parser.add_argument('--root_val', default='.',
                        help='Root directory path of validation image files')
    parser.add_argument('--arch', '-a', choices=archs.keys(),
                        default='resnet50_akiba', help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    #
    # ChainerMN initialization
    #
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank
    chainer.cuda.get_device(device).use()
    chainer.cuda.set_max_workspace_size(1 * 1024 * 1024 * 1024)

    #
    # Logging
    #
    if comm.rank == 0:
        result_directory = args.out
    else:
        import tempfile
        result_directory = tempfile.mkdtemp(dir='/tmp/')

    #
    # Model
    #
    model = archs[args.arch]()
    model.to_gpu()

    #
    # Dataset
    #
    if comm.rank == 0:
        train = dataset.PreprocessedDataset(
            args.train, args.root_train, model.insize)
    else:
        train = None
    train = chainermn.scatter_dataset(train, comm)

    multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)

    #
    # Optimizer
    #
    global_batchsize = comm.size * args.batchsize
    lr = 0.1 * global_batchsize / 256
    if comm.rank == 0:
        print('global_batchsize:', global_batchsize)
        print('Num of GPUs:', comm.size)

    weight_decay = 0.0001
    optimizer = chainer.optimizers.MomentumSGD(lr=lr, momentum=0.9)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
    info['training'] = {
        'local_batchsize': args.batchsize,
        'global_batchsize': global_batchsize,
        'lr': lr
    }

    #
    # Trainer
    #
    log_interval = (10, 'iteration')
    stop_trigger = (200, 'iteration')

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, stop_trigger, result_directory)

    log_report_ext = extensions.LogReport(trigger=log_interval)
    trainer.extend(log_report_ext)

    if comm.rank == 0:
        trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
예제 #30
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError('ImageNet requires GPU support.')

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Start method of multiprocessing module need to be changed if we
    # are using InfiniBand and MultiprocessIterator. This is because
    # processes often crash when calling fork if they are using
    # Infiniband.  (c.f.,
    # https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    # Also, just setting the start method does not seem to be
    # sufficient to actually launch the forkserver processes, so also
    # start a dummy process.
    # See also our document:
    # https://chainermn.readthedocs.io/en/stable/tutorial/tips_faqs.html#using-multiprocessiterator
    # This must be done *before* ``chainermn.create_communicator``!!!
    multiprocessing.set_start_method('forkserver')
    p = multiprocessing.Process(target=lambda *x: x, args=())
    p.start()
    p.join()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(args.val, args.root, mean, model.insize,
                                  False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # A workaround for processes crash should be done before making
    # communicator above, when using fork (e.g. MultiProcessIterator)
    # along with Infiniband.
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model',
        choices=('faster_rcnn_fpn_resnet50', 'faster_rcnn_fpn_resnet101'),
        default='faster_rcnn_fpn_resnet50')
    parser.add_argument('--batchsize', type=int, default=16)
    parser.add_argument('--iteration', type=int, default=90000)
    parser.add_argument('--step', type=int, nargs='*', default=[60000, 80000])
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if args.model == 'faster_rcnn_fpn_resnet50':
        model = FasterRCNNFPNResNet50(
            n_fg_class=len(coco_bbox_label_names), pretrained_model='imagenet')
    elif args.model == 'faster_rcnn_fpn_resnet101':
        model = FasterRCNNFPNResNet101(
            n_fg_class=len(coco_bbox_label_names), pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = TrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    train_chain.to_gpu()

    train = TransformDataset(
        COCOBboxDataset(year='2017', split='train'),
        ('img', 'bbox', 'label'), transform)

    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    train_iter = chainer.iterators.MultithreadIterator(
        train, args.batchsize // comm.size)

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    optimizer.add_hook(WeightDecay(0.0001))

    model.extractor.base.conv1.disable_update()
    model.extractor.base.res2.disable_update()
    for link in model.links():
        if isinstance(link, L.BatchNormalization):
            link.disable_update()

    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, converter=converter, device=device)
    trainer = training.Trainer(
        updater, (args.iteration * 16 / args.batchsize, 'iteration'), args.out)

    @make_shift('lr')
    def lr_schedule(trainer):
        base_lr = 0.02 * args.batchsize / 16
        warm_up_duration = 500
        warm_up_rate = 1 / 3

        iteration = trainer.updater.iteration
        if iteration < warm_up_duration:
            rate = warm_up_rate \
                + (1 - warm_up_rate) * iteration / warm_up_duration
        else:
            rate = 1
            for step in args.step:
                if iteration >= step * 16 / args.batchsize:
                    rate *= 0.1

        return base_lr * rate

    trainer.extend(lr_schedule)

    if comm.rank == 0:
        log_interval = 10, 'iteration'
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport(
            ['epoch', 'iteration', 'lr', 'main/loss',
             'main/loss/rpn/loc', 'main/loss/rpn/conf',
             'main/loss/head/loc', 'main/loss/head/conf']),
            trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration'))
        trainer.extend(
            extensions.snapshot_object(
                model, 'model_iter_{.updater.iteration}'),
            trigger=(90000 * 16 / args.batchsize, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer, strict=False)

    trainer.run()
예제 #32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_path',
                        type=str,
                        default='configs/base.yml',
                        help='path to config file')
    parser.add_argument('--results_dir',
                        type=str,
                        default='./result/',
                        help='directory to save the results to')
    parser.add_argument('--resume',
                        type=str,
                        default='',
                        help='path to the snapshot')
    parser.add_argument('--process_num', type=int, default=0)
    parser.add_argument('--seed', type=int, default=42)

    args = parser.parse_args()
    config = yaml_utils.Config(
        yaml.load(open(args.config_path), Loader=yaml.SafeLoader))
    pattern = "-".join([
        config.pattern, config.models['classifier']['name'],
        config.dataset['dataset_name']
    ])
    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        print('Num Minibatch-size: {}'.format(config.batchsize))
        print('Num Epoch: {}'.format(config.epoch))
        print('==========================================')

    # Model
    classifier = load_models(config.models['classifier'])

    if args.resume:
        print("Resume training with snapshot:{}".format(args.resume))
        chainer.serializers.load_npz(args.resume, classifier)

    chainer.cuda.get_device_from_id(device).use()
    classifier.to_gpu()
    # models = {"classifier": classifier}

    # Optimizer
    opt = make_optimizer(classifier, comm, config)
    opt.add_hook(chainer.optimizer.WeightDecay(5e-4))

    # Dataset
    if comm.rank == 0:
        dataset = yaml_utils.load_dataset(config)
        first_size = int(len(dataset) * config.train_val_split_ratio)
        train, val = chainer.datasets.split_dataset_random(dataset,
                                                           first_size,
                                                           seed=args.seed)
    else:
        yaml_utils.load_module(config.dataset['dataset_func'],
                               config.dataset['dataset_name'])
        train, val = None, None

    train = chainermn.scatter_dataset(train, comm)
    val = chainermn.scatter_dataset(val, comm)

    # Iterator
    train_iterator = chainer.iterators.SerialIterator(train, config.batchsize)
    val_iterator = chainer.iterators.SerialIterator(val,
                                                    config.batchsize,
                                                    repeat=False,
                                                    shuffle=False)
    kwargs = config.updater['args'] if 'args' in config.updater else {}
    kwargs.update({
        'classifier': classifier,
        'iterator': train_iterator,
        'optimizer': opt,
        'device': device,
    })

    # Updater
    updater = yaml_utils.load_updater_class(config)
    updater = updater(**kwargs)
    out = args.results_dir + '/' + pattern

    if comm.rank == 0:
        create_result_dir(out, args.config_path, config)

    # Trainer
    trainer = training.Trainer(updater, (config.epoch, 'epoch'), out=out)

    # Evaluator
    evaluator = ClassifierEvaluator(val_iterator, classifier, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Learning Rate Schedule (fixed)
    schedule = [config.epoch * 0.3, config.epoch * 0.6, config.epoch * 0.8]
    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=ManualScheduleTrigger(schedule, 'epoch'))

    report_keys = [
        'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
        'validation/main/accuracy', 'elapsed_time'
    ]
    if comm.rank == 0:
        # Set up logging
        trainer.extend(extensions.snapshot_object(
            classifier, 'classifier{}.npz'.format(args.process_num)),
                       trigger=MaxValueTrigger('validation/main/accuracy'))
        trainer.extend(
            extensions.LogReport(keys=report_keys,
                                 trigger=(config.display_interval, 'epoch')))
        trainer.extend(extensions.PrintReport(report_keys),
                       trigger=(config.display_interval, 'epoch'))
        trainer.extend(
            extensions.ProgressBar(
                update_interval=config.progressbar_interval))
    # Run the training
    trainer.run()
예제 #33
0
def main():
    model_cfgs = {
        'resnet50': {
            'class': ResNet50,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet101': {
            'class': ResNet101,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet152': {
            'class': ResNet152,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        }
    }
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to root of the train dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')
    parser.add_argument('--model',
                        '-m',
                        choices=model_cfgs.keys(),
                        default='resnet50',
                        help='Convnet models')
    parser.add_argument('--communicator',
                        type=str,
                        default='pure_nccl',
                        help='Type of communicator')
    parser.add_argument('--loaderjob', type=int, default=4)
    parser.add_argument('--batchsize',
                        type=int,
                        default=32,
                        help='Batch size for each worker')
    parser.add_argument('--lr', type=float)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--weight_decay', type=float, default=0.0001)
    parser.add_argument('--out', type=str, default='result')
    parser.add_argument('--epoch', type=int, default=90)
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256
        if comm.rank == 0:
            print('lr={}: lr is selected based on the linear '
                  'scaling rule'.format(lr))

    label_names = directory_parsing_label_names(args.train)

    model_cfg = model_cfgs[args.model]
    extractor = model_cfg['class'](n_class=len(label_names),
                                   **model_cfg['kwargs'])
    extractor.pick = model_cfg['score_layer_name']
    model = Classifier(extractor)
    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in model.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    train_data = DirectoryParsingLabelDataset(args.train)
    val_data = DirectoryParsingLabelDataset(args.val)
    train_data = TransformDataset(train_data, ('img', 'label'),
                                  TrainTransform(extractor.mean))
    val_data = TransformDataset(val_data, ('img', 'label'),
                                ValTransform(extractor.mean))
    print('finished loading dataset')

    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(train_indices,
                                              comm,
                                              shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(val_data,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_processes=args.loaderjob)

    optimizer = chainermn.create_multi_node_optimizer(
        CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    optimizer.setup(model)
    for param in model.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu()

    updater = chainer.training.StandardUpdater(train_iter,
                                               optimizer,
                                               device=device)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, model, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.snapshot_object(
            extractor, 'snapshot_model_{.updater.epoch}.npz'),
                       trigger=(args.epoch, 'epoch'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
예제 #34
0
def main():
    parser = argparse.ArgumentParser(
        description="Train a KISS model",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("log_name", help="name of log")
    parser.add_argument("-c",
                        "--config",
                        default="config.cfg",
                        help="path to config file to use")
    parser.add_argument("-g",
                        "--gpu",
                        nargs='+',
                        default=["-1"],
                        help="gpu if to use (-1 means cpu)")
    parser.add_argument("-l",
                        "--log-dir",
                        default='tests',
                        help="path to log dir")
    parser.add_argument(
        "--snapshot-interval",
        type=int,
        default=10000,
        help="number of iterations after which a snapshot will be taken")
    parser.add_argument("--log-interval",
                        type=int,
                        default=100,
                        help="log interval")
    parser.add_argument(
        "--port",
        type=int,
        default=1337,
        help=
        "port that is used by bbox plotter to send predictions on test image")
    parser.add_argument(
        "--rl",
        dest="resume_localizer",
        help=
        "path to snapshot that is to be used to resume training of localizer")
    parser.add_argument(
        "--rr",
        dest="resume_recognizer",
        help="path to snapshot that us to be used to pre-initialize recognizer"
    )
    parser.add_argument("--num-layers",
                        type=int,
                        default=18,
                        help="Resnet Variant to use")
    parser.add_argument(
        "--no-imgaug",
        action='store_false',
        dest='use_imgaug',
        default=True,
        help=
        "disable image augmentation with `imgaug`, but use naive image augmentation instead"
    )
    parser.add_argument(
        "--rdr",
        "--rotation-dropout-ratio",
        dest="rotation_dropout_ratio",
        type=float,
        default=0,
        help="ratio for dropping rotation params in text localization network")
    parser.add_argument("--save-gradient-information",
                        action='store_true',
                        default=False,
                        help="enable tensorboard gradient plotter")
    parser.add_argument("--dump-graph",
                        action='store_true',
                        default=False,
                        help="dump computational graph to file")
    parser.add_argument("--image-mode",
                        default="RGB",
                        choices=["RGB", "L"],
                        help="mode in which images are to be loaded")
    parser.add_argument("--resume",
                        help="path to logdir from which training shall resume")

    args = parser.parse_args()
    args = parse_config(args.config, args)

    # comm = chainermn.create_communicator(communicator_name='flat')
    comm = chainermn.create_communicator()
    args.gpu = comm.intra_rank
    print(args.gpu)

    if args.resume is not None:
        log_dir = os.path.relpath(args.resume)
    else:
        log_dir = os.path.join(
            "logs", args.log_dir,
            "{}_{}".format(datetime.datetime.now().isoformat(), args.log_name))
    args.log_dir = log_dir

    # set dtype
    chainer.global_config.dtype = 'float32'

    if comm.rank == 0:
        # create log dir
        if not os.path.exists(log_dir):
            os.makedirs(log_dir, exist_ok=True)

    report_keys = ["epoch", "iteration", "loss/localizer/loss"]

    if args.use_memory_manager:
        memory_manager = DatasetClient()
        memory_manager.connect()

        train_kwargs = {
            "memory_manager": memory_manager,
            "base_name": "train_file"
        }
        # recognition_kwargs = {"memory_manager": memory_manager, "base_name": "text_recognition_file"}
        validation_kwargs = {
            "memory_manager": memory_manager,
            "base_name": "val_file"
        }
    else:
        train_kwargs = {"npz_file": args.train_file}
        # recognition_kwargs = {"npz_file": args.text_recognition_file}
        validation_kwargs = {"npz_file": args.val_file}

    if comm.rank == 0:
        train_dataset = TextRecognitionImageDataset(
            char_map=args.char_map,
            image_size=args.image_size,
            root=os.path.dirname(args.train_file),
            dtype=chainer.get_dtype(),
            use_imgaug=args.use_imgaug,
            transform_probability=0.4,
            keep_aspect_ratio=True,
            image_mode=args.image_mode,
            **train_kwargs,
        )

        validation_dataset = TextRecognitionImageDataset(
            char_map=args.char_map,
            image_size=args.image_size,
            root=os.path.dirname(args.val_file),
            dtype=chainer.get_dtype(),
            transform_probability=0,
            keep_aspect_ratio=True,
            image_mode=args.image_mode,
            **validation_kwargs,
        )
    else:
        train_dataset, validation_dataset = None, None

    train_dataset = scatter_dataset(train_dataset, comm)
    validation_dataset = scatter_dataset(validation_dataset, comm)

    # uncomment all commented parts of the code to train the model with extra recognizer training
    # text_recognition_dataset = TextRecognitionImageCharCropDataset(
    #     char_map=args.char_map,
    #     image_size=args.target_size,
    #     root=os.path.dirname(args.text_recognition_file),
    #     dtype=chainer.get_dtype(),
    #     transform_probability=0,
    #     image_mode=args.image_mode,
    #     gpu_id=args.gpu,
    #     reverse=False,
    #     resize_after_load=False,
    #     **recognition_kwargs,
    # )

    data_iter = chainer.iterators.MultithreadIterator(train_dataset,
                                                      args.batch_size)
    validation_iter = chainer.iterators.MultithreadIterator(validation_dataset,
                                                            args.batch_size,
                                                            repeat=False)
    # text_recognition_iter = chainer.iterators.MultithreadIterator(text_recognition_dataset, max(args.batch_size, 32))

    localizer = LSTMTextLocalizer(
        Size(*args.target_size),
        num_bboxes_to_localize=train_dataset.num_chars_per_word,
        num_layers=args.num_layers,
        dropout_ratio=args.rotation_dropout_ratio,
    )
    if args.resume_localizer is not None:
        load_pretrained_model(args.resume_localizer, localizer)

    recognizer = TransformerTextRecognizer(
        train_dataset.num_chars_per_word,
        train_dataset.num_words_per_image,
        train_dataset.num_classes,
        train_dataset.bos_token,
        num_layers=args.num_layers,
    )

    if args.resume_recognizer is not None:
        load_pretrained_model(args.resume_recognizer, recognizer)

    models = [localizer, recognizer]

    if comm.rank == 0:
        tensorboard_handle = SummaryWriter(log_dir=args.log_dir, graph=None)
    else:
        tensorboard_handle = None

    localizer_optimizer = RAdam(alpha=args.learning_rate,
                                beta1=0.9,
                                beta2=0.98,
                                eps=1e-9)
    localizer_optimizer = chainermn.create_multi_node_optimizer(
        localizer_optimizer, comm)
    localizer_optimizer.setup(localizer)
    localizer_optimizer.add_hook(chainer.optimizer_hooks.GradientClipping(2))

    if args.save_gradient_information:
        localizer_optimizer.add_hook(
            TensorboardGradientPlotter(tensorboard_handle,
                                       args.log_interval), )

    recognizer_optimizer = RAdam(alpha=args.learning_rate)
    recognizer_optimizer = chainermn.create_multi_node_optimizer(
        recognizer_optimizer, comm)
    recognizer_optimizer.setup(recognizer)

    optimizers = [localizer_optimizer, recognizer_optimizer]

    # log train information everytime we encouter a new epoch or args.log_interval iterations have been done
    log_interval_trigger = (
        lambda trainer:
        (trainer.updater.is_new_epoch or trainer.updater.iteration % args.
         log_interval == 0) and trainer.updater.iteration > 0)

    updater_args = {
        "iterator": {
            'main': data_iter,
            # 'rec': text_recognition_iter,
        },
        "optimizer": {
            "opt_gen": localizer_optimizer,
            "opt_rec": recognizer_optimizer,
        },
        "tensorboard_handle": tensorboard_handle,
        "tensorboard_log_interval": log_interval_trigger,
        "recognizer_update_interval": 1,
        "device": args.gpu,
    }

    updater = TransformerTextRecognitionUpdater(models=[localizer, recognizer],
                                                **updater_args)

    trainer = chainer.training.Trainer(updater, (args.num_epoch, 'epoch'),
                                       out=args.log_dir)

    data_to_log = {
        'log_dir': args.log_dir,
        'image_size': args.image_size,
        'num_layers': args.num_layers,
        'num_chars': train_dataset.num_chars_per_word,
        'num_words': train_dataset.num_words_per_image,
        'num_classes': train_dataset.num_classes,
        'keep_aspect_ratio': train_dataset.keep_aspect_ratio,
        'localizer': get_import_info(localizer),
        'recognizer': get_import_info(recognizer),
        'bos_token': train_dataset.bos_token,
    }

    for argument in filter(lambda x: not x.startswith('_'), dir(args)):
        data_to_log[argument] = getattr(args, argument)

    def backup_train_config(stats_cpu):
        if stats_cpu['iteration'] == args.log_interval:
            stats_cpu.update(data_to_log)

    if comm.rank == 0:
        for model in models:
            trainer.extend(
                extensions.snapshot_object(
                    model,
                    model.__class__.__name__ + '_{.updater.iteration}.npz'),
                trigger=lambda trainer: trainer.updater.is_new_epoch or trainer
                .updater.iteration % args.snapshot_interval == 0,
            )

        trainer.extend(extensions.snapshot(filename='trainer_snapshot',
                                           autoload=args.resume is not None),
                       trigger=(args.snapshot_interval, 'iteration'))

        evaluation_function = TextRecognitionEvaluatorFunction(
            localizer, recognizer, args.gpu, train_dataset.blank_label,
            train_dataset.char_map)

        trainer.extend(
            TextRecognitionTensorboardEvaluator(
                validation_iter,
                localizer,
                device=args.gpu,
                eval_func=evaluation_function,
                tensorboard_handle=tensorboard_handle,
                num_iterations=200,
            ),
            trigger=(args.test_interval, 'iteration'),
        )

        # every epoch run the model on test datasets
        test_dataset_prefix = "test_dataset_"
        test_datasets = [
            arg for arg in dir(args) if arg.startswith(test_dataset_prefix)
        ]
        for test_dataset_name in test_datasets:
            print(
                f"setting up testing for {test_dataset_name[len(test_dataset_prefix):]} dataset"
            )

            dataset_path = getattr(args, test_dataset_name)
            if args.use_memory_manager:
                test_kwargs = {
                    "memory_manager": memory_manager,
                    "base_name": test_dataset_name
                }
            else:
                test_kwargs = {"npz_file": dataset_path}

            test_dataset = TextRecognitionImageDataset(
                char_map=args.char_map,
                image_size=args.image_size,
                root=os.path.dirname(dataset_path),
                dtype=chainer.get_dtype(),
                transform_probability=0,
                keep_aspect_ratio=True,
                image_mode=args.image_mode,
                **test_kwargs,
            )
            test_iter = chainer.iterators.MultithreadIterator(test_dataset,
                                                              args.batch_size,
                                                              repeat=False)
            trainer.extend(TextRecognitionTensorboardEvaluator(
                test_iter,
                localizer,
                device=args.gpu,
                eval_func=evaluation_function,
                tensorboard_handle=tensorboard_handle,
                base_key=test_dataset_name[len(test_dataset_prefix):]),
                           trigger=(args.snapshot_interval, 'iteration'))

        models.append(updater)
        logger = Logger(
            os.path.dirname(os.path.realpath(__file__)),
            args.log_dir,
            postprocess=backup_train_config,
            trigger=log_interval_trigger,
            exclusion_filters=['*logs*', '*.pyc', '__pycache__', '.git*'],
            resume=args.resume is not None,
        )

        if args.test_image is not None:
            plot_image = train_dataset.load_image(args.test_image)
            gt_bbox = None
        else:
            plot_image = validation_dataset.get_example(0)['image']
            gt_bbox = None

        bbox_plotter = TextRecognitionBBoxPlotter(
            plot_image,
            os.path.join(args.log_dir, 'bboxes'),
            args.target_size,
            send_bboxes=True,
            upstream_port=args.port,
            visualization_anchors=[
                ["visual_backprop_anchors"],
            ],
            device=args.gpu,
            render_extracted_rois=True,
            num_rois_to_render=4,
            sort_rois=False,
            show_visual_backprop_overlay=True,
            visual_backprop_index=0,
            show_backprop_and_feature_vis=True,
            gt_bbox=gt_bbox,
            render_pca=False,
            log_name=args.log_name,
            char_map=train_dataset.char_map,
            blank_label=train_dataset.blank_label,
            predictors={
                "localizer": localizer,
                "recognizer": recognizer,
            },
        )
        trainer.extend(bbox_plotter, trigger=(10, 'iteration'))

        trainer.extend(logger, trigger=log_interval_trigger)
        trainer.extend(extensions.PrintReport(report_keys,
                                              log_report='Logger'),
                       trigger=log_interval_trigger)

        # learning rate shift after each epoch
        trainer.extend(extensions.ExponentialShift(
            "alpha", 0.1, optimizer=localizer_optimizer),
                       trigger=(1, 'epoch'))

        trainer.extend(extensions.ProgressBar(update_interval=10))

        if args.dump_graph:
            trainer.extend(
                extensions.dump_graph('loss/localizer/loss',
                                      out_name='model.dot'))

        open_interactive_prompt(
            bbox_plotter=bbox_plotter,
            optimizer=optimizers,
        )

    trainer.run()
예제 #35
0
def test_deprecation():
    with chainer.testing.assert_warns(DeprecationWarning):
        chainermn.create_communicator('hierarchical')

    with chainer.testing.assert_warns(DeprecationWarning):
        chainermn.create_communicator('two_dimensional')
예제 #36
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model', choices=('resnet50', 'resnet101'))
    parser.add_argument('--batchsize', type=int, default=16)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if args.model == 'resnet50':
        model = FasterRCNNFPNResNet50(
            n_fg_class=len(coco_bbox_label_names), mean='chainercv')
        copyparams(model.extractor.base,
                   ResNet50(pretrained_model='imagenet', arch='he'))
    elif args.model == 'resnet101':
        model = FasterRCNNFPNResNet101(
            n_fg_class=len(coco_bbox_label_names), mean='chainercv')
        copyparams(model.extractor.base,
                   ResNet101(pretrained_model='imagenet', arch='he'))

    model.use_preset('evaluate')
    train_chain = TrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    train_chain.to_gpu()

    train = TransformDataset(
        ConcatenatedDataset(
            COCOBboxDataset(split='train'),
            COCOBboxDataset(split='valminusminival'),
        ), ('img', 'bbox', 'label'), transform)

    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    train_iter = chainer.iterators.MultithreadIterator(
        train, args.batchsize // comm.size)

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    optimizer.add_hook(WeightDecay(0.0001))

    model.extractor.base.conv1.disable_update()
    model.extractor.base.res2.disable_update()
    for link in model.links():
        if isinstance(link, L.BatchNormalization):
            link.disable_update()

    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, converter=converter, device=device)
    trainer = training.Trainer(
        updater, (90000 * 16 / args.batchsize, 'iteration'), args.out)

    def lr_schedule(updater):
        base_lr = 0.02 * args.batchsize / 16
        warm_up_duration = 500
        warm_up_rate = 1 / 3

        iteration = updater.iteration
        if iteration < warm_up_duration:
            rate = warm_up_rate \
                + (1 - warm_up_rate) * iteration / warm_up_duration
        elif iteration < 60000 * 16 / args.batchsize:
            rate = 1
        elif iteration < 80000 * 16 / args.batchsize:
            rate = 0.1
        else:
            rate = 0.01

        return base_lr * rate

    trainer.extend(ManualScheduler('lr', lr_schedule))

    if comm.rank == 0:
        log_interval = 10, 'iteration'
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport(
            ['epoch', 'iteration', 'lr', 'main/loss',
             'main/loss/rpn/loc', 'main/loss/rpn/conf',
             'main/loss/head/loc', 'main/loss/head/conf']),
            trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration'))
        trainer.extend(
            extensions.snapshot_object(
                model, 'model_iter_{.updater.iteration}'),
            trigger=(90000 * 16 / args.batchsize, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer, strict=False)

    trainer.run()
예제 #37
0
def main():
    try:
        os.mkdir(args.snapshot_path)
    except:
        pass

    comm = chainermn.create_communicator()
    device = comm.intra_rank
    print("device", device, "/", comm.size)
    cuda.get_device(device).use()
    xp = cupy

    dataset = gqn.data.Dataset(args.dataset_path)

    hyperparams = HyperParameters()
    hyperparams.generator_share_core = args.generator_share_core
    hyperparams.generator_share_prior = args.generator_share_prior
    hyperparams.generator_generation_steps = args.generation_steps
    hyperparams.inference_share_core = args.inference_share_core
    hyperparams.inference_share_posterior = args.inference_share_posterior
    hyperparams.channels_chz = args.channels_chz
    hyperparams.generator_channels_u = args.channels_u
    hyperparams.inference_channels_map_x = args.channels_map_x
    hyperparams.pixel_n = args.pixel_n
    hyperparams.pixel_sigma_i = args.initial_pixel_sigma
    hyperparams.pixel_sigma_f = args.final_pixel_sigma
    if comm.rank == 0:
        hyperparams.save(args.snapshot_path)
        hyperparams.print()

    model = Model(hyperparams, snapshot_directory=args.snapshot_path)
    model.to_gpu()

    optimizer = Optimizer(
        model.parameters,
        communicator=comm,
        mu_i=args.initial_lr,
        mu_f=args.final_lr)
    if comm.rank == 0:
        optimizer.print()

    dataset_mean, dataset_std = dataset.load_mean_and_std()

    if comm.rank == 0:
        np.save(os.path.join(args.snapshot_path, "mean.npy"), dataset_mean)
        np.save(os.path.join(args.snapshot_path, "std.npy"), dataset_std)

    # avoid division by zero
    dataset_std += 1e-12

    sigma_t = hyperparams.pixel_sigma_i
    pixel_var = xp.full(
        (args.batch_size, 3) + hyperparams.image_size,
        sigma_t**2,
        dtype="float32")
    pixel_ln_var = xp.full(
        (args.batch_size, 3) + hyperparams.image_size,
        math.log(sigma_t**2),
        dtype="float32")

    random.seed(0)
    subset_indices = list(range(len(dataset.subset_filenames)))

    current_training_step = 0
    for iteration in range(args.training_iterations):
        mean_kld = 0
        mean_nll = 0
        total_batch = 0
        subset_size_per_gpu = len(subset_indices) // comm.size
        start_time = time.time()

        for subset_loop in range(subset_size_per_gpu):
            random.shuffle(subset_indices)
            subset_index = subset_indices[comm.rank]
            subset = dataset.read(subset_index)
            iterator = gqn.data.Iterator(subset, batch_size=args.batch_size)

            for batch_index, data_indices in enumerate(iterator):
                # shape: (batch, views, height, width, channels)
                # range: [-1, 1]
                images, viewpoints = subset[data_indices]

                # preprocessing
                images = (images - dataset_mean) / dataset_std

                # (batch, views, height, width, channels) ->  (batch, views, channels, height, width)
                images = images.transpose((0, 1, 4, 2, 3))

                total_views = images.shape[1]

                # sample number of views
                num_views = random.choice(range(total_views))
                query_index = random.choice(range(total_views))

                if current_training_step == 0 and num_views == 0:
                    num_views = 1  # avoid OpenMPI error

                if num_views > 0:
                    r = model.compute_observation_representation(
                        images[:, :num_views], viewpoints[:, :num_views])
                else:
                    r = xp.zeros(
                        (args.batch_size, hyperparams.channels_r) +
                        hyperparams.chrz_size,
                        dtype="float32")
                    r = chainer.Variable(r)

                query_images = images[:, query_index]
                query_viewpoints = viewpoints[:, query_index]
                # transfer to gpu
                query_images = to_gpu(query_images)
                query_viewpoints = to_gpu(query_viewpoints)

                h0_gen, c0_gen, u_0, h0_enc, c0_enc = model.generate_initial_state(
                    args.batch_size, xp)

                loss_kld = 0

                hl_enc = h0_enc
                cl_enc = c0_enc
                hl_gen = h0_gen
                cl_gen = c0_gen
                ul_enc = u_0

                xq = model.inference_downsampler.downsample(query_images)

                for l in range(model.generation_steps):
                    inference_core = model.get_inference_core(l)
                    inference_posterior = model.get_inference_posterior(l)
                    generation_core = model.get_generation_core(l)
                    generation_piror = model.get_generation_prior(l)

                    h_next_enc, c_next_enc = inference_core.forward_onestep(
                        hl_gen, hl_enc, cl_enc, xq, query_viewpoints, r)

                    mean_z_q = inference_posterior.compute_mean_z(hl_enc)
                    ln_var_z_q = inference_posterior.compute_ln_var_z(hl_enc)
                    ze_l = cf.gaussian(mean_z_q, ln_var_z_q)

                    mean_z_p = generation_piror.compute_mean_z(hl_gen)
                    ln_var_z_p = generation_piror.compute_ln_var_z(hl_gen)

                    h_next_gen, c_next_gen, u_next_enc = generation_core.forward_onestep(
                        hl_gen, cl_gen, ul_enc, ze_l, query_viewpoints, r)

                    kld = gqn.nn.chainer.functions.gaussian_kl_divergence(
                        mean_z_q, ln_var_z_q, mean_z_p, ln_var_z_p)

                    loss_kld += cf.sum(kld)

                    hl_gen = h_next_gen
                    cl_gen = c_next_gen
                    ul_enc = u_next_enc
                    hl_enc = h_next_enc
                    cl_enc = c_next_enc

                mean_x = model.generation_observation.compute_mean_x(ul_enc)
                negative_log_likelihood = gqn.nn.chainer.functions.gaussian_negative_log_likelihood(
                    query_images, mean_x, pixel_var, pixel_ln_var)
                loss_nll = cf.sum(negative_log_likelihood)

                loss_nll /= args.batch_size
                loss_kld /= args.batch_size
                loss = loss_nll + loss_kld

                model.cleargrads()
                loss.backward()
                optimizer.update(current_training_step)

                if comm.rank == 0:
                    printr(
                        "Iteration {}: Subset {} / {}: Batch {} / {} - loss: nll: {:.3f} kld: {:.3f} - lr: {:.4e} - sigma_t: {:.6f}".
                        format(iteration + 1, subset_loop * comm.size + 1,
                               len(dataset), batch_index + 1,
                               len(subset) // args.batch_size,
                               float(loss_nll.data), float(loss_kld.data),
                               optimizer.learning_rate, sigma_t))

                sf = hyperparams.pixel_sigma_f
                si = hyperparams.pixel_sigma_i
                sigma_t = max(
                    sf + (si - sf) *
                    (1.0 - current_training_step / hyperparams.pixel_n), sf)

                pixel_var[...] = sigma_t**2
                pixel_ln_var[...] = math.log(sigma_t**2)

                total_batch += 1
                current_training_step += comm.size
                # current_training_step += 1
                mean_kld += float(loss_kld.data)
                mean_nll += float(loss_nll.data)

            if comm.rank == 0:
                model.serialize(args.snapshot_path)

        if comm.rank == 0:
            elapsed_time = time.time() - start_time
            print(
                "\033[2KIteration {} - loss: nll: {:.3f} kld: {:.3f} - lr: {:.4e} - sigma_t: {:.6f} - step: {} - elapsed_time: {:.3f} min".
                format(iteration + 1, mean_nll / total_batch,
                       mean_kld / total_batch, optimizer.learning_rate,
                       sigma_t, current_training_step, elapsed_time / 60))
            model.serialize(args.snapshot_path)
    
    train_data = np.load(os.path.join(args.train, 'train.npz'))['data']
    train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels']

    test_data = np.load(os.path.join(args.test, 'test.npz'))['data']
    test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels']

    train = chainer.datasets.TupleDataset(train_data, train_labels)
    test = chainer.datasets.TupleDataset(test_data, test_labels)

    # Set up a neural network to train.
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    model = L.Classifier(net.VGG(10))

    comm = chainermn.create_communicator(args.communicator)

    # comm.inter_rank gives the rank of the node. This should only print on one node.
    if comm.inter_rank == 0:
        print('# Minibatch-size: {}'.format(args.batch_size))
        print('# epoch: {}'.format(args.epochs))
        print('# communicator: {}'.format(args.communicator))

    # Set up a neural network to train.
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.

    # comm.intra_rank gives the rank of the process on a given node.
    device = comm.intra_rank if num_gpus > 0 else -1
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
예제 #39
0
def check_mnist(gpu, display_log=True):
    epoch = 5
    batchsize = 100
    n_units = 100

    comm = chainermn.create_communicator('naive')
    if gpu:
        device = comm.intra_rank
        chainer.cuda.get_device(device).use()
    else:
        device = -1

    model = L.Classifier(MLP(n_units, 10))
    if gpu:
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)

    trainer = training.Trainer(updater, (epoch, 'epoch'))

    # Wrap standard Chainer evaluators by MultiNodeEvaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Add checkpointer. This is just to check checkpointing runs
    # without errors
    path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + "-tmp-")
    checkpointer = create_multi_node_checkpointer(name=__name__,
                                                  comm=comm,
                                                  path=path)
    trainer.extend(checkpointer, trigger=(1, 'epoch'))

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0 and display_log:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ],
                                              out=sys.stderr),
                       trigger=(1, 'epoch'))
    trainer.run()

    err = evaluator()['validation/main/accuracy']
    assert err > 0.95

    # Check checkpointer successfully finalized snapshot directory
    assert [] == os.listdir(path)
    os.removedirs(path)
    parser.add_argument('--host', type=str, default=env.current_host)
    parser.add_argument('--num-gpus', type=int, default=env.num_gpus)

    parser.add_argument('--train', type=str, default=env.channel_input_dirs['train'])
    parser.add_argument('--test', type=str, default=env.channel_input_dirs['test'])

    args = parser.parse_args()

    train_file = np.load(os.path.join(args.train, 'train.npz'))
    test_file = np.load(os.path.join(args.test, 'test.npz'))

    logger.info('Current host: {}'.format(args.host))

    communicator = 'naive' if args.num_gpus == 0 else args.communicator

    comm = chainermn.create_communicator(communicator)
    device = comm.intra_rank if args.num_gpus > 0 else -1

    print('==========================================')
    print('Using {} communicator'.format(comm))
    print('Num unit: {}'.format(args.units))
    print('Num Minibatch-size: {}'.format(args.batch_size))
    print('Num epoch: {}'.format(args.epochs))
    print('==========================================')

    model = L.Classifier(MLP(args.units, 10))
    if device >= 0:
        chainer.cuda.get_device(device).use()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
예제 #41
0
def main():
    parser = argparse.ArgumentParser(
        description='ChainerMN example: pipelined neural network')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.size != 2:
        raise ValueError(
            'This example can only be executed on exactly 2 processes.')

    if comm.rank == 0:
        print('==========================================')
        if args.gpu:
            print('Using GPUs')
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    if comm.rank == 0:
        model = L.Classifier(MLP0(comm, args.unit))
    elif comm.rank == 1:
        model = MLP1(comm, args.unit, 10)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Iterate dataset only on worker 0.
    train, test = chainer.datasets.get_mnist()
    if comm.rank == 1:
        train = chainermn.datasets.create_empty_dataset(train)
        test = chainermn.datasets.create_empty_dataset(test)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  args.batchsize,
                                                  shuffle=False)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    # Some display and output extentions are necessary only for worker 0.
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
예제 #42
0
def main():
    parser = argparse.ArgumentParser(description='''\
ChainerMN example: MNIST with automatic checkpoints enabled''')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator',
                        type=str,
                        default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--run-id',
                        type=str,
                        default='train-mnist-example',
                        help='ID of the task name')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Enable checkpointer and recover from checkpoint if any checkpoint exists
    checkpointer = create_multi_node_checkpointer(name=args.run_id, comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    print("Rank", comm.rank, ": (Re)Starting from (epoch, iter) =",
          (trainer.updater.epoch, trainer.updater.iteration))
    trainer.extend(checkpointer, trigger=(1000, 'iteration'))

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
예제 #43
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError("ImageNet requires GPU support.")

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(
            args.val, args.root, mean, model.insize, False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # We need to change the start method of multiprocessing module if we are
    # using InfiniBand and MultiprocessIterator. This is because processes
    # often crash when calling fork if they are using Infiniband.
    # (c.f., https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #44
0
def train(args, train_data, test_data, evaluator_type):
    required_args = [
        'dataset',
        'class_names',
        'logs_dir',
        'min_size',
        'max_size',
        'anchor_scales',
    ]
    for arg_key in required_args:
        if not hasattr(args, arg_key):
            raise ValueError(
                'args must contain required key: {}'.format(arg_key)
            )

    assert evaluator_type in ['voc', 'coco'], \
        'Unsupported evaluator_type: {}'.format(evaluator_type)

    if args.multi_node:
        import chainermn

        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        if args.gpu is None:
            print(
                'Option --gpu is required without --multi-node.',
                file=sys.stderr,
            )
            sys.exit(1)
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()
    args.out = osp.join(args.logs_dir, now.strftime('%Y%m%d_%H%M%S'))

    args.batch_size = args.batch_size_per_gpu * args.n_gpu

    # lr: 0.00125 * 8 = 0.01  in original
    args.lr = 0.00125 * args.batch_size
    args.weight_decay = 0.0001

    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [
        (120e3 / 180e3) * args.max_epoch,
        (160e3 / 180e3) * args.max_epoch,
    ]

    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.pooling_func == 'align':
        pooling_func = cmr.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = cmr.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = cmr.functions.crop_and_resize
    else:
        raise ValueError(
            'Unsupported pooling_func: {}'.format(args.pooling_func)
        )

    if args.initializer == 'normal':
        mask_initialW = chainer.initializers.Normal(0.01)
    elif args.initializer == 'he_normal':
        mask_initialW = chainer.initializers.HeNormal(fan_option='fan_out')
    else:
        raise ValueError(
            'Unsupported initializer: {}'.format(args.initializer)
        )

    if args.model in ['resnet50', 'resnet101']:
        n_layers = int(args.model.lstrip('resnet'))
        mask_rcnn = cmr.models.MaskRCNNResNet(
            n_layers=n_layers,
            n_fg_class=len(args.class_names),
            pooling_func=pooling_func,
            anchor_scales=args.anchor_scales,
            roi_size=args.roi_size,
            min_size=args.min_size,
            max_size=args.max_size,
            mask_initialW=mask_initialW,
        )
    else:
        raise ValueError('Unsupported model: {}'.format(args.model))
    model = cmr.models.MaskRCNNTrainChain(mask_rcnn)
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    if args.model in ['resnet50', 'resnet101']:
        # ResNetExtractor.freeze_at is not enough to freeze params
        # since WeightDecay updates the param little by little.
        mask_rcnn.extractor.conv1.disable_update()
        mask_rcnn.extractor.bn1.disable_update()
        mask_rcnn.extractor.res2.disable_update()
        for link in mask_rcnn.links():
            if isinstance(link, cmr.links.AffineChannel2D):
                link.disable_update()

    train_data = chainer.datasets.TransformDataset(
        train_data,
        cmr.datasets.MaskRCNNTransform(mask_rcnn),
    )
    test_data = chainer.datasets.TransformDataset(
        test_data,
        cmr.datasets.MaskRCNNTransform(mask_rcnn, train=False),
    )
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
            test_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        test_data = chainermn.scatter_dataset(test_data, comm)

    # FIXME: MultiProcessIterator sometimes hangs
    train_iter = chainer.iterators.SerialIterator(
        train_data,
        batch_size=args.batch_size_per_gpu,
    )
    test_iter = chainer.iterators.SerialIterator(
        test_data,
        batch_size=args.batch_size_per_gpu,
        repeat=False,
        shuffle=False,
    )

    converter = functools.partial(
        cmr.datasets.concat_examples,
        padding=0,
        # img, bboxes, labels, masks, scales
        indices_concat=[0, 2, 3, 4],  # img, _, labels, masks, scales
        indices_to_device=[0, 1],  # img, bbox
    )
    updater = chainer.training.updater.StandardUpdater(
        train_iter,
        optimizer,
        device=device,
        converter=converter,
    )

    trainer = training.Trainer(
        updater,
        (args.max_epoch, 'epoch'),
        out=args.out,
    )

    trainer.extend(
        extensions.ExponentialShift('lr', 0.1),
        trigger=training.triggers.ManualScheduleTrigger(
            args.step_size,
            'epoch',
        ),
    )

    eval_interval = 1, 'epoch'
    log_interval = 20, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = 20, 'iteration'

    if evaluator_type == 'voc':
        evaluator = cmr.extensions.InstanceSegmentationVOCEvaluator(
            test_iter,
            model.mask_rcnn,
            device=device,
            use_07_metric=True,
            label_names=args.class_names,
        )
    elif evaluator_type == 'coco':
        evaluator = cmr.extensions.InstanceSegmentationCOCOEvaluator(
            test_iter,
            model.mask_rcnn,
            device=device,
            label_names=args.class_names,
        )
    else:
        raise ValueError(
            'Unsupported evaluator_type: {}'.format(evaluator_type)
        )
    if args.multi_node:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=eval_interval)

    if not args.multi_node or comm.rank == 0:
        # Save snapshot.
        trainer.extend(
            extensions.snapshot_object(model.mask_rcnn, 'snapshot_model.npz'),
            trigger=training.triggers.MaxValueTrigger(
                'validation/main/map',
                eval_interval,
            ),
        )

        # Dump params.yaml.
        args.git_hash = cmr.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))

        # Visualization.
        trainer.extend(
            cmr.extensions.InstanceSegmentationVisReport(
                test_iter,
                model.mask_rcnn,
                label_names=args.class_names,
            ),
            trigger=eval_interval,
        )

        # Logging.
        trainer.extend(
            chainer.training.extensions.observe_lr(),
            trigger=log_interval,
        )
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(
            extensions.PrintReport(
                [
                    'iteration',
                    'epoch',
                    'elapsed_time',
                    'lr',
                    'main/loss',
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                    'validation/main/map',
                ],
            ),
            trigger=print_interval,
        )
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # Plot.
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(
                [
                    'main/loss',
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                ],
                file_name='loss.png',
                trigger=plot_interval,
            ),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport(
                ['validation/main/map'],
                file_name='accuracy.png',
                trigger=plot_interval,
            ),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
예제 #45
0
 def test_allreduce_persistent_cpu(self):
     comm = chainermn.create_communicator('naive')
     self._test(comm, ExampleModel())
예제 #46
0
def check_mnist(gpu, display_log=True):
    epoch = 5
    batchsize = 100
    n_units = 100

    comm = chainermn.create_communicator('naive')
    if gpu:
        device = comm.intra_rank
        chainer.cuda.get_device_from_id(device).use()
    else:
        device = -1

    model = L.Classifier(MLP(n_units, 10))
    if gpu:
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test, batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(
        train_iter,
        optimizer,
        device=device
    )

    trainer = training.Trainer(updater, (epoch, 'epoch'))

    # Wrap standard Chainer evaluators by MultiNodeEvaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Add checkpointer. This is just to check checkpointing runs
    # without errors
    path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + "-tmp-")
    checkpointer = create_multi_node_checkpointer(name=__name__, comm=comm,
                                                  path=path)
    trainer.extend(checkpointer, trigger=(1, 'epoch'))

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0 and display_log:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.PrintReport(['epoch',
                                               'main/loss',
                                               'validation/main/loss',
                                               'main/accuracy',
                                               'validation/main/accuracy',
                                               'elapsed_time'],
                                              out=sys.stderr),
                       trigger=(1, 'epoch'))
    trainer.run()

    err = evaluator()['validation/main/accuracy']
    assert err > 0.95

    # Check checkpointer successfully finalized snapshot directory
    assert [] == os.listdir(path)
    os.removedirs(path)
예제 #47
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action="store_true", default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help="Type of communicator")
    parser.add_argument('--stop', '-s', type=str, default="15e",
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default="adam()",
                        help="Optimizer and its argument")
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.mpi_comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print("RD source done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print("RD target done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print("Rank {} GPU: {}".format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.mpi_comm.bcast(source_ids, root=0)
    target_ids = comm.mpi_comm.bcast(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print("target_words : {}".format(len(target_words)))
        print("source_words : {}".format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write("Error: unknown stop trigger: {}".format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print("Trigger: {}".format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()
예제 #48
0
    def setUp(self):
        self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size < 2:
            pytest.skip('This test is for multinode only')
예제 #49
0
def comm() -> CommunicatorBase:

    if not _available:
        pytest.skip("This test requires ChainerMN.")

    return chainermn.create_communicator("naive")
예제 #50
0
파일: seq2seq.py 프로젝트: asi1024/chainer
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action='store_true', default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--stop', '-s', type=str, default='15e',
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default='adam()',
                        help='Optimizer and its argument')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print('RD source done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print('RD target done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print('Rank {} GPU: {}'.format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.bcast_obj(source_ids, root=0)
    target_ids = comm.bcast_obj(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print('target_words : {}'.format(len(target_words)))
        print('source_words : {}'.format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device_from_id(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write('Error: unknown stop trigger: {}'.format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print('Trigger: {}'.format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], numpy.int32)
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()
예제 #51
0
 def test_allreduce_persistent_cpu(self):
     comm = chainermn.create_communicator('naive')
     model = ExampleModel()
     self._test(comm, model, False, False)  # CPU test (numpy)
     self._test(comm, model, False, True)  # CPU test (ChainerX)
예제 #52
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: DCGAN')
    parser.add_argument('--batchsize', '-b', type=int, default=50,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator', type=str,
                        default='hierarchical', help='Type of communicator')
    parser.add_argument('--epoch', '-e', type=int, default=1000,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--dataset', '-i', default='',
                        help='Directory of image files.  Default is cifar-10.')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--gen_model', '-r', default='',
                        help='Use pre-trained generator for training')
    parser.add_argument('--dis_model', '-d', default='',
                        help='Use pre-trained discriminator for training')
    parser.add_argument('--n_hidden', '-n', type=int, default=100,
                        help='Number of hidden units (z)')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed of z at visualization stage')
    parser.add_argument('--snapshot_interval', type=int, default=1000,
                        help='Interval of snapshot')
    parser.add_argument('--display_interval', type=int, default=100,
                        help='Interval of displaying log to console')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num hidden unit: {}'.format(args.n_hidden))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    # Set up a neural network to train
    gen = Generator(n_hidden=args.n_hidden)
    dis = Discriminator()

    if device >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(device).use()
        gen.to_gpu()  # Copy the model to the GPU
        dis.to_gpu()

    # Setup an optimizer
    def make_optimizer(model, comm, alpha=0.0002, beta1=0.5):
        # Create a multi node optimizer from a standard Chainer optimizer.
        optimizer = chainermn.create_multi_node_optimizer(
            chainer.optimizers.Adam(alpha=alpha, beta1=beta1), comm)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001), 'hook_dec')
        return optimizer

    opt_gen = make_optimizer(gen, comm)
    opt_dis = make_optimizer(dis, comm)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        if args.dataset == '':
            # Load the CIFAR10 dataset if args.dataset is not specified
            train, _ = chainer.datasets.get_cifar10(withlabel=False,
                                                    scale=255.)
        else:
            all_files = os.listdir(args.dataset)
            image_files = [f for f in all_files if ('png' in f or 'jpg' in f)]
            print('{} contains {} image files'
                  .format(args.dataset, len(image_files)))
            train = chainer.datasets\
                .ImageDataset(paths=image_files, root=args.dataset)
    else:
        train = None

    train = chainermn.scatter_dataset(train, comm)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)

    # Set up a trainer
    updater = DCGANUpdater(
        models=(gen, dis),
        iterator=train_iter,
        optimizer={
            'gen': opt_gen, 'dis': opt_dis},
        device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        snapshot_interval = (args.snapshot_interval, 'iteration')
        display_interval = (args.display_interval, 'iteration')
        # Save only model parameters.
        # `snapshot` extension will save all the trainer module's attribute,
        # including `train_iter`.
        # However, `train_iter` depends on scattered dataset, which means that
        # `train_iter` may be different in each process.
        # Here, instead of saving whole trainer module, only the network models
        # are saved.
        trainer.extend(extensions.snapshot_object(
            gen, 'gen_iter_{.updater.iteration}.npz'),
            trigger=snapshot_interval)
        trainer.extend(extensions.snapshot_object(
            dis, 'dis_iter_{.updater.iteration}.npz'),
            trigger=snapshot_interval)
        trainer.extend(extensions.LogReport(trigger=display_interval))
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'gen/loss', 'dis/loss', 'elapsed_time',
        ]), trigger=display_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))
        trainer.extend(
            out_generated_image(
                gen, dis,
                10, 10, args.seed, args.out),
            trigger=snapshot_interval)

    # Start the training using pre-trained model, saved by snapshot_object
    if args.gen_model:
        chainer.serializers.load_npz(args.gen_model, gen)
    if args.dis_model:
        chainer.serializers.load_npz(args.dis_model, dis)

    # Run the training
    trainer.run()
예제 #53
0
    def setup(self):
        self.communicator = chainermn.create_communicator('naive')

        if self.communicator.size != 2:
            pytest.skip('This test is for two processes')
예제 #54
0
 def setUp(self):
     self.communicator = chainermn.create_communicator('naive')
     warnings.filterwarnings(action='always', category=DeprecationWarning)
예제 #55
0
def train(args, dataset_train, dataset_test):
    random.seed(0)
    np.random.seed(0)

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_gpu = comm.size
        args.inter_size = comm.inter_size
        args.intra_size = comm.intra_size
        args.batch_size_total = args.batch_size * args.n_gpu

        chainer.cuda.get_device(device).use()
    else:
        args.batch_size_total = args.batch_size
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    # Model

    G_A = chainer_cyclegan.models.ResnetGenerator()
    G_B = chainer_cyclegan.models.ResnetGenerator()
    D_A = chainer_cyclegan.models.NLayerDiscriminator()
    D_B = chainer_cyclegan.models.NLayerDiscriminator()

    if args.multi_node or args.gpu >= 0:
        G_A.to_gpu()
        G_B.to_gpu()
        D_A.to_gpu()
        D_B.to_gpu()

    # Optimizer

    args.lr = 0.0002
    args.beta1 = 0.5
    args.beta2 = 0.999

    optimizer_G_A = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)
    optimizer_G_B = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)
    optimizer_D_A = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)
    optimizer_D_B = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)

    if args.multi_node:
        optimizer_G_A = chainermn.create_multi_node_optimizer(
            optimizer_G_A, comm)
        optimizer_G_B = chainermn.create_multi_node_optimizer(
            optimizer_G_B, comm)
        optimizer_D_A = chainermn.create_multi_node_optimizer(
            optimizer_D_A, comm)
        optimizer_D_B = chainermn.create_multi_node_optimizer(
            optimizer_D_B, comm)

    optimizer_G_A.setup(G_A)
    optimizer_G_B.setup(G_B)
    optimizer_D_A.setup(D_A)
    optimizer_D_B.setup(D_B)

    # Dataset

    if args.multi_node:
        if comm.rank != 0:
            dataset_train = None
            dataset_test = None
        dataset_train = chainermn.scatter_dataset(dataset_train,
                                                  comm,
                                                  shuffle=True)
        dataset_test = chainermn.scatter_dataset(dataset_test, comm)

    iter_train = chainer.iterators.MultiprocessIterator(
        dataset_train,
        batch_size=args.batch_size,
        n_processes=4,
        shared_mem=10**7)
    iter_test = chainer.iterators.SerialIterator(dataset_test,
                                                 batch_size=args.batch_size,
                                                 repeat=False,
                                                 shuffle=False)

    # Updater

    epoch_count = 1
    niter = 100
    niter_decay = 100

    updater = chainer_cyclegan.updaters.CycleGANUpdater(
        iterator=iter_train,
        optimizer=dict(
            G_A=optimizer_G_A,
            G_B=optimizer_G_B,
            D_A=optimizer_D_A,
            D_B=optimizer_D_B,
        ),
        device=device,
    )

    # Trainer

    out = osp.join('logs/train_cyclegan',
                   datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
    trainer = training.Trainer(updater, (niter + niter_decay, 'epoch'),
                               out=out)

    @training.make_extension(trigger=(1, 'epoch'))
    def tune_learning_rate(trainer):
        epoch = trainer.updater.epoch

        lr_rate = 1.0 - (max(0, epoch + 1 + epoch_count - niter) /
                         float(niter_decay + 1))

        trainer.updater.get_optimizer('G_A').alpha *= lr_rate
        trainer.updater.get_optimizer('G_B').alpha *= lr_rate
        trainer.updater.get_optimizer('D_A').alpha *= lr_rate
        trainer.updater.get_optimizer('D_B').alpha *= lr_rate

    trainer.extend(tune_learning_rate)

    if not args.multi_node or comm.rank == 0:
        trainer.extend(
            chainer_cyclegan.extensions.CycleGANEvaluator(iter_test,
                                                          device=device))

        trainer.extend(extensions.snapshot_object(
            target=G_A, filename='G_A_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.snapshot_object(
            target=G_B, filename='G_B_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.snapshot_object(
            target=D_A, filename='D_A_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.snapshot_object(
            target=D_B, filename='D_B_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))

        trainer.extend(extensions.LogReport(trigger=(20, 'iteration')))
        trainer.extend(
            extensions.PrintReport([
                'epoch',
                'iteration',
                'elapsed_time',
                'loss_gen_A',
                'loss_gen_B',
                'loss_dis_A',
                'loss_dis_B',
                'loss_cyc_A',
                'loss_cyc_B',
                'loss_idt_A',
                'loss_idt_B',
            ]))
        trainer.extend(contrib.extensions.ParamsReport(args.__dict__))
        trainer.extend(extensions.ProgressBar(update_interval=10))

        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_gen_A', 'loss_gen_B'],
                                  x_key='iteration',
                                  file_name='loss_gen.png',
                                  trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_dis_A', 'loss_dis_B'],
                                  x_key='iteration',
                                  file_name='loss_dis.png',
                                  trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_cyc_A', 'loss_cyc_B'],
                                  x_key='iteration',
                                  file_name='loss_cyc.png',
                                  trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_idt_A', 'loss_idt_B'],
                                  x_key='iteration',
                                  file_name='loss_idt.png',
                                  trigger=(100, 'iteration')))

    trainer.run()
예제 #56
0
def main():
    parser = argparse.ArgumentParser(
        description='ChainerMN example: pipelined neural network')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.size != 2:
        raise ValueError(
            'This example can only be executed on exactly 2 processes.')

    if comm.rank == 0:
        print('==========================================')
        if args.gpu:
            print('Using GPUs')
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    if comm.rank == 0:
        model = L.Classifier(MLP0(comm, args.unit))
    elif comm.rank == 1:
        model = MLP1(comm, args.unit, 10)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Iterate dataset only on worker 0.
    train, test = chainer.datasets.get_mnist()
    if comm.rank == 1:
        train = chainermn.datasets.create_empty_dataset(train)
        test = chainermn.datasets.create_empty_dataset(test)

    train_iter = chainer.iterators.SerialIterator(
        train, args.batchsize, shuffle=False)
    test_iter = chainer.iterators.SerialIterator(
        test, args.batchsize, repeat=False, shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    # Some display and output extentions are necessary only for worker 0.
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
예제 #57
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        choices=('ssd300', 'ssd512'),
                        default='ssd300')
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--test-batchsize', type=int, default=16)
    parser.add_argument('--iteration', type=int, default=120000)
    parser.add_argument('--step', type=int, nargs='*', default=[80000, 100000])
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if args.model == 'ssd300':
        model = SSD300(n_fg_class=len(voc_bbox_label_names),
                       pretrained_model='imagenet')
    elif args.model == 'ssd512':
        model = SSD512(n_fg_class=len(voc_bbox_label_names),
                       pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    model.to_gpu()

    train = TransformDataset(
        ConcatenatedDataset(VOCBboxDataset(year='2007', split='trainval'),
                            VOCBboxDataset(year='2012', split='trainval')),
        ('img', 'mb_loc', 'mb_label'),
        Transform(model.coder, model.insize, model.mean))

    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    # http://chainermn.readthedocs.io/en/latest/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(train,
                                                        args.batchsize //
                                                        comm.size,
                                                        n_processes=2)

    if comm.rank == 0:
        test = VOCBboxDataset(year='2007',
                              split='test',
                              use_difficult=True,
                              return_difficult=True)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     args.test_batchsize,
                                                     repeat=False,
                                                     shuffle=False)

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, (args.iteration, 'iteration'),
                               args.out)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3),
                   trigger=triggers.ManualScheduleTrigger(
                       args.step, 'iteration'))

    if comm.rank == 0:
        trainer.extend(DetectionVOCEvaluator(test_iter,
                                             model,
                                             use_07_metric=True,
                                             label_names=voc_bbox_label_names),
                       trigger=triggers.ManualScheduleTrigger(
                           args.step + [args.iteration], 'iteration'))

        log_interval = 10, 'iteration'
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
            'main/loss/conf', 'validation/main/map'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.extend(extensions.snapshot(),
                       trigger=triggers.ManualScheduleTrigger(
                           args.step + [args.iteration], 'iteration'))
        trainer.extend(extensions.snapshot_object(
            model, 'model_iter_{.updater.iteration}'),
                       trigger=(args.iteration, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #58
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_path',
                        type=str,
                        default='configs/base.yml',
                        help='path to config file')
    parser.add_argument('--data_dir', type=str, default='./data/imagenet')
    parser.add_argument('--results_dir',
                        type=str,
                        default='./results/gans',
                        help='directory to save the results to')
    parser.add_argument('--inception_model_path',
                        type=str,
                        default='./datasets/inception_model',
                        help='path to the inception model')
    parser.add_argument('--snapshot',
                        type=str,
                        default='',
                        help='path to the snapshot')
    parser.add_argument('--loaderjob',
                        type=int,
                        help='number of parallel data loading processes')
    parser.add_argument('--communicator',
                        type=str,
                        default='hierarchical',
                        help='Type of communicator')

    args = parser.parse_args()
    config = yaml_utils.Config(yaml.load(open(args.config_path)))
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank
    chainer.cuda.get_device_from_id(device).use()
    print("init")
    if comm.rank == 0:
        print('==========================================')
        print('Using {} communicator'.format(args.communicator))
        print('==========================================')
    # Model
    gen, dis = load_models(config)
    gen.to_gpu()
    dis.to_gpu()
    models = {"gen": gen, "dis": dis}
    # Optimizer
    opt_gen = make_optimizer(gen,
                             comm,
                             alpha=config.adam['alpha'],
                             beta1=config.adam['beta1'],
                             beta2=config.adam['beta2'])
    opt_dis = make_optimizer(dis,
                             comm,
                             alpha=config.adam['alpha'],
                             beta1=config.adam['beta1'],
                             beta2=config.adam['beta2'])
    opts = {"opt_gen": opt_gen, "opt_dis": opt_dis}
    # Dataset
    config['dataset']['args']['root'] = args.data_dir
    if comm.rank == 0:
        dataset = yaml_utils.load_dataset(config)
    else:
        _ = yaml_utils.load_dataset(
            config)  # Dummy, for adding path to the dataset module
        dataset = None
    dataset = chainermn.scatter_dataset(dataset, comm)
    # Iterator
    multiprocessing.set_start_method('forkserver')
    iterator = chainer.iterators.MultiprocessIterator(
        dataset, config.batchsize, n_processes=args.loaderjob)
    kwargs = config.updater['args'] if 'args' in config.updater else {}
    kwargs.update({
        'models': models,
        'iterator': iterator,
        'optimizer': opts,
        'device': device,
    })
    updater = yaml_utils.load_updater_class(config)
    updater = updater(**kwargs)
    out = args.results_dir
    if comm.rank == 0:
        create_result_dir(out, args.config_path, config)
    trainer = training.Trainer(updater, (config.iteration, 'iteration'),
                               out=out)
    report_keys = ["loss_dis", "loss_gen", "inception_mean", "inception_std"]
    if comm.rank == 0:
        # Set up logging
        trainer.extend(extensions.snapshot(),
                       trigger=(config.snapshot_interval, 'iteration'))
        for m in models.values():
            trainer.extend(extensions.snapshot_object(
                m, m.__class__.__name__ + '_{.updater.iteration}.npz'),
                           trigger=(config.snapshot_interval, 'iteration'))
        trainer.extend(
            extensions.LogReport(keys=report_keys,
                                 trigger=(config.display_interval,
                                          'iteration')))
        trainer.extend(extensions.PrintReport(report_keys),
                       trigger=(config.display_interval, 'iteration'))
        trainer.extend(sample_generate_conditional(gen,
                                                   out,
                                                   n_classes=gen.n_classes),
                       trigger=(config.evaluation_interval, 'iteration'),
                       priority=extension.PRIORITY_WRITER)
        trainer.extend(sample_generate_light(gen, out, rows=10, cols=10),
                       trigger=(config.evaluation_interval // 10, 'iteration'),
                       priority=extension.PRIORITY_WRITER)
        trainer.extend(calc_inception(gen,
                                      n_ims=5000,
                                      splits=1,
                                      path=args.inception_model_path),
                       trigger=(config.evaluation_interval, 'iteration'),
                       priority=extension.PRIORITY_WRITER)
        trainer.extend(
            extensions.ProgressBar(
                update_interval=config.progressbar_interval))
    ext_opt_gen = extensions.LinearShift(
        'alpha', (config.adam['alpha'], 0.),
        (config.iteration_decay_start, config.iteration), opt_gen)
    ext_opt_dis = extensions.LinearShift(
        'alpha', (config.adam['alpha'], 0.),
        (config.iteration_decay_start, config.iteration), opt_dis)
    trainer.extend(ext_opt_gen)
    trainer.extend(ext_opt_dis)
    if args.snapshot:
        print("Resume training with snapshot:{}".format(args.snapshot))
        chainer.serializers.load_npz(args.snapshot, trainer)

    # Run the training
    print("start training")
    trainer.run()
예제 #59
0
def test_deprecation():
    with chainer.testing.assert_warns(DeprecationWarning):
        chainermn.create_communicator('hierarchical')

    with chainer.testing.assert_warns(DeprecationWarning):
        chainermn.create_communicator('two_dimensional')
예제 #60
0
파일: train.py 프로젝트: asi1024/chainer
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: VGG16')
    parser.add_argument('--dataset', '-d', default='cifar10',
                        help='The dataset to use: cifar10 or cifar100')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--learnrate', '-l', type=float, default=0.05,
                        help='Learning rate for SGD')
    parser.add_argument('--frequency', '-f', type=int, default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--gpu', '-g', action='store_true', default=False,
                        help='use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--noplot', dest='plot', action='store_false',
                        help='Disable PlotReport extension')
    args = parser.parse_args()

    # Create ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        device = comm.rank
    else:
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('GPU: {}'.format(args.gpu))
        print('# Minibatch-size: {}'.format(args.batchsize))
        print('# epoch: {}'.format(args.epoch))
        print('')

    # Load the CIFAR10 dataset
    if args.dataset == 'cifar10':
        class_labels = 10
        train, test = chainer.datasets.get_cifar10()
    elif args.dataset == 'cifar100':
        class_labels = 100
        train, test = chainer.datasets.get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')

    model = L.Classifier(VGG.VGG(comm, class_labels))

    if args.gpu:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.MomentumSGD(args.learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

    if comm.rank != 0:
        train = chainermn.datasets.create_empty_dataset(train)
        test = chainermn.datasets.create_empty_dataset(test)

    train_iter = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.SerialIterator(train, args.batchsize), comm)
    test_iter = chainermn.iterators.create_multi_node_iterator(
        chainer.iterators.SerialIterator(test, args.batchsize,
                                         repeat=False, shuffle=False),
        comm)

    # Set up a trainer
    updater = training.StandardUpdater(
        train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=device))

    if comm.rank == 0:
        # Dump a computational graph from 'loss' variable
        # The "main" refers to the target link of the "main" optimizer.
        trainer.extend(extensions.DumpGraph('main/loss'))

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())

        # Save two plot images to the result dir
        if args.plot and extensions.PlotReport.available():
            trainer.extend(
                extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                      'epoch', file_name='loss.png'))
            trainer.extend(
                extensions.PlotReport(
                    ['main/accuracy', 'validation/main/accuracy'],
                    'epoch', file_name='accuracy.png'))

        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

        trainer.extend(extensions.ProgressBar())

    # Run the training
    trainer.run()