예제 #1
0
    def setup_mnist_trainer(self, display_log=False):
        batchsize = 100
        n_units = 100

        comm = self.communicator
        model = L.Classifier(MLP(n_units, 10))

        optimizer = chainermn.create_multi_node_optimizer(
            chainer.optimizers.Adam(), comm)
        optimizer.setup(model)

        if comm.rank == 0:
            train, test = chainer.datasets.get_mnist()
        else:
            train, test = None, None

        train = chainermn.scatter_dataset(train, comm, shuffle=True)
        test = chainermn.scatter_dataset(test, comm, shuffle=True)

        train_iter = chainer.iterators.SerialIterator(train, batchsize)
        test_iter = chainer.iterators.SerialIterator(test, batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        updater = training.StandardUpdater(
            train_iter,
            optimizer
        )

        return updater, optimizer, train_iter, test_iter, model
예제 #2
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator. Only worker 0 loads the whole dataset.
    # The dataset of worker 0 is evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
        rng = np.random.RandomState(0)
        train = chainer.datasets.SubDataset(
            train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train)))
        test = chainer.datasets.SubDataset(
            test, 0, N_TEST_EXAMPLES, order=rng.permutation(len(test)))
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm)

    train_iter = chainer.iterators.SerialIterator(train, BATCHSIZE, shuffle=True)
    test_iter = chainer.iterators.SerialIterator(test, BATCHSIZE, repeat=False, shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch'))

    # Add Chainer extension for pruners.
    trainer.extend(
        optuna.integration.ChainerPruningExtension(trial, 'validation/main/accuracy',
                                                   (PRUNER_INTERVAL, 'epoch')))
    evaluator = chainer.training.extensions.Evaluator(test_iter, model)
    trainer.extend(chainermn.create_multi_node_evaluator(evaluator, comm))
    log_report_extension = chainer.training.extensions.LogReport(log_name=None)
    trainer.extend(log_report_extension)

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    # Please set show_loop_exception_msg False to inhibit messages about TrialPruned exception.
    # ChainerPruningExtension raises TrialPruned exception to stop training, and
    # trainer shows some messages every time it receive TrialPruned.
    trainer.run(show_loop_exception_msg=False)

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(test_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return report['main/accuracy']
예제 #3
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator. Only worker 0 loads the whole dataset.
    # The dataset of worker 0 is evenly split and distributed to all workers.
    if comm.rank == 0:
        train, valid = chainer.datasets.get_mnist()
        rng = np.random.RandomState(0)
        train = chainer.datasets.SubDataset(train,
                                            0,
                                            N_TRAIN_EXAMPLES,
                                            order=rng.permutation(len(train)))
        valid = chainer.datasets.SubDataset(valid,
                                            0,
                                            N_VALID_EXAMPLES,
                                            order=rng.permutation(len(valid)))
    else:
        train, valid = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    valid = chainermn.scatter_dataset(valid, comm)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  BATCHSIZE,
                                                  shuffle=True)
    valid_iter = chainer.iterators.SerialIterator(valid,
                                                  BATCHSIZE,
                                                  repeat=False,
                                                  shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, "epoch"))

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    trainer.run()

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(valid_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return report["main/accuracy"]
def objective(trial, comm):
    device = comm.intra_rank
    chainer.cuda.get_device_from_id(device).use()

    # Sample an architecture.
    model = L.Classifier(create_model(trial))
    model.to_gpu()

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator. Only worker 0 loads the whole dataset.
    # The dataset of worker 0 is evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
        rng = np.random.RandomState(0)
        train = chainer.datasets.SubDataset(
            train, 0, N_TRAIN_EXAMPLES, order=rng.permutation(len(train)))
        test = chainer.datasets.SubDataset(
            test, 0, N_TEST_EXAMPLES, order=rng.permutation(len(test)))
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm)

    train_iter = chainer.iterators.SerialIterator(
        train, BATCHSIZE, shuffle=True)
    test_iter = chainer.iterators.SerialIterator(
        test, BATCHSIZE, repeat=False, shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(
        train_iter, optimizer, device=device)
    trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch'))

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    trainer.run()

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(
        test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return report['main/accuracy']
예제 #5
0
    def test_mnist(self, display_log=True):
        # This test file is intended to be run on Travis-CI and
        # GPU is not used for now.
        epoch = 5
        batchsize = 100
        n_units = 100

        comm = chainermn.create_communicator('naive')
        model = L.Classifier(MLP(n_units, 10))
        optimizer = chainermn.create_multi_node_optimizer(
            chainer.optimizers.Adam(), comm)
        optimizer.setup(model)

        if comm.rank == 0:
            train, test = chainer.datasets.get_mnist()
        else:
            train, test = None, None

        train = chainermn.scatter_dataset(train, comm)
        test = chainermn.scatter_dataset(test, comm)

        train_iter = chainer.iterators.SerialIterator(train, batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        updater = training.StandardUpdater(train_iter, optimizer)
        trainer = training.Trainer(updater, (epoch, 'epoch'))

        # Wrap standard Chainer evaluators by MultiNodeEvaluator.
        evaluator = extensions.Evaluator(test_iter, model)
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
        trainer.extend(evaluator)

        # Some display and output extensions are necessary only for one worker.
        # (Otherwise, there would just be repeated outputs.)
        if comm.rank == 0 and display_log:
            trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                           trigger=(1, 'epoch'))
            trainer.extend(extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ],
                                                  out=sys.stderr),
                           trigger=(1, 'epoch'))
        trainer.run()

        err = evaluator()['validation/main/accuracy']
        self.assertGreaterEqual(err, 0.95)
예제 #6
0
def objective(trial, comm):
    # Sample an architecture.
    model = L.Classifier(create_model(trial))

    # Setup optimizer.
    optimizer = chainer.optimizers.MomentumSGD()
    optimizer.setup(model)
    optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)

    # Setup dataset and iterator.
    train, test = chainer.datasets.get_mnist()
    rng = np.random.RandomState(0)
    train = chainer.datasets.SubDataset(train,
                                        0,
                                        N_TRAIN_EXAMPLES,
                                        order=rng.permutation(len(train)))
    test = chainer.datasets.SubDataset(test,
                                       0,
                                       N_TEST_EXAMPLES,
                                       order=rng.permutation(len(test)))

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm)

    train_iter = chainer.iterators.SerialIterator(train,
                                                  BATCHSIZE,
                                                  shuffle=True)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 BATCHSIZE,
                                                 repeat=False,
                                                 shuffle=False)

    # Setup trainer.
    updater = chainer.training.StandardUpdater(train_iter, optimizer)
    trainer = chainer.training.Trainer(updater, (EPOCH, 'epoch'))

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.ProgressBar())

    # Run training.
    trainer.run()

    # Evaluate.
    evaluator = chainer.training.extensions.Evaluator(test_iter, model)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    report = evaluator()

    return 1.0 - report['main/accuracy']
예제 #7
0
def _prepare_multinode_snapshot(n, result):
    n_units = 100
    batchsize = 10
    comm = create_communicator('naive')
    model = L.Classifier(MLP(n_units, 10))
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, _ = chainer.datasets.get_mnist()
    else:
        train, _ = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    train_iter = chainer.iterators.SerialIterator(train, batchsize)

    updater = StandardUpdater(train_iter, optimizer)
    trainer = Trainer(updater, out=result)

    snapshot = extensions.snapshot(target=updater, autoload=True)
    replica_sets = []
    mn_snapshot = multi_node_snapshot(comm, snapshot, replica_sets)
    mn_snapshot.initialize(trainer)
    for _ in range(n):
        updater.update()

    return updater, mn_snapshot, trainer
예제 #8
0
 def scatter_large_data(self, comm_type):
     comm = self.communicator
     if comm.rank == 0:
         data = ["test"] * 2000000000
         data = chainermn.scatter_dataset(data, comm)
     else:
         data = []
         data = scatter_dataset(data, comm)
예제 #9
0
def train(x_data,
          t_data,
          batchsize=128,
          layer=1,
          in_units=1,
          hidden_units=5,
          out_units=1):

    comm = chainermn.create_communicator('naive')

    # Iterator
    batchsize = batchsize
    x_data = chainermn.scatter_dataset(x_data, comm)
    t_data = chainermn.scatter_dataset(t_data, comm)
    train_iter = iterators.SerialIterator(x_data, batchsize)
    test_iter = iterators.SerialIterator(t_data,
                                         batchsize,
                                         repeat=False,
                                         shuffle=False)

    # setup model
    model = LSTM(in_units, hidden_units, out_units)

    # setup optimizer
    optimizer = chainermn.create_multi_node_optimizer(optimizers.Adam(), comm)
    optimizer.setup(model)

    updater = training.StandardUpdater(train_iter, optimizer, MyConverter)
    trainer = training.Trainer(updater, (20, 'epoch'), out='result')
    if comm.rank == 0:
        trainer.extend(extensions.LogReport())
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.observe_lr())
        trainer.extend(extensions.Evaluator(test_iter, model, MyConverter),
                       name='val')
        trainer.extend(
            extensions.PrintReport(
                ['epoch', 'main/loss', 'val/main/loss', 'elapsed_time', 'lr']))
        trainer.extend(
            extensions.PlotReport(['main/loss', 'val/main/loss'],
                                  x_key='epoch',
                                  file_name='loss.png'))
#    trainer.extend(extensions.ProgressBar())

    trainer.run()
예제 #10
0
def get_dataset(args, comm, model):
    mean = np.load(args.mean)
    if args.loadtype == 'development':
        if comm.rank == 0:
            train = dlframeworks.chainer.datasets.read_pairs(args.train)
            val = dlframeworks.chainer.datasets.read_pairs(args.val)
        else:
            train = None
            val = None
        train = chainermn.scatter_dataset(train, comm, shuffle=True)
        val = chainermn.scatter_dataset(val, comm)
        train = dlframeworks.chainer.datasets.CroppingDataset(
            train, args.train_root, mean, model.insize, model.insize)
        val = dlframeworks.chainer.datasets.CroppingDataset(
            val, args.val_root, mean, model.insize, model.insize, False)
    else:
        raise NotImplementedError('Invalid loadtype: {}'.format(args.loadtype))
    return train, val
예제 #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset',
                        choices=('cityscapes', 'ade20k', 'camvid'))
    parser.add_argument('--model', choices=('pspnet_resnet101', 'segnet'))
    parser.add_argument('--pretrained-model')
    parser.add_argument('--input-size', type=int, default=None)
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    dataset, label_names, model = get_dataset_and_model(
        args.dataset, args.model, args.pretrained_model,
        (args.input_size, args.input_size))
    assert len(dataset) % comm.size == 0, \
        "The size of the dataset should be a multiple "\
        "of the number of GPUs"

    chainer.cuda.get_device_from_id(device).use()
    model.to_gpu()

    if comm.rank == 0:
        indices = np.arange(len(dataset))
    else:
        indices = None
    indices = chainermn.scatter_dataset(indices, comm)
    dataset = dataset.slice[indices]

    it = iterators.SerialIterator(dataset, 1, repeat=False, shuffle=False)

    in_values, out_values, rest_values = apply_to_iterator(model.predict,
                                                           it,
                                                           hook=ProgressHook(
                                                               len(dataset)))
    # Delete an iterator of images to save memory usage.
    del in_values
    pred_labels, = out_values
    gt_labels, = rest_values

    confusion = calc_semantic_segmentation_confusion(pred_labels, gt_labels)
    confusion = comm.allreduce(confusion)

    if comm.rank == 0:
        iou = calc_semantic_segmentation_iou(confusion)
        pixel_accuracy = np.diag(confusion).sum() / confusion.sum()
        class_accuracy = np.diag(confusion) / np.sum(confusion, axis=1)

        for iu, label_name in zip(iou, label_names):
            print('{:>23} : {:.4f}'.format(label_name, iu))
        print('=' * 34)
        print('{:>23} : {:.4f}'.format('mean IoU', np.nanmean(iou)))
        print('{:>23} : {:.4f}'.format('Class average accuracy',
                                       np.nanmean(class_accuracy)))
        print('{:>23} : {:.4f}'.format('Global average accuracy',
                                       pixel_accuracy))
예제 #12
0
파일: train.py 프로젝트: maeshu/jsk_apc
def get_model_and_data(
    affordance,
    batch_size=1,
    comm=None,
    modal='rgb',
    augmentation=True,
    resolution=30,
):
    if affordance == 'suction':
        dataset_train = grasp_fusion.datasets.SuctionDataset(
            'train', augmentation=augmentation,
        )
        dataset_test = grasp_fusion.datasets.SuctionDataset('test')
    else:
        assert affordance == 'pinch'
        dataset_train = grasp_fusion.datasets.PinchDataset(
            'train', augmentation=augmentation, resolution=resolution,
        )
        dataset_test = grasp_fusion.datasets.PinchDataset(
            'test', resolution=resolution,
        )
    channel_names = dataset_train.channel_names
    out_channels = len(channel_names)

    predictor = grasp_fusion.models.FCN8sVGG16Sigmoid(
        out_channels=out_channels, modal=modal,
    )
    model = grasp_fusion.models.FCNSigmoidTrainChain(predictor)

    if comm:
        import chainermn
        if comm.rank != 0:
            dataset_train = None
        dataset_train = chainermn.scatter_dataset(
            dataset_train, comm, shuffle=True
        )

    iter_train = chainer.iterators.SerialIterator(
        chainer.datasets.TransformDataset(
            dataset_train,
            lambda x: transform(x, model=predictor, train=True),
        ),
        batch_size=batch_size,
    )
    iter_test = chainer.iterators.SerialIterator(
        chainer.datasets.TransformDataset(
            dataset_test,
            lambda x: transform(x, model=predictor, train=False),
        ),
        batch_size=1,
        repeat=False,
        shuffle=False,
    )

    return model, iter_train, iter_test, channel_names
예제 #13
0
def scatter_train_and_val_data(train_data, val_data, comm):
    if comm.rank == 0:
        indices = np.arange(len(train_data))
    else:
        indices = None
    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train_data = train_data.slice[indices]

    indices = np.arange(len(val_data))
    indices = indices[comm.rank:indices.size:comm.size]
    val_data = val_data.slice[indices]

    return train_data, val_data
def _setup_datasets(config, comm, is_master):
    if is_master:
        if config['dataset_name'] == 'msd_bound':
            train_data = MSDBoundDataset(config, config['train_list_path'])
            validation_data = MSDBoundDataset(config,
                                              config['validation_list_path'])
            test_data = MSDBoundDataset(config, config['test_list_path'])
            validation_data.random_scale = False
            test_data.random_scale = False
            validation_data.shift_intensity = 0
            test_data.shift_intensity = 0
            validation_data.random_flip = False
            test_data.random_flip = False
            validation_data.nb_copies = 1
            test_data.nb_copies = 1
            validation_data.training = False
            test_data.training = False
        else:
            raise ValueError('Unknown dataset_name: {}'.format(
                config['dataset_name']))
        print('Training dataset size: {}'.format(len(train_data)))
        print('Validation dataset size: {}'.format(len(validation_data)))
        print('Test dataset size: {}'.format(len(test_data)))
    else:
        train_data = None
        validation_data = None
        test_data = None

    # scatter dataset
    if comm is not None:
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        validation_data = chainermn.scatter_dataset(validation_data,
                                                    comm,
                                                    shuffle=True)
        test_data = chainermn.scatter_dataset(test_data, comm, shuffle=True)

    return train_data, validation_data, test_data
예제 #15
0
    def check_scatter_dataset(self, original_dataset, shuffle=False, root=0):
        if self.communicator.rank != root:
            original_dataset = None
        my_dataset = chainermn.scatter_dataset(original_dataset,
                                               self.communicator,
                                               shuffle=shuffle,
                                               root=root)
        sub_datasets = self.communicator.gather_obj(my_dataset, root=root)

        if self.communicator.rank == root:
            # Test the sizes
            sub_sizes = [len(sub_dataset) for sub_dataset in sub_datasets]
            self.assertEqual(len(set(sub_sizes)), 1)
            sub_size = sub_sizes[0]
            self.assertLessEqual(len(original_dataset),
                                 sub_size * self.mpi_comm.size)
            self.assertGreater(len(original_dataset),
                               (sub_size - 1) * self.mpi_comm.size)

            # Test the content of scattered datasets
            joined_dataset = sum(
                (sub_dataset[:] for sub_dataset in sub_datasets), [])

            # NOTE: The values in `original_dataset` and
            # `joined_dataset` must be casted to int to compare.
            # There are 2 backgrounds on this issue.
            #
            # (1) numpy and cupy/chainerx have different behaviours on
            # 1-element array. Numpy implicitly converts a 1-element array to
            # a scalar value.
            # type(numpy.array([1])[0])
            # =>  <class 'numpy.int64'>  # Scalar
            # type(chainerx.array([1])[0])
            # => <class 'chainerx.ndarray'>  # array of one element
            #
            # (2) Two different ChainerX arrays are never identical in the
            # context of `set()`.
            # set([chainerx.array([0]), chainerx.array([0])])
            # => {array([0], shape=(1,), dtype=int64, device='native:0'),
            #     array([0], shape=(1,), dtype=int64, device='native:0')}

            joined_dataset = [int(e) for e in joined_dataset]
            original_dataset = [int(e) for e in original_dataset]
            self.assertEqual(set(joined_dataset), set(original_dataset))
예제 #16
0
    def check_scatter_dataset(self, original_dataset):
        my_dataset = chainermn.scatter_dataset(original_dataset,
                                               self.communicator)
        sub_datasets = self.mpi_comm.gather(my_dataset)

        if self.mpi_comm.rank == 0:
            # Test the sizes
            sub_sizes = [len(sub_dataset) for sub_dataset in sub_datasets]
            self.assertEqual(len(set(sub_sizes)), 1)
            sub_size = sub_sizes[0]
            self.assertLessEqual(len(original_dataset),
                                 sub_size * self.mpi_comm.size)
            self.assertGreater(len(original_dataset),
                               (sub_size - 1) * self.mpi_comm.size)

            # Test the content of scattered datasets
            joined_dataset = sum(
                (sub_dataset[:] for sub_dataset in sub_datasets), [])
            self.assertEqual(set(joined_dataset), set(original_dataset))
예제 #17
0
파일: train.py 프로젝트: maeshu/jsk_apc
def get_data(name, batch_size=1, comm=None, extractor='res'):
    if name == 'voc':
        dataset_train = fcn.datasets.SBDClassSeg(split='train')
        dataset_valid = fcn.datasets.VOC2011ClassSeg(split='seg11valid')
    else:
        assert name == 'occlusion'
        dataset_train = OcclusionSegmentationDataset(split='train')
        dataset_valid = OcclusionSegmentationDataset(split='test')
    class_names = dataset_train.class_names

    if comm:
        import chainermn
        if comm.rank != 0:
            dataset_train = None
        dataset_train = chainermn.scatter_dataset(
            dataset_train, comm, shuffle=True
        )

    iter_train = chainer.iterators.SerialIterator(
        chainer.datasets.TransformDataset(
            dataset_train,
            lambda x: transform(x, extractor=extractor),
        ),
        batch_size=batch_size,
    )
    iter_valid_raw = chainer.iterators.SerialIterator(
        chainer.datasets.TransformDataset(dataset_valid, transform_size),
        batch_size=1,
        repeat=False,
        shuffle=False,
    )
    iter_valid = chainer.iterators.SerialIterator(
        chainer.datasets.TransformDataset(
            dataset_valid,
            lambda x: transform(x, extractor=extractor),
        ),
        batch_size=1,
        repeat=False,
        shuffle=False,
    )

    return class_names, iter_train, iter_valid, iter_valid_raw
예제 #18
0
    def check_scatter_dataset(self, original_dataset, shuffle=False, root=0):
        my_dataset = chainermn.scatter_dataset(
            original_dataset, self.communicator,
            shuffle=shuffle, root=root)
        sub_datasets = self.communicator.gather_obj(my_dataset, root=root)

        if self.communicator.rank == root:
            # Test the sizes
            sub_sizes = [len(sub_dataset) for sub_dataset in sub_datasets]
            self.assertEqual(len(set(sub_sizes)), 1)
            sub_size = sub_sizes[0]
            self.assertLessEqual(
                len(original_dataset), sub_size * self.mpi_comm.size)
            self.assertGreater(
                len(original_dataset), (sub_size - 1) * self.mpi_comm.size)

            # Test the content of scattered datasets
            joined_dataset = sum((sub_dataset[:]
                                  for sub_dataset in sub_datasets), [])
            self.assertEqual(set(joined_dataset), set(original_dataset))
예제 #19
0
    def make_dataset(self, stage_int):
        if self.is_master:
            size = 4 * (2**((stage_int + 1) // 2))
            _dataset = MultiDataset(json.load(open(FLAGS.dataset_config, 'r')),
                                    '%dx%d' % (size, size), [[
                                        "resize", {
                                            "probability": 1,
                                            "width": size,
                                            "height": size,
                                            "resample_filter": "ANTIALIAS"
                                        }
                                    ]])
            self.print_log('Add (master) dataset for size {}'.format(size))
        else:
            _dataset = None
            self.print_log('Add (slave) dataset')

        if self.use_mpi:
            _dataset = chainermn.scatter_dataset(_dataset, self.comm)

        return _dataset
예제 #20
0
    def check_scatter_dataset(self, original_dataset, shuffle=False, root=0):
        if self.communicator.rank != root:
            original_dataset = None
        my_dataset = chainermn.scatter_dataset(
            original_dataset, self.communicator,
            shuffle=shuffle, root=root)
        sub_datasets = self.communicator.gather_obj(my_dataset, root=root)

        if self.communicator.rank == root:
            # Test the sizes
            sub_sizes = [len(sub_dataset) for sub_dataset in sub_datasets]
            self.assertEqual(len(set(sub_sizes)), 1)
            sub_size = sub_sizes[0]
            self.assertLessEqual(
                len(original_dataset), sub_size * self.mpi_comm.size)
            self.assertGreater(
                len(original_dataset), (sub_size - 1) * self.mpi_comm.size)

            # Test the content of scattered datasets
            joined_dataset = sum((sub_dataset[:]
                                  for sub_dataset in sub_datasets), [])
            self.assertEqual(set(joined_dataset), set(original_dataset))
예제 #21
0
def read_by_list(comm, files, root, cnt_classes, size, random_crop):
    if comm.rank == 0:
        print("reading file list")
        locations = read_locations(files, root, cnt_classes)
        print("cnt samples global:", len(locations))
    else:
        locations = None
    locations = chainermn.scatter_dataset(locations, comm, shuffle=True)
    #print("cnt samples local:", len(locations))
    mean = np.ones((3, 256, 256), dtype=np.float32) * 128.0
    images = []
    for loc in locations:
        #if not os.path.isfile(loc[0]):
        #print(f"{MPI.Get_processor_name()}, missing {loc[0]}")
        img = read_image(loc[0])
        images.append((img, loc[1]))
    # images = chainer.datasets.LabeledImageDataset(locations, "./")
    ds = PreprocessedDataset(base=images,
                             mean=mean,
                             crop_size=224,
                             random_crop=random_crop)
    #print(f"{MPI.Get_processor_name()}, done")
    #exit(-1)
    return ds
예제 #22
0
def train(args, train_data, test_data, evaluator_type):
    required_args = [
        'dataset',
        'class_names',
        'logs_dir',
        'min_size',
        'max_size',
        'anchor_scales',
    ]
    for arg_key in required_args:
        if not hasattr(args, arg_key):
            raise ValueError(
                'args must contain required key: {}'.format(arg_key)
            )

    assert evaluator_type in ['voc', 'coco'], \
        'Unsupported evaluator_type: {}'.format(evaluator_type)

    if args.multi_node:
        import chainermn

        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        if args.gpu is None:
            print(
                'Option --gpu is required without --multi-node.',
                file=sys.stderr,
            )
            sys.exit(1)
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()
    args.out = osp.join(args.logs_dir, now.strftime('%Y%m%d_%H%M%S'))

    args.batch_size = args.batch_size_per_gpu * args.n_gpu

    # lr: 0.00125 * 8 = 0.01  in original
    args.lr = 0.00125 * args.batch_size
    args.weight_decay = 0.0001

    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [
        (120e3 / 180e3) * args.max_epoch,
        (160e3 / 180e3) * args.max_epoch,
    ]

    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.pooling_func == 'align':
        pooling_func = cmr.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = cmr.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = cmr.functions.crop_and_resize
    else:
        raise ValueError(
            'Unsupported pooling_func: {}'.format(args.pooling_func)
        )

    if args.initializer == 'normal':
        mask_initialW = chainer.initializers.Normal(0.01)
    elif args.initializer == 'he_normal':
        mask_initialW = chainer.initializers.HeNormal(fan_option='fan_out')
    else:
        raise ValueError(
            'Unsupported initializer: {}'.format(args.initializer)
        )

    if args.model in ['resnet50', 'resnet101']:
        n_layers = int(args.model.lstrip('resnet'))
        mask_rcnn = cmr.models.MaskRCNNResNet(
            n_layers=n_layers,
            n_fg_class=len(args.class_names),
            pooling_func=pooling_func,
            anchor_scales=args.anchor_scales,
            roi_size=args.roi_size,
            min_size=args.min_size,
            max_size=args.max_size,
            mask_initialW=mask_initialW,
        )
    else:
        raise ValueError('Unsupported model: {}'.format(args.model))
    model = cmr.models.MaskRCNNTrainChain(mask_rcnn)
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    if args.model in ['resnet50', 'resnet101']:
        # ResNetExtractor.freeze_at is not enough to freeze params
        # since WeightDecay updates the param little by little.
        mask_rcnn.extractor.conv1.disable_update()
        mask_rcnn.extractor.bn1.disable_update()
        mask_rcnn.extractor.res2.disable_update()
        for link in mask_rcnn.links():
            if isinstance(link, cmr.links.AffineChannel2D):
                link.disable_update()

    train_data = chainer.datasets.TransformDataset(
        train_data,
        cmr.datasets.MaskRCNNTransform(mask_rcnn),
    )
    test_data = chainer.datasets.TransformDataset(
        test_data,
        cmr.datasets.MaskRCNNTransform(mask_rcnn, train=False),
    )
    if args.multi_node:
        if comm.rank != 0:
            train_data = None
            test_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        test_data = chainermn.scatter_dataset(test_data, comm)

    # FIXME: MultiProcessIterator sometimes hangs
    train_iter = chainer.iterators.SerialIterator(
        train_data,
        batch_size=args.batch_size_per_gpu,
    )
    test_iter = chainer.iterators.SerialIterator(
        test_data,
        batch_size=args.batch_size_per_gpu,
        repeat=False,
        shuffle=False,
    )

    converter = functools.partial(
        cmr.datasets.concat_examples,
        padding=0,
        # img, bboxes, labels, masks, scales
        indices_concat=[0, 2, 3, 4],  # img, _, labels, masks, scales
        indices_to_device=[0, 1],  # img, bbox
    )
    updater = chainer.training.updater.StandardUpdater(
        train_iter,
        optimizer,
        device=device,
        converter=converter,
    )

    trainer = training.Trainer(
        updater,
        (args.max_epoch, 'epoch'),
        out=args.out,
    )

    trainer.extend(
        extensions.ExponentialShift('lr', 0.1),
        trigger=training.triggers.ManualScheduleTrigger(
            args.step_size,
            'epoch',
        ),
    )

    eval_interval = 1, 'epoch'
    log_interval = 20, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = 20, 'iteration'

    if evaluator_type == 'voc':
        evaluator = cmr.extensions.InstanceSegmentationVOCEvaluator(
            test_iter,
            model.mask_rcnn,
            device=device,
            use_07_metric=True,
            label_names=args.class_names,
        )
    elif evaluator_type == 'coco':
        evaluator = cmr.extensions.InstanceSegmentationCOCOEvaluator(
            test_iter,
            model.mask_rcnn,
            device=device,
            label_names=args.class_names,
        )
    else:
        raise ValueError(
            'Unsupported evaluator_type: {}'.format(evaluator_type)
        )
    if args.multi_node:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=eval_interval)

    if not args.multi_node or comm.rank == 0:
        # Save snapshot.
        trainer.extend(
            extensions.snapshot_object(model.mask_rcnn, 'snapshot_model.npz'),
            trigger=training.triggers.MaxValueTrigger(
                'validation/main/map',
                eval_interval,
            ),
        )

        # Dump params.yaml.
        args.git_hash = cmr.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))

        # Visualization.
        trainer.extend(
            cmr.extensions.InstanceSegmentationVisReport(
                test_iter,
                model.mask_rcnn,
                label_names=args.class_names,
            ),
            trigger=eval_interval,
        )

        # Logging.
        trainer.extend(
            chainer.training.extensions.observe_lr(),
            trigger=log_interval,
        )
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(
            extensions.PrintReport(
                [
                    'iteration',
                    'epoch',
                    'elapsed_time',
                    'lr',
                    'main/loss',
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                    'validation/main/map',
                ],
            ),
            trigger=print_interval,
        )
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # Plot.
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(
                [
                    'main/loss',
                    'main/roi_loc_loss',
                    'main/roi_cls_loss',
                    'main/roi_mask_loss',
                    'main/rpn_loc_loss',
                    'main/rpn_cls_loss',
                ],
                file_name='loss.png',
                trigger=plot_interval,
            ),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport(
                ['validation/main/map'],
                file_name='accuracy.png',
                trigger=plot_interval,
            ),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
예제 #23
0
def main():
    parser = argparse.ArgumentParser(description='''\
ChainerMN example: MNIST with automatic checkpoints enabled''')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator',
                        type=str,
                        default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--run-id',
                        type=str,
                        default='train-mnist-example',
                        help='ID of the task name')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Enable checkpointer and recover from checkpoint if any checkpoint exists
    checkpointer = create_multi_node_checkpointer(name=args.run_id, comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    print("Rank", comm.rank, ": (Re)Starting from (epoch, iter) =",
          (trainer.updater.epoch, trainer.updater.iteration))
    trainer.extend(checkpointer, trigger=(1000, 'iteration'))

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
예제 #24
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action="store_true", default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help="Type of communicator")
    parser.add_argument('--stop', '-s', type=str, default="15e",
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default="adam()",
                        help="Optimizer and its argument")
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.mpi_comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print("RD source done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print("RD target done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print("Rank {} GPU: {}".format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.mpi_comm.bcast(source_ids, root=0)
    target_ids = comm.mpi_comm.bcast(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print("target_words : {}".format(len(target_words)))
        print("source_words : {}".format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write("Error: unknown stop trigger: {}".format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print("Trigger: {}".format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()
예제 #25
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator', type=str,
                        default='hierarchical', help='Type of communicator')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
                                                 repeat=False, shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #26
0
def scatter_large_data(communicator):
    data = []
    if communicator.rank == 0:
        data = ["test"] * 2000000000
    data = chainermn.scatter_dataset(data, communicator)
    assert len(data) > 0
예제 #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        choices=('ssd300', 'ssd512'),
                        default='ssd300')
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if args.model == 'ssd300':
        model = SSD300(n_fg_class=len(epic_kitchens_bbox_category_names),
                       pretrained_model='imagenet')
    elif args.model == 'ssd512':
        model = SSD512(n_fg_class=len(epic_kitchens_bbox_category_names),
                       pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    model.to_gpu()

    train = EpicKitchensBboxDataset(year='2018', split='train')
    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    train = TransformDataset(train, ('img', 'mb_loc', 'mb_label'),
                             Transform(model.coder, model.insize, model.mean))

    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    # http://chainermn.readthedocs.io/en/latest/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(train,
                                                        args.batchsize,
                                                        n_processes=2)

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, (18, 'epoch'), args.out)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=args.lr),
                   trigger=triggers.ManualScheduleTrigger([12, 15], 'epoch'))

    if comm.rank == 0:
        log_interval = 10, 'iteration'
        trainer.extend(
            extensions.LogReport(log_name='log.json', trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
            'main/loss/conf'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=1))

        trainer.extend(extensions.snapshot_object(
            model, 'model_iter_{.updater.iteration}.npz'),
                       trigger=(1, 'epoch'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        choices=('ssd300', 'ssd512'),
                        default='ssd300')
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--test-batchsize', type=int, default=16)
    parser.add_argument('--iteration', type=int, default=120000)
    parser.add_argument('--step', type=int, nargs='*', default=[80000, 100000])
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if args.model == 'ssd300':
        model = SSD300(n_fg_class=len(voc_bbox_label_names),
                       pretrained_model='imagenet')
    elif args.model == 'ssd512':
        model = SSD512(n_fg_class=len(voc_bbox_label_names),
                       pretrained_model='imagenet')

    model.use_preset('evaluate')
    train_chain = MultiboxTrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    model.to_gpu()

    train = TransformDataset(
        ConcatenatedDataset(VOCBboxDataset(year='2007', split='trainval'),
                            VOCBboxDataset(year='2012', split='trainval')),
        ('img', 'mb_loc', 'mb_label'),
        Transform(model.coder, model.insize, model.mean))

    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    # http://chainermn.readthedocs.io/en/latest/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(train,
                                                        args.batchsize //
                                                        comm.size,
                                                        n_processes=2)

    if comm.rank == 0:
        test = VOCBboxDataset(year='2007',
                              split='test',
                              use_difficult=True,
                              return_difficult=True)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     args.test_batchsize,
                                                     repeat=False,
                                                     shuffle=False)

    # initial lr is set to 1e-3 by ExponentialShift
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    for param in train_chain.params():
        if param.name == 'b':
            param.update_rule.add_hook(GradientScaling(2))
        else:
            param.update_rule.add_hook(WeightDecay(0.0005))

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=device)
    trainer = training.Trainer(updater, (args.iteration, 'iteration'),
                               args.out)
    trainer.extend(extensions.ExponentialShift('lr', 0.1, init=1e-3),
                   trigger=triggers.ManualScheduleTrigger(
                       args.step, 'iteration'))

    if comm.rank == 0:
        trainer.extend(DetectionVOCEvaluator(test_iter,
                                             model,
                                             use_07_metric=True,
                                             label_names=voc_bbox_label_names),
                       trigger=triggers.ManualScheduleTrigger(
                           args.step + [args.iteration], 'iteration'))

        log_interval = 10, 'iteration'
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'lr', 'main/loss', 'main/loss/loc',
            'main/loss/conf', 'validation/main/map'
        ]),
                       trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.extend(extensions.snapshot(),
                       trigger=triggers.ManualScheduleTrigger(
                           args.step + [args.iteration], 'iteration'))
        trainer.extend(extensions.snapshot_object(
            model, 'model_iter_{.updater.iteration}'),
                       trigger=(args.iteration, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #29
0
def main(argv):

    if len(argv) < 5:
        print("python " + argv[0] + " data_path out_path layer epoch")
        sys.exit(0)

    data_path = argv[1]
    out_path = argv[2]
    layer = int(argv[3])
    epoch = int(argv[4])

    x_file = os.path.join(data_path, "en.txt")
    y_file = os.path.join(data_path, "ja.txt")
    vocab_path = os.path.join(data_path, "vocab.dump")

    # 単語とidの辞書
    with open(vocab_path, "rb") as f:
        vocab = pickle.load(f)

    train_data1 = load_data(x_file, vocab)
    train_data2 = load_data(y_file, vocab)

    eos_id = vocab["<eos>"]
    batch_size = 256
    demb = 256
    drop_out = 0.5
    model = gAtt(layer, len(vocab) + 1, demb, drop_out)

    comm = chainermn.create_communicator("single_node")
    device = comm.intra_rank
    chainer.cuda.get_device(device).use()
    model.to_gpu(device)
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        # file・directory生成
        date = datetime.datetime.today()
        folder_name = "_".join(
            [str(date.year), str(date.month),
             str(date.day)])
        out_path = (os.path.join(out_path, folder_name, "".join(
            ["layer", str(layer)])) + os.sep)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        loss_out_path = os.path.join(
            out_path, "".join(["loss_",
                               str(epoch), "_",
                               str(layer), ".csv"]))
        loss_out = open(loss_out_path, "w")
        print(
            "epoch:",
            epoch,
            " batch:",
            batch_size,
            " drop:",
            drop_out,
            " demb:",
            demb,
            " layer:",
            layer,
            end="\n",
            file=loss_out,
        )

    xs = []
    ys = []
    s = []
    # xのデータを生成
    if comm.rank == 0:
        for pos in range(len(train_data1)):
            id = train_data1[pos]
            if id != eos_id:
                s += [id]
            else:
                xs += [xp.asarray(s, dtype=xp.int32)]
                s = []
        # yのデータを生成
        for pos in range(len(train_data2)):
            id = train_data2[pos]
            if id != eos_id:
                s += [id]
            else:
                ys += [xp.asarray(s, dtype=xp.int32)]
                s = []
    # データを配る
    xs = chainermn.scatter_dataset(xs, comm)
    ys = chainermn.scatter_dataset(ys, comm)

    loss = None
    for cnt in range(epoch):
        index = np.random.permutation(len(xs))
        for pos in range(0, len(xs), batch_size):
            # ミニバッチを生成
            batch_xs = []
            batch_ys = []
            for idx in index[pos:pos + (batch_size)]:
                batch_xs.append(xs[idx])
                batch_ys.append(ys[idx])
            model.cleargrads()
            # 初期値を生成
            hx = chainer.Variable(
                xp.zeros((2 * layer, len(batch_xs), demb), dtype=xp.float32))
            cx = chainer.Variable(
                xp.zeros((2 * layer, len(batch_xs), demb), dtype=xp.float32))
            # 学習する
            loss = model(hx, cx, batch_xs, batch_ys, len(batch_xs), vocab)
            loss.backward()
            optimizer.update()
            print(cnt + 1, " : ", pos + len(batch_xs), "/", len(xs),
                  " finished")
        if comm.rank == 0:
            print(loss.array, end="\n", file=loss_out)
            out_file = out_path + "nsteplstm-" + str(layer) + "-" + str(
                cnt) + ".model"
            model.to_cpu()
            serializers.save_npz(out_file, model)
            model.to_gpu(0)
예제 #30
0
def check_mnist(gpu, display_log=True):
    epoch = 5
    batchsize = 100
    n_units = 100

    comm = chainermn.create_communicator('naive')
    if gpu:
        device = comm.intra_rank
        chainer.cuda.get_device(device).use()
    else:
        device = -1

    model = L.Classifier(MLP(n_units, 10))
    if gpu:
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)

    trainer = training.Trainer(updater, (epoch, 'epoch'))

    # Wrap standard Chainer evaluators by MultiNodeEvaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Add checkpointer. This is just to check checkpointing runs
    # without errors
    path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + "-tmp-")
    checkpointer = create_multi_node_checkpointer(name=__name__,
                                                  comm=comm,
                                                  path=path)
    trainer.extend(checkpointer, trigger=(1, 'epoch'))

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0 and display_log:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ],
                                              out=sys.stderr),
                       trigger=(1, 'epoch'))
    trainer.run()

    err = evaluator()['validation/main/accuracy']
    assert err > 0.95

    # Check checkpointer successfully finalized snapshot directory
    assert [] == os.listdir(path)
    os.removedirs(path)
예제 #31
0
def main():
    parser = argparse.ArgumentParser(
        description='ChainerMN example: pipelined neural network')
    parser.add_argument('--batchsize', '-b', type=int, default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit', '-u', type=int, default=1000,
                        help='Number of units')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        data_axis, model_axis = comm.rank % 2, comm.rank // 2
        data_comm = comm.split(data_axis, comm.rank)
        model_comm = comm.split(model_axis, comm.rank)
        device = -1

    if model_comm.size != 2:
        raise ValueError(
            'This example can only be executed on the even number'
            'of processes.')

    if comm.rank == 0:
        print('==========================================')
        if args.gpu:
            print('Using GPUs')
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    if data_axis == 0:
        model = L.Classifier(MLP0(model_comm, args.unit))
    elif data_axis == 1:
        model = MLP1(model_comm, args.unit, 10)

    if device >= 0:
        chainer.cuda.get_device_from_id(device).use()
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), data_comm)
    optimizer.setup(model)

    # Original dataset on worker 0 and 1.
    # Datasets of worker 0 and 1 are split and distributed to all workers.
    if model_axis == 0:
        train, test = chainer.datasets.get_mnist()
        if data_axis == 1:
            train = chainermn.datasets.create_empty_dataset(train)
            test = chainermn.datasets.create_empty_dataset(test)
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, data_comm, shuffle=True)
    test = chainermn.scatter_dataset(test, data_comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(
        train, args.batchsize, shuffle=False)
    test_iter = chainer.iterators.SerialIterator(
        test, args.batchsize, repeat=False, shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, data_comm)
    trainer.extend(evaluator)

    # Some display and output extentions are necessary only for worker 0.
    if comm.rank == 0:
        trainer.extend(extensions.DumpGraph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
예제 #32
0
def main():
    model_cfgs = {
        'resnet50': {
            'class': ResNet50,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet101': {
            'class': ResNet101,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet152': {
            'class': ResNet152,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        }
    }
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to root of the train dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')
    parser.add_argument('--model',
                        '-m',
                        choices=model_cfgs.keys(),
                        default='resnet50',
                        help='Convnet models')
    parser.add_argument('--communicator',
                        type=str,
                        default='pure_nccl',
                        help='Type of communicator')
    parser.add_argument('--loaderjob', type=int, default=4)
    parser.add_argument('--batchsize',
                        type=int,
                        default=32,
                        help='Batch size for each worker')
    parser.add_argument('--lr', type=float)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--weight_decay', type=float, default=0.0001)
    parser.add_argument('--out', type=str, default='result')
    parser.add_argument('--epoch', type=int, default=90)
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256
        if comm.rank == 0:
            print('lr={}: lr is selected based on the linear '
                  'scaling rule'.format(lr))

    label_names = directory_parsing_label_names(args.train)

    model_cfg = model_cfgs[args.model]
    extractor = model_cfg['class'](n_class=len(label_names),
                                   **model_cfg['kwargs'])
    extractor.pick = model_cfg['score_layer_name']
    model = Classifier(extractor)
    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in model.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    train_data = DirectoryParsingLabelDataset(args.train)
    val_data = DirectoryParsingLabelDataset(args.val)
    train_data = TransformDataset(train_data, ('img', 'label'),
                                  TrainTransform(extractor.mean))
    val_data = TransformDataset(val_data, ('img', 'label'),
                                ValTransform(extractor.mean))
    print('finished loading dataset')

    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(train_indices,
                                              comm,
                                              shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(val_data,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_processes=args.loaderjob)

    optimizer = chainermn.create_multi_node_optimizer(
        CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    optimizer.setup(model)
    for param in model.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu()

    updater = chainer.training.StandardUpdater(train_iter,
                                               optimizer,
                                               device=device)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, model, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.snapshot_object(
            extractor, 'snapshot_model_{.updater.epoch}.npz'),
                       trigger=(args.epoch, 'epoch'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
예제 #33
0
파일: seq2seq.py 프로젝트: asi1024/chainer
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action='store_true', default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--stop', '-s', type=str, default='15e',
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default='adam()',
                        help='Optimizer and its argument')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print('RD source done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print('RD target done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print('Rank {} GPU: {}'.format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.bcast_obj(source_ids, root=0)
    target_ids = comm.bcast_obj(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print('target_words : {}'.format(len(target_words)))
        print('source_words : {}'.format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device_from_id(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write('Error: unknown stop trigger: {}'.format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print('Trigger: {}'.format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], numpy.int32)
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()
예제 #34
0
def train(args, dataset_train, dataset_test):
    random.seed(0)
    np.random.seed(0)

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_gpu = comm.size
        args.inter_size = comm.inter_size
        args.intra_size = comm.intra_size
        args.batch_size_total = args.batch_size * args.n_gpu

        chainer.cuda.get_device(device).use()
    else:
        args.batch_size_total = args.batch_size
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    # Model

    G_A = chainer_cyclegan.models.ResnetGenerator()
    G_B = chainer_cyclegan.models.ResnetGenerator()
    D_A = chainer_cyclegan.models.NLayerDiscriminator()
    D_B = chainer_cyclegan.models.NLayerDiscriminator()

    if args.multi_node or args.gpu >= 0:
        G_A.to_gpu()
        G_B.to_gpu()
        D_A.to_gpu()
        D_B.to_gpu()

    # Optimizer

    args.lr = 0.0002
    args.beta1 = 0.5
    args.beta2 = 0.999

    optimizer_G_A = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)
    optimizer_G_B = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)
    optimizer_D_A = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)
    optimizer_D_B = chainer.optimizers.Adam(alpha=args.lr,
                                            beta1=args.beta1,
                                            beta2=args.beta2)

    if args.multi_node:
        optimizer_G_A = chainermn.create_multi_node_optimizer(
            optimizer_G_A, comm)
        optimizer_G_B = chainermn.create_multi_node_optimizer(
            optimizer_G_B, comm)
        optimizer_D_A = chainermn.create_multi_node_optimizer(
            optimizer_D_A, comm)
        optimizer_D_B = chainermn.create_multi_node_optimizer(
            optimizer_D_B, comm)

    optimizer_G_A.setup(G_A)
    optimizer_G_B.setup(G_B)
    optimizer_D_A.setup(D_A)
    optimizer_D_B.setup(D_B)

    # Dataset

    if args.multi_node:
        if comm.rank != 0:
            dataset_train = None
            dataset_test = None
        dataset_train = chainermn.scatter_dataset(dataset_train,
                                                  comm,
                                                  shuffle=True)
        dataset_test = chainermn.scatter_dataset(dataset_test, comm)

    iter_train = chainer.iterators.MultiprocessIterator(
        dataset_train,
        batch_size=args.batch_size,
        n_processes=4,
        shared_mem=10**7)
    iter_test = chainer.iterators.SerialIterator(dataset_test,
                                                 batch_size=args.batch_size,
                                                 repeat=False,
                                                 shuffle=False)

    # Updater

    epoch_count = 1
    niter = 100
    niter_decay = 100

    updater = chainer_cyclegan.updaters.CycleGANUpdater(
        iterator=iter_train,
        optimizer=dict(
            G_A=optimizer_G_A,
            G_B=optimizer_G_B,
            D_A=optimizer_D_A,
            D_B=optimizer_D_B,
        ),
        device=device,
    )

    # Trainer

    out = osp.join('logs/train_cyclegan',
                   datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
    trainer = training.Trainer(updater, (niter + niter_decay, 'epoch'),
                               out=out)

    @training.make_extension(trigger=(1, 'epoch'))
    def tune_learning_rate(trainer):
        epoch = trainer.updater.epoch

        lr_rate = 1.0 - (max(0, epoch + 1 + epoch_count - niter) /
                         float(niter_decay + 1))

        trainer.updater.get_optimizer('G_A').alpha *= lr_rate
        trainer.updater.get_optimizer('G_B').alpha *= lr_rate
        trainer.updater.get_optimizer('D_A').alpha *= lr_rate
        trainer.updater.get_optimizer('D_B').alpha *= lr_rate

    trainer.extend(tune_learning_rate)

    if not args.multi_node or comm.rank == 0:
        trainer.extend(
            chainer_cyclegan.extensions.CycleGANEvaluator(iter_test,
                                                          device=device))

        trainer.extend(extensions.snapshot_object(
            target=G_A, filename='G_A_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.snapshot_object(
            target=G_B, filename='G_B_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.snapshot_object(
            target=D_A, filename='D_A_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.snapshot_object(
            target=D_B, filename='D_B_{.updater.epoch:08}.npz'),
                       trigger=(1, 'epoch'))

        trainer.extend(extensions.LogReport(trigger=(20, 'iteration')))
        trainer.extend(
            extensions.PrintReport([
                'epoch',
                'iteration',
                'elapsed_time',
                'loss_gen_A',
                'loss_gen_B',
                'loss_dis_A',
                'loss_dis_B',
                'loss_cyc_A',
                'loss_cyc_B',
                'loss_idt_A',
                'loss_idt_B',
            ]))
        trainer.extend(contrib.extensions.ParamsReport(args.__dict__))
        trainer.extend(extensions.ProgressBar(update_interval=10))

        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_gen_A', 'loss_gen_B'],
                                  x_key='iteration',
                                  file_name='loss_gen.png',
                                  trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_dis_A', 'loss_dis_B'],
                                  x_key='iteration',
                                  file_name='loss_dis.png',
                                  trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_cyc_A', 'loss_cyc_B'],
                                  x_key='iteration',
                                  file_name='loss_cyc.png',
                                  trigger=(100, 'iteration')))
        trainer.extend(
            extensions.PlotReport(y_keys=['loss_idt_A', 'loss_idt_B'],
                                  x_key='iteration',
                                  file_name='loss_idt.png',
                                  trigger=(100, 'iteration')))

    trainer.run()
예제 #35
0
def main():
    # Check if GPU is available
    # (ImageNet example does not support CPU execution)
    if not chainer.cuda.available:
        raise RuntimeError("ImageNet requires GPU support.")

    archs = {
        'alex': alex.Alex,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch', '-a', choices=archs.keys(), default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize', '-B', type=int, default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch', '-E', type=int, default=10,
                        help='Number of epochs to train')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob', '-j', type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean', '-m', default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume', '-r', default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out', '-o', default='result',
                        help='Output directory')
    parser.add_argument('--root', '-R', default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize', '-b', type=int, default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--communicator', default='hierarchical')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Prepare ChainerMN communicator.
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Using {} arch'.format(args.arch))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)

    chainer.cuda.get_device_from_id(device).use()  # Make the GPU current
    model.to_gpu()

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    mean = np.load(args.mean)
    if comm.rank == 0:
        train = PreprocessedDataset(args.train, args.root, mean, model.insize)
        val = PreprocessedDataset(
            args.val, args.root, mean, model.insize, False)
    else:
        train = None
        val = None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    val = chainermn.scatter_dataset(val, comm)

    # We need to change the start method of multiprocessing module if we are
    # using InfiniBand and MultiprocessIterator. This is because processes
    # often crash when calling fork if they are using Infiniband.
    # (c.f., https://www.open-mpi.org/faq/?category=tuning#fork-warning )
    multiprocessing.set_start_method('forkserver')
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9), comm)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    checkpoint_interval = (10, 'iteration') if args.test else (1, 'epoch')
    val_interval = (10, 'iteration') if args.test else (1, 'epoch')
    log_interval = (10, 'iteration') if args.test else (1, 'epoch')

    checkpointer = chainermn.create_multi_node_checkpointer(
        name='imagenet-example', comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    trainer.extend(checkpointer, trigger=checkpoint_interval)

    # Create a multi node evaluator from an evaluator.
    evaluator = TestModeEvaluator(val_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator, trigger=val_interval)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'lr'
        ]), trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
예제 #36
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model', choices=('resnet50', 'resnet101'))
    parser.add_argument('--batchsize', type=int, default=16)
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume')
    args = parser.parse_args()

    comm = chainermn.create_communicator()
    device = comm.intra_rank

    if args.model == 'resnet50':
        model = FasterRCNNFPNResNet50(
            n_fg_class=len(coco_bbox_label_names), mean='chainercv')
        copyparams(model.extractor.base,
                   ResNet50(pretrained_model='imagenet', arch='he'))
    elif args.model == 'resnet101':
        model = FasterRCNNFPNResNet101(
            n_fg_class=len(coco_bbox_label_names), mean='chainercv')
        copyparams(model.extractor.base,
                   ResNet101(pretrained_model='imagenet', arch='he'))

    model.use_preset('evaluate')
    train_chain = TrainChain(model)
    chainer.cuda.get_device_from_id(device).use()
    train_chain.to_gpu()

    train = TransformDataset(
        ConcatenatedDataset(
            COCOBboxDataset(split='train'),
            COCOBboxDataset(split='valminusminival'),
        ), ('img', 'bbox', 'label'), transform)

    if comm.rank == 0:
        indices = np.arange(len(train))
    else:
        indices = None
    indices = chainermn.scatter_dataset(indices, comm, shuffle=True)
    train = train.slice[indices]

    train_iter = chainer.iterators.MultithreadIterator(
        train, args.batchsize // comm.size)

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.MomentumSGD(), comm)
    optimizer.setup(train_chain)
    optimizer.add_hook(WeightDecay(0.0001))

    model.extractor.base.conv1.disable_update()
    model.extractor.base.res2.disable_update()
    for link in model.links():
        if isinstance(link, L.BatchNormalization):
            link.disable_update()

    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, converter=converter, device=device)
    trainer = training.Trainer(
        updater, (90000 * 16 / args.batchsize, 'iteration'), args.out)

    def lr_schedule(updater):
        base_lr = 0.02 * args.batchsize / 16
        warm_up_duration = 500
        warm_up_rate = 1 / 3

        iteration = updater.iteration
        if iteration < warm_up_duration:
            rate = warm_up_rate \
                + (1 - warm_up_rate) * iteration / warm_up_duration
        elif iteration < 60000 * 16 / args.batchsize:
            rate = 1
        elif iteration < 80000 * 16 / args.batchsize:
            rate = 0.1
        else:
            rate = 0.01

        return base_lr * rate

    trainer.extend(ManualScheduler('lr', lr_schedule))

    if comm.rank == 0:
        log_interval = 10, 'iteration'
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_lr(), trigger=log_interval)
        trainer.extend(extensions.PrintReport(
            ['epoch', 'iteration', 'lr', 'main/loss',
             'main/loss/rpn/loc', 'main/loss/rpn/conf',
             'main/loss/head/loc', 'main/loss/head/conf']),
            trigger=log_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

        trainer.extend(extensions.snapshot(), trigger=(10000, 'iteration'))
        trainer.extend(
            extensions.snapshot_object(
                model, 'model_iter_{.updater.iteration}'),
            trigger=(90000 * 16 / args.batchsize, 'iteration'))

    if args.resume:
        serializers.load_npz(args.resume, trainer, strict=False)

    trainer.run()
예제 #37
0
def main():
    parser = argparse.ArgumentParser(description='ChainerMN example: DCGAN')
    parser.add_argument('--batchsize', '-b', type=int, default=50,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator', type=str,
                        default='hierarchical', help='Type of communicator')
    parser.add_argument('--epoch', '-e', type=int, default=1000,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--dataset', '-i', default='',
                        help='Directory of image files.  Default is cifar-10.')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--gen_model', '-r', default='',
                        help='Use pre-trained generator for training')
    parser.add_argument('--dis_model', '-d', default='',
                        help='Use pre-trained discriminator for training')
    parser.add_argument('--n_hidden', '-n', type=int, default=100,
                        help='Number of hidden units (z)')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed of z at visualization stage')
    parser.add_argument('--snapshot_interval', type=int, default=1000,
                        help='Interval of snapshot')
    parser.add_argument('--display_interval', type=int, default=100,
                        help='Interval of displaying log to console')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num hidden unit: {}'.format(args.n_hidden))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    # Set up a neural network to train
    gen = Generator(n_hidden=args.n_hidden)
    dis = Discriminator()

    if device >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(device).use()
        gen.to_gpu()  # Copy the model to the GPU
        dis.to_gpu()

    # Setup an optimizer
    def make_optimizer(model, comm, alpha=0.0002, beta1=0.5):
        # Create a multi node optimizer from a standard Chainer optimizer.
        optimizer = chainermn.create_multi_node_optimizer(
            chainer.optimizers.Adam(alpha=alpha, beta1=beta1), comm)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001), 'hook_dec')
        return optimizer

    opt_gen = make_optimizer(gen, comm)
    opt_dis = make_optimizer(dis, comm)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        if args.dataset == '':
            # Load the CIFAR10 dataset if args.dataset is not specified
            train, _ = chainer.datasets.get_cifar10(withlabel=False,
                                                    scale=255.)
        else:
            all_files = os.listdir(args.dataset)
            image_files = [f for f in all_files if ('png' in f or 'jpg' in f)]
            print('{} contains {} image files'
                  .format(args.dataset, len(image_files)))
            train = chainer.datasets\
                .ImageDataset(paths=image_files, root=args.dataset)
    else:
        train = None

    train = chainermn.scatter_dataset(train, comm)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)

    # Set up a trainer
    updater = DCGANUpdater(
        models=(gen, dis),
        iterator=train_iter,
        optimizer={
            'gen': opt_gen, 'dis': opt_dis},
        device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        snapshot_interval = (args.snapshot_interval, 'iteration')
        display_interval = (args.display_interval, 'iteration')
        # Save only model parameters.
        # `snapshot` extension will save all the trainer module's attribute,
        # including `train_iter`.
        # However, `train_iter` depends on scattered dataset, which means that
        # `train_iter` may be different in each process.
        # Here, instead of saving whole trainer module, only the network models
        # are saved.
        trainer.extend(extensions.snapshot_object(
            gen, 'gen_iter_{.updater.iteration}.npz'),
            trigger=snapshot_interval)
        trainer.extend(extensions.snapshot_object(
            dis, 'dis_iter_{.updater.iteration}.npz'),
            trigger=snapshot_interval)
        trainer.extend(extensions.LogReport(trigger=display_interval))
        trainer.extend(extensions.PrintReport([
            'epoch', 'iteration', 'gen/loss', 'dis/loss', 'elapsed_time',
        ]), trigger=display_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))
        trainer.extend(
            out_generated_image(
                gen, dis,
                10, 10, args.seed, args.out),
            trigger=snapshot_interval)

    # Start the training using pre-trained model, saved by snapshot_object
    if args.gen_model:
        chainer.serializers.load_npz(args.gen_model, gen)
    if args.dis_model:
        chainer.serializers.load_npz(args.dis_model, dis)

    # Run the training
    trainer.run()
예제 #38
0
def main():
    parser = argparse.ArgumentParser(description='Chainer K-FAC example: MNIST')  # NOQA
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--num_epochs', type=int, default=20)
    parser.add_argument('--snapshot_interval', type=int, default=-1)
    parser.add_argument('--no_cuda', action='store_true')
    parser.add_argument('--out', default='result')
    parser.add_argument('--resume', default='')
    parser.add_argument('--optimizer', default='kfac')
    parser.add_argument('--arch', choices=['mlp', 'cnn'], default='mlp')
    parser.add_argument('--plot', action='store_true')
    parser.add_argument('--distributed', action='store_true')
    args = parser.parse_args()

    # Prepare communicator
    if not args.distributed:
        # Single process execution
        comm = None
        rank = 0
        device = -1 if args.no_cuda else 0
    else:
        # Multiple processes execution, constructs a communicator.
        # chainerkfac uses different method to create a communicator from
        # chainermn.
        if args.optimizer == 'kfac':
            comm = chainerkfac.create_communicator('pure_nccl')
        else:
            comm = chainermn.create_communicator('pure_nccl')
        rank = comm.rank
        device = comm.intra_rank
        if rank == 0:
            print('======== DISTRIBUTED TRAINING ========')

    # Set up a neural network to train
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if args.arch == 'mlp':
        model = L.Classifier(MLP())
        in_ndim = 1  # input dimentions
    else:
        model = L.Classifier(CNN())
        in_ndim = 3  # input dimentions

    if device >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(device).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    if args.optimizer == 'kfac':
        if comm is None:
            optimizer = chainerkfac.optimizers.KFAC()
        else:
            optimizer = chainerkfac.optimizers.DistributedKFAC(comm)
    else:
        optimizer = chainer.optimizers.Adam()
        if comm is not None:
            optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)

    # Load the MNIST dataset
    if rank == 0:
        train, test = chainer.datasets.get_mnist(ndim=in_ndim)
    else:
        train, test = None, None
    if comm is not None:
        train = chainermn.scatter_dataset(train, comm, shuffle=True)
        test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batch_size)
    test_iter = chainer.iterators.SerialIterator(test, args.batch_size,
                                                 repeat=False, shuffle=False)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.num_epochs, 'epoch'),
                               out=args.out)

    # Evaluate the model with the test dataset for each epoch
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    if comm is not None:
        evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if rank == 0:
        # Take a snapshot for each specified epoch
        snapshot_interval = args.num_epochs \
            if args.snapshot_interval == -1 else max(1, args.snapshot_interval)
        trainer.extend(extensions.snapshot(),
                       trigger=(snapshot_interval, 'epoch'))

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())

        # Save two plot images to the result dir
        if args.plot and extensions.PlotReport.available():
            trainer.extend(
                extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                      'epoch', file_name='loss.png'))
            trainer.extend(
                extensions.PlotReport(
                    ['main/accuracy', 'validation/main/accuracy'],
                    'epoch', file_name='accuracy.png'))

        # Print selected entries of the log to stdout
        # Here "main" refers to the target link of the "main" optimizer again,
        # and "validation" refers to the default name of the Evaluator
        # extension. Entries other than 'epoch' are reported by the Classifier
        # link, called by either the updater or the evaluator.
        trainer.extend(extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
예제 #39
0
def check_mnist(gpu, display_log=True):
    epoch = 5
    batchsize = 100
    n_units = 100

    comm = chainermn.create_communicator('naive')
    if gpu:
        device = comm.intra_rank
        chainer.cuda.get_device_from_id(device).use()
    else:
        device = -1

    model = L.Classifier(MLP(n_units, 10))
    if gpu:
        model.to_gpu()

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test, batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(
        train_iter,
        optimizer,
        device=device
    )

    trainer = training.Trainer(updater, (epoch, 'epoch'))

    # Wrap standard Chainer evaluators by MultiNodeEvaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Add checkpointer. This is just to check checkpointing runs
    # without errors
    path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + "-tmp-")
    checkpointer = create_multi_node_checkpointer(name=__name__, comm=comm,
                                                  path=path)
    trainer.extend(checkpointer, trigger=(1, 'epoch'))

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0 and display_log:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.PrintReport(['epoch',
                                               'main/loss',
                                               'validation/main/loss',
                                               'main/accuracy',
                                               'validation/main/accuracy',
                                               'elapsed_time'],
                                              out=sys.stderr),
                       trigger=(1, 'epoch'))
    trainer.run()

    err = evaluator()['validation/main/accuracy']
    assert err > 0.95

    # Check checkpointer successfully finalized snapshot directory
    assert [] == os.listdir(path)
    os.removedirs(path)