def create(self):
     return optimizers.CorrectedMomentumSGD(0.1)
Esempio n. 2
0
    elif args.adabound:
        optimizer = optimizers.AdaBound()
    elif args.rmsprop:
        optimizer = optimizers.RMSprop()
    elif args.adam:
        optimizer = optimizers.Adam()
    elif args.sgd:
        optimizer = optimizers.SGD()
    elif args.adagrad:
        optimizer = optimizers.AdaGrad()
    elif args.amsgrad:
        optimizer = optimizers.AMSGrad()
    elif args.amsbound:
        optimizer = optimizers.AMSBound()
    elif args.correctedmomentsgd:
        optimizer = optimizers.CorrectedMomentumSGD()
    elif args.nesterovag:
        optimizer = optimizers.NesterovAG()
    elif args.msvag:
        optimizer = optimizers.MSVAG()
    elif args.rmspropgraves:
        optimizer = optimizers.RMSpropGraves()
    elif args.smorms3:
        optimizer = optimizers.SMORMS3()
    else:
        optimizer = optimizers.AdaDelta()

    optimizer.setup(net)

    if args.lasso:
        #Lasso回帰でスパース化
Esempio n. 3
0
def main():
    # Start the multiprocessing environment
    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    # Set up workspace
    # 12 GB GPU RAM for workspace
    chainer.cuda.set_max_workspace_size(16 * 1024 * 1024 * 1024)

    # Setup the multi-node environment
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank
    print(
        '==> Successfully setup communicator: "{}" rank: {} device: {} size: {}'
        .format(args.communicator, comm.rank, device, comm.size))
    set_random_seed(args, device)

    # Setup LR
    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256  # TODO: why?
        if comm.rank == 0:
            print(
                'LR = {} is selected based on the linear scaling rule'.format(
                    lr))

    # Setup dataset
    train_dir = os.path.join(args.dataset_dir, 'train')
    val_dir = os.path.join(args.dataset_dir, 'val')
    label_names = datasets.directory_parsing_label_names(train_dir)
    train_data = datasets.DirectoryParsingLabelDataset(train_dir)
    val_data = datasets.DirectoryParsingLabelDataset(val_dir)
    train_data = TransformDataset(train_data, ('img', 'label'),
                                  TrainTransform(_mean, args))
    val_data = TransformDataset(val_data, ('img', 'label'),
                                ValTransform(_mean, args))
    print('==> [{}] Successfully finished loading dataset'.format(comm.rank))

    # Initializing dataset iterators
    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(train_indices,
                                              comm,
                                              shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(val_data,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_processes=args.loaderjob)

    # Create the model
    kwargs = {}
    if args.first_bn_mixed16 and args.dtype == 'float16':
        print('==> Setting the first BN layer to mixed16')
        kwargs['first_bn_mixed16'] = True

    # Initialize the model
    net = models.__dict__[args.arch](n_class=len(label_names), **kwargs)
    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in net.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    # Apply ada loss transform
    recorder = AdaLossRecorder(sample_per_n_iter=100)
    # Update the model to support AdaLoss
    net = AdaLossScaled(net,
                        init_scale=args.init_scale,
                        cfg={
                            'loss_scale_method': args.loss_scale_method,
                            'scale_upper_bound': args.scale_upper_bound,
                            'accum_upper_bound': args.accum_upper_bound,
                            'update_per_n_iteration':
                            args.update_per_n_iteration,
                            'recorder': recorder,
                        },
                        transforms=[
                            AdaLossTransformLinear(),
                            AdaLossTransformBottleneck(),
                            AdaLossTransformBasicBlock(),
                            AdaLossTransformConv2DBNActiv(),
                        ],
                        verbose=args.verbose)

    if comm.rank == 0:  # print network only in the 1-rank machine
        print(net)
    net = L.Classifier(net)
    hook = AdaLossMonitor(sample_per_n_iter=100,
                          verbose=args.verbose,
                          includes=['Grad', 'Deconvolution'])

    # Setup optimizer
    optim = chainermn.create_multi_node_optimizer(
        optimizers.CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    if args.dtype == 'mixed16':
        print('==> Using FP32 update for dtype=mixed16')
        optim.use_fp32_update()  # by default use fp32 update

        # HACK: support skipping update by existing loss scaling functionality
        if args.dynamic_interval is not None:
            optim.loss_scaling(interval=args.dynamic_interval, scale=None)
        else:
            optim.loss_scaling(interval=float('inf'), scale=None)
            optim._loss_scale_max = 1.0  # to prevent actual loss scaling

    optim.setup(net)

    # setup weight decay
    for param in net.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    # allocate model to multiple GPUs
    if device >= 0:
        chainer.cuda.get_device(device).use()
        net.to_gpu()

    # Create an updater that implements how to update based on one train_iter input
    updater = chainer.training.StandardUpdater(train_iter,
                                               optim,
                                               device=device)
    # Setup Trainer
    stop_trigger = (args.epoch, 'epoch')
    if args.iter is not None:
        stop_trigger = (args.iter, 'iteration')
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        """ LR schedule for training ResNet especially.
        NOTE: lr should be within the context.
        """
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5  # NOTE: mentioned the original ResNet paper.
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, net, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)

        # NOTE: may take snapshot every iteration now
        snapshot_label = 'epoch' if args.iter is None else 'iteration'
        snapshot_trigger = (args.snapshot_freq, snapshot_label)
        snapshot_filename = ('snapshot_' + snapshot_label + '_{.updater.' +
                             snapshot_label + '}.npz')
        trainer.extend(extensions.snapshot(filename=snapshot_filename),
                       trigger=snapshot_trigger)

        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_value(
            'loss_scale',
            lambda trainer: trainer.updater.get_optimizer('main')._loss_scale),
                       trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'loss_scale',
            'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    recorder.trainer = trainer
    hook.trainer = trainer
    with ExitStack() as stack:
        if comm.rank == 0:
            stack.enter_context(hook)
        trainer.run()

    # store recorded results
    if comm.rank == 0:  # NOTE: only export in the first rank
        recorder.export().to_csv(os.path.join(args.out, 'loss_scale.csv'))
        hook.export_history().to_csv(os.path.join(args.out, 'grad_stats.csv'))
Esempio n. 4
0
def main():
    parser = ArgumentParser()

    parser.add_argument('train_data', help='train data')
    parser.add_argument('train_labels', help='train labels')
    parser.add_argument('--val-data', default=None, help='val data')
    parser.add_argument('--val-labels', default=None, help='val labels')
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=5,
                        help='mini-batch size (default=5)')
    parser.add_argument('--beta2',
                        type=float,
                        default=0.999,
                        help='beta2 of Adam (default=0.999)')
    parser.add_argument('-g',
                        '--gpu-id',
                        type=int,
                        default=-1,
                        help='GPU ID (default=-1, indicates CPU)')
    parser.add_argument('--ignore-labels',
                        type=int,
                        default=[],
                        nargs='+',
                        help='labels to ignore (default=[])')
    parser.add_argument('-l',
                        '--learning-rate',
                        type=float,
                        default=0.1,
                        help='learning rate (default=0.1)')
    parser.add_argument('--max-iter',
                        type=int,
                        default=160000,
                        help='train model up to max-iter (default=160000)')
    parser.add_argument(
        '--mean-interval',
        type=int,
        default=1000,
        help='calculate mean of train/loss (and validation loss) ' +
        'every mean-interval iters (default=1000)')
    parser.add_argument('--model',
                        default=None,
                        help='resume to train the model')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        help='momentum rate (default=0.9)')
    parser.add_argument('--n-classes',
                        type=int,
                        default=5,
                        help='number of classes (default=5)')
    parser.add_argument('--noise',
                        default='no',
                        help='noise injection method. \'no\', \'patch\', ' +
                        'and \'permutation\' are available (default=\'no\')')
    parser.add_argument('--optim',
                        default='nesterov',
                        help='optimization method. \'sgd\', \'nesterov\', ' +
                        'and \'adam\' are available (default=\'nesterov\')')
    parser.add_argument(
        '-o',
        '--outdir',
        default='./',
        help='trained models and optimizer states are stored in outdir ' +
        '(default=\'./\')')
    parser.add_argument(
        '--queue-maxsize',
        type=int,
        default=10,
        help='maxsize of queues for training and validation (default=10)')
    parser.add_argument(
        '--save-interval',
        type=int,
        default=10000,
        help='save model & optimizer every save-interval iters (default=10000)'
    )
    parser.add_argument(
        '--state',
        default=None,
        help='optimizer state. resume to train the model with the optimizer')
    parser.add_argument('-w',
                        '--weight-decay',
                        type=float,
                        default=1e-4,
                        help='weight decay factor (default=1e-4)')

    args = parser.parse_args()

    print(argv2string(sys.argv) + '\n')
    for arg in dir(args):
        if arg[:1] == '_':
            continue
        print('{} = {}'.format(arg, getattr(args, arg)))
    print()

    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)
        print('mkdir ' + args.outdir + '\n')

    model = Model(in_ch=3, out_ch=args.n_classes)
    if args.model is not None:
        S.load_npz(args.model, model)
    loss_func = Loss(model)

    if args.optim.lower() in 'sgd':
        if args.momentum > 0:
            optim = optims.CorrectedMomentumSGD(lr=args.learning_rate,
                                                momentum=args.momentum)
        else:
            optim = optims.SGD(lr=args.learning_rate)
    elif args.optim.lower() in 'nesterovag':
        optim = optims.NesterovAG(lr=args.learning_rate,
                                  momentum=args.momentum)
    elif args.optim.lower() in 'adam':
        optim = optims.Adam(alpha=args.learning_rate,
                            beta1=args.momentum,
                            beta2=args.beta2,
                            weight_decay_rate=args.weight_decay,
                            amsgrad=True)
    else:
        raise ValueError('Please specify an available optimizer name.\n' +
                         'SGD, NesterovAG, and Adam are available.')

    print('{}\n'.format(type(optim)))
    optim.setup(model)

    if args.state is not None:
        S.load_npz(args.state, optim)

    if (args.weight_decay > 0) and not isinstance(optim, optims.Adam):
        optim.add_hook(WeightDecay(args.weight_decay))

    optim.add_hook(GradientClipping(1))

    lr_decay_iter_dict = {
        int(5 * args.max_iter / 8): 0.1,
        int(7 * args.max_iter / 8): 0.1,
    }

    with open(args.train_data, 'r') as f:
        train_data_path_list = [line.strip() for line in f.readlines()]
    with open(args.train_labels, 'r') as f:
        train_labels_path_list = [line.strip() for line in f.readlines()]

    assert len(train_data_path_list) == len(train_labels_path_list)

    if (args.val_data is not None) or (args.val_labels is not None):
        if (args.val_data is not None) and (args.val_labels is not None):
            with open(args.val_data, 'r') as f:
                val_data_path_list = [line.strip() for line in f.readlines()]
            with open(args.val_labels, 'r') as f:
                val_labels_path_list = [line.strip() for line in f.readlines()]
            assert len(val_data_path_list) == len(val_labels_path_list)
        else:
            raise ValueError('Either val_data or val_labels is not specified.')

    train_queue = mp.Queue(maxsize=args.queue_maxsize)
    train_generator = BatchGenerator(args.batch_size,
                                     train_data_path_list,
                                     train_labels_path_list,
                                     train_queue,
                                     train=True,
                                     noise_injection=args.noise,
                                     out_height=512,
                                     out_width=512,
                                     max_height=1216,
                                     max_width=1216,
                                     min_height=832,
                                     min_width=832)
    train_generator.start()

    if args.val_data is None:
        val_queue = None
    else:
        val_queue = mp.Queue(maxsize=args.queue_maxsize)
        try:
            val_generator = BatchGenerator(1,
                                           val_data_path_list,
                                           val_labels_path_list,
                                           val_queue,
                                           train=False,
                                           out_height=608,
                                           out_width=968)
            val_generator.start()
        except Exception:
            train_generator.terminate()
            train_queue.close()
            val_queue.close()
            raise

    try:
        train(loss_func, optim, train_queue, args.max_iter, args.mean_interval,
              args.save_interval, val_queue, lr_decay_iter_dict, args.gpu_id,
              args.ignore_labels, args.outdir)
    except BaseException:
        train_generator.terminate()
        train_queue.close()
        if val_queue is not None:
            val_generator.terminate()
            val_queue.close()
        raise

    train_generator.terminate()
    train_queue.close()
    if val_queue is not None:
        val_generator.terminate()
        val_queue.close()