Beispiel #1
0
def get_data_iterators(data_dir,
                       batch_size,
                       num_workers,
                       num_classes,
                       input_image_size=224,
                       resize_inv_factor=0.875):
    assert (resize_inv_factor > 0.0)
    resize_value = int(math.ceil(float(input_image_size) / resize_inv_factor))

    train_dir_path = os.path.join(data_dir, 'train')
    train_dataset = PreprocessedDataset(root=train_dir_path,
                                        scale_size=resize_value,
                                        crop_size=input_image_size)
    assert (len(directory_parsing_label_names(train_dir_path)) == num_classes)

    val_dir_path = os.path.join(data_dir, 'val')
    val_dataset = PreprocessedDataset(root=val_dir_path,
                                      scale_size=resize_value,
                                      crop_size=input_image_size)
    assert (len(directory_parsing_label_names(val_dir_path)) == num_classes)

    train_iterator = iterators.MultiprocessIterator(dataset=train_dataset,
                                                    batch_size=batch_size,
                                                    repeat=False,
                                                    shuffle=True,
                                                    n_processes=num_workers)

    val_iterator = iterators.MultiprocessIterator(dataset=val_dataset,
                                                  batch_size=batch_size,
                                                  repeat=False,
                                                  shuffle=False,
                                                  n_processes=num_workers)

    return train_iterator, val_iterator
Beispiel #2
0
def get_data_iterators(data_dir,
                       batch_size,
                       num_workers,
                       num_classes):

    train_dir_path = os.path.join(data_dir, 'train')
    train_dataset = PreprocessedDataset(root=train_dir_path)
    assert(len(directory_parsing_label_names(train_dir_path)) == num_classes)

    val_dir_path = os.path.join(data_dir, 'val')
    val_dataset = PreprocessedDataset(root=val_dir_path)
    assert (len(directory_parsing_label_names(val_dir_path)) == num_classes)

    train_iterator = iterators.MultiprocessIterator(
        dataset=train_dataset,
        batch_size=batch_size,
        repeat=False,
        shuffle=True,
        n_processes=num_workers)

    val_iterator = iterators.MultiprocessIterator(
        dataset=val_dataset,
        batch_size=batch_size,
        repeat=False,
        shuffle=False,
        n_processes=num_workers)

    return train_iterator, val_iterator
    def test_directory_parsing_label_dataset(self):
        dataset = DirectoryParsingLabelDataset(
            self.tmp_dir, color=self.color)

        if self.depth == 1:
            expected_legnth = self.n_img_per_class * self.n_class
        elif self.depth == 2:
            expected_legnth =\
                self.n_img_per_class * self.n_sub_directory * self.n_class
        self.assertEqual(len(dataset), expected_legnth)

        assert_is_label_dataset(dataset, self.n_class, color=self.color)

        label_names = directory_parsing_label_names(self.tmp_dir)
        self.assertEqual(
            label_names, ['class_{}'.format(i) for i in range(self.n_class)])

        if self.depth == 1:
            self.assertEqual(
                dataset.img_paths,
                ['{}/class_{}/img{}.{}'.format(self.tmp_dir, i, j, self.suffix)
                 for i in range(self.n_class)
                 for j in range(self.n_img_per_class)])
        elif self.depth == 2:
            self.assertEqual(
                dataset.img_paths,
                ['{}/class_{}/nested_{}/img{}.{}'.format(
                    self.tmp_dir, i, j, k, self.suffix)
                 for i in range(self.n_class)
                 for j in range(self.n_sub_directory)
                 for k in range(self.n_img_per_class)])
Beispiel #4
0
    def test_directory_parsing_classification_dataset(self):
        dataset = DirectoryParsingClassificationDataset(self.tmp_dir,
                                                        color=self.color)

        if self.depth == 1:
            expected_legnth = self.n_img_per_class * self.n_class
        elif self.depth == 2:
            expected_legnth =\
                self.n_img_per_class * self.n_sub_directory * self.n_class
        self.assertEqual(len(dataset), expected_legnth)

        assert_is_classification_dataset(dataset,
                                         self.n_class,
                                         color=self.color)

        label_names = directory_parsing_label_names(self.tmp_dir)
        self.assertEqual(label_names,
                         ['class_{}'.format(i) for i in range(self.n_class)])

        if self.depth == 1:
            self.assertEqual(dataset.img_paths, [
                '{}/class_{}/img{}.{}'.format(self.tmp_dir, i, j, self.suffix)
                for i in range(self.n_class)
                for j in range(self.n_img_per_class)
            ])
        elif self.depth == 2:
            self.assertEqual(dataset.img_paths, [
                '{}/class_{}/nested_{}/img{}.{}'.format(
                    self.tmp_dir, i, j, k, self.suffix)
                for i in range(self.n_class)
                for j in range(self.n_sub_directory)
                for k in range(self.n_img_per_class)
            ])
    def test_numerical_sort(self):
        dataset = DirectoryParsingLabelDataset(
            self.tmp_dir, numerical_sort=True)

        assert_is_label_dataset(dataset, self.n_class)

        label_names = directory_parsing_label_names(
            self.tmp_dir, numerical_sort=True)
        self.assertEqual(
            label_names, ['{}'.format(i) for i in range(self.n_class)])
Beispiel #6
0
    def test_numerical_sort(self):
        dataset = DirectoryParsingClassificationDataset(self.tmp_dir,
                                                        numerical_sort=True)

        assert_is_classification_dataset(dataset, self.n_class)

        label_names = directory_parsing_label_names(self.tmp_dir,
                                                    numerical_sort=True)
        self.assertEqual(label_names,
                         ['{}'.format(i) for i in range(self.n_class)])
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')
    parser.add_argument(
        '--model', choices=('vgg16', 'resnet50', 'resnet101', 'resnet152'))
    parser.add_argument('--pretrained_model', default='imagenet')
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--crop', choices=('center', '10'), default='center')
    parser.add_argument('--resnet_mode', default='he')
    args = parser.parse_args()

    dataset = DirectoryParsingLabelDataset(args.val)
    label_names = directory_parsing_label_names(args.val)
    n_class = len(label_names)
    iterator = iterators.MultiprocessIterator(
        dataset, args.batchsize, repeat=False, shuffle=False,
        n_processes=6, shared_mem=300000000)

    if args.model == 'vgg16':
        extractor = VGG16(n_class, args.pretrained_model)
    elif args.model == 'resnet50':
        extractor = ResNet50(
            n_class, args.pretrained_model, mode=args.resnet_mode)
    elif args.model == 'resnet101':
        extractor = ResNet101(
            n_class, args.pretrained_model, mode=args.resnet_mode)
    elif args.model == 'resnet152':
        extractor = ResNet152(
            n_class, args.pretrained_model, mode=args.resnet_mode)
    model = FeaturePredictor(
        extractor, crop_size=224, scale_size=256, crop=args.crop)

    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu()

    print('Model has been prepared. Evaluation starts.')
    in_values, out_values, rest_values = apply_to_iterator(
        model.predict, iterator, hook=ProgressHook(len(dataset)))
    del in_values

    pred_probs, = out_values
    gt_labels, = rest_values

    accuracy = F.accuracy(
        np.array(list(pred_probs)), np.array(list(gt_labels))).data
    print()
    print('Top 1 Error {}'.format(1. - accuracy))
Beispiel #8
0
def get_val_data_iterator(data_dir, batch_size, num_workers, num_classes):

    val_dir_path = os.path.join(data_dir, 'val')
    val_dataset = DirectoryParsingLabelDataset(val_dir_path)
    val_dataset_len = len(val_dataset)
    assert (len(directory_parsing_label_names(val_dir_path)) == num_classes)

    val_iterator = iterators.MultiprocessIterator(dataset=val_dataset,
                                                  batch_size=batch_size,
                                                  repeat=False,
                                                  shuffle=False,
                                                  n_processes=num_workers,
                                                  shared_mem=300000000)

    return val_iterator, val_dataset_len
Beispiel #9
0
def setup(dataset, model, pretrained_model, batchsize, val, crop, resnet_arch):
    dataset_name = dataset
    if dataset_name == 'imagenet':
        dataset = DirectoryParsingLabelDataset(val)
        label_names = directory_parsing_label_names(val)

    def eval_(out_values, rest_values):
        pred_probs, = out_values
        gt_labels, = rest_values

        accuracy = F.accuracy(np.array(list(pred_probs)),
                              np.array(list(gt_labels))).data
        print()
        print('Top 1 Error {}'.format(1. - accuracy))

    cls, pretrained_models, default_batchsize = models[model][:3]
    if pretrained_model is None:
        pretrained_model = pretrained_models.get(dataset_name, dataset_name)
    if crop is None:
        crop = models[model][3]
    kwargs = {
        'n_class': len(label_names),
        'pretrained_model': pretrained_model,
    }
    if model in ['resnet50', 'resnet101', 'resnet152']:
        if resnet_arch is None:
            resnet_arch = models[model][4]
        kwargs.update({'arch': resnet_arch})
    extractor = cls(**kwargs)
    model = FeaturePredictor(extractor,
                             crop_size=224,
                             scale_size=256,
                             crop=crop)

    if batchsize is None:
        batchsize = default_batchsize

    return dataset, eval_, model, batchsize
def main():
    model_cfgs = {
        'resnet50': {
            'class': ResNet50,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet101': {
            'class': ResNet101,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        },
        'resnet152': {
            'class': ResNet152,
            'score_layer_name': 'fc6',
            'kwargs': {
                'arch': 'fb'
            }
        }
    }
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to root of the train dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')
    parser.add_argument('--model',
                        '-m',
                        choices=model_cfgs.keys(),
                        default='resnet50',
                        help='Convnet models')
    parser.add_argument('--communicator',
                        type=str,
                        default='pure_nccl',
                        help='Type of communicator')
    parser.add_argument('--loaderjob', type=int, default=4)
    parser.add_argument('--batchsize',
                        type=int,
                        default=32,
                        help='Batch size for each worker')
    parser.add_argument('--lr', type=float)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--weight_decay', type=float, default=0.0001)
    parser.add_argument('--out', type=str, default='result')
    parser.add_argument('--epoch', type=int, default=90)
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256
        if comm.rank == 0:
            print('lr={}: lr is selected based on the linear '
                  'scaling rule'.format(lr))

    label_names = directory_parsing_label_names(args.train)

    model_cfg = model_cfgs[args.model]
    extractor = model_cfg['class'](n_class=len(label_names),
                                   **model_cfg['kwargs'])
    extractor.pick = model_cfg['score_layer_name']
    model = Classifier(extractor)
    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in model.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    train_data = DirectoryParsingLabelDataset(args.train)
    val_data = DirectoryParsingLabelDataset(args.val)
    train_data = TransformDataset(train_data, ('img', 'label'),
                                  TrainTransform(extractor.mean))
    val_data = TransformDataset(val_data, ('img', 'label'),
                                ValTransform(extractor.mean))
    print('finished loading dataset')

    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(train_indices,
                                              comm,
                                              shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(val_data,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_processes=args.loaderjob)

    optimizer = chainermn.create_multi_node_optimizer(
        CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    optimizer.setup(model)
    for param in model.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu()

    updater = chainer.training.StandardUpdater(train_iter,
                                               optimizer,
                                               device=device)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, model, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.snapshot_object(
            extractor, 'snapshot_model_{.updater.epoch}.npz'),
                       trigger=(args.epoch, 'epoch'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'validation/main/loss', 'main/accuracy', 'validation/main/accuracy'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
def main():
    # Start the multiprocessing environment
    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    # Set up workspace
    # 12 GB GPU RAM for workspace
    chainer.cuda.set_max_workspace_size(16 * 1024 * 1024 * 1024)

    # Setup the multi-node environment
    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank
    print(
        '==> Successfully setup communicator: "{}" rank: {} device: {} size: {}'
        .format(args.communicator, comm.rank, device, comm.size))
    set_random_seed(args, device)

    # Setup LR
    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256  # TODO: why?
        if comm.rank == 0:
            print(
                'LR = {} is selected based on the linear scaling rule'.format(
                    lr))

    # Setup dataset
    train_dir = os.path.join(args.dataset_dir, 'train')
    val_dir = os.path.join(args.dataset_dir, 'val')
    label_names = datasets.directory_parsing_label_names(train_dir)
    train_data = datasets.DirectoryParsingLabelDataset(train_dir)
    val_data = datasets.DirectoryParsingLabelDataset(val_dir)
    train_data = TransformDataset(train_data, ('img', 'label'),
                                  TrainTransform(_mean, args))
    val_data = TransformDataset(val_data, ('img', 'label'),
                                ValTransform(_mean, args))
    print('==> [{}] Successfully finished loading dataset'.format(comm.rank))

    # Initializing dataset iterators
    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(train_indices,
                                              comm,
                                              shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(val_data,
                                              args.batchsize,
                                              repeat=False,
                                              shuffle=False,
                                              n_processes=args.loaderjob)

    # Create the model
    kwargs = {}
    if args.first_bn_mixed16 and args.dtype == 'float16':
        print('==> Setting the first BN layer to mixed16')
        kwargs['first_bn_mixed16'] = True

    # Initialize the model
    net = models.__dict__[args.arch](n_class=len(label_names), **kwargs)
    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in net.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    # Apply ada loss transform
    recorder = AdaLossRecorder(sample_per_n_iter=100)
    # Update the model to support AdaLoss
    net = AdaLossScaled(net,
                        init_scale=args.init_scale,
                        cfg={
                            'loss_scale_method': args.loss_scale_method,
                            'scale_upper_bound': args.scale_upper_bound,
                            'accum_upper_bound': args.accum_upper_bound,
                            'update_per_n_iteration':
                            args.update_per_n_iteration,
                            'recorder': recorder,
                        },
                        transforms=[
                            AdaLossTransformLinear(),
                            AdaLossTransformBottleneck(),
                            AdaLossTransformBasicBlock(),
                            AdaLossTransformConv2DBNActiv(),
                        ],
                        verbose=args.verbose)

    if comm.rank == 0:  # print network only in the 1-rank machine
        print(net)
    net = L.Classifier(net)
    hook = AdaLossMonitor(sample_per_n_iter=100,
                          verbose=args.verbose,
                          includes=['Grad', 'Deconvolution'])

    # Setup optimizer
    optim = chainermn.create_multi_node_optimizer(
        optimizers.CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    if args.dtype == 'mixed16':
        print('==> Using FP32 update for dtype=mixed16')
        optim.use_fp32_update()  # by default use fp32 update

        # HACK: support skipping update by existing loss scaling functionality
        if args.dynamic_interval is not None:
            optim.loss_scaling(interval=args.dynamic_interval, scale=None)
        else:
            optim.loss_scaling(interval=float('inf'), scale=None)
            optim._loss_scale_max = 1.0  # to prevent actual loss scaling

    optim.setup(net)

    # setup weight decay
    for param in net.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    # allocate model to multiple GPUs
    if device >= 0:
        chainer.cuda.get_device(device).use()
        net.to_gpu()

    # Create an updater that implements how to update based on one train_iter input
    updater = chainer.training.StandardUpdater(train_iter,
                                               optim,
                                               device=device)
    # Setup Trainer
    stop_trigger = (args.epoch, 'epoch')
    if args.iter is not None:
        stop_trigger = (args.iter, 'iteration')
    trainer = training.Trainer(updater, stop_trigger, out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        """ LR schedule for training ResNet especially.
        NOTE: lr should be within the context.
        """
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5  # NOTE: mentioned the original ResNet paper.
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, net, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        print('Using {} communicator'.format(args.communicator))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)

        # NOTE: may take snapshot every iteration now
        snapshot_label = 'epoch' if args.iter is None else 'iteration'
        snapshot_trigger = (args.snapshot_freq, snapshot_label)
        snapshot_filename = ('snapshot_' + snapshot_label + '_{.updater.' +
                             snapshot_label + '}.npz')
        trainer.extend(extensions.snapshot(filename=snapshot_filename),
                       trigger=snapshot_trigger)

        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.observe_value(
            'loss_scale',
            lambda trainer: trainer.updater.get_optimizer('main')._loss_scale),
                       trigger=log_interval)
        trainer.extend(extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'loss_scale',
            'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy'
        ]),
                       trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        serializers.load_npz(args.resume, trainer)

    recorder.trainer = trainer
    hook.trainer = trainer
    with ExitStack() as stack:
        if comm.rank == 0:
            stack.enter_context(hook)
        trainer.run()

    # store recorded results
    if comm.rank == 0:  # NOTE: only export in the first rank
        recorder.export().to_csv(os.path.join(args.out, 'loss_scale.csv'))
        hook.export_history().to_csv(os.path.join(args.out, 'grad_stats.csv'))
Beispiel #12
0
def main():
    model_cfgs = {
        'detnas_small_coco': {
            'class': DetNASSmallCOCO,
            'score_layer_name': 'fc',
            'kwargs': {
                #'n_class': 1000
            }
        },
    }
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to root of the train dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')
    parser.add_argument('--trial', action='store_true')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument(
        '--model',
        '-m',
        choices=model_cfgs.keys(),
        default='detnas_small_coco',
        help='Convnet models')
    parser.add_argument('--loaderjob', type=int, default=4)
    parser.add_argument(
        '--batchsize', type=int, help='Batch size for each worker')
    parser.add_argument('--lr', type=float)
    parser.add_argument('--momentum', type=float)
    parser.add_argument('--weight_decay', type=float)
    parser.add_argument('--out', type=str, default='result')
    parser.add_argument('--epoch', type=int)
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    label_names = directory_parsing_label_names(args.train)

    model_cfg = model_cfgs[args.model]
    extractor = model_cfg['class'](
        n_class=len(label_names), **model_cfg['kwargs'])
    extractor.pick = model_cfg['score_layer_name']
    model = Classifier(extractor)

    train_data = DirectoryParsingLabelDataset(args.train)
    val_data = DirectoryParsingLabelDataset(args.val)
    train_data = TransformDataset(train_data, TrainTransform(extractor.mean))
    val_data = TransformDataset(val_data, ValTransform(extractor.mean))
    print('finished loading dataset')

    train_indices = np.arange(len(train_data)//(100 if args.trial else 1))
    val_indices = np.arange(len(val_data))


    """
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    """
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(
        val_data,
        args.batchsize,
        repeat=False,
        shuffle=False,
        n_processes=args.loaderjob)

    optimizer = CorrectedMomentumSGD(lr=args.lr, momentum=args.momentum)
    optimizer.setup(model)
    for param in model.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    if args.gpu != -1:
        model.to_gpu(args.gpu)

    updater = chainer.training.StandardUpdater(
        train_iter, optimizer, device=args.gpu)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(LinearShift('lr', (args.lr, 0.0),
                   (0, len(train_indices) / args.batchsize)))
    evaluator = extensions.Evaluator(val_iter, model)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    trainer.extend(
        chainer.training.extensions.observe_lr(), trigger=log_interval)
    trainer.extend(
        extensions.snapshot_object(extractor,
                                   'snapshot_model_{.updater.epoch}.npz'),
        trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(
        extensions.PrintReport([
            'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
            'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy'
        ]),
        trigger=print_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
Beispiel #13
0
def main():
    model_cfgs = {
        'resnet50': {'class': ResNet50, 'score_layer_name': 'fc6',
                     'kwargs': {'arch': 'fb'}},
        'resnet101': {'class': ResNet101, 'score_layer_name': 'fc6',
                      'kwargs': {'arch': 'fb'}},
        'resnet152': {'class': ResNet152, 'score_layer_name': 'fc6',
                      'kwargs': {'arch': 'fb'}}
    }
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to root of the train dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')

    parser.add_argument('--export', type=str, default=None,
                        help='Export the model to ONNX')
    parser.add_argument('--compile', type=str, default=None,
                        help='Compile the model')
    parser.add_argument('--computation_order', type=str, default=None,
                        help='Computation order in backpropagation')

    parser.add_argument('--model',
                        '-m', choices=model_cfgs.keys(), default='resnet50',
                        help='Convnet models')
    parser.add_argument('--communicator', type=str,
                        default='pure_nccl', help='Type of communicator')
    parser.add_argument('--loaderjob', type=int, default=4)
    parser.add_argument('--batchsize', type=int, default=32,
                        help='Batch size for each worker')
    parser.add_argument('--lr', type=float)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--weight-decay', type=float, default=0.0001)
    parser.add_argument('--out', type=str, default='result')
    parser.add_argument('--epoch', type=int, default=90)
    parser.add_argument('--iterations', '-I', type=int, default=None,
                        help='Number of iterations to train')
    parser.add_argument('--no_use_fixed_batch_dataset',
                        dest='use_fixed_batch_dataset',
                        action='store_false',
                        help='Disable the use of FixedBatchDataset')
    parser.add_argument('--compiler-log', action='store_true',
                        help='Enables compile-time logging')
    parser.add_argument('--trace', action='store_true',
                        help='Enables runtime tracing')
    parser.add_argument('--verbose', action='store_true',
                        help='Enables runtime verbose log')
    parser.add_argument('--skip_runtime_type_check', action='store_true',
                        help='Skip runtime type check')
    parser.add_argument('--dump_memory_usage', type=int, default=0,
                        help='Dump memory usage (0-2)')
    parser.add_argument('--quiet_period', type=int, default=0,
                        help='Quiet period after runtime report')
    parser.add_argument('--overwrite_batchsize', action='store_true',
                        help='Overwrite batch size')
    args = parser.parse_args()

    # https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#using-multiprocessiterator
    if hasattr(multiprocessing, 'set_start_method'):
        multiprocessing.set_start_method('forkserver')
        p = multiprocessing.Process()
        p.start()
        p.join()

    comm = chainermn.create_communicator(args.communicator)
    device = comm.intra_rank

    if args.lr is not None:
        lr = args.lr
    else:
        lr = 0.1 * (args.batchsize * comm.size) / 256
        if comm.rank == 0:
            print('lr={}: lr is selected based on the linear '
                  'scaling rule'.format(lr))

    label_names = directory_parsing_label_names(args.train)

    model_cfg = model_cfgs[args.model]
    extractor = model_cfg['class'](
        n_class=len(label_names), **model_cfg['kwargs'])
    extractor.pick = model_cfg['score_layer_name']

    # Following https://arxiv.org/pdf/1706.02677.pdf,
    # the gamma of the last BN of each resblock is initialized by zeros.
    for l in extractor.links():
        if isinstance(l, Bottleneck):
            l.conv3.bn.gamma.data[:] = 0

    if args.export is not None:
        chainer_compiler.use_unified_memory_allocator()
        extractor.to_device(device)
        x = extractor.xp.zeros((args.batchsize, 3, 224, 224)).astype('f')
        chainer_compiler.export(extractor, [x], args.export)
        return

    if args.compile is not None:
        print('run compiled model')
        chainer_compiler.use_chainerx_shared_allocator()
        extractor.to_device(device)
        # init params
        with chainer.using_config('enable_backprop', False),\
                chainer.using_config('train', False):
            x = extractor.xp.zeros((1, 3, 224, 224)).astype('f')
            extractor(x)

        compiler_kwargs = {}
        if args.compiler_log:
            compiler_kwargs['compiler_log'] = True
        runtime_kwargs = {}
        if args.trace:
            runtime_kwargs['trace'] = True
        if args.verbose:
            runtime_kwargs['verbose'] = True
        if args.skip_runtime_type_check:
            runtime_kwargs['check_types'] = False
        if args.dump_memory_usage >= 1:
            runtime_kwargs['dump_memory_usage'] = args.dump_memory_usage
            free, total = cupy.cuda.runtime.memGetInfo()
            used = total - free
            runtime_kwargs['base_memory_usage'] = used

        onnx_filename = args.compile
        if args.overwrite_batchsize:
            new_onnx_filename = ('/tmp/overwrite_batchsize_' +
                                 os.path.basename(onnx_filename))
            new_input_types = [
                input_rewriter.Type(shape=(args.batchsize, 3, 224, 224))
            ]
            input_rewriter.rewrite_onnx_file(onnx_filename,
                                             new_onnx_filename,
                                             new_input_types)
            onnx_filename = new_onnx_filename

        extractor_cc = chainer_compiler.compile_onnx(
            extractor,
            onnx_filename,
            'onnx_chainer',
            computation_order=args.computation_order,
            compiler_kwargs=compiler_kwargs,
            runtime_kwargs=runtime_kwargs,
            quiet_period=args.quiet_period)
        model = Classifier(extractor_cc)
    else:
        print('run vanilla chainer model')
        model = Classifier(extractor)

    train_data = DirectoryParsingLabelDataset(args.train)
    val_data = DirectoryParsingLabelDataset(args.val)
    train_data = TransformDataset(
        train_data, ('img', 'label'), TrainTransform(extractor.mean))
    val_data = TransformDataset(
        val_data, ('img', 'label'), ValTransform(extractor.mean))
    print('finished loading dataset')

    if comm.rank == 0:
        train_indices = np.arange(len(train_data))
        val_indices = np.arange(len(val_data))
    else:
        train_indices = None
        val_indices = None

    train_indices = chainermn.scatter_dataset(
        train_indices, comm, shuffle=True)
    val_indices = chainermn.scatter_dataset(val_indices, comm, shuffle=True)
    train_data = train_data.slice[train_indices]
    val_data = val_data.slice[val_indices]
    if args.use_fixed_batch_dataset:
        train_data = FixedBatchDataset(train_data, args.batchsize)
        val_data = FixedBatchDataset(val_data, args.batchsize)
    train_iter = chainer.iterators.MultiprocessIterator(
        train_data, args.batchsize, n_processes=args.loaderjob)
    val_iter = iterators.MultiprocessIterator(
        val_data, args.batchsize,
        repeat=False, shuffle=False, n_processes=args.loaderjob)

    optimizer = chainermn.create_multi_node_optimizer(
        CorrectedMomentumSGD(lr=lr, momentum=args.momentum), comm)
    optimizer.setup(model)
    for param in model.params():
        if param.name not in ('beta', 'gamma'):
            param.update_rule.add_hook(WeightDecay(args.weight_decay))

    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu()

    updater = chainer.training.StandardUpdater(
        train_iter, optimizer, device=device)

    if args.iterations:
        stop_trigger = (args.iterations, 'iteration')
    else:
        stop_trigger = (args.epoch, 'epoch')
    trainer = training.Trainer(
        updater, stop_trigger, out=args.out)

    @make_shift('lr')
    def warmup_and_exponential_shift(trainer):
        epoch = trainer.updater.epoch_detail
        warmup_epoch = 5
        if epoch < warmup_epoch:
            if lr > 0.1:
                warmup_rate = 0.1 / lr
                rate = warmup_rate \
                    + (1 - warmup_rate) * epoch / warmup_epoch
            else:
                rate = 1
        elif epoch < 30:
            rate = 1
        elif epoch < 60:
            rate = 0.1
        elif epoch < 80:
            rate = 0.01
        else:
            rate = 0.001
        return rate * lr

    trainer.extend(warmup_and_exponential_shift)
    evaluator = chainermn.create_multi_node_evaluator(
        extensions.Evaluator(val_iter, model, device=device), comm)
    trainer.extend(evaluator, trigger=(1, 'epoch'))

    log_interval = 0.1, 'epoch'
    print_interval = 0.1, 'epoch'

    if comm.rank == 0:
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(
            extensions.snapshot_object(
                extractor, 'snapshot_model_{.updater.epoch}.npz'),
            trigger=(args.epoch, 'epoch'))
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(extensions.PrintReport(
            ['iteration', 'epoch', 'elapsed_time', 'lr',
             'main/loss', 'validation/main/loss',
             'main/accuracy', 'validation/main/accuracy']
        ), trigger=print_interval)
        trainer.extend(extensions.ProgressBar(update_interval=10))

    trainer.run()
def main():
    archs = {
        'resnet50': {'class': ResNet50, 'score_layer_name': 'fc6',
                     'kwargs': {'arch': 'fb'}},
        'resnet101': {'class': ResNet101, 'score_layer_name': 'fc6',
                      'kwargs': {'arch': 'fb'}},
        'resnet152': {'class': ResNet152, 'score_layer_name': 'fc6',
                      'kwargs': {'arch': 'fb'}}
    }
    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to root of the train dataset')
    parser.add_argument('val', help='Path to root of the validation dataset')
    parser.add_argument('--arch',
                        '-a', choices=archs.keys(), default='resnet50',
                        help='Convnet architecture')
    parser.add_argument('--loaderjob', type=int, default=4)
    parser.add_argument('--batchsize', type=int, default=32,
                        help='Batch size for each worker')
    parser.add_argument('--lr', type=float)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--weight_decay', type=float, default=0.0001)
    parser.add_argument('--out', type=str, default='result')
    parser.add_argument('--epoch', type=int, default=90)

    parser.add_argument('--min', type=int, required=True,
                        help='Minimum number of processes')
    parser.add_argument('--start', type=int, required=True,
                        help='Number of processes to start')
    parser.add_argument('--bind', '-p', type=str, required=True,
                        help='address to bind gRPC server')
    parser.add_argument('--etcd', '-c', type=str, default='etcd://*****:*****@make_shift('lr')
        def warmup_and_exponential_shift(trainer):
            epoch = trainer.updater.epoch_detail
            warmup_epoch = 5
            if epoch < warmup_epoch:
                if lr > 0.1:
                    warmup_rate = 0.1 / lr
                    rate = warmup_rate \
                           + (1 - warmup_rate) * epoch / warmup_epoch
                else:
                    rate = 1
            elif epoch < 30:
                rate = 1
            elif epoch < 60:
                rate = 0.1
            elif epoch < 80:
                rate = 0.01
            else:
                rate = 0.001
            return rate * lr
        trainer.extend(warmup_and_exponential_shift)

        evaluator = chainermn.create_multi_node_evaluator(
            extensions.Evaluator(val_iter, model, device=device), comm)
        trainer.extend(evaluator, trigger=(1, 'epoch'))
        trainer.extend(comm.get_uninitializer(), trigger=(1, 'iteration'))

        log_interval = 0.1, 'epoch'
        print_interval = 0.5, 'epoch'
        plot_interval = 1, 'epoch'

        if comm.intra_rank == 0:
            # TODO: lr is not properly controlled for accuracy
            trainer.extend(chainer.training.extensions.observe_lr(),
                           trigger=log_interval)
            trainer.extend(echainer.extension.Lineage(comm, trigger=log_interval))
            trainer.extend(extensions.PrintReport(
                ['iteration', 'epoch', 'elapsed_time', 'lr',
                 'main/loss', 'validation/main/loss',
                 'main/accuracy', 'validation/main/accuracy'],
                log_report='Lineage'), trigger=print_interval)
            trainer.extend(extensions.ProgressBar(update_interval=10))

            if extensions.PlotReport.available():
                trainer.extend(
                    extensions.PlotReport(
                        ['main/loss', 'validation/main/loss'],
                        file_name='loss.png', trigger=plot_interval
                    ),
                    trigger=plot_interval
                )
                trainer.extend(
                    extensions.PlotReport(
                        ['main/accuracy', 'validation/main/accuracy'],
                        file_name='accuracy.png', trigger=plot_interval
                    ),
                trigger=plot_interval
                )

        # Optimizer includes model parameters and other params in optimizer
        comm.register_state('optimizer', optimizer)
        comm.register_state('iterator', train_iter)
        if retry or not comm.initial:
            (iteration, epoch) = comm.fetch_state('optimizer', optimizer)
            # train_iter.epoch = epoch
            comm.fetch_state('iterator', train_iter)
            updater.iteration = iteration

        optimizers = trainer.updater.get_all_optimizers()
        for name in optimizers.keys():
            optimizers[name].reset_prev_params()

        try:
            print('start trainer.run(), ', trainer.updater.iteration, trainer.updater.epoch)
            trainer.run()
            done = trainer._done
        except CommException as ce:
            print("Comm exception >>>>>>>>>>>", ce, updater.iteration, updater.epoch)
            comm.save_all_states(updater.iteration, updater.epoch)
            # Here comm will be ready to accept fetch state calls and once all
            # nodes got catched up it'll return and continue to run: TODO
            comm.sync_cluster(trainer.updater.get_all_optimizers())
            retry = True
            continue
        except ClusterUpdatedException as ce:
            print("Cluster updated: >>>>>>>>>>>", ce)
            comm.save_all_states(updater.iteration, updater.epoch)
            comm.sync_cluster(trainer.updater.get_all_optimizers())
            retry = True
            continue
        except Exception as e:
            print("Unexpected >>>>>>>>>>>", e)
            break

    comm.leave()