Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description="Learning from flowers data")
    parser.add_argument("--gpu",
                        "-g",
                        type=int,
                        default=-1,
                        help="GPU ID (negative value indicates CPU")
    parser.add_argument("--init", help="Initialize the model from given file")
    parser.add_argument('--job',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument("--resume",
                        '-r',
                        default='',
                        help="Initialize the trainer from given file")
    args = parser.parse_args()

    batch = 32
    epoch = 50
    val_batch = 200
    model = models.ResNet50V1(data.ClassNumber)
    if args.init:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.init, model)
    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    if data.fetch_flowers() and data.fetch_labels():
        print("Flower images and labels have been fetched.")
    else:
        print("Failed to fetch flower images and labels")
        return

    data.pre_process_data(224)

    output_name = output.init_train(model.__class__)
    output_path = path.join(output.OutPath, output_name)

    train, validate = data.get_datasets()

    train_iter = chainer.iterators.MultiprocessIterator(train,
                                                        batch,
                                                        n_processes=args.job)
    val_iter = chainer.iterators.MultiprocessIterator(validate,
                                                      val_batch,
                                                      repeat=False,
                                                      n_processes=args.job)

    classifier = chainer.links.Classifier(model)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(classifier)
    model.base.disable_update()

    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=args.gpu)
    trainer = training.Trainer(updater, (epoch, 'epoch'), output_path)

    val_interval = 500, 'iteration'
    log_interval = 250, 'iteration'
    snapshot_interval = 5000, 'iteration'

    trainer.extend(extensions.Evaluator(val_iter, classifier, device=args.gpu),
                   trigger=val_interval)
    trainer.extend(extensions.dump_graph('main/loss'))

    trainer.extend(extensions.snapshot(), trigger=snapshot_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=snapshot_interval)

    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    print("Start training")
    trainer.run()

    model.to_cpu()
    chainer.serializers.save_npz(path.join(output_path, "model.npz"), model)
    print("Uploading files")
    output.upload_result(output_name)
    print("Finish training")
Esempio n. 2
0
def train(args):
    """Train with the given args

    :param Namespace args: The program arguments
    """
    set_deterministic_pytorch(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning('cuda is not available')

    # get input and output dimension info
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']
    utts = list(valid_json.keys())

    # reverse input and output dimension
    idim = int(valid_json[utts[0]]['output'][0]['shape'][1])
    odim = int(valid_json[utts[0]]['input'][0]['shape'][1])
    logging.info('#input dims : ' + str(idim))
    logging.info('#output dims: ' + str(odim))

    # get extra input and output dimenstion
    if args.use_speaker_embedding:
        args.spk_embed_dim = int(valid_json[utts[0]]['input'][1]['shape'][0])
    else:
        args.spk_embed_dim = None
    if args.use_second_target:
        args.spc_dim = int(valid_json[utts[0]]['input'][1]['shape'][1])
    else:
        args.spc_dim = None

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + '/model.json'
    with open(model_conf, 'wb') as f:
        logging.info('writing a model config file to' + model_conf)
        f.write(
            json.dumps((idim, odim, vars(args)),
                       indent=4,
                       ensure_ascii=False,
                       sort_keys=True).encode('utf_8'))
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    model_class = dynamic_import(args.model_module)
    model = model_class(idim, odim, args)
    assert isinstance(model, TTSInterface)
    logging.info(model)
    reporter = model.reporter

    # check the use of multi-gpu
    if args.ngpu > 1:
        model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu)))
        if args.batch_size != 0:
            logging.info('batch size is automatically increased (%d -> %d)' %
                         (args.batch_size, args.batch_size * args.ngpu))
            args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    # Setup an optimizer
    if args.opt == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     args.lr,
                                     eps=args.eps,
                                     weight_decay=args.weight_decay)
    elif args.opt == 'noam':
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
        optimizer = get_std_opt(model, args.adim,
                                args.transformer_warmup_steps,
                                args.transformer_lr)
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, 'target', reporter)
    setattr(optimizer, 'serialize', lambda s: reporter.serialize(s))

    # read json data
    with open(args.train_json, 'rb') as f:
        train_json = json.load(f)['utts']
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    if use_sortagrad:
        args.batch_sort_key = "input"
    # make minibatch list (variable length)
    train_batchset = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=True)
    valid_batchset = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=True)

    load_tr = LoadInputsAndTargets(
        mode='tts',
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={'train': True},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    load_cv = LoadInputsAndTargets(
        mode='tts',
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={'train': False},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    if args.num_iter_processes > 0:
        train_iter = ToggleableShufflingMultiprocessIterator(
            TransformDataset(train_batchset, load_tr),
            batch_size=1,
            n_processes=args.num_iter_processes,
            n_prefetch=8,
            maxtasksperchild=20,
            shuffle=not use_sortagrad)
        valid_iter = ToggleableShufflingMultiprocessIterator(
            TransformDataset(valid_batchset, load_cv),
            batch_size=1,
            repeat=False,
            shuffle=False,
            n_processes=args.num_iter_processes,
            n_prefetch=8,
            maxtasksperchild=20)
    else:
        train_iter = ToggleableShufflingSerialIterator(
            TransformDataset(train_batchset, load_tr),
            batch_size=1,
            shuffle=not use_sortagrad)
        valid_iter = ToggleableShufflingSerialIterator(TransformDataset(
            valid_batchset, load_cv),
                                                       batch_size=1,
                                                       repeat=False,
                                                       shuffle=False)

    # Set up a trainer
    converter = CustomConverter()
    updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer,
                            converter, device, args.accum_grad)
    trainer = training.Trainer(updater, (args.epochs, 'epoch'),
                               out=args.outdir)

    # Resume from a snapshot
    if args.resume:
        logging.info('resumed from %s' % args.resume)
        torch_resume(args.resume, trainer)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        CustomEvaluator(model, valid_iter, reporter, converter, device))

    # set intervals
    save_interval = (args.save_interval_epochs, 'epoch')
    report_interval = (args.report_interval_iters, 'iteration')

    # Save snapshot for each epoch
    trainer.extend(torch_snapshot(), trigger=save_interval)

    # Save best models
    trainer.extend(snapshot_object(model, 'model.loss.best'),
                   trigger=training.triggers.MinValueTrigger(
                       'validation/main/loss', trigger=save_interval))

    # Save attention figure for each epoch
    if args.num_save_attention > 0:
        data = sorted(list(valid_json.items())[:args.num_save_attention],
                      key=lambda x: int(x[1]['input'][0]['shape'][1]),
                      reverse=True)
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(att_vis_fn,
                                  data,
                                  args.outdir + '/att_ws',
                                  converter=converter,
                                  transform=load_cv,
                                  device=device,
                                  reverse=True)
        trainer.extend(att_reporter, trigger=save_interval)
    else:
        att_reporter = None

    # Make a plot for training and validation values
    if hasattr(model, "module"):
        base_plot_keys = model.module.base_plot_keys
    else:
        base_plot_keys = model.base_plot_keys
    plot_keys = []
    for key in base_plot_keys:
        plot_key = ['main/' + key, 'validation/main/' + key]
        trainer.extend(
            extensions.PlotReport(plot_key, 'epoch', file_name=key + '.png'))
        plot_keys += plot_key
    trainer.extend(
        extensions.PlotReport(plot_keys, 'epoch', file_name='all_loss.png'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=report_interval))
    report_keys = ['epoch', 'iteration', 'elapsed_time'] + plot_keys
    trainer.extend(extensions.PrintReport(report_keys),
                   trigger=report_interval)
    trainer.extend(extensions.ProgressBar())

    set_early_stop(trainer, args)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(TensorboardLogger(writer, att_reporter),
                       trigger=report_interval)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs,
                     'epoch'))

    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description='''\
ChainerMN example: MNIST with automatic checkpoints enabled''')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--communicator',
                        type=str,
                        default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--run-id',
                        type=str,
                        default='train-mnist-example',
                        help='ID of the task name')
    args = parser.parse_args()

    # Prepare ChainerMN communicator.

    if args.gpu:
        if args.communicator == 'naive':
            print("Error: 'naive' communicator does not support GPU.\n")
            exit(-1)
        comm = chainermn.create_communicator(args.communicator)
        device = comm.intra_rank
    else:
        if args.communicator != 'naive':
            print('Warning: using naive communicator '
                  'because only naive supports CPU-only execution')
        comm = chainermn.create_communicator('naive')
        device = -1

    if comm.mpi_comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('Num epoch: {}'.format(args.epoch))
        print('==========================================')

    model = L.Classifier(MLP(args.unit, 10))
    if device >= 0:
        chainer.cuda.get_device(device).use()
        model.to_gpu()

    # Create a multi node optimizer from a standard Chainer optimizer.
    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    # Split and distribute the dataset. Only worker 0 loads the whole dataset.
    # Datasets of worker 0 are evenly split and distributed to all workers.
    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None
    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Enable checkpointer and recover from checkpoint if any checkpoint exists
    checkpointer = create_multi_node_checkpointer(name=args.run_id, comm=comm)
    checkpointer.maybe_load(trainer, optimizer)
    print("Rank", comm.rank, ": (Re)Starting from (epoch, iter) =",
          (trainer.updater.epoch, trainer.updater.iteration))
    trainer.extend(checkpointer, trigger=(1000, 'iteration'))

    # Create a multi node evaluator from a standard Chainer evaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0:
        trainer.extend(extensions.dump_graph('main/loss'))
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))
        trainer.extend(extensions.ProgressBar())

    trainer.run()
Esempio n. 4
0
    opts = {}
    encoder = InceptionResNetV2()
    serializers.load_npz(config.pretrainmodel_path, encoder)
    clmodel = Classification(encoder, identityn).to_gpu(config.gpu)
    models = clmodel
    updater_args = {"iterator": {'main': train_iter}, "device": args.gpu}

    opts["opt"] = make_sgd_optimizer(clmodel, config.lr)
    updater_args["optimizer"] = opts
    updater_args["models"] = models
    updater_args["config"] = config
    updater = Updater(**updater_args)
    report_keys = ["loss"]

    trainer = training.Trainer(updater, (config.max_iter, 'iteration'),
                               out=config.out)
    trainer.extend(
        MultistepShift('lr',
                       0.1, [config.epoch_iter * 10, config.epoch_iter * 20],
                       1e-2,
                       optimizer=opts["opt"]))
    trainer.extend(extensions.snapshot_object(
        clmodel, clmodel.__class__.__name__ + '_{.updater.iteration}.npz'),
                   trigger=(config.snapshot_interval, 'iteration'))
    trainer.extend(
        extensions.LogReport(keys=report_keys,
                             trigger=(config.display_interval, 'iteration')))
    trainer.extend(extensions.PrintReport(report_keys),
                   trigger=(config.display_interval, 'iteration'))
    trainer.extend(extensions.ProgressBar(update_interval=10))
    trainer.extend(eval_classification(train_dataset,
Esempio n. 5
0
    # default is 'softmax cross entropy loss'
    model = L.Classifier(model)

    # optimier setup
    optimizer = optimizers.MomentumSGD()

    # Give the optimizer a reference to the model
    optimizer.setup(model)

    # Get an updater that uses the Iterator and Optimizer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=gpu_id)

    # Setup a trainer
    trainer = training.Trainer(updater, (max_epoch, "epoch"),
                               out='mnist_result')

    # Add Extensions to the Trainer object
    from chainer.training import extensions

    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'))
    trainer.extend(
        extensions.snapshot_object(model.predictor,
                                   filename='model_epoch-{.updater.epoch}'))
    trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id))
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'main/accuracy', 'validation/main/loss',
            'validation/main/accuracy', 'elapsed_time'
Esempio n. 6
0
def main():

    parser = argparse.ArgumentParser(description='Chainer example:cfiar-VGG')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=300,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=-1,
                        help='Frequency of taking a snapshot')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--learnrate',
                        '-l',
                        type=float,
                        default=0.05,
                        help='Learning rate for SGD')
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    train = get_data(r"/home/notsuji/chainer_src/origin/mnist_data/Data/train")
    test = get_data(r"/home/notsuji/chainer_src/origin/mnist_data/Data/test")

    # Setup model
    model = L.Classifier(VGG(10))

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # Setup an optimizer
    optimizer = chainer.optimizers.MomentumSGD(args.learnrate)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu),
                   trigger=(1, 'epoch'))

    # Reduce the learning rate by half every 25 epochs.
    trainer.extend(extensions.ExponentialShift('lr', 0.5),
                   trigger=(25, 'epoch'))

    # Dump a computational graph from 'loss' variable at the first iteration
    # The "main" refers to the target link of the "main" optimizer.
    trainer.extend(extensions.dump_graph('main/loss'))

    # Take a snapshot for each specified epoch
    trainer.extend(
        extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'),
        trigger=(10, 'epoch'))

    trainer.extend(extensions.snapshot_object(model,
                                              'model_epoch_{.updater.epoch}'),
                   trigger=(args.epoch, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport())

    # Save two plot images to the result dir
    if extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(['main/loss', 'validation/main/loss'],
                                  'epoch',
                                  file_name='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/accuracy', 'validation/main/accuracy'],
                'epoch',
                file_name='accuracy.png'))

    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # Run the training
    trainer.run()
Esempio n. 7
0
def train(args):
    """Train with the given args.

    Args:
        args (namespace): The program arguments.

    """
    set_deterministic_pytorch(args)
    if args.num_encs > 1:
        args = format_mulenc_args(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())
    idim_list = [
        int(valid_json[utts[0]]["input"][i]["shape"][-1])
        for i in range(args.num_encs)
    ]
    odim = int(valid_json[utts[0]]["output"][0]["shape"][-1])
    if hasattr(args, "decoder_mode") and args.decoder_mode == "maskctc":
        odim += 1  # for the <mask> token
    for i in range(args.num_encs):
        logging.info("stream{}: input dims : {}".format(i + 1, idim_list[i]))
    logging.info("#output dims: " + str(odim))

    # specify attention, CTC, hybrid mode
    if "transducer" in args.model_module:
        assert args.mtlalpha == 1.0
        mtl_mode = "transducer"
        logging.info("Pure transducer mode")
    elif args.mtlalpha == 1.0:
        mtl_mode = "ctc"
        logging.info("Pure CTC mode")
    elif args.mtlalpha == 0.0:
        mtl_mode = "att"
        logging.info("Pure attention mode")
    else:
        mtl_mode = "mtl"
        logging.info("Multitask learning mode")

    if (args.enc_init is not None
            or args.dec_init is not None) and args.num_encs == 1:
        model = load_trained_modules(idim_list[0], odim, args)
    else:
        model_class = dynamic_import(args.model_module)
        model = model_class(idim_list[0] if args.num_encs == 1 else idim_list,
                            odim, args)
    assert isinstance(model, ASRInterface)

    logging.info(" Total parameter of the model = " +
                 str(sum(p.numel() for p in model.parameters())))

    if args.rnnlm is not None:
        rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        rnnlm = lm_pytorch.ClassifierWithState(
            lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer,
                             rnnlm_args.unit))
        torch_load(args.rnnlm, rnnlm)
        model.rnnlm = rnnlm

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to " + model_conf)
        f.write(
            json.dumps(
                (idim_list[0] if args.num_encs == 1 else idim_list, odim,
                 vars(args)),
                indent=4,
                ensure_ascii=False,
                sort_keys=True,
            ).encode("utf_8"))
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    reporter = model.reporter

    # check the use of multi-gpu
    if args.ngpu > 1:
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)" %
                (args.batch_size, args.batch_size * args.ngpu))
            args.batch_size *= args.ngpu
        if args.num_encs > 1:
            # TODO(ruizhili): implement data parallel for multi-encoder setup.
            raise NotImplementedError(
                "Data parallel is not supported for multi-encoder setup.")

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    if args.train_dtype in ("float16", "float32", "float64"):
        dtype = getattr(torch, args.train_dtype)
    else:
        dtype = torch.float32
    model = model.to(device=device, dtype=dtype)

    if args.freeze_mods:
        model, model_params = freeze_modules(model, args.freeze_mods)
    else:
        model_params = model.parameters()

    # Setup an optimizer
    if args.opt == "adadelta":
        optimizer = torch.optim.Adadelta(model_params,
                                         rho=0.95,
                                         eps=args.eps,
                                         weight_decay=args.weight_decay)
    elif args.opt == "adam":
        optimizer = torch.optim.Adam(model_params,
                                     weight_decay=args.weight_decay)
    elif args.opt == "noam":
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt

        optimizer = get_std_opt(model_params, args.adim,
                                args.transformer_warmup_steps,
                                args.transformer_lr)
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # setup apex.amp
    if args.train_dtype in ("O0", "O1", "O2", "O3"):
        try:
            from apex import amp
        except ImportError as e:
            logging.error(
                f"You need to install apex for --train-dtype {args.train_dtype}. "
                "See https://github.com/NVIDIA/apex#linux")
            raise e
        if args.opt == "noam":
            model, optimizer.optimizer = amp.initialize(
                model, optimizer.optimizer, opt_level=args.train_dtype)
        else:
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.train_dtype)
        use_apex = True

        from espnet.nets.pytorch_backend.ctc import CTC

        amp.register_float_function(CTC, "loss_fn")
        amp.init()
        logging.warning("register ctc as float function")
    else:
        use_apex = False

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # Setup a converter
    if args.num_encs == 1:
        converter = CustomConverter(subsampling_factor=model.subsample[0],
                                    dtype=dtype)
    else:
        converter = CustomConverterMulEnc([i[0] for i in model.subsample_list],
                                          dtype=dtype)

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    # make minibatch list (variable length)
    train = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=0,
    )
    valid = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        iaxis=0,
        oaxis=0,
    )

    load_tr = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
    )
    load_cv = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
    )
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    # default collate function converts numpy array to pytorch tensor
    # we used an empty collate function instead which returns list
    train_iter = ChainerDataLoader(
        dataset=TransformDataset(train,
                                 lambda data: converter([load_tr(data)])),
        batch_size=1,
        num_workers=args.n_iter_processes,
        shuffle=not use_sortagrad,
        collate_fn=lambda x: x[0],
    )
    valid_iter = ChainerDataLoader(
        dataset=TransformDataset(valid,
                                 lambda data: converter([load_cv(data)])),
        batch_size=1,
        shuffle=False,
        collate_fn=lambda x: x[0],
        num_workers=args.n_iter_processes,
    )

    # Set up a trainer
    updater = CustomUpdater(
        model,
        args.grad_clip,
        {"main": train_iter},
        optimizer,
        device,
        args.ngpu,
        args.grad_noise,
        args.accum_grad,
        use_apex=use_apex,
    )
    trainer = training.Trainer(updater, (args.epochs, "epoch"),
                               out=args.outdir)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs,
                     "epoch"),
        )

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    # Evaluate the model with the test dataset for each epoch
    if args.save_interval_iters > 0:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device,
                            args.ngpu),
            trigger=(args.save_interval_iters, "iteration"),
        )
    else:
        trainer.extend(
            CustomEvaluator(model, {"main": valid_iter}, reporter, device,
                            args.ngpu))

    # Save attention weight at each epoch
    is_attn_plot = (mtl_mode in ["att", "mtl"]
                    or "transformer" in args.model_module
                    or "conformer" in args.model_module)
    if args.num_save_attention > 0 and is_attn_plot:
        data = sorted(
            list(valid_json.items())[:args.num_save_attention],
            key=lambda x: int(x[1]["input"][0]["shape"][1]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
        )
        trainer.extend(att_reporter, trigger=(1, "epoch"))
    else:
        att_reporter = None

    # Make a plot for training and validation values
    if args.num_encs > 1:
        report_keys_loss_ctc = [
            "main/loss_ctc{}".format(i + 1) for i in range(model.num_encs)
        ] + [
            "validation/main/loss_ctc{}".format(i + 1)
            for i in range(model.num_encs)
        ]
        report_keys_cer_ctc = [
            "main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)
        ] + [
            "validation/main/cer_ctc{}".format(i + 1)
            for i in range(model.num_encs)
        ]
    trainer.extend(
        extensions.PlotReport(
            [
                "main/loss",
                "validation/main/loss",
                "main/loss_ctc",
                "validation/main/loss_ctc",
                "main/loss_att",
                "validation/main/loss_att",
            ] + ([] if args.num_encs == 1 else report_keys_loss_ctc),
            "epoch",
            file_name="loss.png",
        ))
    trainer.extend(
        extensions.PlotReport(["main/acc", "validation/main/acc"],
                              "epoch",
                              file_name="acc.png"))
    trainer.extend(
        extensions.PlotReport(
            ["main/cer_ctc", "validation/main/cer_ctc"] +
            ([] if args.num_encs == 1 else report_keys_loss_ctc),
            "epoch",
            file_name="cer.png",
        ))

    # Save best models
    trainer.extend(
        snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger("validation/main/loss"),
    )
    if mtl_mode not in ["ctc", "transducer"]:
        trainer.extend(
            snapshot_object(model, "model.acc.best"),
            trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
        )

    # save snapshot which contains model and optimizer states
    if args.save_interval_iters > 0:
        trainer.extend(
            torch_snapshot(filename="snapshot.iter.{.updater.iteration}"),
            trigger=(args.save_interval_iters, "iteration"),
        )
    else:
        trainer.extend(torch_snapshot(), trigger=(1, "epoch"))

    # epsilon decay in the optimizer
    if args.opt == "adadelta":
        if args.criterion == "acc" and mtl_mode != "ctc":
            trainer.extend(
                restore_snapshot(model,
                                 args.outdir + "/model.acc.best",
                                 load_fn=torch_load),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value >
                    current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/acc",
                    lambda best_value, current_value: best_value >
                    current_value,
                ),
            )
        elif args.criterion == "loss":
            trainer.extend(
                restore_snapshot(model,
                                 args.outdir + "/model.loss.best",
                                 load_fn=torch_load),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value <
                    current_value,
                ),
            )
            trainer.extend(
                adadelta_eps_decay(args.eps_decay),
                trigger=CompareValueTrigger(
                    "validation/main/loss",
                    lambda best_value, current_value: best_value <
                    current_value,
                ),
            )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.report_interval_iters,
                                      "iteration")))
    report_keys = [
        "epoch",
        "iteration",
        "main/loss",
        "main/loss_ctc",
        "main/loss_att",
        "validation/main/loss",
        "validation/main/loss_ctc",
        "validation/main/loss_att",
        "main/acc",
        "validation/main/acc",
        "main/cer_ctc",
        "validation/main/cer_ctc",
        "elapsed_time",
    ] + ([] if args.num_encs == 1 else report_keys_cer_ctc +
         report_keys_loss_ctc)
    if args.opt == "adadelta":
        trainer.extend(
            extensions.observe_value(
                "eps",
                lambda trainer: trainer.updater.get_optimizer("main").
                param_groups[0]["eps"],
            ),
            trigger=(args.report_interval_iters, "iteration"),
        )
        report_keys.append("eps")
    if args.report_cer:
        report_keys.append("validation/main/cer")
    if args.report_wer:
        report_keys.append("validation/main/wer")
    trainer.extend(
        extensions.PrintReport(report_keys),
        trigger=(args.report_interval_iters, "iteration"),
    )

    trainer.extend(
        extensions.ProgressBar(update_interval=args.report_interval_iters))
    set_early_stop(trainer, args)

    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        trainer.extend(
            TensorboardLogger(SummaryWriter(args.tensorboard_dir),
                              att_reporter),
            trigger=(args.report_interval_iters, "iteration"),
        )
    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)
Esempio n. 8
0
def train(args):
    """Train with the given args.

    Args:
        args (namespace): The program arguments.

    """
    # display chainer version
    logging.info('chainer version = ' + chainer.__version__)

    set_deterministic_chainer(args)

    # check cuda and cudnn availability
    if not chainer.cuda.available:
        logging.warning('cuda is not available')
    if not chainer.cuda.cudnn_enabled:
        logging.warning('cudnn is not available')

    # get input and output dimension info
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']
    utts = list(valid_json.keys())
    idim = int(valid_json[utts[0]]['input'][0]['shape'][1])
    odim = int(valid_json[utts[0]]['output'][0]['shape'][1])
    logging.info('#input dims : ' + str(idim))
    logging.info('#output dims: ' + str(odim))

    # specify attention, CTC, hybrid mode
    if args.mtlalpha == 1.0:
        mtl_mode = 'ctc'
        logging.info('Pure CTC mode')
    elif args.mtlalpha == 0.0:
        mtl_mode = 'att'
        logging.info('Pure attention mode')
    else:
        mtl_mode = 'mtl'
        logging.info('Multitask learning mode')

    # specify model architecture
    logging.info('import model module: ' + args.model_module)
    model_class = dynamic_import(args.model_module)
    model = model_class(idim, odim, args, flag_return=False)
    assert isinstance(model, ASRInterface)

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + '/model.json'
    with open(model_conf, 'wb') as f:
        logging.info('writing a model config file to ' + model_conf)
        f.write(
            json.dumps((idim, odim, vars(args)),
                       indent=4,
                       ensure_ascii=False,
                       sort_keys=True).encode('utf_8'))
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # Set gpu
    ngpu = args.ngpu
    if ngpu == 1:
        gpu_id = 0
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()  # Copy the model to the GPU
        logging.info('single gpu calculation.')
    elif ngpu > 1:
        gpu_id = 0
        devices = {'main': gpu_id}
        for gid in six.moves.xrange(1, ngpu):
            devices['sub_%d' % gid] = gid
        logging.info('multi gpu calculation (#gpus = %d).' % ngpu)
        logging.warning('batch size is automatically increased (%d -> %d)' %
                        (args.batch_size, args.batch_size * args.ngpu))
    else:
        gpu_id = -1
        logging.info('cpu calculation')

    # Setup an optimizer
    if args.opt == 'adadelta':
        optimizer = chainer.optimizers.AdaDelta(eps=args.eps)
    elif args.opt == 'adam':
        optimizer = chainer.optimizers.Adam()
    elif args.opt == 'noam':
        optimizer = chainer.optimizers.Adam(alpha=0,
                                            beta1=0.9,
                                            beta2=0.98,
                                            eps=1e-9)
    else:
        raise NotImplementedError('args.opt={}'.format(args.opt))

    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))

    # Setup Training Extensions
    if 'transformer' in args.model_module:
        from espnet.nets.chainer_backend.transformer.training import CustomConverter
        from espnet.nets.chainer_backend.transformer.training import CustomParallelUpdater
        from espnet.nets.chainer_backend.transformer.training import CustomUpdater
    else:
        from espnet.nets.chainer_backend.rnn.training import CustomConverter
        from espnet.nets.chainer_backend.rnn.training import CustomParallelUpdater
        from espnet.nets.chainer_backend.rnn.training import CustomUpdater

    # Setup a converter
    converter = CustomConverter(subsampling_factor=model.subsample[0])

    # read json data
    with open(args.train_json, 'rb') as f:
        train_json = json.load(f)['utts']
    with open(args.valid_json, 'rb') as f:
        valid_json = json.load(f)['utts']

    # set up training iterator and updater
    load_tr = LoadInputsAndTargets(
        mode='asr',
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={'train': True}  # Switch the mode of preprocessing
    )
    load_cv = LoadInputsAndTargets(
        mode='asr',
        load_output=True,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={'train': False}  # Switch the mode of preprocessing
    )

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    accum_grad = args.accum_grad
    if ngpu <= 1:
        # make minibatch list (variable length)
        train = make_batchset(train_json,
                              args.batch_size,
                              args.maxlen_in,
                              args.maxlen_out,
                              args.minibatches,
                              min_batch_size=args.ngpu if args.ngpu > 1 else 1,
                              shortest_first=use_sortagrad,
                              count=args.batch_count,
                              batch_bins=args.batch_bins,
                              batch_frames_in=args.batch_frames_in,
                              batch_frames_out=args.batch_frames_out,
                              batch_frames_inout=args.batch_frames_inout,
                              iaxis=0,
                              oaxis=0)
        # hack to make batchsize argument as 1
        # actual batchsize is included in a list
        if args.n_iter_processes > 0:
            train_iters = [
                ToggleableShufflingMultiprocessIterator(
                    TransformDataset(train, load_tr),
                    batch_size=1,
                    n_processes=args.n_iter_processes,
                    n_prefetch=8,
                    maxtasksperchild=20,
                    shuffle=not use_sortagrad)
            ]
        else:
            train_iters = [
                ToggleableShufflingSerialIterator(TransformDataset(
                    train, load_tr),
                                                  batch_size=1,
                                                  shuffle=not use_sortagrad)
            ]

        # set up updater
        updater = CustomUpdater(train_iters[0],
                                optimizer,
                                converter=converter,
                                device=gpu_id,
                                accum_grad=accum_grad)
    else:
        if args.batch_count not in ("auto", "seq") and args.batch_size == 0:
            raise NotImplementedError(
                "--batch-count 'bin' and 'frame' are not implemented in chainer multi gpu"
            )
        # set up minibatches
        train_subsets = []
        for gid in six.moves.xrange(ngpu):
            # make subset
            train_json_subset = {
                k: v
                for i, (k, v) in enumerate(train_json.items())
                if i % ngpu == gid
            }
            # make minibatch list (variable length)
            train_subsets += [
                make_batchset(train_json_subset, args.batch_size,
                              args.maxlen_in, args.maxlen_out,
                              args.minibatches)
            ]

        # each subset must have same length for MultiprocessParallelUpdater
        maxlen = max([len(train_subset) for train_subset in train_subsets])
        for train_subset in train_subsets:
            if maxlen != len(train_subset):
                for i in six.moves.xrange(maxlen - len(train_subset)):
                    train_subset += [train_subset[i]]

        # hack to make batchsize argument as 1
        # actual batchsize is included in a list
        if args.n_iter_processes > 0:
            train_iters = [
                ToggleableShufflingMultiprocessIterator(
                    TransformDataset(train_subsets[gid], load_tr),
                    batch_size=1,
                    n_processes=args.n_iter_processes,
                    n_prefetch=8,
                    maxtasksperchild=20,
                    shuffle=not use_sortagrad)
                for gid in six.moves.xrange(ngpu)
            ]
        else:
            train_iters = [
                ToggleableShufflingSerialIterator(TransformDataset(
                    train_subsets[gid], load_tr),
                                                  batch_size=1,
                                                  shuffle=not use_sortagrad)
                for gid in six.moves.xrange(ngpu)
            ]

        # set up updater
        updater = CustomParallelUpdater(train_iters,
                                        optimizer,
                                        converter=converter,
                                        devices=devices)

    # Set up a trainer
    trainer = training.Trainer(updater, (args.epochs, 'epoch'),
                               out=args.outdir)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler(train_iters),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs,
                     'epoch'))
    if args.opt == 'noam':
        from espnet.nets.chainer_backend.transformer.training import VaswaniRule
        trainer.extend(VaswaniRule('alpha',
                                   d=args.adim,
                                   warmup_steps=args.transformer_warmup_steps,
                                   scale=args.transformer_lr),
                       trigger=(1, 'iteration'))
    # Resume from a snapshot
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    # set up validation iterator
    valid = make_batchset(valid_json,
                          args.batch_size,
                          args.maxlen_in,
                          args.maxlen_out,
                          args.minibatches,
                          min_batch_size=args.ngpu if args.ngpu > 1 else 1,
                          count=args.batch_count,
                          batch_bins=args.batch_bins,
                          batch_frames_in=args.batch_frames_in,
                          batch_frames_out=args.batch_frames_out,
                          batch_frames_inout=args.batch_frames_inout,
                          iaxis=0,
                          oaxis=0)

    if args.n_iter_processes > 0:
        valid_iter = chainer.iterators.MultiprocessIterator(
            TransformDataset(valid, load_cv),
            batch_size=1,
            repeat=False,
            shuffle=False,
            n_processes=args.n_iter_processes,
            n_prefetch=8,
            maxtasksperchild=20)
    else:
        valid_iter = chainer.iterators.SerialIterator(TransformDataset(
            valid, load_cv),
                                                      batch_size=1,
                                                      repeat=False,
                                                      shuffle=False)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        BaseEvaluator(valid_iter, model, converter=converter, device=gpu_id))

    # Save attention weight each epoch
    if args.num_save_attention > 0 and args.mtlalpha != 1.0:
        data = sorted(list(valid_json.items())[:args.num_save_attention],
                      key=lambda x: int(x[1]['input'][0]['shape'][1]),
                      reverse=True)
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        logging.info('Using custom PlotAttentionReport')
        att_reporter = plot_class(att_vis_fn,
                                  data,
                                  args.outdir + "/att_ws",
                                  converter=converter,
                                  transform=load_cv,
                                  device=gpu_id)
        trainer.extend(att_reporter, trigger=(1, 'epoch'))
    else:
        att_reporter = None

    # Take a snapshot for each specified epoch
    trainer.extend(
        extensions.snapshot(filename='snapshot.ep.{.updater.epoch}'),
        trigger=(1, 'epoch'))

    # Make a plot for training and validation values
    trainer.extend(
        extensions.PlotReport([
            'main/loss', 'validation/main/loss', 'main/loss_ctc',
            'validation/main/loss_ctc', 'main/loss_att',
            'validation/main/loss_att'
        ],
                              'epoch',
                              file_name='loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/acc', 'validation/main/acc'],
                              'epoch',
                              file_name='acc.png'))

    # Save best models
    trainer.extend(
        extensions.snapshot_object(model, 'model.loss.best'),
        trigger=training.triggers.MinValueTrigger('validation/main/loss'))
    if mtl_mode != 'ctc':
        trainer.extend(
            extensions.snapshot_object(model, 'model.acc.best'),
            trigger=training.triggers.MaxValueTrigger('validation/main/acc'))

    # epsilon decay in the optimizer
    if args.opt == 'adadelta':
        if args.criterion == 'acc' and mtl_mode != 'ctc':
            trainer.extend(restore_snapshot(model,
                                            args.outdir + '/model.acc.best'),
                           trigger=CompareValueTrigger(
                               'validation/main/acc', lambda best_value,
                               current_value: best_value > current_value))
            trainer.extend(adadelta_eps_decay(args.eps_decay),
                           trigger=CompareValueTrigger(
                               'validation/main/acc', lambda best_value,
                               current_value: best_value > current_value))
        elif args.criterion == 'loss':
            trainer.extend(restore_snapshot(model,
                                            args.outdir + '/model.loss.best'),
                           trigger=CompareValueTrigger(
                               'validation/main/loss', lambda best_value,
                               current_value: best_value < current_value))
            trainer.extend(adadelta_eps_decay(args.eps_decay),
                           trigger=CompareValueTrigger(
                               'validation/main/loss', lambda best_value,
                               current_value: best_value < current_value))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.report_interval_iters,
                                      'iteration')))
    report_keys = [
        'epoch', 'iteration', 'main/loss', 'main/loss_ctc', 'main/loss_att',
        'validation/main/loss', 'validation/main/loss_ctc',
        'validation/main/loss_att', 'main/acc', 'validation/main/acc',
        'elapsed_time'
    ]
    if args.opt == 'adadelta':
        trainer.extend(extensions.observe_value(
            'eps', lambda trainer: trainer.updater.get_optimizer('main').eps),
                       trigger=(args.report_interval_iters, 'iteration'))
        report_keys.append('eps')
    trainer.extend(extensions.PrintReport(report_keys),
                   trigger=(args.report_interval_iters, 'iteration'))

    trainer.extend(
        extensions.ProgressBar(update_interval=args.report_interval_iters))

    set_early_stop(trainer, args)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(TensorboardLogger(writer, att_reporter),
                       trigger=(args.report_interval_iters, 'iteration'))

    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)
Esempio n. 9
0
def check_mnist(gpu, display_log=True):
    epoch = 5
    batchsize = 100
    n_units = 100
    warnings.filterwarnings(action='always', category=DeprecationWarning)

    comm = chainermn.create_communicator('naive')
    if gpu:
        device = comm.intra_rank
        chainer.cuda.get_device_from_id(device).use()
    else:
        device = -1

    model = L.Classifier(MLP(n_units, 10))
    if gpu:
        model.to_device(cupy.cuda.Device())

    optimizer = chainermn.create_multi_node_optimizer(
        chainer.optimizers.Adam(), comm)
    optimizer.setup(model)

    if comm.rank == 0:
        train, test = chainer.datasets.get_mnist()
    else:
        train, test = None, None

    train = chainermn.scatter_dataset(train, comm, shuffle=True)
    test = chainermn.scatter_dataset(test, comm, shuffle=True)

    train_iter = chainer.iterators.SerialIterator(train, batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    updater = training.StandardUpdater(train_iter, optimizer, device=device)

    trainer = training.Trainer(updater, (epoch, 'epoch'))

    # Wrap standard Chainer evaluators by MultiNodeEvaluator.
    evaluator = extensions.Evaluator(test_iter, model, device=device)
    evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
    trainer.extend(evaluator)

    # Add checkpointer. This is just to check checkpointing runs
    # without errors
    path = tempfile.mkdtemp(dir='/tmp', prefix=__name__ + '-tmp-')
    checkpointer = create_multi_node_checkpointer(name=__name__,
                                                  comm=comm,
                                                  path=path)
    trainer.extend(checkpointer, trigger=(1, 'epoch'))

    # Some display and output extensions are necessary only for one worker.
    # (Otherwise, there would just be repeated outputs.)
    if comm.rank == 0 and display_log:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))
        trainer.extend(extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ],
                                              out=sys.stderr),
                       trigger=(1, 'epoch'))
    trainer.run()

    err = evaluator()['validation/main/accuracy']
    assert err > 0.95

    # Check checkpointer successfully finalized snapshot directory
    assert [] == os.listdir(path)
    os.removedirs(path)
Esempio n. 10
0
    def train_model(self, datasets):
        parser = argparse.ArgumentParser(description='Chainer CIFAR example:')
        parser.add_argument('--dataset',
                            '-d',
                            default='cifar10',
                            help='The dataset to use: cifar10 or cifar100')
        parser.add_argument('--batchsize',
                            '-b',
                            type=int,
                            default=10,
                            help='Number of images in each mini-batch')
        parser.add_argument('--learnrate',
                            '-l',
                            type=float,
                            default=0.05,
                            help='Learning rate for SGD')
        parser.add_argument('--epoch',
                            '-e',
                            type=int,
                            default=300,
                            help='Number of sweeps over the dataset to train')
        parser.add_argument('--gpu',
                            '-g',
                            type=int,
                            default=-1,
                            help='GPU ID (negative value indicates CPU)')
        parser.add_argument('--out',
                            '-o',
                            default='result',
                            help='Directory to output the result')
        parser.add_argument('--resume',
                            '-r',
                            default='',
                            help='Resume the training from snapshot')
        parser.add_argument('--early-stopping',
                            type=str,
                            help='Metric to watch for early stopping')
        args = parser.parse_args()

        print('GPU: {}'.format(args.gpu))
        print('# Minibatch-size: {}'.format(args.batchsize))
        print('# epoch: {}'.format(args.epoch))

        if args.gpu >= 0:
            chainer.backends.cuda.get_device_from_id(args.gpu).use()
            self.model.to_gpu()

        optimizer = chainer.optimizers.Adam(args.learnrate)
        optimizer.setup(self.model)
        optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))

        train, test = split_dataset(datasets, 80)

        train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     args.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        stop_trigger = (args.epoch, 'epoch')
        # Early stopping option
        if args.early_stopping:
            stop_trigger = triggers.EarlyStoppingTrigger(
                monitor=args.early_stopping,
                verbose=True,
                max_trigger=(args.epoch, 'epoch'))

        # Set up a trainer
        updater = training.updaters.StandardUpdater(
            train_iter,
            optimizer,
            device=args.gpu,
            loss_func=mean_squared_error)
        trainer = training.Trainer(updater, stop_trigger, out=args.out)

        # Evaluate the model with the test dataset for each epoch
        trainer.extend(
            extensions.Evaluator(test_iter, self.model, device=args.gpu))

        # Reduce the learning rate by half every 25 epochs.
        trainer.extend(extensions.ExponentialShift('lr', 0.5),
                       trigger=(25, 'epoch'))

        # Dump a computational graph from 'loss' variable at the first iteration
        # The "main" refers to the target link of the "main" optimizer.
        trainer.extend(extensions.dump_graph('main/loss'))

        # Take a snapshot at each epoch
        trainer.extend(
            extensions.snapshot(filename='snaphot_epoch_{.updater.epoch}'))

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())

        # Print selected entries of the log to stdout
        # Here "main" refers to the target link of the "main" optimizer again, and
        # "validation" refers to the default name of the Evaluator extension.
        # Entries other than 'epoch' are reported by the Classifier link, called by
        # either the updater or the evaluator.
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

        if args.resume:
            # Resume from a snapshot
            chainer.serializers.load_npz(args.resume, trainer)

        print(train[:1])

        # Run the training
        trainer.run()

        return self.model
Esempio n. 11
0
def train(args):
    """Train E2E VC model."""
    set_deterministic_pytorch(args)

    # check cuda availability
    if not torch.cuda.is_available():
        logging.warning("cuda is not available")

    # get input and output dimension info
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]
    utts = list(valid_json.keys())

    # In TTS, this is reversed, but not in VC. See `espnet.utils.training.batchfy`
    idim = int(valid_json[utts[0]]["input"][0]["shape"][1])
    odim = int(valid_json[utts[0]]["output"][0]["shape"][1])
    logging.info("#input dims : " + str(idim))
    logging.info("#output dims: " + str(odim))

    # get extra input and output dimenstion
    if args.use_speaker_embedding:
        args.spk_embed_dim = int(valid_json[utts[0]]["input"][1]["shape"][0])
    else:
        args.spk_embed_dim = None
    if args.use_second_target:
        args.spc_dim = int(valid_json[utts[0]]["input"][1]["shape"][1])
    else:
        args.spc_dim = None

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + "/model.json"
    with open(model_conf, "wb") as f:
        logging.info("writing a model config file to" + model_conf)
        f.write(
            json.dumps((idim, odim, vars(args)),
                       indent=4,
                       ensure_ascii=False,
                       sort_keys=True).encode("utf_8"))
    for key in sorted(vars(args).keys()):
        logging.info("ARGS: " + key + ": " + str(vars(args)[key]))

    # specify model architecture
    if args.enc_init is not None or args.dec_init is not None:
        model = load_trained_modules(idim, odim, args, TTSInterface)
    else:
        model_class = dynamic_import(args.model_module)
        model = model_class(idim, odim, args)
    assert isinstance(model, TTSInterface)
    logging.info(model)
    reporter = model.reporter

    # freeze modules, if specified
    if args.freeze_mods:
        for mod, param in model.named_parameters():
            if any(mod.startswith(key) for key in args.freeze_mods):
                logging.info("freezing %s" % mod)
                param.requires_grad = False

    for mod, param in model.named_parameters():
        if not param.requires_grad:
            logging.info("Frozen module %s" % mod)

    # check the use of multi-gpu
    if args.ngpu > 1:
        model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu)))
        if args.batch_size != 0:
            logging.warning(
                "batch size is automatically increased (%d -> %d)" %
                (args.batch_size, args.batch_size * args.ngpu))
            args.batch_size *= args.ngpu

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    logging.warning(
        "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format(
            sum(p.numel() for p in model.parameters()),
            sum(p.numel() for p in model.parameters() if p.requires_grad),
            sum(p.numel() for p in model.parameters() if p.requires_grad) *
            100.0 / sum(p.numel() for p in model.parameters()),
        ))

    # Setup an optimizer
    if args.opt == "adam":
        optimizer = torch.optim.Adam(model.parameters(),
                                     args.lr,
                                     eps=args.eps,
                                     weight_decay=args.weight_decay)
    elif args.opt == "noam":
        from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt

        optimizer = get_std_opt(
            model.parameters(),
            args.adim,
            args.transformer_warmup_steps,
            args.transformer_lr,
        )
    elif args.opt == "lamb":
        from pytorch_lamb import Lamb

        optimizer = Lamb(model.parameters(),
                         lr=args.lr,
                         weight_decay=0.01,
                         betas=(0.9, 0.999))
    else:
        raise NotImplementedError("unknown optimizer: " + args.opt)

    # FIXME: TOO DIRTY HACK
    setattr(optimizer, "target", reporter)
    setattr(optimizer, "serialize", lambda s: reporter.serialize(s))

    # read json data
    with open(args.train_json, "rb") as f:
        train_json = json.load(f)["utts"]
    with open(args.valid_json, "rb") as f:
        valid_json = json.load(f)["utts"]

    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
    if use_sortagrad:
        args.batch_sort_key = "input"
    # make minibatch list (variable length)
    train_batchset = make_batchset(
        train_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        shortest_first=use_sortagrad,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=False,
        iaxis=0,
        oaxis=0,
    )
    valid_batchset = make_batchset(
        valid_json,
        args.batch_size,
        args.maxlen_in,
        args.maxlen_out,
        args.minibatches,
        batch_sort_key=args.batch_sort_key,
        min_batch_size=args.ngpu if args.ngpu > 1 else 1,
        count=args.batch_count,
        batch_bins=args.batch_bins,
        batch_frames_in=args.batch_frames_in,
        batch_frames_out=args.batch_frames_out,
        batch_frames_inout=args.batch_frames_inout,
        swap_io=False,
        iaxis=0,
        oaxis=0,
    )

    load_tr = LoadInputsAndTargets(
        mode="vc",
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": True},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    load_cv = LoadInputsAndTargets(
        mode="vc",
        use_speaker_embedding=args.use_speaker_embedding,
        use_second_target=args.use_second_target,
        preprocess_conf=args.preprocess_conf,
        preprocess_args={"train": False},  # Switch the mode of preprocessing
        keep_all_data_on_mem=args.keep_all_data_on_mem,
    )

    converter = CustomConverter()
    # hack to make batchsize argument as 1
    # actual bathsize is included in a list
    train_iter = {
        "main":
        ChainerDataLoader(
            dataset=TransformDataset(train_batchset,
                                     lambda data: converter([load_tr(data)])),
            batch_size=1,
            num_workers=args.num_iter_processes,
            shuffle=not use_sortagrad,
            collate_fn=lambda x: x[0],
        )
    }
    valid_iter = {
        "main":
        ChainerDataLoader(
            dataset=TransformDataset(valid_batchset,
                                     lambda data: converter([load_cv(data)])),
            batch_size=1,
            shuffle=False,
            collate_fn=lambda x: x[0],
            num_workers=args.num_iter_processes,
        )
    }

    # Set up a trainer
    updater = CustomUpdater(model, args.grad_clip, train_iter, optimizer,
                            device, args.accum_grad)
    trainer = training.Trainer(updater, (args.epochs, "epoch"),
                               out=args.outdir)

    # Resume from a snapshot
    if args.resume:
        logging.info("resumed from %s" % args.resume)
        torch_resume(args.resume, trainer)

    # set intervals
    eval_interval = (args.eval_interval_epochs, "epoch")
    save_interval = (args.save_interval_epochs, "epoch")
    report_interval = (args.report_interval_iters, "iteration")

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(CustomEvaluator(model, valid_iter, reporter, device),
                   trigger=eval_interval)

    # Save snapshot for each epoch
    trainer.extend(torch_snapshot(), trigger=save_interval)

    # Save best models
    trainer.extend(
        snapshot_object(model, "model.loss.best"),
        trigger=training.triggers.MinValueTrigger("validation/main/loss",
                                                  trigger=eval_interval),
    )

    # Save attention figure for each epoch
    if args.num_save_attention > 0:
        data = sorted(
            list(valid_json.items())[:args.num_save_attention],
            key=lambda x: int(x[1]["input"][0]["shape"][1]),
            reverse=True,
        )
        if hasattr(model, "module"):
            att_vis_fn = model.module.calculate_all_attentions
            plot_class = model.module.attention_plot_class
        else:
            att_vis_fn = model.calculate_all_attentions
            plot_class = model.attention_plot_class
        att_reporter = plot_class(
            att_vis_fn,
            data,
            args.outdir + "/att_ws",
            converter=converter,
            transform=load_cv,
            device=device,
            reverse=True,
        )
        trainer.extend(att_reporter, trigger=eval_interval)
    else:
        att_reporter = None

    # Make a plot for training and validation values
    if hasattr(model, "module"):
        base_plot_keys = model.module.base_plot_keys
    else:
        base_plot_keys = model.base_plot_keys
    plot_keys = []
    for key in base_plot_keys:
        plot_key = ["main/" + key, "validation/main/" + key]
        trainer.extend(
            extensions.PlotReport(plot_key, "epoch", file_name=key + ".png"),
            trigger=eval_interval,
        )
        plot_keys += plot_key
    trainer.extend(
        extensions.PlotReport(plot_keys, "epoch", file_name="all_loss.png"),
        trigger=eval_interval,
    )

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=report_interval))
    report_keys = ["epoch", "iteration", "elapsed_time"] + plot_keys
    trainer.extend(extensions.PrintReport(report_keys),
                   trigger=report_interval)
    trainer.extend(extensions.ProgressBar(), trigger=report_interval)

    set_early_stop(trainer, args)
    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
        from torch.utils.tensorboard import SummaryWriter

        writer = SummaryWriter(args.tensorboard_dir)
        trainer.extend(TensorboardLogger(writer, att_reporter),
                       trigger=report_interval)

    if use_sortagrad:
        trainer.extend(
            ShufflingEnabler([train_iter]),
            trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs,
                     "epoch"),
        )

    # Run the training
    trainer.run()
    check_early_stop(trainer, args.epochs)
Esempio n. 12
0
def train(args):
    '''Run training'''
    # display chainer version
    logging.info('chainer version = ' + chainer.__version__)

    # seed setting (chainer seed may not need it)
    os.environ['CHAINER_SEED'] = str(args.seed)
    logging.info('chainer seed = ' + os.environ['CHAINER_SEED'])

    # debug mode setting
    # 0 would be fastest, but 1 seems to be reasonable
    # by considering reproducability
    # revmoe type check
    if args.debugmode < 2:
        chainer.config.type_check = False
        logging.info('chainer type check is disabled')
    # use determinisitic computation or not
    if args.debugmode < 1:
        chainer.config.cudnn_deterministic = False
        logging.info('chainer cudnn deterministic is disabled')
    else:
        chainer.config.cudnn_deterministic = True

    # check cuda and cudnn availability
    if not chainer.cuda.available:
        logging.warning('cuda is not available')
    if not chainer.cuda.cudnn_enabled:
        logging.warning('cudnn is not available')

    # get input and output dimension info
    with open(args.valid_label, 'rb') as f:
        valid_json = json.load(f)['utts']
    utts = list(valid_json.keys())
    idim = int(valid_json[utts[0]]['idim'])
    odim = int(valid_json[utts[0]]['odim'])
    logging.info('#input dims : ' + str(idim))
    logging.info('#output dims: ' + str(odim))

    # check attention type
    if args.atype not in ['noatt', 'dot', 'location']:
        raise NotImplementedError(
            'chainer supports only noatt, dot, and location attention.')

    # specify attention, CTC, hybrid mode
    if args.mtlalpha == 1.0:
        mtl_mode = 'ctc'
        logging.info('Pure CTC mode')
    elif args.mtlalpha == 0.0:
        mtl_mode = 'att'
        logging.info('Pure attention mode')
    else:
        mtl_mode = 'mtl'
        logging.info('Multitask learning mode')

    # specify model architecture
    e2e = E2E(idim, odim, args)
    model = Loss(e2e, args.mtlalpha)

    # write model config
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    model_conf = args.outdir + '/model.conf'
    with open(model_conf, 'wb') as f:
        logging.info('writing a model config file to' + model_conf)
        # TODO(watanabe) use others than pickle, possibly json, and save as a text
        pickle.dump((idim, odim, args), f)
    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # Set gpu
    ngpu = args.ngpu
    if ngpu == 1:
        gpu_id = 0
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()  # Copy the model to the GPU
        logging.info('single gpu calculation.')
    elif ngpu > 1:
        gpu_id = 0
        devices = {'main': gpu_id}
        for gid in six.moves.xrange(1, ngpu):
            devices['sub_%d' % gid] = gid
        logging.info('multi gpu calculation (#gpus = %d).' % ngpu)
        logging.info('batch size is automatically increased (%d -> %d)' %
                     (args.batch_size, args.batch_size * args.ngpu))
    else:
        gpu_id = -1
        logging.info('cpu calculation')

    # Setup an optimizer
    if args.opt == 'adadelta':
        optimizer = chainer.optimizers.AdaDelta(eps=args.eps)
    elif args.opt == 'adam':
        optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))

    # read json data
    with open(args.train_label, 'rb') as f:
        train_json = json.load(f)['utts']
    with open(args.valid_label, 'rb') as f:
        valid_json = json.load(f)['utts']

    # prepare Kaldi reader
    train_reader = lazy_io.read_dict_scp(args.train_feat)
    valid_reader = lazy_io.read_dict_scp(args.valid_feat)

    # set up training iterator and updater
    if ngpu <= 1:
        # make minibatch list (variable length)
        train = make_batchset(train_json, args.batch_size, args.maxlen_in,
                              args.maxlen_out, args.minibatches)
        # hack to make batchsize argument as 1
        # actual batchsize is included in a list
        train_iter = chainer.iterators.SerialIterator(train, 1)

        # set up updater
        updater = ChainerSeqUpdaterKaldi(train_iter, optimizer, train_reader,
                                         gpu_id)
    else:
        # set up minibatches
        train_subsets = []
        for gid in six.moves.xrange(ngpu):
            # make subset
            train_json_subset = {
                k: v
                for i, (k, v) in enumerate(train_json.viewitems())
                if i % ngpu == gid
            }
            # make minibatch list (variable length)
            train_subsets += [
                make_batchset(train_json_subset, args.batch_size,
                              args.maxlen_in, args.maxlen_out,
                              args.minibatches)
            ]

        # each subset must have same length for MultiprocessParallelUpdater
        maxlen = max([len(train_subset) for train_subset in train_subsets])
        for train_subset in train_subsets:
            if maxlen != len(train_subset):
                for i in six.moves.xrange(maxlen - len(train_subset)):
                    train_subset += [train_subset[i]]

        # hack to make batchsize argument as 1
        # actual batchsize is included in a list
        train_iters = [
            chainer.iterators.MultiprocessIterator(train_subsets[gid],
                                                   1,
                                                   n_processes=1)
            for gid in six.moves.xrange(ngpu)
        ]

        # set up updater
        updater = ChainerMultiProcessParallelUpdaterKaldi(
            train_iters, optimizer, train_reader, devices)

    # Set up a trainer
    trainer = training.Trainer(updater, (args.epochs, 'epoch'),
                               out=args.outdir)

    # Resume from a snapshot
    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    # set up validation iterator
    valid = make_batchset(valid_json, args.batch_size, args.maxlen_in,
                          args.maxlen_out, args.minibatches)
    valid_iter = chainer.iterators.SerialIterator(valid,
                                                  1,
                                                  repeat=False,
                                                  shuffle=False)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        ChainerSeqEvaluaterKaldi(valid_iter,
                                 model,
                                 valid_reader,
                                 device=gpu_id))

    # Save attention weight each epoch
    if args.num_save_attention > 0 and args.mtlalpha != 1.0:
        data = sorted(valid_json.items()[:args.num_save_attention],
                      key=lambda x: int(x[1]['ilen']),
                      reverse=True)
        data = converter_kaldi(data, valid_reader)
        trainer.extend(PlotAttentionReport(model, data,
                                           args.outdir + "/att_ws"),
                       trigger=(1, 'epoch'))

    # Take a snapshot for each specified epoch
    trainer.extend(extensions.snapshot(), trigger=(1, 'epoch'))

    # Make a plot for training and validation values
    trainer.extend(
        extensions.PlotReport([
            'main/loss', 'validation/main/loss', 'main/loss_ctc',
            'validation/main/loss_ctc', 'main/loss_att',
            'validation/main/loss_att'
        ],
                              'epoch',
                              file_name='loss.png'))
    trainer.extend(
        extensions.PlotReport(['main/acc', 'validation/main/acc'],
                              'epoch',
                              file_name='acc.png'))

    # Save best models
    trainer.extend(
        extensions.snapshot_object(model, 'model.loss.best'),
        trigger=training.triggers.MinValueTrigger('validation/main/loss'))
    if mtl_mode is not 'ctc':
        trainer.extend(
            extensions.snapshot_object(model, 'model.acc.best'),
            trigger=training.triggers.MaxValueTrigger('validation/main/acc'))

    # epsilon decay in the optimizer
    if args.opt == 'adadelta':
        if args.criterion == 'acc' and mtl_mode is not 'ctc':
            trainer.extend(restore_snapshot(model,
                                            args.outdir + '/model.acc.best'),
                           trigger=CompareValueTrigger(
                               'validation/main/acc', lambda best_value,
                               current_value: best_value > current_value))
            trainer.extend(adadelta_eps_decay(args.eps_decay),
                           trigger=CompareValueTrigger(
                               'validation/main/acc', lambda best_value,
                               current_value: best_value > current_value))
        elif args.criterion == 'loss':
            trainer.extend(restore_snapshot(model,
                                            args.outdir + '/model.loss.best'),
                           trigger=CompareValueTrigger(
                               'validation/main/loss', lambda best_value,
                               current_value: best_value < current_value))
            trainer.extend(adadelta_eps_decay(args.eps_decay),
                           trigger=CompareValueTrigger(
                               'validation/main/loss', lambda best_value,
                               current_value: best_value < current_value))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(extensions.LogReport(trigger=(100, 'iteration')))
    report_keys = [
        'epoch', 'iteration', 'main/loss', 'main/loss_ctc', 'main/loss_att',
        'validation/main/loss', 'validation/main/loss_ctc',
        'validation/main/loss_att', 'main/acc', 'validation/main/acc',
        'elapsed_time'
    ]
    if args.opt == 'adadelta':
        trainer.extend(extensions.observe_value(
            'eps', lambda trainer: trainer.updater.get_optimizer('main').eps),
                       trigger=(100, 'iteration'))
        report_keys.append('eps')
    trainer.extend(extensions.PrintReport(report_keys),
                   trigger=(100, 'iteration'))

    trainer.extend(extensions.ProgressBar())

    # Run the training
    trainer.run()
    train_iter = iterators.SerialIterator(train, batch_size)
    z_iter = iterators.RandomNoiseIterator(UniformNoiseGenerator(-1, 1, nz),
                                           batch_size)

    optimizer_generator = optimizers.Adam(alpha=1e-4, beta1=0.5)
    optimizer_discriminator = optimizers.Adam(alpha=1e-4, beta1=0.5)
    optimizer_denoiser = optimizers.Adam(alpha=1e-4, beta1=0.5)

    optimizer_generator.setup(Generator())
    optimizer_discriminator.setup(Discriminator())
    optimizer_denoiser.setup(Denoiser())

    updater = updater.GenerativeAdversarialUpdater(
        iterator=train_iter,
        noise_iterator=z_iter,
        optimizer_generator=optimizer_generator,
        optimizer_discriminator=optimizer_discriminator,
        optimizer_denoiser=optimizer_denoiser,
        lambda_denoise=lambda_denoise,
        lambda_adv=lambda_adv,
        device=gpu)

    trainer = training.Trainer(updater, stop_trigger=(epochs, 'epoch'))
    trainer.extend(extensions.LogReport(trigger=(1, 'iteration')))
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'iteration', 'gen/loss', 'dis/loss', 'denoiser/loss']))
    trainer.extend(extensions.ProgressBar())
    trainer.extend(extensions.GeneratorSample(), trigger=(1, 'epoch'))
    trainer.run()
Esempio n. 14
0
    optimizer.use_cleargrads()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    if 'Imdb' in args.dataset:
        train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     1,
                                                     repeat=False,
                                                     shuffle=False)
    elif 'nli' in args.dataset:
        train_iter = train
        test_iter = test

    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(TestModeEvaluator(test_iter, model, device=args.gpu))
    #trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.LogReport(postprocess=store_model, trigger=(1, 'epoch')))
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy'
        ]))
    trainer.extend(extensions.ProgressBar(update_interval=8))

    if args.resume:
Esempio n. 15
0
max_epoch = 15

model = L.Classifier(model)

optimizer = optimizers.MomentumSGD(lr=0.01, momentum=0.9)

optimizer.setup(model)

updater = training.updaters.StandardUpdater(train_iter,
                                            optimizer,
                                            device=gpu_id)

result_dir = '../results/hardtopic_{}_100_depth{}_valid{}'.format(
    mode,
    sum(depth) * 2 + 1, args.valid)
trainer = training.Trainer(updater, (epoch_size * max_epoch, 'iteration'),
                           out=result_dir)

from chainer.training import extensions

trainer.extend(extensions.LogReport(trigger=(epoch_size, 'iteration')))
trainer.extend(
    extensions.snapshot(filename='snapshot_iteration-{.updater.iteration}'),
    trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.snapshot_object(
    model.predictor, filename='model_iteration-{.updater.iteration}'),
               trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.Evaluator(test_iter, model, device=gpu_id),
               trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.observe_lr(), trigger=(epoch_size, 'iteration'))
trainer.extend(extensions.PrintReport([
    'iteration', 'lr', 'main/accuracy', 'validation/main/accuracy',
Esempio n. 16
0
def main():
    archs = {
        'alex': alex.Alex,
        'alex_fp16': alex.AlexFp16,
        'googlenet': googlenet.GoogLeNet,
        'googlenetbn': googlenetbn.GoogLeNetBN,
        'googlenetbn_fp16': googlenetbn.GoogLeNetBNFp16,
        'nin': nin.NIN,
        'resnet50': resnet50.ResNet50,
        'resnext50': resnet50.ResNeXt50,
    }

    parser = argparse.ArgumentParser(
        description='Learning convnet from ILSVRC2012 dataset')
    parser.add_argument('train', help='Path to training image-label list file')
    parser.add_argument('val', help='Path to validation image-label list file')
    parser.add_argument('--arch',
                        '-a',
                        choices=archs.keys(),
                        default='nin',
                        help='Convnet architecture')
    parser.add_argument('--batchsize',
                        '-B',
                        type=int,
                        default=32,
                        help='Learning minibatch size')
    parser.add_argument('--epoch',
                        '-E',
                        type=int,
                        default=10,
                        help='Number of epochs to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--loaderjob',
                        '-j',
                        type=int,
                        help='Number of parallel data loading processes')
    parser.add_argument('--mean',
                        '-m',
                        default='mean.npy',
                        help='Mean file (computed by compute_mean.py)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--root',
                        '-R',
                        default='.',
                        help='Root directory path of image files')
    parser.add_argument('--val_batchsize',
                        '-b',
                        type=int,
                        default=250,
                        help='Validation minibatch size')
    parser.add_argument('--test', action='store_true')
    parser.set_defaults(test=False)
    args = parser.parse_args()

    # Initialize the model to train
    model = archs[args.arch]()
    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()  # Make the GPU current
        model.to_gpu()

    # Load the datasets and mean file
    mean = np.load(args.mean)
    train = PreprocessedDataset(args.train, args.root, mean, model.insize)
    val = PreprocessedDataset(args.val, args.root, mean, model.insize, False)
    # These iterators load the images with subprocesses running in parallel to
    # the training/validation.
    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)
    val_iter = chainer.iterators.MultiprocessIterator(
        val, args.val_batchsize, repeat=False, n_processes=args.loaderjob)

    # Set up an optimizer
    optimizer = chainer.optimizers.MomentumSGD(lr=0.01, momentum=0.9)
    optimizer.setup(model)

    # Set up a trainer
    updater = training.updaters.StandardUpdater(train_iter,
                                                optimizer,
                                                device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), args.out)

    val_interval = (1 if args.test else 100000), 'iteration'
    log_interval = (1 if args.test else 1000), 'iteration'

    trainer.extend(extensions.Evaluator(val_iter, model, device=args.gpu),
                   trigger=val_interval)
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=val_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=val_interval)
    # Be careful to pass the interval directly to LogReport
    # (it determines when to emit log rather than when to read observations)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss',
        'main/accuracy', 'validation/main/accuracy', 'lr'
    ]),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Esempio n. 17
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    if args.label:
        labels = args.label
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        raise ValueError('No target label was specified.')
    # Dataset preparation. Postprocessing is required for the regression task.
    def postprocess_label(label_list):
        label_arr = np.asarray(label_list, dtype=np.int32)
        return label_arr

    # Apply a preprocessor to the dataset.
    print('Preprocessing dataset...')
    preprocessor = preprocess_method_dict[args.method]()
    parser = CSVFileParserForPair(preprocessor,
                                  postprocess_label=postprocess_label,
                                  labels=labels,
                                  smiles_cols=['smiles_1', 'smiles_2'])
    dataset = parser.parse(args.datafile)['dataset']

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, val = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    # def set_up_predictor(method, fp_hidden_dim, fp_out_dim, conv_layers, net_hidden_num, class_num, net_layers):
    # predictor = set_up_predictor(args.method, args.unit_num,
    #                              args.conv_layers, class_num)
    if len(args.net_hidden_dims):
        net_hidden_dims = tuple([
            int(net_hidden_dim)
            for net_hidden_dim in args.net_hidden_dims.split(',')
        ])
    else:
        net_hidden_dims = ()
    predictor = set_up_predictor(method=args.method,
                                 fp_hidden_dim=args.fp_hidden_dim,
                                 fp_out_dim=args.fp_out_dim,
                                 conv_layers=args.conv_layers,
                                 concat_hidden=args.concat_hidden,
                                 fp_dropout_rate=args.fp_dropout_rate,
                                 net_hidden_dims=net_hidden_dims,
                                 class_num=class_num,
                                 sim_method=args.sim_method)

    # Set up the iterator.
    train_iter = SerialIterator(train, args.batchsize)
    val_iter = SerialIterator(val, args.batchsize, repeat=False, shuffle=False)

    metrics_fun = {'accuracy': F.binary_accuracy}
    classifier = Classifier(predictor,
                            lossfun=F.sigmoid_cross_entropy,
                            metrics_fun=metrics_fun,
                            device=args.gpu)

    # Set up the optimizer.
    optimizer = optimizers.Adam(alpha=args.learning_rate,
                                weight_decay_rate=args.weight_decay_rate)
    # optimizer = optimizers.Adam()
    # optimizer = optimizers.SGD(lr=args.learning_rate)
    optimizer.setup(classifier)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       converter=concat_mols)

    # Set up the trainer.
    print('Training...')
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        E.Evaluator(val_iter,
                    classifier,
                    device=args.gpu,
                    converter=concat_mols))

    train_eval_iter = SerialIterator(train,
                                     args.batchsize,
                                     repeat=False,
                                     shuffle=False)

    trainer.extend(
        AccuracyEvaluator(train_eval_iter,
                          classifier,
                          eval_func=predictor,
                          device=args.gpu,
                          converter=concat_mols,
                          name='train_acc',
                          pos_labels=1,
                          ignore_labels=-1,
                          raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        AccuracyEvaluator(val_iter,
                          classifier,
                          eval_func=predictor,
                          device=args.gpu,
                          converter=concat_mols,
                          name='val_acc',
                          pos_labels=1,
                          ignore_labels=-1))

    trainer.extend(
        ROCAUCEvaluator(train_eval_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='train_roc',
                        pos_labels=1,
                        ignore_labels=-1,
                        raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        ROCAUCEvaluator(val_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='val_roc',
                        pos_labels=1,
                        ignore_labels=-1))

    trainer.extend(
        PRCAUCEvaluator(train_eval_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='train_prc',
                        pos_labels=1,
                        ignore_labels=-1,
                        raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        PRCAUCEvaluator(val_iter,
                        classifier,
                        eval_func=predictor,
                        device=args.gpu,
                        converter=concat_mols,
                        name='val_prc',
                        pos_labels=1,
                        ignore_labels=-1))

    # trainer.extend(PrecisionEvaluator(
    #     train_eval_iter, classifier, eval_func=predictor,
    #     device=args.gpu, converter=concat_mols, name='train_p',
    #     pos_labels=1, ignore_labels=-1, raise_value_error=False))
    # # extension name='validation' is already used by `Evaluator`,
    # # instead extension name `val` is used.
    # trainer.extend(PrecisionEvaluator(
    #     val_iter, classifier, eval_func=predictor,
    #     device=args.gpu, converter=concat_mols, name='val_p',
    #     pos_labels=1, ignore_labels=-1))
    #
    # trainer.extend(RecallEvaluator(
    #     train_eval_iter, classifier, eval_func=predictor,
    #     device=args.gpu, converter=concat_mols, name='train_r',
    #     pos_labels=1, ignore_labels=-1, raise_value_error=False))
    # # extension name='validation' is already used by `Evaluator`,
    # # instead extension name `val` is used.
    # trainer.extend(RecallEvaluator(
    #     val_iter, classifier, eval_func=predictor,
    #     device=args.gpu, converter=concat_mols, name='val_r',
    #     pos_labels=1, ignore_labels=-1))

    trainer.extend(
        F1Evaluator(train_eval_iter,
                    classifier,
                    eval_func=predictor,
                    device=args.gpu,
                    converter=concat_mols,
                    name='train_f',
                    pos_labels=1,
                    ignore_labels=-1,
                    raise_value_error=False))
    # extension name='validation' is already used by `Evaluator`,
    # instead extension name `val` is used.
    trainer.extend(
        F1Evaluator(val_iter,
                    classifier,
                    eval_func=predictor,
                    device=args.gpu,
                    converter=concat_mols,
                    name='val_f',
                    pos_labels=1,
                    ignore_labels=-1))

    # apply shift strategy to learning rate every 10 epochs
    # trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate), trigger=(10, 'epoch'))
    trainer.extend(E.ExponentialShift('alpha', args.exp_shift_rate),
                   trigger=triggers.ManualScheduleTrigger([10, 20, 30, 40, 50],
                                                          'epoch'))
    # # observation of learning rate
    trainer.extend(E.observe_lr(), trigger=(1, 'iteration'))

    entries = [
        'epoch',
        'main/loss',
        'train_acc/main/accuracy',
        'train_roc/main/roc_auc',
        'train_prc/main/prc_auc',
        # 'train_p/main/precision', 'train_r/main/recall',
        'train_f/main/f1',
        'validation/main/loss',
        'val_acc/main/accuracy',
        'val_roc/main/roc_auc',
        'val_prc/main/prc_auc',
        # 'val_p/main/precision', 'val_r/main/recall',
        'val_f/main/f1',
        'lr',
        'elapsed_time'
    ]
    trainer.extend(E.PrintReport(entries=entries))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(E.ProgressBar())

    if args.resume:
        resume_path = os.path.join(args.out, args.resume)
        logging.info(
            'Resume training according to snapshot in {}'.format(resume_path))
        chainer.serializers.load_npz(resume_path, trainer)

    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained models to {}...'.format(model_path))
    classifier.save_pickle(model_path, protocol=args.protocol)
Esempio n. 18
0
    discriminator=discriminator,
    device=config.train.gpu,
    iterator=train_iter,
    optimizer=opts,
    converter=converter,
)

# trainer
trigger_log = (config.train.log_iteration, 'iteration')
trigger_snapshot = (config.train.snapshot_iteration, 'iteration')
trigger_stop = (
    config.train.stop_iteration,
    'iteration') if config.train.stop_iteration is not None else None

trainer = training.Trainer(updater,
                           stop_trigger=trigger_stop,
                           out=arguments.output)
tb_writer = SummaryWriter(Path(arguments.output))

ext = extensions.Evaluator(test_iter,
                           models,
                           converter,
                           device=config.train.gpu,
                           eval_func=updater.forward)
trainer.extend(ext, name='test', trigger=trigger_log)
ext = extensions.Evaluator(train_eval_iter,
                           models,
                           converter,
                           device=config.train.gpu,
                           eval_func=updater.forward)
trainer.extend(ext, name='train', trigger=trigger_log)
Esempio n. 19
0
converter = partial(convert.concat_examples, padding=0)
updater = Updater(
    loss_config=config.loss,
    predictor=predictor,
    discriminator=discriminator,
    device=config.train.gpu,
    iterator=train_iter,
    optimizer=opts,
    converter=converter,
)

# trainer
trigger_log = (config.train.log_iteration, 'iteration')
trigger_snapshot = (config.train.snapshot_iteration, 'iteration')

trainer = training.Trainer(updater, out=arguments.output)

ext = extensions.Evaluator(test_iter,
                           models,
                           converter,
                           device=config.train.gpu,
                           eval_func=updater.forward)
trainer.extend(ext, name='test', trigger=trigger_log)
ext = extensions.Evaluator(train_eval_iter,
                           models,
                           converter,
                           device=config.train.gpu,
                           eval_func=updater.forward)
trainer.extend(ext, name='train', trigger=trigger_log)

trainer.extend(extensions.dump_graph('predictor/loss'))
Esempio n. 20
0
print(trainDataset[499])
trainIter = iterators.SerialIterator(trainDataset, batchsize)

model = VGG16.VGG16()

if gpuId >= 0:
    model.to_gpu(gpuId)

optimizer = chainer.optimizers.MomentumSGD()
optimizer.setup(model)
model.base.disable_update()
model = L.Classifier(model)

updater = training.StandardUpdater(trainIter, optimizer, device=gpuId)
trainer = training.Trainer(
    updater, (maxEpoch, 'epoch'),
    out='/home/yusuke/dataset/style-color-images/result')
trainer.extend(extensions.LogReport())
trainer.extend(extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'))
trainer.extend(
    extensions.PrintReport([
        'epoch', 'main/loss', 'main/accuracy', 'val/main/loss',
        'val/main/accuracy', 'l1/W/data/std', 'elapsed_time'
    ]))
trainer.extend(
    extensions.PlotReport(['l1/W/data/std'],
                          x_key='epoch',
                          file_name='std.png'))
trainer.extend(
    extensions.PlotReport(['main/loss', 'val/main/loss'],
                          x_key='epoch',
Esempio n. 21
0
# ニューラルネットワークの登録
model = L.Classifier(MyChain(), lossfun = F.softmax_cross_entropy)
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)

# イテレータの定義
batchsize = 4
train_iter = chainer.iterators.SerialIterator(train, batchsize) # 学習用
test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) # 評価用

#アップデータの登録
updater = training.StandardUpdater(train_iter, optimizer)

# トレーナの登録
epoch = 1000
trainer = training.Trainer(updater, (epoch, 'epoch'))

# 学習状況の表示や保存
trainer.extend(extensions.LogReport()) # ログ
trainer.extend(extensions.Evaluator(test_iter, model)) # エポック数の表示
trainer.extend(extensions.PrintReport([
                                       'epoch', 
                                       'main/loss', 
                                       'validation/main/loss', 
                                       'main/accuracy', 
                                       'validation/main/accuracy', 
                                       'elapsed_time'])) # 計算状況の表示

# 学習開始
trainer.run()
def main():
    # This script is almost identical to train_mnist.py. The only difference is
    # that this script uses data-parallel computation on two GPUs.
    # See train_mnist.py for more details.
    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=400,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu0',
                        '-g',
                        type=int,
                        default=0,
                        help='First GPU ID')
    parser.add_argument('--gpu1',
                        '-G',
                        type=int,
                        default=1,
                        help='Second GPU ID')
    parser.add_argument('--out',
                        '-o',
                        default='result_parallel',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    parser.add_argument('--train_imgs',
                        default='data/kmnist-train-imgs.npz',
                        help='Path to kmnist training images')
    parser.add_argument('--train_label',
                        default='data/kmnist-train-labels.npz',
                        help='Path to kmnist training labels')
    parser.add_argument('--test_imgs',
                        default='data/kmnist-test-imgs.npz',
                        help='Path to kmnist test images')
    parser.add_argument('--test_label',
                        default='data/kmnist-test-labels.npz',
                        help='Path to kmnist test labels')

    args = parser.parse_args()

    print('GPU: {}, {}'.format(args.gpu0, args.gpu1))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    chainer.backends.cuda.get_device_from_id(args.gpu0).use()

    model = L.Classifier(train_kmnist.MLP(args.unit, 10))
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Load and prepare the KMNIST dataset
    train_data = np.load(args.train_imgs)['arr_0'].\
                 reshape((60000, 784)).astype(np.float32)/255.
    train_labels = [int(n) for n in np.load(args.train_label)['arr_0']]
    train = TupleDataset(train_data, train_labels)

    test_data = np.load(args.test_imgs)['arr_0'].\
                reshape((10000, 784)).astype(np.float32)/255.
    test_labels = [int(n) for n in np.load(args.test_label)['arr_0']]
    test = TupleDataset(test_data, test_labels)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    # ParallelUpdater implements the data-parallel gradient computation on
    # multiple GPUs. It accepts "devices" argument that specifies which GPU to
    # use.
    updater = training.updaters.ParallelUpdater(
        train_iter,
        optimizer,
        # The device of the name 'main' is used as a "master", while others are
        # used as slaves. Names other than 'main' are arbitrary.
        devices={
            'main': args.gpu0,
            'second': args.gpu1
        },
    )
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu0))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy', 'elapsed_time'
        ]))
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    trainer.run()
Esempio n. 23
0
def main():
    # define options
    parser = argparse.ArgumentParser(
        description='Training script of DenseNet on CIFAR-10 dataset')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=300,
                        help='Number of epochs to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--initmodel',
                        help='Initialize the model from given file')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Initialize the trainer from given file')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Output directory')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Validation minibatch size')
    parser.add_argument('--numlayers',
                        '-L',
                        type=int,
                        default=40,
                        help='Number of layers')
    parser.add_argument('--growth',
                        '-G',
                        type=int,
                        default=12,
                        help='Growth rate parameter')
    parser.add_argument('--dropout',
                        '-D',
                        type=float,
                        default=0.2,
                        help='Dropout ratio')
    parser.add_argument('--dataset',
                        type=str,
                        default='C10',
                        choices=('C10', 'C10+', 'C100', 'C100+'),
                        help='Dataset used for training (Default is C10)')
    args = parser.parse_args()

    # load dataset
    if args.dataset == 'C10':
        train, test = dataset.get_C10()
    elif args.dataset == 'C10+':
        train, test = dataset.get_C10_plus()
    elif args.dataset == 'C100':
        train, test = dataset.get_C100()
    elif args.dataset == 'C100+':
        train, test = dataset.get_C100_plus()

    train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize)
    test_iter = chainer.iterators.MultiprocessIterator(test,
                                                       args.batchsize,
                                                       repeat=False,
                                                       shuffle=False)

    # setup model
    model = L.Classifier(
        DenseNet(args.numlayers, args.growth, 16, args.dropout, 10))

    if args.initmodel:
        print('Load model from', args.initmodel)
        chainer.serializers.load_npz(args.initmodel, model)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu()

    # setup optimizer
    optimizer = chainer.optimizers.NesterovAG(lr=0.1, momentum=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # setup trainer
    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(Evaluator(test_iter, model, device=args.gpu))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=(10, 'epoch'))
    trainer.extend(
        extensions.snapshot_object(model, 'model_{.updater.epoch}.npz'))
    trainer.extend(extensions.LogReport())
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
            'validation/main/accuracy'
        ]))
    trainer.extend(extensions.ProgressBar())

    # devide lr by 10 at 0.5, 0.75 fraction of total number of training epochs
    iter_per_epoch = math.ceil(len(train) / args.batchsize)
    n_iter1 = int(args.epoch * 0.5 * iter_per_epoch)
    n_iter2 = int(args.epoch * 0.75 * iter_per_epoch)
    shifts = [(n_iter1, 0.01), (n_iter2, 0.001)]
    trainer.extend(StepShift('lr', shifts, optimizer))

    if args.resume:
        chainer.serializers.load_npz(args.resume, trainer)

    # start training
    trainer.run()
Esempio n. 24
0
def train():
    parser = argparse.ArgumentParser(description='DAGMM')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=10000,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--cn_h_unit',
                        type=int,
                        default=10,
                        help='Number of Compression Network hidden units')
    parser.add_argument('--cn_z_unit',
                        type=int,
                        default=2,
                        help='Number of Compression Network z units')
    parser.add_argument('--en_h_unit',
                        type=int,
                        default=10,
                        help='Number of Estimation Network hidden units')
    parser.add_argument('--en_o_unit',
                        type=int,
                        default=2,
                        help='Number of Estimation Network output units')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--frequency',
                        '-f',
                        type=int,
                        default=20,
                        help='Frequency of taking a snapshot')
    parser.add_argument(
        '--resume',
        '-r',
        type=int,
        help='Resume the training from snapshot that is designated epoch number'
    )
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('# Compression Network: Dim - {0} - {1} - {0} - Dim'.format(
        args.cn_h_unit, args.cn_z_unit))
    print('# Estimation Network: {} - {} - {}'.format(args.cn_z_unit + 2,
                                                      args.en_h_unit,
                                                      args.en_o_unit))
    print('# Output-directory: {}'.format(args.out))
    print('# Frequency-snapshot: {}'.format(args.frequency))
    if args.resume:
        print('# Resume-epochNumber: {}'.format(args.resume))
    print('')

    # データセット読み込み
    x_data = np.loadtxt('./dataset_arrhythmia/ExplanatoryVariables.csv',
                        delimiter=',')
    y_label = np.loadtxt('./dataset_arrhythmia/CriterionVariables.csv',
                         delimiter=',')

    # 正常データのみを抽出
    HealthData = x_data[y_label[:] == 1]

    # 正常データを学習用と検証用に分割
    NumOfHealthData = len(HealthData)
    trainData = HealthData[:math.floor(NumOfHealthData * 0.9)]
    validData = HealthData[len(trainData):]

    # 型変換
    trainData = trainData.astype(np.float32)
    validData = validData.astype(np.float32)

    train_iter = chainer.iterators.SerialIterator(trainData,
                                                  batch_size=args.batchsize,
                                                  repeat=True,
                                                  shuffle=True)
    valid_iter = chainer.iterators.SerialIterator(validData,
                                                  batch_size=len(validData),
                                                  repeat=False,
                                                  shuffle=False)

    model = DAGMM(args.cn_h_unit, args.cn_z_unit, len(trainData[0]),
                  args.en_h_unit, args.en_o_unit)

    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    optimizer = optimizers.Adam(alpha=0.0001)
    optimizer.setup(model)

    if args.resume:
        serializers.load_npz(
            args.out + '/model_snapshot_epoch_' + str(args.resume), model)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       loss_func=model.lossFunc(gpu=args.gpu))
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(
        extensions.Evaluator(valid_iter,
                             model,
                             device=args.gpu,
                             eval_func=model.lossFunc(gpu=args.gpu)))
    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(
        extensions.snapshot(filename='snapshot_epoch-{.updater.epoch}'),
        trigger=(args.frequency, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        model, filename='model_snapshot_epoch_{.updater.epoch}'),
                   trigger=(args.frequency, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        optimizer, filename='optimizer_snapshot_epoch_{.updater.epoch}'),
                   trigger=(args.frequency, 'epoch'))
    trainer.extend(
        extensions.PlotReport(['main/loss', 'validation/main/loss'],
                              x_key='epoch',
                              file_name='loss1.png'))
    trainer.extend(
        extensions.PlotReport(['main/loss'],
                              x_key='epoch',
                              file_name='loss2.png'))
    trainer.extend(extensions.LogReport(log_name="log", trigger=(1, 'epoch')))
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time']))
    trainer.extend(extensions.ProgressBar())

    if args.resume:
        serializers.load_npz(args.out + '/snapshot_epoch-' + str(args.resume),
                             trainer)

    trainer.run()
Esempio n. 25
0
def main():
    global target, target_ids, source_word_ids, source_char_ids
    todaydetail = dt.today()
    todaydetailf = todaydetail.strftime("%Y%m%d-%H%M%S")
    print('start at ' + todaydetailf)
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('SOURCE', help='source sentence list')
    parser.add_argument('TARGET', help='target sentence list')
    parser.add_argument('SOURCE_WORD_VOCAB',
                        help='source word vocabulary file')
    parser.add_argument('SOURCE_CHAR_VOCAB',
                        help='source char vocabulary file')
    parser.add_argument('TARGET_VOCAB', help='target vocabulary file')
    parser.add_argument('--validation-source',
                        help='source sentence list for validation')
    parser.add_argument('--validation-target',
                        help='target sentence list for validation')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='number of sentence pairs in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1024,
                        help='number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=3,
                        help='number of layers')
    parser.add_argument('--min-source-sentence',
                        type=int,
                        default=1,
                        help='minimium length of source sentence')
    parser.add_argument('--max-source-sentence',
                        type=int,
                        default=50,
                        help='maximum length of source sentence')
    parser.add_argument('--min-target-sentence',
                        type=int,
                        default=1,
                        help='minimium length of target sentence')
    parser.add_argument('--max-target-sentence',
                        type=int,
                        default=50,
                        help='maximum length of target sentence')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='directory to output the result')
    parser.add_argument('--trigger',
                        '-t',
                        type=int,
                        default=4000,
                        help='define trigger')
    args = parser.parse_args()

    source_word_ids = load_vocabulary(args.SOURCE_WORD_VOCAB)
    target_ids = load_vocabulary(args.TARGET_VOCAB)
    source_words = {i: w for w, i in source_word_ids.items()}
    target = {i: w for w, i in target_ids.items()}
    source_char_ids = load_vocabulary(args.SOURCE_CHAR_VOCAB)
    source_chars = {i: w for w, i in source_char_ids.items()}
    train_source = load_data(args.SOURCE)
    train_target = load_data(args.TARGET)
    assert len(train_source) == len(train_target)
    train_data = [
        (s, t) for s, t in six.moves.zip(train_source, train_target)
        if args.min_source_sentence <= len(s) <= args.max_source_sentence
        and args.min_source_sentence <= len(t) <= args.max_source_sentence
    ]
    #train_source_unknown = calculate_unknown_ratio(
    #    [s for s, _ in train_data])
    #train_target_unknown = calculate_unknown_ratio(
    #    [t for _, t in train_data])

    print('Source word vocabulary size: %d' % len(source_word_ids))
    print('Source char vocabulary size: %d' % len(source_char_ids))
    print('Target vocabulary size: %d' % len(target_ids))
    print('Train data size: %d' % len(train_data))
    #print('Train source unknown ratio: %.2f%%' % (train_source_unknown * 100))
    #print('Train target unknown ratio: %.2f%%' % (train_target_unknown * 100))

    model = Seq2seq(args.layer, len(source_word_ids), len(target_ids),
                    len(source_char_ids), args.unit)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=convert,
                                       device=args.gpu)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'))
    trainer.extend(extensions.LogReport(trigger=(args.trigger, 'iteration'),
                                        log_name='Log-' + todaydetailf +
                                        '.txt'),
                   trigger=(args.trigger, 'iteration'))
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/perp',
        'validation/main/perp', 'validation/main/bleu', 'elapsed_time'
    ]),
                   trigger=(args.trigger, 'iteration'))

    if args.validation_source and args.validation_target:
        test_source = load_data(args.validation_source)
        test_target = load_data(args.validation_target)
        assert len(test_source) == len(test_target)
        test_data = list(six.moves.zip(test_source, test_target))
        test_data = [(s, t) for s, t in test_data if 0 < len(s) and 0 < len(t)]
        #test_source_unknown = calculate_unknown_ratio(
        #    [s for s, _ in test_data])
        #test_target_unknown = calculate_unknown_ratio(
        #    [t for _, t in test_data])

        print('Validation data: %d' % len(test_data))
        #print('Validation source unknown ratio: %.2f%%' %
        #      (test_source_unknown * 100))
        #print('Validation target unknown ratio: %.2f%%' %
        #      (test_target_unknown * 100))

        @chainer.training.make_extension(trigger=(args.trigger, 'iteration'))
        def translate(trainer):
            source, target = test_data[np.random.choice(len(test_data))]
            result = model.translate([model.xp.array(source)])[0]

            source_sentence = ' '.join([x for x in source])
            target_sentence = ' '.join([y for y in target])
            result_sentence = ' '.join([target_words[y] for y in result])
            print('# source : ' + source_sentence)
            print('#  result : ' + result_sentence)
            print('#  expect : ' + target_sentence)

        #trainer.extend(translate, trigger=(args.trigger, 'iteration'))
        trainer.extend(CalculateBleu(
            model,
            test_data, ['validation/main/bleu', 'validation/main/loss'],
            device=args.gpu),
                       trigger=(args.trigger, 'iteration'))

    print('start training')
    trainer.run()
    print('=>finished!')

    model_name = todaydetailf + '-Parallel-BiGRU.model'
    serializers.save_npz(cfg.PATH_TO_MODELS + model_name, model)
    print('=>save the model: ' + model_name)

    config_name = todaydetailf + '-Hybrid-BiGRU-config.txt'
    f = open(cfg.PATH_TO_MODELS + config_name, 'w')
    model_params = [
        str(args.layer),
        str(len(source_word_ids)),
        str(len(target_ids)),
        str(len(source_char_ids)),
        str(args.unit)
    ]
    assert len(model_params) == model.get_n_params()
    f.write("\n".join(model_params))
    f.close()
    print('=>save the config: ' + config_name)

    enddetail = dt.today()
    enddetailf = enddetail.strftime("%Y%m%d-%H:%M:%S")
    print('end at ' + enddetailf)
Esempio n. 26
0
def train():

    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', '-g', type=int, default=-1)
    parser.add_argument('--model', '-m', type=str, default=None)
    parser.add_argument('--opt', type=str, default=None)
    parser.add_argument('--validation', '-v', type=int, default=5)
    parser.add_argument('--epoch', '-e', type=int, default=20)
    parser.add_argument('--lr', '-l', type=float, default=0.001)
    parser.add_argument('--inf', type=int, default=3)
    parser.add_argument('--outf', type=int, default=3)
    parser.add_argument('--batch', '-b', type=int, default=1)
    args = parser.parse_args()

    train = dataset.UCSDped1Dataset(0, 200, args.inf, args.outf, "./ucsd_ped1_train.npy")

    # cross validation
    dataset_ = datasets.get_cross_validation_datasets(train, args.validation, order=None)
    
    v = 1
    while v <= args.validation:

        model = convlstm.Model(n_input=2, size=[128,64,64])

        if args.model != None:
            print( "loading model from " + args.model )
            serializers.load_npz(args.model, model)

        if args.gpu >= 0:
            cuda.get_device_from_id(0).use()
            model.to_gpu()

        optimizer = optimizers.RMSprop(lr=args.lr)
        optimizer.setup(model)
        
        if args.opt != None:
            print( "loading opt from " + args.opt )
            serializers.load_npz(args.opt, opt)

        train_iter = chainer.iterators.SerialIterator(dataset_[v-1][0], batch_size=args.batch, shuffle=False)
        test_iter = chainer.iterators.SerialIterator(dataset_[v-1][1], batch_size=args.batch, repeat=False, shuffle=False)
        
        updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
        trainer = training.Trainer(updater, (args.epoch, 'epoch'), out='results')
        
        trainer.extend(extensions.Evaluator(test_iter, model, device=args.gpu))
          
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch'), log_name='log_'+str(v)+'_epoch'))
        trainer.extend(extensions.LogReport(trigger=(10, 'iteration')))

        trainer.extend(extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time']))
        trainer.extend(extensions.PlotReport(['main/loss', 'validation/main/loss'],
                       x_key='epoch', file_name='loss_'+str(v)+'_epoch.png'))
        trainer.extend(extensions.ProgressBar(update_interval=1))
        
        trainer.run()
        
        modelname = "./results/model" + str(v)
        print( "saving model to " + modelname )
        serializers.save_npz(modelname, model)

        optname = "./results/opt" + str(v)
        print( "saving opt to " + optname )
        serializers.save_npz(optname, optimizer)

        v = v + 1
Esempio n. 27
0
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu()

    train = VOCDataset(args.root, [t.split('-') for t in args.train], size,
                       multibox_encoder)

    train_iter = chainer.iterators.MultiprocessIterator(
        train, args.batchsize, n_processes=args.loaderjob)

    optimizer = chainer.optimizers.MomentumSGD(lr=0.001)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(0.0005))

    updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
    trainer = training.Trainer(updater, (120000, 'iteration'), args.out)

    snapshot_interval = 1000, 'iteration'
    log_interval = 10, 'iteration'

    trainer.extend(extensions.dump_graph('main/loss'))
    trainer.extend(extensions.snapshot(), trigger=snapshot_interval)
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}'),
                   trigger=snapshot_interval)
    trainer.extend(extensions.LogReport(trigger=log_interval))
    trainer.extend(extensions.observe_lr(), trigger=log_interval)
    trainer.extend(extensions.PrintReport(
        ['epoch', 'iteration', 'main/loss', 'main/loc', 'main/conf', 'lr']),
                   trigger=log_interval)
    trainer.extend(extensions.ProgressBar(update_interval=10))
Esempio n. 28
0
def main():
    # Parse the arguments.
    args = parse_arguments()

    # Set up some useful variables that will be used later on.
    method = args.method
    if args.label != 'all':
        labels = args.label
        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
        class_num = len(labels) if isinstance(labels, list) else 1
    else:
        labels = None
        cache_dir = os.path.join('input', '{}_all'.format(method))
        class_num = len(D.get_qm9_label_names())

    # Get the filename corresponding to the cached dataset, based on the amount
    # of data samples that need to be parsed from the original dataset.
    num_data = args.num_data
    if num_data >= 0:
        dataset_filename = 'data_{}.npz'.format(num_data)
    else:
        dataset_filename = 'data.npz'

    # Load the cached dataset.
    dataset_cache_path = os.path.join(cache_dir, dataset_filename)

    dataset = None
    if os.path.exists(dataset_cache_path):
        print('Loading cached dataset from {}.'.format(dataset_cache_path))
        dataset = NumpyTupleDataset.load(dataset_cache_path)
    if dataset is None:
        print('Preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()

        if num_data >= 0:
            # Select the first `num_data` samples from the dataset.
            target_index = numpy.arange(num_data)
            dataset = D.get_qm9(preprocessor,
                                labels=labels,
                                target_index=target_index)
        else:
            # Load the entire dataset.
            dataset = D.get_qm9(preprocessor, labels=labels)

        # Cache the laded dataset.
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        NumpyTupleDataset.save(dataset_cache_path, dataset)

    # Scale the label values, if necessary.
    if args.scale == 'standardize':
        print('Applying standard scaling to the labels.')
        scaler = StandardScaler()
        scaled_t = scaler.fit_transform(dataset.get_datasets()[-1])
        dataset = NumpyTupleDataset(*(dataset.get_datasets()[:-1] +
                                      (scaled_t, )))
    else:
        print('No standard scaling was selected.')
        scaler = None

    # Split the dataset into training and validation.
    train_data_size = int(len(dataset) * args.train_data_ratio)
    train, valid = split_dataset_random(dataset, train_data_size, args.seed)

    # Set up the predictor.
    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,
                                 class_num, scaler)

    # Set up the iterators.
    train_iter = iterators.SerialIterator(train, args.batchsize)
    valid_iter = iterators.SerialIterator(valid,
                                          args.batchsize,
                                          repeat=False,
                                          shuffle=False)

    # Set up the regressor.
    device = args.gpu
    metrics_fun = {
        'mae': MeanAbsError(scaler=scaler),
        'rmse': RootMeanSqrError(scaler=scaler)
    }
    regressor = Regressor(predictor,
                          lossfun=F.mean_squared_error,
                          metrics_fun=metrics_fun,
                          device=device)

    # Set up the optimizer.
    optimizer = optimizers.Adam()
    optimizer.setup(regressor)

    # Set up the updater.
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=device,
                                       converter=concat_mols)

    # Set up the trainer.
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
    trainer.extend(
        E.Evaluator(valid_iter,
                    regressor,
                    device=device,
                    converter=concat_mols))
    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))
    trainer.extend(E.LogReport())
    trainer.extend(
        E.PrintReport([
            'epoch', 'main/loss', 'main/mae', 'main/rmse',
            'validation/main/loss', 'validation/main/mae',
            'validation/main/rmse', 'elapsed_time'
        ]))
    trainer.extend(E.ProgressBar())
    trainer.run()

    # Save the regressor's parameters.
    model_path = os.path.join(args.out, args.model_filename)
    print('Saving the trained model to {}...'.format(model_path))
    regressor.save_pickle(model_path, protocol=args.protocol)
Esempio n. 29
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
    parser.add_argument(
        'dataset',
        choices=['visible+occlusion', 'synthetic', 'occlusion'],
        help='The dataset.',
    )
    parser.add_argument('--model',
                        '-m',
                        choices=['vgg16', 'resnet50', 'resnet101'],
                        default='resnet50',
                        help='Base model of Mask R-CNN.')
    parser.add_argument('--pooling-func',
                        '-pf',
                        choices=['pooling', 'align', 'resize'],
                        default='align',
                        help='Pooling function.')
    parser.add_argument('--gpu', '-g', type=int, help='GPU id.')
    parser.add_argument('--multi-node',
                        '-mn',
                        action='store_true',
                        help='use multi node')
    parser.add_argument('--mask-loss',
                        default='softmax',
                        choices=contrib.models.MaskRCNN.mask_losses,
                        help='mask loss mode')
    default_max_epoch = (180e3 * 8) / 118287 * 3  # x3
    parser.add_argument('--max-epoch',
                        type=float,
                        default=default_max_epoch,
                        help='epoch')
    args = parser.parse_args()

    if args.multi_node:
        import chainermn
        comm = chainermn.create_communicator('hierarchical')
        device = comm.intra_rank

        args.n_node = comm.inter_size
        args.n_gpu = comm.size
        chainer.cuda.get_device_from_id(device).use()
    else:
        args.n_node = 1
        args.n_gpu = 1
        chainer.cuda.get_device_from_id(args.gpu).use()
        device = args.gpu

    args.seed = 0
    now = datetime.datetime.now()
    args.timestamp = now.isoformat()
    args.out = osp.join(
        here,
        'logs/train_mrcnn_lbl',
        now.strftime('%Y%m%d_%H%M%S'),
    )

    # 0.00125 * 8 = 0.01  in original
    args.batch_size = 1 * args.n_gpu
    args.lr = 0.00125 * args.batch_size
    args.weight_decay = 0.0001

    # lr / 10 at 120k iteration with
    # 160k iteration * 16 batchsize in original
    args.step_size = [(120e3 / 180e3) * args.max_epoch,
                      (160e3 / 180e3) * args.max_epoch]

    random.seed(args.seed)
    np.random.seed(args.seed)

    # Default Config
    args.min_size = 800
    args.max_size = 1333
    args.anchor_scales = (2, 4, 8, 16, 32)

    if args.dataset == 'visible+occlusion':
        train_data1 = contrib.datasets.ARC2017RealInstancesDataset(
            'train', aug='standard')
        train_data1 = MaskRcnnDataset(train_data1, zero_to_unlabeled=True)
        train_data2 = contrib.datasets.ARC2017RealInstancesDataset(
            'test', aug='standard')
        train_data2 = MaskRcnnDataset(train_data2, zero_to_unlabeled=True)
        train_data3 = contrib.datasets.ARC2017OcclusionDataset('train',
                                                               do_aug=True)
        train_data3 = MaskRcnnDataset(train_data3)
        train_data = chainer.datasets.ConcatenatedDataset(
            train_data1,
            train_data2,
            train_data3,
        )
    elif args.dataset == 'synthetic':
        train_data = contrib.datasets.ARC2017SyntheticInstancesDataset(
            do_aug=True, aug_level='all')
        train_data = MaskRcnnDataset(train_data)
    elif args.dataset == 'occlusion':
        train_data = contrib.datasets.ARC2017OcclusionDataset('train',
                                                              do_aug=True)
        train_data = MaskRcnnDataset(train_data)
    else:
        raise ValueError
    test_data = contrib.datasets.ARC2017OcclusionDataset('test')
    instance_class_names = test_data.class_names[1:]
    test_data_list = test_data.get_video_datasets()
    del test_data
    test_data_list = [MaskRcnnDataset(td) for td in test_data_list]

    if args.pooling_func == 'align':
        pooling_func = cmr.functions.roi_align_2d
    elif args.pooling_func == 'pooling':
        pooling_func = chainer.functions.roi_pooling_2d
    elif args.pooling_func == 'resize':
        pooling_func = cmr.functions.crop_and_resize
    else:
        raise ValueError

    if args.model in ['resnet50', 'resnet101']:
        n_layers = int(args.model.lstrip('resnet'))
        mask_rcnn = contrib.models.MaskRCNNResNet(
            n_layers=n_layers,
            n_fg_class=len(instance_class_names),
            pooling_func=pooling_func,
            anchor_scales=args.anchor_scales,
            min_size=args.min_size,
            max_size=args.max_size,
            mask_loss=args.mask_loss,
        )
    else:
        raise ValueError
    model = contrib.models.MaskRCNNTrainChain(mask_rcnn)
    if args.multi_node or args.gpu >= 0:
        model.to_gpu()

    optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9)
    if args.multi_node:
        optimizer = chainermn.create_multi_node_optimizer(optimizer, comm)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(rate=args.weight_decay))

    for link in mask_rcnn.links():
        if isinstance(link, cmr.links.AffineChannel2D):
            link.disable_update()

    train_data = chainer.datasets.TransformDataset(
        train_data, cmr.datasets.MaskRCNNTransform(mask_rcnn))
    test_data_list = [
        chainer.datasets.TransformDataset(
            td, cmr.datasets.MaskRCNNTransform(mask_rcnn, train=False))
        for td in test_data_list
    ]
    test_concat_data = chainer.datasets.ConcatenatedDataset(*test_data_list)
    if args.multi_node:
        # XXX: test_data is only used on device0
        if comm.rank != 0:
            train_data = None
            # test_data = None
        train_data = chainermn.scatter_dataset(train_data, comm, shuffle=True)
        # test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data, batch_size=1)
    test_iters = {
        i: chainer.iterators.SerialIterator(td,
                                            batch_size=1,
                                            repeat=False,
                                            shuffle=False)
        for i, td in enumerate(test_data_list)
    }
    test_concat_iter = chainer.iterators.SerialIterator(test_concat_data,
                                                        batch_size=1,
                                                        repeat=False,
                                                        shuffle=False)

    converter = functools.partial(
        cmr.datasets.concat_examples,
        padding=0,
        # img, bboxes, labels, masks, scales
        indices_concat=[0, 2, 3, 4],  # img, _, labels, masks, scales
        indices_to_device=[0, 1],  # img, bbox
    )
    updater = chainer.training.updater.StandardUpdater(train_iter,
                                                       optimizer,
                                                       device=device,
                                                       converter=converter)

    trainer = training.Trainer(updater, (args.max_epoch, 'epoch'),
                               out=args.out)

    trainer.extend(extensions.ExponentialShift('lr', 0.1),
                   trigger=training.triggers.ManualScheduleTrigger(
                       args.step_size, 'epoch'))

    eval_interval = 1, 'epoch'
    log_interval = 20, 'iteration'
    plot_interval = 0.1, 'epoch'
    print_interval = 20, 'iteration'

    if not args.multi_node or comm.rank == 0:
        evaluator = contrib.extensions.InstanceSegmentationVOCEvaluator(
            test_iters,
            model.mask_rcnn,
            device=device,
            use_07_metric=False,
            label_names=instance_class_names)
        trainer.extend(evaluator, trigger=eval_interval)
        trainer.extend(extensions.snapshot_object(model.mask_rcnn,
                                                  'snapshot_model.npz'),
                       trigger=training.triggers.MaxValueTrigger(
                           'validation/main/mpq', eval_interval))
        args.git_hash = cmr.utils.git_hash()
        args.hostname = socket.gethostname()
        trainer.extend(fcn.extensions.ParamsReport(args.__dict__))
        trainer.extend(contrib.extensions.InstanceSegmentationVisReport(
            test_concat_iter,
            model.mask_rcnn,
            label_names=instance_class_names),
                       trigger=eval_interval)
        trainer.extend(chainer.training.extensions.observe_lr(),
                       trigger=log_interval)
        trainer.extend(extensions.LogReport(trigger=log_interval))
        trainer.extend(
            extensions.PrintReport([
                'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss',
                'main/roi_loc_loss', 'main/roi_cls_loss', 'main/roi_mask_loss',
                'main/rpn_loc_loss', 'main/rpn_cls_loss', 'validation/main/mpq'
            ]),
            trigger=print_interval,
        )
        trainer.extend(extensions.ProgressBar(update_interval=10))

        # plot
        assert extensions.PlotReport.available()
        trainer.extend(
            extensions.PlotReport(
                [
                    'main/loss', 'main/roi_loc_loss', 'main/roi_cls_loss',
                    'main/roi_mask_loss', 'main/rpn_loc_loss',
                    'main/rpn_cls_loss'
                ],
                file_name='loss.png',
                trigger=plot_interval,
            ),
            trigger=plot_interval,
        )
        trainer.extend(
            extensions.PlotReport([
                'validation/main/map', 'validation/main/msq',
                'validation/main/mdq', 'validation/main/mpq'
            ],
                                  file_name='accuracy.png',
                                  trigger=plot_interval),
            trigger=eval_interval,
        )

        trainer.extend(extensions.dump_graph('main/loss'))

    trainer.run()
Esempio n. 30
0
def main():
    parser = argparse.ArgumentParser(description='Chainer CIFAR example:')
    parser.add_argument('--dataset',
                        '-d',
                        default='cifar10',
                        help='The dataset to use: cifar10 or cifar100')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=100,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--pretrain',
                        default=0,
                        help='Pretrain (w/o VD) or not (w/ VD).' +
                        ' default is not (0).')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--resume-opt',
                        '-ro',
                        default='',
                        help='Resume optimizer the training from snapshot')
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Set up a neural network to train.
    # Classifier reports softmax cross entropy loss and accuracy at every
    # iteration, which will be used by the PrintReport extension below.
    if args.dataset == 'cifar10':
        print('Using CIFAR10 dataset.')
        class_labels = 10
        train, test = get_cifar10()
    elif args.dataset == 'cifar100':
        print('Using CIFAR100 dataset.')
        class_labels = 100
        train, test = get_cifar100()
    else:
        raise RuntimeError('Invalid dataset choice.')
    print('# train:', len(train))
    print('# test :', len(test))

    if args.pretrain:
        model = nets.VGG16(class_labels)

        def calc_loss(x, t):
            model.y = model(x)
            model.loss = F.softmax_cross_entropy(model.y, t)
            reporter.report({'loss': model.loss}, model)
            model.accuracy = F.accuracy(model.y, t)
            reporter.report({'accuracy': model.accuracy}, model)
            return model.loss

        model.calc_loss = calc_loss
        model.use_raw_dropout = True
    elif args.resume:
        model = nets.VGG16VD(class_labels, warm_up=1.)
        model(train[0][0][None, ])  # for setting in_channels automatically
        model.to_variational_dropout()
        chainer.serializers.load_npz(args.resume, model)
    else:
        model = nets.VGG16VD(class_labels, warm_up=0.0001)
        model(train[0][0][None, ])  # for setting in_channels automatically
        model.to_variational_dropout()

    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()  # Make a specified GPU current
        model.to_gpu()  # Copy the model to the GPU

    if args.pretrain:
        # Original Torch code (http://torch.ch/blog/2015/07/30/cifar.html)
        # uses lr=1. However, it doesn't work well as people say in the post.
        # This follows a version of Chainer example using lr=0.1.
        optimizer = chainer.optimizers.MomentumSGD(0.1)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))
    elif args.resume:
        optimizer = chainer.optimizers.Adam(1e-5)
        optimizer.setup(model)
    else:
        optimizer = chainer.optimizers.Adam(1e-4)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(10.))

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    if args.resume:
        classifier = L.Classifier(model.copy())
        accuracy = extensions.Evaluator(test_iter, classifier,
                                        device=args.gpu)()['main/accuracy']
        print('test accuracy VD:', accuracy)

    # Set up a trainer
    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       device=args.gpu,
                                       loss_func=model.calc_loss)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(
        extensions.Evaluator(test_iter, L.Classifier(model), device=args.gpu))

    if args.pretrain:
        trainer.extend(extensions.ExponentialShift('lr', 0.5),
                       trigger=(25, 'epoch'))
    elif not args.resume:
        trainer.extend(
            extensions.LinearShift(
                'alpha', (1e-4, 0.),
                (0, args.epoch * len(train) // args.batchsize)))

    # Take a snapshot at each epoch
    # trainer.extend(extensions.snapshot(), trigger=(args.epoch, 'epoch'))
    if args.pretrain:
        trainer.extend(extensions.snapshot_object(
            model, 'model_snapshot_{.updater.epoch}'),
                       trigger=(10, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    # trainer.extend(extensions.LogReport())
    per = min(len(train) // args.batchsize // 2, 1000)
    trainer.extend(extensions.LogReport(trigger=(per, 'iteration')))

    # Print selected entries of the log to stdout
    # Here "main" refers to the target link of the "main" optimizer again, and
    # "validation" refers to the default name of the Evaluator extension.
    # Entries other than 'epoch' are reported by the Classifier link, called by
    # either the updater or the evaluator.
    trainer.extend(
        extensions.PrintReport([
            'epoch', 'iteration', 'main/loss', 'validation/main/loss',
            'main/accuracy', 'validation/main/accuracy', 'main/class',
            'main/kl', 'main/mean_p', 'main/sparsity', 'main/W/Wnz',
            'main/kl_coef', 'elapsed_time'
        ]))

    # Print a progress bar to stdout
    trainer.extend(extensions.ProgressBar())

    # Run the training
    trainer.run()

    print('Measure inference speeds for 1 sample inference...')
    test_iter = chainer.iterators.SerialIterator(test,
                                                 1,
                                                 repeat=False,
                                                 shuffle=False)

    if not args.pretrain:
        if args.gpu >= 0:
            classifier = L.Classifier(model.copy())
            start = time.time()
            accuracy = extensions.Evaluator(test_iter,
                                            classifier,
                                            device=args.gpu)()['main/accuracy']
            print('dense Gpu:',
                  time.time() - start, 's/{} imgs'.format(len(test)))

        model.to_cpu()
        classifier = L.Classifier(model.copy())
        start = time.time()
        accuracy = extensions.Evaluator(test_iter, classifier,
                                        device=-1)()['main/accuracy']
        print('dense Cpu:', time.time() - start, 's/{} imgs'.format(len(test)))

        model.to_cpu_sparse()
        model.name = None
        classifier = L.Classifier(copy.deepcopy(model))
        start = time.time()
        accuracy = extensions.Evaluator(test_iter, classifier,
                                        device=-1)()['main/accuracy']
        print('sparse Cpu:',
              time.time() - start, 's/{} imgs'.format(len(test)))