Example #1
0
def read_source(in_dir, cache=None):
    en_path = os.path.join(in_dir, 'giga-fren.release2.fixed.en')
    source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path)
    source_data = europal.make_dataset(en_path, source_vocab)

    return source_vocab, source_data
Example #2
0
def read_target(in_dir, cahce=None):
    fr_path = os.path.join(in_dir, 'giga-fren.release2.fixed.fr')
    target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path)
    target_data = europal.make_dataset(fr_path, target_vocab)

    return target_vocab, target_data
Example #3
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action="store_true", default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help="Type of communicator")
    parser.add_argument('--stop', '-s', type=str, default="15e",
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default="adam()",
                        help="Optimizer and its argument")
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.mpi_comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print("RD source done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print("RD target done. {:.3f} [s]".format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print("Rank {} GPU: {}".format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.mpi_comm.bcast(source_ids, root=0)
    target_ids = comm.mpi_comm.bcast(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print("target_words : {}".format(len(target_words)))
        print("source_words : {}".format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write("Error: unknown stop trigger: {}".format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print("Trigger: {}".format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()
def main():
    parser = argparse.ArgumentParser(
        description='Chainer example: convolutional seq2seq')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=32,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=100,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=512,
                        help='Number of units')
    parser.add_argument('--layer',
                        '-l',
                        type=int,
                        default=15,
                        help='Number of layers')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        default='./',
                        help='Input directory')
    parser.add_argument('--source',
                        '-s',
                        type=str,
                        default='europarl-v7.fr-en.en',
                        help='Filename of train data for source language')
    parser.add_argument('--target',
                        '-t',
                        type=str,
                        default='europarl-v7.fr-en.fr',
                        help='Filename of train data for target language')
    parser.add_argument('--source-valid',
                        '-svalid',
                        type=str,
                        default='dev/newstest2013.en',
                        help='Filename of validation data for source language')
    parser.add_argument('--target-valid',
                        '-tvalid',
                        type=str,
                        default='dev/newstest2013.fr',
                        help='Filename of validation data for target language')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--source-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of source language')
    parser.add_argument('--target-vocab',
                        type=int,
                        default=40000,
                        help='Vocabulary size of target language')
    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=4))

    # Check file
    en_path = os.path.join(args.input, args.source)
    source_vocab = ['<eos>', '<unk>'] + \
        europal.count_words(en_path, args.source_vocab)
    source_data = europal.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target)
    target_vocab = ['<eos>', '<unk>'] + \
        europal.count_words(fr_path, args.target_vocab)
    target_data = europal.make_dataset(fr_path, target_vocab)
    assert len(source_data) == len(target_data)
    print('Original training data size: %d' % len(source_data))
    train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)
                  if 0 < len(s) < 50 and 0 < len(t) < 50]
    print('Filtered training data size: %d' % len(train_data))

    en_path = os.path.join(args.input, args.source_valid)
    source_data = europal.make_dataset(en_path, source_vocab)
    fr_path = os.path.join(args.input, args.target_valid)
    target_data = europal.make_dataset(fr_path, target_vocab)
    assert len(source_data) == len(target_data)
    test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data)
                 if 0 < len(s) and 0 < len(t)]

    source_ids = {word: index for index, word in enumerate(source_vocab)}
    target_ids = {word: index for index, word in enumerate(target_vocab)}

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    # Define Model
    model = net.Seq2seq(args.layer, len(source_ids), len(target_ids),
                        args.unit)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    # Setup Optimizer
    optimizer = chainer.optimizers.NesterovAG(lr=0.25, momentum=0.99)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(0.1))

    # Setup Trainer
    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test_data,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)
    iter_per_epoch = len(train_data) // args.batchsize
    print('Number of iter/epoch =', iter_per_epoch)

    updater = training.StandardUpdater(train_iter,
                                       optimizer,
                                       converter=seq2seq_pad_concat_convert,
                                       device=args.gpu)

    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # If you want to change a logging interval, change this number
    log_trigger = (min(1000, iter_per_epoch // 2), 'iteration')

    def floor_step(trigger):
        floored = trigger[0] - trigger[0] % log_trigger[0]
        if floored <= 0:
            floored = trigger[0]
        return (floored, trigger[1])

    # Validation every half epoch
    eval_trigger = floor_step((iter_per_epoch // 2, 'iteration'))
    fail_trigger = FailMinValueTrigger('val/main/perp', eval_trigger)
    record_trigger = training.triggers.MinValueTrigger('val/main/perp',
                                                       eval_trigger)

    evaluator = extensions.Evaluator(test_iter,
                                     model,
                                     converter=seq2seq_pad_concat_convert,
                                     device=args.gpu)
    evaluator.default_name = 'val'
    trainer.extend(evaluator, trigger=eval_trigger)
    # Only if validation perplexity fails to be improved,
    # lr is decayed (until 1e-4).
    trainer.extend(extensions.ExponentialShift('lr', 0.1, target=1e-4),
                   trigger=fail_trigger)
    trainer.extend(extensions.observe_lr(), trigger=eval_trigger)
    # Only if a model gets best validation score,
    # save the model
    trainer.extend(extensions.snapshot_object(
        model, 'model_iter_{.updater.iteration}.npz'),
                   trigger=record_trigger)

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array([source_ids.get(w, 1) for w in words], 'i')
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one('Who are we ?', 'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words[i] for i in source])
        target = ' '.join([target_words[i] for i in target])
        translate_one(source, target)

    # Gereneration Test
    trainer.extend(translate, trigger=(min(200, iter_per_epoch), 'iteration'))
    # Calculate BLEU every half epoch
    trainer.extend(CalculateBleu(model,
                                 test_data,
                                 'val/main/bleu',
                                 device=args.gpu,
                                 batch=args.batchsize // 4),
                   trigger=floor_step((iter_per_epoch // 2, 'iteration')))

    # Log
    trainer.extend(extensions.LogReport(trigger=log_trigger),
                   trigger=log_trigger)
    trainer.extend(extensions.PrintReport([
        'epoch', 'iteration', 'main/loss', 'val/main/loss', 'main/perp',
        'val/main/perp', 'main/acc', 'val/main/acc', 'val/main/bleu', 'lr',
        'elapsed_time'
    ]),
                   trigger=log_trigger)

    print('start training')
    trainer.run()
Example #5
0
def read_target(in_dir, cahce=None):
    fr_path = os.path.join(in_dir, 'giga-fren.release2.fixed.fr')
    target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path)
    target_data = europal.make_dataset(fr_path, target_vocab)

    return target_vocab, target_data
Example #6
0
def read_source(in_dir, cache=None):
    en_path = os.path.join(in_dir, 'giga-fren.release2.fixed.en')
    source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path)
    source_data = europal.make_dataset(en_path, source_vocab)

    return source_vocab, source_data
Example #7
0
def main():
    parser = argparse.ArgumentParser(description='Chainer example: seq2seq')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--bleu', action='store_true', default=False,
                        help='Report BLEU score')
    parser.add_argument('--gpu', '-g', action='store_true',
                        help='Use GPU')
    parser.add_argument('--cache', '-c', default=None,
                        help='Directory to cache pre-processed dataset')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit', '-u', type=int, default=1024,
                        help='Number of units')
    parser.add_argument('--communicator', default='hierarchical',
                        help='Type of communicator')
    parser.add_argument('--stop', '-s', type=str, default='15e',
                        help='Stop trigger (ex. "500i", "15e")')
    parser.add_argument('--input', '-i', type=str, default='wmt',
                        help='Input directory')
    parser.add_argument('--optimizer', type=str, default='adam()',
                        help='Optimizer and its argument')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    args = parser.parse_args()

    # Prepare ChainerMN communicator
    if args.gpu:
        comm = chainermn.create_communicator('hierarchical')
        dev = comm.intra_rank
    else:
        comm = chainermn.create_communicator('naive')
        dev = -1

    if comm.rank == 0:
        print('==========================================')
        print('Num process (COMM_WORLD): {}'.format(comm.size))
        if args.gpu:
            print('Using GPUs')
        print('Using {} communicator'.format(args.communicator))
        print('Num unit: {}'.format(args.unit))
        print('Num Minibatch-size: {}'.format(args.batchsize))
        print('==========================================')

    # Rank 0 prepares all data
    if comm.rank == 0:
        if args.cache and not os.path.exists(args.cache):
            os.mkdir(args.cache)

        # Read source data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'source.pickle')
            source_vocab, source_data = cached_call(cache_file,
                                                    read_source,
                                                    args.input, args.cache)
        else:
            source_vocab, source_data = read_source(args.input, args.cache)
        et = time.time()
        print('RD source done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        # Read target data
        bt = time.time()
        if args.cache:
            cache_file = os.path.join(args.cache, 'target.pickle')
            target_vocab, target_data = cached_call(cache_file,
                                                    read_target,
                                                    args.input, args.cache)
        else:
            target_vocab, target_data = read_target(args.input, args.cache)
        et = time.time()
        print('RD target done. {:.3f} [s]'.format(et - bt))
        sys.stdout.flush()

        print('Original training data size: %d' % len(source_data))
        train_data = [(s, t)
                      for s, t in six.moves.zip(source_data, target_data)
                      if 0 < len(s) < 50 and 0 < len(t) < 50]
        print('Filtered training data size: %d' % len(train_data))

        en_path = os.path.join(args.input, 'dev', 'newstest2013.en')
        source_data = europal.make_dataset(en_path, source_vocab)
        fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr')
        target_data = europal.make_dataset(fr_path, target_vocab)
        assert(len(source_data) == len(target_data))
        test_data = [(s, t) for s, t
                     in six.moves.zip(source_data, target_data)
                     if 0 < len(s) and 0 < len(t)]

        source_ids = {word: index
                      for index, word in enumerate(source_vocab)}
        target_ids = {word: index
                      for index, word in enumerate(target_vocab)}
    else:
        # target_data, source_data = None, None
        train_data, test_data = None, None
        target_ids, source_ids = None, None

    # Print GPU id
    for i in range(0, comm.size):
        if comm.rank == i:
            print('Rank {} GPU: {}'.format(comm.rank, dev))
        sys.stdout.flush()
        comm.mpi_comm.Barrier()

    # broadcast id- > word dictionary
    source_ids = comm.bcast_obj(source_ids, root=0)
    target_ids = comm.bcast_obj(target_ids, root=0)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    if comm.rank == 0:
        print('target_words : {}'.format(len(target_words)))
        print('source_words : {}'.format(len(source_words)))

    model = Seq2seq(3, len(source_ids), len(target_ids), args.unit)

    if dev >= 0:
        chainer.cuda.get_device_from_id(dev).use()
        model.to_gpu(dev)

    # determine the stop trigger
    m = re.match(r'^(\d+)e$', args.stop)
    if m:
        trigger = (int(m.group(1)), 'epoch')
    else:
        m = re.match(r'^(\d+)i$', args.stop)
        if m:
            trigger = (int(m.group(1)), 'iteration')
        else:
            if comm.rank == 0:
                sys.stderr.write('Error: unknown stop trigger: {}'.format(
                    args.stop))
            exit(-1)

    if comm.rank == 0:
        print('Trigger: {}'.format(trigger))

    optimizer = chainermn.create_multi_node_optimizer(
        create_optimizer(args.optimizer), comm)
    optimizer.setup(model)

    # Broadcast dataset
    # Sanity check of train_data
    train_data = chainermn.scatter_dataset(train_data, comm)

    test_data = chainermn.scatter_dataset(test_data, comm)

    train_iter = chainer.iterators.SerialIterator(train_data,
                                                  args.batchsize,
                                                  shuffle=False)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=convert, device=dev)
    trainer = training.Trainer(updater,
                               trigger,
                               out=args.out)

    trainer.extend(chainermn.create_multi_node_evaluator(
        BleuEvaluator(model, test_data, device=dev, comm=comm),
        comm))

    def translate_one(source, target):
        words = europal.split_sentence(source)
        print('# source : ' + ' '.join(words))
        x = model.xp.array(
            [source_ids.get(w, 1) for w in words], numpy.int32)
        ys = model.translate([x])[0]
        words = [target_words[y] for y in ys]
        print('#  result : ' + ' '.join(words))
        print('#  expect : ' + target)

    # @chainer.training.make_extension(trigger=(200, 'iteration'))
    def translate(trainer):
        translate_one(
            'Who are we ?',
            'Qui sommes-nous?')
        translate_one(
            'And it often costs over a hundred dollars ' +
            'to obtain the required identity card .',
            'Or, il en coûte souvent plus de cent dollars ' +
            'pour obtenir la carte d\'identité requise.')

        source, target = test_data[numpy.random.choice(len(test_data))]
        source = ' '.join([source_words.get(i, '') for i in source])
        target = ' '.join([target_words.get(i, '') for i in target])
        translate_one(source, target)

    if comm.rank == 0:
        trainer.extend(extensions.LogReport(trigger=(1, 'epoch')),
                       trigger=(1, 'epoch'))

        report = extensions.PrintReport(['epoch',
                                         'iteration',
                                         'main/loss',
                                         'main/perp',
                                         'validation/main/bleu',
                                         'elapsed_time'])
        trainer.extend(report, trigger=(1, 'epoch'))

    comm.mpi_comm.Barrier()
    if comm.rank == 0:
        print('start training')
        sys.stdout.flush()

    trainer.run()