def read_source(in_dir, cache=None): en_path = os.path.join(in_dir, 'giga-fren.release2.fixed.en') source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path) source_data = europal.make_dataset(en_path, source_vocab) return source_vocab, source_data
def read_target(in_dir, cahce=None): fr_path = os.path.join(in_dir, 'giga-fren.release2.fixed.fr') target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path) target_data = europal.make_dataset(fr_path, target_vocab) return target_vocab, target_data
def main(): parser = argparse.ArgumentParser(description='Chainer example: seq2seq') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--bleu', action="store_true", default=False, help='Report BLEU score') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--cache', '-c', default=None, help='Directory to cache pre-processed dataset') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1024, help='Number of units') parser.add_argument('--communicator', default='hierarchical', help="Type of communicator") parser.add_argument('--stop', '-s', type=str, default="15e", help='Stop trigger (ex. "500i", "15e")') parser.add_argument('--input', '-i', type=str, default='wmt', help='Input directory') parser.add_argument('--optimizer', type=str, default="adam()", help="Optimizer and its argument") parser.add_argument('--out', '-o', default='result', help='Directory to output the result') args = parser.parse_args() # Prepare ChainerMN communicator if args.gpu: comm = chainermn.create_communicator('hierarchical') dev = comm.intra_rank else: comm = chainermn.create_communicator('naive') dev = -1 if comm.mpi_comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size())) if args.gpu: print('Using GPUs') print('Using {} communicator'.format(args.communicator)) print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('==========================================') # Rank 0 prepares all data if comm.rank == 0: if args.cache and not os.path.exists(args.cache): os.mkdir(args.cache) # Read source data bt = time.time() if args.cache: cache_file = os.path.join(args.cache, 'source.pickle') source_vocab, source_data = cached_call(cache_file, read_source, args.input, args.cache) else: source_vocab, source_data = read_source(args.input, args.cache) et = time.time() print("RD source done. {:.3f} [s]".format(et - bt)) sys.stdout.flush() # Read target data bt = time.time() if args.cache: cache_file = os.path.join(args.cache, 'target.pickle') target_vocab, target_data = cached_call(cache_file, read_target, args.input, args.cache) else: target_vocab, target_data = read_target(args.input, args.cache) et = time.time() print("RD target done. {:.3f} [s]".format(et - bt)) sys.stdout.flush() print('Original training data size: %d' % len(source_data)) train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) < 50 and 0 < len(t) < 50] print('Filtered training data size: %d' % len(train_data)) en_path = os.path.join(args.input, 'dev', 'newstest2013.en') source_data = europal.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr') target_data = europal.make_dataset(fr_path, target_vocab) assert(len(source_data) == len(target_data)) test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) and 0 < len(t)] source_ids = {word: index for index, word in enumerate(source_vocab)} target_ids = {word: index for index, word in enumerate(target_vocab)} else: # target_data, source_data = None, None train_data, test_data = None, None target_ids, source_ids = None, None # Print GPU id for i in range(0, comm.size): if comm.rank == i: print("Rank {} GPU: {}".format(comm.rank, dev)) sys.stdout.flush() comm.mpi_comm.Barrier() # broadcast id- > word dictionary source_ids = comm.mpi_comm.bcast(source_ids, root=0) target_ids = comm.mpi_comm.bcast(target_ids, root=0) target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} if comm.rank == 0: print("target_words : {}".format(len(target_words))) print("source_words : {}".format(len(source_words))) model = Seq2seq(3, len(source_ids), len(target_ids), args.unit) if dev >= 0: chainer.cuda.get_device(dev).use() model.to_gpu(dev) # determine the stop trigger m = re.match(r'^(\d+)e$', args.stop) if m: trigger = (int(m.group(1)), 'epoch') else: m = re.match(r'^(\d+)i$', args.stop) if m: trigger = (int(m.group(1)), 'iteration') else: if comm.rank == 0: sys.stderr.write("Error: unknown stop trigger: {}".format( args.stop)) exit(-1) if comm.rank == 0: print("Trigger: {}".format(trigger)) optimizer = chainermn.create_multi_node_optimizer( create_optimizer(args.optimizer), comm) optimizer.setup(model) # Broadcast dataset # Sanity check of train_data train_data = chainermn.scatter_dataset(train_data, comm) test_data = chainermn.scatter_dataset(test_data, comm) train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize, shuffle=False) updater = training.StandardUpdater( train_iter, optimizer, converter=convert, device=dev) trainer = training.Trainer(updater, trigger, out=args.out) trainer.extend(chainermn.create_multi_node_evaluator( BleuEvaluator(model, test_data, device=dev, comm=comm), comm)) def translate_one(source, target): words = europal.split_sentence(source) print('# source : ' + ' '.join(words)) x = model.xp.array( [source_ids.get(w, 1) for w in words], 'i') ys = model.translate([x])[0] words = [target_words[y] for y in ys] print('# result : ' + ' '.join(words)) print('# expect : ' + target) # @chainer.training.make_extension(trigger=(200, 'iteration')) def translate(trainer): translate_one( 'Who are we ?', 'Qui sommes-nous?') translate_one( 'And it often costs over a hundred dollars ' + 'to obtain the required identity card .', 'Or, il en coûte souvent plus de cent dollars ' + 'pour obtenir la carte d\'identité requise.') source, target = test_data[numpy.random.choice(len(test_data))] source = ' '.join([source_words.get(i, '') for i in source]) target = ' '.join([target_words.get(i, '') for i in target]) translate_one(source, target) if comm.rank == 0: trainer.extend(extensions.LogReport(trigger=(1, 'epoch')), trigger=(1, 'epoch')) report = extensions.PrintReport(['epoch', 'iteration', 'main/loss', 'main/perp', 'validation/main/bleu', 'elapsed_time']) trainer.extend(report, trigger=(1, 'epoch')) comm.mpi_comm.Barrier() if comm.rank == 0: print('start training') sys.stdout.flush() trainer.run()
def main(): parser = argparse.ArgumentParser( description='Chainer example: convolutional seq2seq') parser.add_argument('--batchsize', '-b', type=int, default=32, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=100, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--unit', '-u', type=int, default=512, help='Number of units') parser.add_argument('--layer', '-l', type=int, default=15, help='Number of layers') parser.add_argument('--input', '-i', type=str, default='./', help='Input directory') parser.add_argument('--source', '-s', type=str, default='europarl-v7.fr-en.en', help='Filename of train data for source language') parser.add_argument('--target', '-t', type=str, default='europarl-v7.fr-en.fr', help='Filename of train data for target language') parser.add_argument('--source-valid', '-svalid', type=str, default='dev/newstest2013.en', help='Filename of validation data for source language') parser.add_argument('--target-valid', '-tvalid', type=str, default='dev/newstest2013.fr', help='Filename of validation data for target language') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--source-vocab', type=int, default=40000, help='Vocabulary size of source language') parser.add_argument('--target-vocab', type=int, default=40000, help='Vocabulary size of target language') args = parser.parse_args() print(json.dumps(args.__dict__, indent=4)) # Check file en_path = os.path.join(args.input, args.source) source_vocab = ['<eos>', '<unk>'] + \ europal.count_words(en_path, args.source_vocab) source_data = europal.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, args.target) target_vocab = ['<eos>', '<unk>'] + \ europal.count_words(fr_path, args.target_vocab) target_data = europal.make_dataset(fr_path, target_vocab) assert len(source_data) == len(target_data) print('Original training data size: %d' % len(source_data)) train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) < 50 and 0 < len(t) < 50] print('Filtered training data size: %d' % len(train_data)) en_path = os.path.join(args.input, args.source_valid) source_data = europal.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, args.target_valid) target_data = europal.make_dataset(fr_path, target_vocab) assert len(source_data) == len(target_data) test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) and 0 < len(t)] source_ids = {word: index for index, word in enumerate(source_vocab)} target_ids = {word: index for index, word in enumerate(target_vocab)} target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} # Define Model model = net.Seq2seq(args.layer, len(source_ids), len(target_ids), args.unit) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) # Setup Optimizer optimizer = chainer.optimizers.NesterovAG(lr=0.25, momentum=0.99) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(0.1)) # Setup Trainer train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) test_iter = chainer.iterators.SerialIterator(test_data, args.batchsize, repeat=False, shuffle=False) iter_per_epoch = len(train_data) // args.batchsize print('Number of iter/epoch =', iter_per_epoch) updater = training.StandardUpdater(train_iter, optimizer, converter=seq2seq_pad_concat_convert, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # If you want to change a logging interval, change this number log_trigger = (min(1000, iter_per_epoch // 2), 'iteration') def floor_step(trigger): floored = trigger[0] - trigger[0] % log_trigger[0] if floored <= 0: floored = trigger[0] return (floored, trigger[1]) # Validation every half epoch eval_trigger = floor_step((iter_per_epoch // 2, 'iteration')) fail_trigger = FailMinValueTrigger('val/main/perp', eval_trigger) record_trigger = training.triggers.MinValueTrigger('val/main/perp', eval_trigger) evaluator = extensions.Evaluator(test_iter, model, converter=seq2seq_pad_concat_convert, device=args.gpu) evaluator.default_name = 'val' trainer.extend(evaluator, trigger=eval_trigger) # Only if validation perplexity fails to be improved, # lr is decayed (until 1e-4). trainer.extend(extensions.ExponentialShift('lr', 0.1, target=1e-4), trigger=fail_trigger) trainer.extend(extensions.observe_lr(), trigger=eval_trigger) # Only if a model gets best validation score, # save the model trainer.extend(extensions.snapshot_object( model, 'model_iter_{.updater.iteration}.npz'), trigger=record_trigger) def translate_one(source, target): words = europal.split_sentence(source) print('# source : ' + ' '.join(words)) x = model.xp.array([source_ids.get(w, 1) for w in words], 'i') ys = model.translate([x])[0] words = [target_words[y] for y in ys] print('# result : ' + ' '.join(words)) print('# expect : ' + target) @chainer.training.make_extension(trigger=(200, 'iteration')) def translate(trainer): translate_one('Who are we ?', 'Qui sommes-nous?') translate_one( 'And it often costs over a hundred dollars ' + 'to obtain the required identity card .', 'Or, il en coûte souvent plus de cent dollars ' + 'pour obtenir la carte d\'identité requise.') source, target = test_data[numpy.random.choice(len(test_data))] source = ' '.join([source_words[i] for i in source]) target = ' '.join([target_words[i] for i in target]) translate_one(source, target) # Gereneration Test trainer.extend(translate, trigger=(min(200, iter_per_epoch), 'iteration')) # Calculate BLEU every half epoch trainer.extend(CalculateBleu(model, test_data, 'val/main/bleu', device=args.gpu, batch=args.batchsize // 4), trigger=floor_step((iter_per_epoch // 2, 'iteration'))) # Log trainer.extend(extensions.LogReport(trigger=log_trigger), trigger=log_trigger) trainer.extend(extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'val/main/loss', 'main/perp', 'val/main/perp', 'main/acc', 'val/main/acc', 'val/main/bleu', 'lr', 'elapsed_time' ]), trigger=log_trigger) print('start training') trainer.run()
def read_target(in_dir, cahce=None): fr_path = os.path.join(in_dir, 'giga-fren.release2.fixed.fr') target_vocab = ['<eos>', '<unk>'] + europal.count_words(fr_path) target_data = europal.make_dataset(fr_path, target_vocab) return target_vocab, target_data
def read_source(in_dir, cache=None): en_path = os.path.join(in_dir, 'giga-fren.release2.fixed.en') source_vocab = ['<eos>', '<unk>'] + europal.count_words(en_path) source_data = europal.make_dataset(en_path, source_vocab) return source_vocab, source_data
def main(): parser = argparse.ArgumentParser(description='Chainer example: seq2seq') parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--bleu', action='store_true', default=False, help='Report BLEU score') parser.add_argument('--gpu', '-g', action='store_true', help='Use GPU') parser.add_argument('--cache', '-c', default=None, help='Directory to cache pre-processed dataset') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1024, help='Number of units') parser.add_argument('--communicator', default='hierarchical', help='Type of communicator') parser.add_argument('--stop', '-s', type=str, default='15e', help='Stop trigger (ex. "500i", "15e")') parser.add_argument('--input', '-i', type=str, default='wmt', help='Input directory') parser.add_argument('--optimizer', type=str, default='adam()', help='Optimizer and its argument') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') args = parser.parse_args() # Prepare ChainerMN communicator if args.gpu: comm = chainermn.create_communicator('hierarchical') dev = comm.intra_rank else: comm = chainermn.create_communicator('naive') dev = -1 if comm.rank == 0: print('==========================================') print('Num process (COMM_WORLD): {}'.format(comm.size)) if args.gpu: print('Using GPUs') print('Using {} communicator'.format(args.communicator)) print('Num unit: {}'.format(args.unit)) print('Num Minibatch-size: {}'.format(args.batchsize)) print('==========================================') # Rank 0 prepares all data if comm.rank == 0: if args.cache and not os.path.exists(args.cache): os.mkdir(args.cache) # Read source data bt = time.time() if args.cache: cache_file = os.path.join(args.cache, 'source.pickle') source_vocab, source_data = cached_call(cache_file, read_source, args.input, args.cache) else: source_vocab, source_data = read_source(args.input, args.cache) et = time.time() print('RD source done. {:.3f} [s]'.format(et - bt)) sys.stdout.flush() # Read target data bt = time.time() if args.cache: cache_file = os.path.join(args.cache, 'target.pickle') target_vocab, target_data = cached_call(cache_file, read_target, args.input, args.cache) else: target_vocab, target_data = read_target(args.input, args.cache) et = time.time() print('RD target done. {:.3f} [s]'.format(et - bt)) sys.stdout.flush() print('Original training data size: %d' % len(source_data)) train_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) < 50 and 0 < len(t) < 50] print('Filtered training data size: %d' % len(train_data)) en_path = os.path.join(args.input, 'dev', 'newstest2013.en') source_data = europal.make_dataset(en_path, source_vocab) fr_path = os.path.join(args.input, 'dev', 'newstest2013.fr') target_data = europal.make_dataset(fr_path, target_vocab) assert(len(source_data) == len(target_data)) test_data = [(s, t) for s, t in six.moves.zip(source_data, target_data) if 0 < len(s) and 0 < len(t)] source_ids = {word: index for index, word in enumerate(source_vocab)} target_ids = {word: index for index, word in enumerate(target_vocab)} else: # target_data, source_data = None, None train_data, test_data = None, None target_ids, source_ids = None, None # Print GPU id for i in range(0, comm.size): if comm.rank == i: print('Rank {} GPU: {}'.format(comm.rank, dev)) sys.stdout.flush() comm.mpi_comm.Barrier() # broadcast id- > word dictionary source_ids = comm.bcast_obj(source_ids, root=0) target_ids = comm.bcast_obj(target_ids, root=0) target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} if comm.rank == 0: print('target_words : {}'.format(len(target_words))) print('source_words : {}'.format(len(source_words))) model = Seq2seq(3, len(source_ids), len(target_ids), args.unit) if dev >= 0: chainer.cuda.get_device_from_id(dev).use() model.to_gpu(dev) # determine the stop trigger m = re.match(r'^(\d+)e$', args.stop) if m: trigger = (int(m.group(1)), 'epoch') else: m = re.match(r'^(\d+)i$', args.stop) if m: trigger = (int(m.group(1)), 'iteration') else: if comm.rank == 0: sys.stderr.write('Error: unknown stop trigger: {}'.format( args.stop)) exit(-1) if comm.rank == 0: print('Trigger: {}'.format(trigger)) optimizer = chainermn.create_multi_node_optimizer( create_optimizer(args.optimizer), comm) optimizer.setup(model) # Broadcast dataset # Sanity check of train_data train_data = chainermn.scatter_dataset(train_data, comm) test_data = chainermn.scatter_dataset(test_data, comm) train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize, shuffle=False) updater = training.StandardUpdater( train_iter, optimizer, converter=convert, device=dev) trainer = training.Trainer(updater, trigger, out=args.out) trainer.extend(chainermn.create_multi_node_evaluator( BleuEvaluator(model, test_data, device=dev, comm=comm), comm)) def translate_one(source, target): words = europal.split_sentence(source) print('# source : ' + ' '.join(words)) x = model.xp.array( [source_ids.get(w, 1) for w in words], numpy.int32) ys = model.translate([x])[0] words = [target_words[y] for y in ys] print('# result : ' + ' '.join(words)) print('# expect : ' + target) # @chainer.training.make_extension(trigger=(200, 'iteration')) def translate(trainer): translate_one( 'Who are we ?', 'Qui sommes-nous?') translate_one( 'And it often costs over a hundred dollars ' + 'to obtain the required identity card .', 'Or, il en coûte souvent plus de cent dollars ' + 'pour obtenir la carte d\'identité requise.') source, target = test_data[numpy.random.choice(len(test_data))] source = ' '.join([source_words.get(i, '') for i in source]) target = ' '.join([target_words.get(i, '') for i in target]) translate_one(source, target) if comm.rank == 0: trainer.extend(extensions.LogReport(trigger=(1, 'epoch')), trigger=(1, 'epoch')) report = extensions.PrintReport(['epoch', 'iteration', 'main/loss', 'main/perp', 'validation/main/bleu', 'elapsed_time']) trainer.extend(report, trigger=(1, 'epoch')) comm.mpi_comm.Barrier() if comm.rank == 0: print('start training') sys.stdout.flush() trainer.run()