def dump_random_source_target_translation(model, source_buckets, target_buckets, vocab_inv_source, vocab_inv_target, num_translate=3, beam_width=8): xp = model.xp for source_bucket, target_bucket in zip(source_buckets, target_buckets): source_batch, target_batch = sample_batch_from_bucket( source_bucket, target_bucket, num_translate) if beam_width == 1: # greedy translation_batch = translate_greedy(model, source_batch, target_batch.shape[1] * 2, len(vocab_inv_target), beam_width) for index in range(len(translation_batch)): source = source_batch[index] translation = translation_batch[index] target = target_batch[index] dump_translation(vocab_inv_source, vocab_inv_target, source, translation, target) else: # beam search for index in range(len(source_batch)): source = source_batch[index] target = target_batch[index] translation_batch = translate_beam_search( model, source, target.size * 2, len(vocab_inv_target), beam_width) dump_translation(vocab_inv_source, vocab_inv_target, source, translation_batch, target)
def compute_random_mean_wer(model, source_buckets, target_buckets, target_vocab_size, sample_size=100, argmax=True): xp = model.xp result = [] for source_bucket, target_bucket in zip(source_buckets, target_buckets): # sample minibatch source_batch, target_batch = sample_batch_from_bucket(source_bucket, target_bucket, sample_size) # compute WER mean_wer = _compute_batch_wer_mean(model, source_batch, target_batch, target_vocab_size, argmax=argmax) result.append(mean_wer * 100) return result
def show_random_source_target_translation(model, source_buckets, target_buckets, vocab_inv_source, vocab_inv_target, num_translate=100, argmax=True): xp = model.xp for source_bucket, target_bucket in zip(source_buckets, target_buckets): # sample minibatch source_batch, target_batch = sample_batch_from_bucket( source_bucket, target_bucket, num_translate) translation_batch = _translate_batch(model, source_batch, target_batch.shape[1] * 2, vocab_inv_source, vocab_inv_target, argmax=argmax) show_translate_results(vocab_inv_source, vocab_inv_target, source_batch, translation_batch, target_batch)
def compute_random_error_rate_buckets(model, source_buckets, target_buckets, target_vocab_size, sample_size=100, beam_width=8, normalization_alpha=0): xp = model.xp result = [] for bucket_index, (source_bucket, target_bucket) in enumerate( zip(source_buckets, target_buckets)): source_batch, target_batch = sample_batch_from_bucket( source_bucket, target_bucket, sample_size) if beam_width == 1: # greedy mean_wer = compute_error_rate_source_batch(model, source_batch, target_batch, target_vocab_size) else: # beam search sum_wer = 0 for index in range(sample_size): sys.stdout.write( "\rcomputing WER ... bucket {}/{} (sequence {}/{})".format( bucket_index + 1, len(source_buckets), index + 1, sample_size)) sys.stdout.flush() source = source_batch[index] target = target_batch[index] wer = compute_error_rate_source_sequence( model, source, target, target_vocab_size, beam_width, normalization_alpha) sum_wer += wer mean_wer = sum_wer / len(source_batch) sys.stdout.write("\r" + stdout.CLEAR) sys.stdout.flush() result.append(mean_wer * 100) return result
def main(args): # load textfile train_dataset, dev_dataset, test_dataset, vocab, vocab_inv = read_data( args.text_filename, train_split_ratio=args.train_split, dev_split_ratio=args.dev_split, seed=args.seed) save_vocab(args.model_dir, vocab, vocab_inv) vocab_size = len(vocab) print_bold("data # hash") print("train {} {}".format(len(train_dataset), hash(str(train_dataset)))) print("dev {} {}".format(len(dev_dataset), hash(str(dev_dataset)))) print("test {} {}".format(len(test_dataset), hash(str(test_dataset)))) print("vocab {}".format(vocab_size)) # split into buckets train_buckets = make_buckets(train_dataset) print_bold("buckets #data (train)") if args.buckets_limit is not None: train_buckets = train_buckets[:args.buckets_limit + 1] for size, data in zip(bucket_sizes, train_buckets): print("{} {}".format(size, len(data))) print_bold("buckets #data (dev)") dev_buckets = make_buckets(dev_dataset) if args.buckets_limit is not None: dev_buckets = dev_buckets[:args.buckets_limit + 1] for size, data in zip(bucket_sizes, dev_buckets): print("{} {}".format(size, len(data))) print_bold("buckets #data (test)") test_buckets = make_buckets(test_dataset) for size, data in zip(bucket_sizes, test_buckets): print("{} {}".format(size, len(data))) # to maintain equilibrium min_num_data = 0 for data in train_buckets: if min_num_data == 0 or len(data) < min_num_data: min_num_data = len(data) repeats = [] for data in train_buckets: repeat = len(data) // min_num_data repeat = repeat + 1 if repeat == 0 else repeat repeats.append(repeat) num_updates_per_iteration = 0 for repeat, data in zip(repeats, train_buckets): num_updates_per_iteration += repeat * args.batchsize num_iteration = len(train_dataset) // num_updates_per_iteration + 1 # init model = load_model(args.model_dir) if model is None: model = RNNModel(vocab_size, args.ndim_embedding, args.num_layers, ndim_h=args.ndim_h, kernel_size=args.kernel_size, pooling=args.pooling, zoneout=args.zoneout, dropout=args.dropout, wgain=args.wgain, densely_connected=args.densely_connected, ignore_label=ID_PAD) if args.gpu_device >= 0: chainer.cuda.get_device(args.gpu_device).use() model.to_gpu() # setup an optimizer if args.eve: optimizer = Eve(alpha=args.learning_rate, beta1=0.9) else: optimizer = optimizers.Adam(alpha=args.learning_rate, beta1=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) min_learning_rate = 1e-7 prev_ppl = None total_time = 0 def mean(l): return sum(l) / len(l) # training for epoch in xrange(1, args.epoch + 1): print("Epoch", epoch) start_time = time.time() for itr in xrange(1, num_iteration + 1): sys.stdout.write("\r{} / {}".format(itr, num_iteration)) sys.stdout.flush() for repeat, dataset in zip(repeats, train_buckets): for r in xrange(repeat): batch = sample_batch_from_bucket(dataset, args.batchsize) source, target = make_source_target_pair(batch) if model.xp is cuda.cupy: source = cuda.to_gpu(source) target = cuda.to_gpu(target) model.reset_state() Y = model(source) loss = softmax_cross_entropy(Y, target, ignore_label=ID_PAD) optimizer.update(lossfun=lambda: loss) if itr % args.interval == 0 or itr == num_iteration: save_model(args.model_dir, model) # show log sys.stdout.write("\r" + stdout.CLEAR) sys.stdout.flush() print_bold(" accuracy (sampled train)") acc_train = compute_random_accuracy(model, train_buckets, args.batchsize) print(" ", mean(acc_train), acc_train) print_bold(" accuracy (dev)") acc_dev = compute_accuracy(model, dev_buckets, args.batchsize) print(" ", mean(acc_dev), acc_dev) print_bold(" ppl (sampled train)") ppl_train = compute_random_perplexity(model, train_buckets, args.batchsize) print(" ", mean(ppl_train), ppl_train) print_bold(" ppl (dev)") ppl_dev = compute_perplexity(model, dev_buckets, args.batchsize) ppl_dev_mean = mean(ppl_dev) print(" ", ppl_dev_mean, ppl_dev) elapsed_time = (time.time() - start_time) / 60. total_time += elapsed_time print(" done in {} min, lr = {}, total {} min".format( int(elapsed_time), optimizer.alpha, int(total_time))) # decay learning rate if prev_ppl is not None and ppl_dev_mean >= prev_ppl and optimizer.alpha > min_learning_rate: optimizer.alpha *= 0.5 prev_ppl = ppl_dev_mean
def compute_random_perplexity(model, buckets, batchsize=100): ppl = [] for dataset in buckets: batch = sample_batch_from_bucket(dataset, batchsize) ppl.append(compute_perplexity_batch(model, batch)) return ppl
def compute_random_accuracy(model, buckets, batchsize=100): acc = [] for dataset in buckets: batch = sample_batch_from_bucket(dataset, batchsize) acc.append(compute_accuracy_batch(model, batch)) return acc
def main(args): # load textfile source_dataset, target_dataset, vocab, vocab_inv = read_data(args.source_filename, args.target_filename, train_split_ratio=args.train_split, dev_split_ratio=args.dev_split, seed=args.seed) save_vocab(args.model_dir, vocab, vocab_inv) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset print_bold("data #") print("train {}".format(len(source_dataset_train))) print("dev {}".format(len(source_dataset_dev))) print("test {}".format(len(source_dataset_test))) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv print("vocab {} (source)".format(len(vocab_source))) print("vocab {} (target)".format(len(vocab_target))) # split into buckets source_buckets_train, target_buckets_train = make_buckets(source_dataset_train, target_dataset_train) if args.buckets_limit is not None: source_buckets_train = source_buckets_train[:args.buckets_limit+1] target_buckets_train = target_buckets_train[:args.buckets_limit+1] print_bold("buckets #data (train)") for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) print_bold("buckets #data (dev)") source_buckets_dev, target_buckets_dev = make_buckets(source_dataset_dev, target_dataset_dev) if args.buckets_limit is not None: source_buckets_dev = source_buckets_dev[:args.buckets_limit+1] target_buckets_dev = target_buckets_dev[:args.buckets_limit+1] for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) print_bold("buckets #data (test)") source_buckets_test, target_buckets_test = make_buckets(source_dataset_test, target_dataset_test) if args.buckets_limit is not None: source_buckets_test = source_buckets_test[:args.buckets_limit+1] target_buckets_test = target_buckets_test[:args.buckets_limit+1] for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data))) # to maintain equilibrium min_num_data = 0 for data in source_buckets_train: if min_num_data == 0 or len(data) < min_num_data: min_num_data = len(data) repeats = [] for data in source_buckets_train: repeats.append(len(data) // min_num_data + 1) num_updates_per_iteration = 0 for repeat, data in zip(repeats, source_buckets_train): num_updates_per_iteration += repeat * args.batchsize num_iteration = len(source_dataset_train) // num_updates_per_iteration + 1 # init model = load_model(args.model_dir) if model is None: model = seq2seq(len(vocab_source), len(vocab_target), args.ndim_embedding, args.num_layers, ndim_h=args.ndim_h, pooling=args.pooling, dropout=args.dropout, zoneout=args.zoneout, wgain=args.wgain, densely_connected=args.densely_connected, attention=args.attention) if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() # setup an optimizer if args.eve: optimizer = Eve(alpha=args.learning_rate, beta1=0.9) else: optimizer = optimizers.Adam(alpha=args.learning_rate, beta1=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) min_learning_rate = 1e-7 prev_wer = None total_time = 0 def mean(l): return sum(l) / len(l) # training for epoch in xrange(1, args.epoch + 1): print("Epoch", epoch) start_time = time.time() for itr in xrange(1, num_iteration + 1): for repeat, source_bucket, target_bucket in zip(repeats, source_buckets_train, target_buckets_train): for r in xrange(repeat): # sample minibatch source_batch, target_batch = sample_batch_from_bucket(source_bucket, target_bucket, args.batchsize) skip_mask = source_batch != ID_PAD target_batch_input, target_batch_output = make_source_target_pair(target_batch) # to gpu if model.xp is cuda.cupy: skip_mask = cuda.to_gpu(skip_mask) source_batch = cuda.to_gpu(source_batch) target_batch_input = cuda.to_gpu(target_batch_input) target_batch_output = cuda.to_gpu(target_batch_output) # compute loss model.reset_state() if args.attention: last_hidden_states, last_layer_outputs = model.encode(source_batch, skip_mask) Y = model.decode(target_batch_input, last_hidden_states, last_layer_outputs, skip_mask) else: last_hidden_states = model.encode(source_batch, skip_mask) Y = model.decode(target_batch_input, last_hidden_states) loss = softmax_cross_entropy(Y, target_batch_output, ignore_label=ID_PAD) optimizer.update(lossfun=lambda: loss) sys.stdout.write("\r{} / {}".format(itr, num_iteration)) sys.stdout.flush() if itr % args.interval == 0 or itr == num_iteration: save_model(args.model_dir, model) # show log sys.stdout.write("\r" + stdout.CLEAR) sys.stdout.flush() print_bold("translate (train)") show_random_source_target_translation(model, source_buckets_train, target_buckets_train, vocab_inv_source, vocab_inv_target, num_translate=5, argmax=True) print_bold("translate (dev)") show_random_source_target_translation(model, source_buckets_dev, target_buckets_dev, vocab_inv_source, vocab_inv_target, num_translate=5, argmax=True) print_bold("WER (sampled train)") wer_train = compute_random_mean_wer(model, source_buckets_train, target_buckets_train, len(vocab_inv_target), sample_size=args.batchsize, argmax=True) print(mean(wer_train), wer_train) print_bold("WER (dev)") wer_dev = compute_mean_wer(model, source_buckets_dev, target_buckets_dev, len(vocab_inv_target), batchsize=args.batchsize, argmax=True) mean_wer_dev = mean(wer_dev) print(mean_wer_dev, wer_dev) elapsed_time = (time.time() - start_time) / 60. total_time += elapsed_time print("done in {} min, lr = {}, total {} min".format(int(elapsed_time), optimizer.alpha, int(total_time))) # decay learning rate if prev_wer is not None and mean_wer_dev >= prev_wer and optimizer.alpha > min_learning_rate: optimizer.alpha *= 0.5 prev_wer = mean_wer_dev