def dump_dataset(source_dataset, vocab, source_bucket): source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset vocab_source, vocab_target = vocab source_buckets_train, source_buckets_dev, source_buckets_test = source_bucket printb("data #") print("train {}".format(len(source_dataset_train))) if len(source_dataset_dev) > 0: print("dev {}".format(len(source_dataset_dev))) if len(source_dataset_test) > 0: print("test {}".format(len(source_dataset_test))) print("vocab {} (source)".format(len(vocab_source))) print("vocab {} (target)".format(len(vocab_target))) printb("buckets #data (train)") for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) if source_buckets_dev: printb("buckets #data (dev)") for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) if source_buckets_test: printb("buckets #data (test)") for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data)))
def dump_dataset(dataset_train, dataset_dev, train_buckets, dev_buckets, vocab_size): printb("data # hash") print("train {} {}".format(len(dataset_train), hash(str(dataset_train)))) if len(dataset_dev) > 0: print("dev {} {}".format(len(dataset_dev), hash(str(dataset_dev)))) print("vocab {}".format(vocab_size)) printb("buckets #data (train)") for size, data in zip(bucket_sizes, train_buckets): print("{} {}".format(size, len(data))) if len(dev_buckets) > 0: printb("buckets #data (dev)") for size, data in zip(bucket_sizes, dev_buckets): print("{} {}".format(size, len(data)))
def main(args): vocab, vocab_inv = load_vocab(args.model_dir) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv source_dataset, target_dataset = read_data(vocab_source, vocab_target, args.source_train, args.target_train, args.source_dev, args.target_dev, args.source_test, args.target_test, reverse_source=True) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset printb("data #") if len(source_dataset_train) > 0: print("train {}".format(len(source_dataset_train))) if len(source_dataset_dev) > 0: print("dev {}".format(len(source_dataset_dev))) if len(source_dataset_test) > 0: print("test {}".format(len(source_dataset_test))) print("vocab {} (source)".format(len(vocab_source))) print("vocab {} (target)".format(len(vocab_target))) # split into buckets source_buckets_train = None if len(source_dataset_train) > 0: printb("buckets #data (train)") source_buckets_train, target_buckets_train = make_buckets( source_dataset_train, target_dataset_train) if args.buckets_slice is not None: source_buckets_train = source_buckets_train[:args.buckets_slice + 1] target_buckets_train = target_buckets_train[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) source_buckets_dev = None if len(source_dataset_dev) > 0: printb("buckets #data (dev)") source_buckets_dev, target_buckets_dev = make_buckets( source_dataset_dev, target_dataset_dev) if args.buckets_slice is not None: source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1] target_buckets_dev = target_buckets_dev[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) source_buckets_test = None if len(source_dataset_test) > 0: printb("buckets #data (test)") source_buckets_test, target_buckets_test = make_buckets( source_dataset_test, target_dataset_test) if args.buckets_slice is not None: source_buckets_test = source_buckets_test[:args.buckets_slice + 1] target_buckets_test = target_buckets_test[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data))) model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() def mean(l): return sum(l) / len(l) with chainer.using_config("train", False): if source_buckets_train is not None: printb("WER (train)") wer_train = compute_error_rate_buckets(model, source_buckets_train, target_buckets_train, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_train), wer_train) if source_buckets_dev is not None: printb("WER (dev)") wer_dev = compute_error_rate_buckets(model, source_buckets_dev, target_buckets_dev, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_dev), wer_dev) if source_buckets_test is not None: printb("WER (test)") wer_test = compute_error_rate_buckets(model, source_buckets_test, target_buckets_test, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_test), wer_test)
def main(): # load textfile dataset_train, dataset_dev, _, vocab, vocab_inv = read_data(args.train_filename, args.dev_filename) vocab_size = len(vocab) save_vocab(args.model_dir, vocab, vocab_inv) # split into buckets train_buckets = make_buckets(dataset_train) if args.buckets_slice is not None: train_buckets = train_buckets[:args.buckets_slice + 1] dev_buckets = None if len(dataset_dev) > 0: dev_buckets = make_buckets(dataset_dev) if args.buckets_slice is not None: dev_buckets = dev_buckets[:args.buckets_slice + 1] # print dump_dataset(dataset_train, dataset_dev, train_buckets, dev_buckets, vocab_size) # to maintain equilibrium required_interations = [] for data in train_buckets: itr = math.ceil(len(data) / args.batchsize) required_interations.append(itr) total_iterations = sum(required_interations) buckets_distribution = np.asarray(required_interations, dtype=float) / total_iterations # init model = load_model(args.model_dir) if model is None: model = RNNModel(vocab_size, args.ndim_embedding, args.num_layers, ndim_h=args.ndim_h, kernel_size=args.kernel_size, pooling=args.pooling, zoneout=args.zoneout, dropout=args.dropout, weightnorm=args.weightnorm, wgain=args.wgain, densely_connected=args.densely_connected, ignore_label=ID_PAD) if args.gpu_device >= 0: chainer.cuda.get_device(args.gpu_device).use() model.to_gpu() # setup an optimizer optimizer = get_optimizer(args.optimizer, args.learning_rate, args.momentum) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) final_learning_rate = 1e-4 total_time = 0 def mean(l): return sum(l) / len(l) # training for epoch in range(1, args.epoch + 1): print("Epoch", epoch) start_time = time.time() with chainer.using_config("train", True): for itr in range(total_iterations): bucket_idx = int(np.random.choice(np.arange(len(train_buckets)), size=1, p=buckets_distribution)) dataset = train_buckets[bucket_idx] np.random.shuffle(dataset) data_batch = dataset[:args.batchsize] source_batch, target_batch = make_source_target_pair(data_batch) if args.gpu_device >= 0: source_batch = cuda.to_gpu(source_batch) target_batch = cuda.to_gpu(target_batch) # update params model.reset_state() y_batch = model(source_batch) loss = F.softmax_cross_entropy(y_batch, target_batch, ignore_label=ID_PAD) optimizer.update(lossfun=lambda: loss) # show log printr("iteration {}/{}".format(itr + 1, total_iterations)) save_model(args.model_dir, model) # clear console printr("") # compute perplexity with chainer.using_config("train", False): if dev_buckets is not None: printb(" ppl (dev)") ppl_dev = compute_perplexity(model, dev_buckets, args.batchsize) print(" ", mean(ppl_dev), ppl_dev) # show log elapsed_time = (time.time() - start_time) / 60. total_time += elapsed_time print(" done in {} min, lr = {}, total {} min".format(int(elapsed_time), get_current_learning_rate(optimizer), int(total_time))) # decay learning rate decay_learning_rate(optimizer, args.lr_decay_factor, final_learning_rate)
def main(): # load textfile vocab, vocab_inv = load_vocab(args.model_dir) dataset_train, dataset_dev, dataset_test, _, _ = read_data(args.train_filename, args.dev_filename, args.test_filename, vocab=vocab) vocab_size = len(vocab) printb("data # hash") print("train {} {}".format(len(dataset_train), hash(str(dataset_train)))) if len(dataset_dev) > 0: print("dev {} {}".format(len(dataset_dev), hash(str(dataset_dev)))) if len(dataset_test) > 0: print("test {} {}".format(len(dataset_test), hash(str(dataset_test)))) print("vocab {}".format(vocab_size)) # split into buckets buckets_train = None if len(dataset_train) > 0: printb("buckets #data (train)") buckets_train = make_buckets(dataset_train) if args.buckets_slice is not None: buckets_train = buckets_train[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, buckets_train): print("{} {}".format(size, len(data))) buckets_dev = None if len(dataset_dev) > 0: printb("buckets #data (dev)") buckets_dev = make_buckets(dataset_dev) if args.buckets_slice is not None: buckets_dev = buckets_dev[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, buckets_dev): print("{} {}".format(size, len(data))) buckets_test = None if len(dataset_test) > 0: printb("buckets #data (test)") buckets_test = make_buckets(dataset_test) if args.buckets_slice is not None: buckets_test = buckets_test[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, buckets_test): print("{} {}".format(size, len(data))) # init model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: chainer.cuda.get_device(args.gpu_device).use() model.to_gpu() # show log def mean(l): return sum(l) / len(l) sys.stdout.write("\r" + stdout.CLEAR) sys.stdout.flush() with chainer.using_config("train", False): if buckets_train is not None: printb("ppl (train)") ppl_train = compute_perplexity(model, buckets_train, args.batchsize) print(mean(ppl_train), ppl_train) if buckets_dev is not None: printb("ppl (dev)") ppl_dev = compute_perplexity(model, buckets_dev, args.batchsize) print(mean(ppl_dev), ppl_dev) if buckets_test is not None: printb("ppl (test)") ppl_test = compute_perplexity(model, buckets_test, args.batchsize) print(mean(ppl_test), ppl_dev)
def main(args): source_dataset, target_dataset, vocab, vocab_inv = read_data_and_vocab( args.source_train, args.target_train, args.source_dev, args.target_dev, args.source_test, args.target_test, reverse_source=True) save_vocab(args.model_dir, vocab, vocab_inv) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv # split into buckets source_buckets_train, target_buckets_train = make_buckets( source_dataset_train, target_dataset_train) if args.buckets_slice is not None: source_buckets_train = source_buckets_train[:args.buckets_slice + 1] target_buckets_train = target_buckets_train[:args.buckets_slice + 1] # development dataset source_buckets_dev = None if len(source_dataset_dev) > 0: source_buckets_dev, target_buckets_dev = make_buckets( source_dataset_dev, target_dataset_dev) if args.buckets_slice is not None: source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1] target_buckets_dev = target_buckets_dev[:args.buckets_slice + 1] # test dataset source_buckets_test = None if len(source_dataset_test) > 0: source_buckets_test, target_buckets_test = make_buckets( source_dataset_test, target_dataset_test) if args.buckets_slice is not None: source_buckets_test = source_buckets_test[:args.buckets_slice + 1] target_buckets_test = target_buckets_test[:args.buckets_slice + 1] # show log dump_dataset( source_dataset, vocab, (source_buckets_train, source_buckets_dev, source_buckets_test)) # to maintain equilibrium required_interations = [] for data in source_buckets_train: itr = len(data) // args.batchsize + 1 required_interations.append(itr) total_iterations = sum(required_interations) buckets_distribution = np.asarray(required_interations, dtype=float) / total_iterations # init model = load_model(args.model_dir) if model is None: model = seq2seq(len(vocab_source), len(vocab_target), args.ndim_embedding, args.ndim_h, args.num_layers, pooling=args.pooling, dropout=args.dropout, zoneout=args.zoneout, weightnorm=args.weightnorm, wgain=args.wgain, densely_connected=args.densely_connected, attention=args.attention) if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() # setup an optimizer optimizer = get_optimizer(args.optimizer, args.learning_rate, args.momentum) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) final_learning_rate = 1e-5 total_time = 0 indices_train = [] for bucket_idx, bucket in enumerate(source_buckets_train): indices = np.arange(len(bucket)) np.random.shuffle(indices) indices_train.append(indices) def mean(l): return sum(l) / len(l) # training for epoch in range(1, args.epoch + 1): print("Epoch", epoch) start_time = time.time() with chainer.using_config("train", True): for itr in range(total_iterations): bucket_idx = int( np.random.choice(np.arange(len(source_buckets_train)), size=1, p=buckets_distribution)) source_bucket = source_buckets_train[bucket_idx] target_bucket = target_buckets_train[bucket_idx] # sample minibatch source_batch = source_bucket[:args.batchsize] target_batch = target_bucket[:args.batchsize] skip_mask = source_batch != ID_PAD target_batch_input, target_batch_output = make_source_target_pair( target_batch) # to gpu if args.gpu_device >= 0: skip_mask = cuda.to_gpu(skip_mask) source_batch = cuda.to_gpu(source_batch) target_batch_input = cuda.to_gpu(target_batch_input) target_batch_output = cuda.to_gpu(target_batch_output) # compute loss model.reset_state() if args.attention: last_hidden_states, last_layer_outputs = model.encode( source_batch, skip_mask) y_batch = model.decode(target_batch_input, last_hidden_states, last_layer_outputs, skip_mask) else: last_hidden_states = model.encode(source_batch, skip_mask) y_batch = model.decode(target_batch_input, last_hidden_states) loss = softmax_cross_entropy(y_batch, target_batch_output, ignore_label=ID_PAD) # update parameters optimizer.update(lossfun=lambda: loss) # show log printr("iteration {}/{}".format(itr + 1, total_iterations)) source_buckets_train[bucket_idx] = np.roll(source_bucket, -args.batchsize, axis=0) # shift target_buckets_train[bucket_idx] = np.roll(target_bucket, -args.batchsize, axis=0) # shift # shuffle for bucket_idx in range(len(source_buckets_train)): indices = indices_train[bucket_idx] np.random.shuffle(indices) source_buckets_train[bucket_idx] = source_buckets_train[ bucket_idx][indices] target_buckets_train[bucket_idx] = target_buckets_train[ bucket_idx][indices] # serialize save_model(args.model_dir, model) # clear console printr("") # show log with chainer.using_config("train", False): if epoch % args.interval == 0: printb("translate (train)") dump_random_source_target_translation(model, source_buckets_train, target_buckets_train, vocab_inv_source, vocab_inv_target, num_translate=5, beam_width=1) if source_buckets_dev is not None: printb("translate (dev)") dump_random_source_target_translation(model, source_buckets_dev, target_buckets_dev, vocab_inv_source, vocab_inv_target, num_translate=5, beam_width=1) if source_buckets_dev is not None: printb("WER (dev)") wer_dev = compute_error_rate_buckets(model, source_buckets_dev, target_buckets_dev, len(vocab_inv_target), beam_width=1) print(mean(wer_dev), wer_dev) elapsed_time = (time.time() - start_time) / 60. total_time += elapsed_time print("done in {} min, lr = {:.4f}, total {} min".format( int(elapsed_time), get_current_learning_rate(optimizer), int(total_time))) # decay learning rate decay_learning_rate(optimizer, args.lr_decay_factor, final_learning_rate)
def main(args): vocab, vocab_inv = load_vocab(args.model_dir) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv source_dataset, target_dataset = read_data(vocab_source, vocab_target, args.source_train, None, args.source_dev, None, args.source_test, None, reverse_source=True) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset printb("data #") if len(source_dataset_train) > 0: print("train {}".format(len(source_dataset_train))) if len(source_dataset_dev) > 0: print("dev {}".format(len(source_dataset_dev))) if len(source_dataset_test) > 0: print("test {}".format(len(source_dataset_test))) # split into buckets source_buckets_train = None if len(source_dataset_train) > 0: printb("buckets #data (train)") source_buckets_train = make_buckets(source_dataset_train) if args.buckets_slice is not None: source_buckets_train = source_buckets_train[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) source_buckets_dev = None if len(source_dataset_dev) > 0: printb("buckets #data (dev)") source_buckets_dev = make_buckets(source_dataset_dev) if args.buckets_slice is not None: source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) source_buckets_test = None if len(source_dataset_test) > 0: printb("buckets #data (test)") source_buckets_test = make_buckets(source_dataset_test) if args.buckets_slice is not None: source_buckets_test = source_buckets_test[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data))) # init model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() if source_buckets_train is not None: dump_source_translation(model, source_buckets_train, vocab_inv_source, vocab_inv_target, beam_width=args.beam_width, normalization_alpha=args.alpha) if source_buckets_dev is not None: dump_source_translation(model, source_buckets_dev, vocab_inv_source, vocab_inv_target, beam_width=args.beam_width, normalization_alpha=args.alpha) if source_buckets_test is not None: dump_source_translation(model, source_buckets_test, vocab_inv_source, vocab_inv_target, beam_width=args.beam_width, normalization_alpha=args.alpha)