Example #1
0
def dump_random_source_target_translation(model,
                                          source_buckets,
                                          target_buckets,
                                          vocab_inv_source,
                                          vocab_inv_target,
                                          num_translate=3,
                                          beam_width=8):
    xp = model.xp
    for source_bucket, target_bucket in zip(source_buckets, target_buckets):
        source_batch, target_batch = sample_batch_from_bucket(
            source_bucket, target_bucket, num_translate)

        if beam_width == 1:  # greedy
            translation_batch = translate_greedy(model, source_batch,
                                                 target_batch.shape[1] * 2,
                                                 len(vocab_inv_target),
                                                 beam_width)
            for index in range(len(translation_batch)):
                source = source_batch[index]
                translation = translation_batch[index]
                target = target_batch[index]
                dump_translation(vocab_inv_source, vocab_inv_target, source,
                                 translation, target)

        else:  # beam search
            for index in range(len(source_batch)):
                source = source_batch[index]
                target = target_batch[index]
                translation_batch = translate_beam_search(
                    model, source, target.size * 2, len(vocab_inv_target),
                    beam_width)
                dump_translation(vocab_inv_source, vocab_inv_target, source,
                                 translation_batch, target)
Example #2
0
def compute_random_mean_wer(model, source_buckets, target_buckets, target_vocab_size, sample_size=100, argmax=True):
	xp = model.xp
	result = []
	for source_bucket, target_bucket in zip(source_buckets, target_buckets):
		# sample minibatch
		source_batch, target_batch = sample_batch_from_bucket(source_bucket, target_bucket, sample_size)
		
		# compute WER
		mean_wer = _compute_batch_wer_mean(model, source_batch, target_batch, target_vocab_size, argmax=argmax)

		result.append(mean_wer * 100)

	return result
Example #3
0
def show_random_source_target_translation(model,
                                          source_buckets,
                                          target_buckets,
                                          vocab_inv_source,
                                          vocab_inv_target,
                                          num_translate=100,
                                          argmax=True):
    xp = model.xp
    for source_bucket, target_bucket in zip(source_buckets, target_buckets):
        # sample minibatch
        source_batch, target_batch = sample_batch_from_bucket(
            source_bucket, target_bucket, num_translate)
        translation_batch = _translate_batch(model,
                                             source_batch,
                                             target_batch.shape[1] * 2,
                                             vocab_inv_source,
                                             vocab_inv_target,
                                             argmax=argmax)
        show_translate_results(vocab_inv_source, vocab_inv_target,
                               source_batch, translation_batch, target_batch)
Example #4
0
def compute_random_error_rate_buckets(model,
                                      source_buckets,
                                      target_buckets,
                                      target_vocab_size,
                                      sample_size=100,
                                      beam_width=8,
                                      normalization_alpha=0):
    xp = model.xp
    result = []
    for bucket_index, (source_bucket, target_bucket) in enumerate(
            zip(source_buckets, target_buckets)):
        source_batch, target_batch = sample_batch_from_bucket(
            source_bucket, target_bucket, sample_size)

        if beam_width == 1:  # greedy
            mean_wer = compute_error_rate_source_batch(model, source_batch,
                                                       target_batch,
                                                       target_vocab_size)

        else:  # beam search
            sum_wer = 0
            for index in range(sample_size):
                sys.stdout.write(
                    "\rcomputing WER ... bucket {}/{} (sequence {}/{})".format(
                        bucket_index + 1, len(source_buckets), index + 1,
                        sample_size))
                sys.stdout.flush()
                source = source_batch[index]
                target = target_batch[index]
                wer = compute_error_rate_source_sequence(
                    model, source, target, target_vocab_size, beam_width,
                    normalization_alpha)
                sum_wer += wer
            mean_wer = sum_wer / len(source_batch)

            sys.stdout.write("\r" + stdout.CLEAR)
            sys.stdout.flush()

        result.append(mean_wer * 100)

    return result
Example #5
0
def main(args):
    # load textfile
    train_dataset, dev_dataset, test_dataset, vocab, vocab_inv = read_data(
        args.text_filename,
        train_split_ratio=args.train_split,
        dev_split_ratio=args.dev_split,
        seed=args.seed)
    save_vocab(args.model_dir, vocab, vocab_inv)
    vocab_size = len(vocab)
    print_bold("data	#	hash")
    print("train	{}	{}".format(len(train_dataset), hash(str(train_dataset))))
    print("dev	{}	{}".format(len(dev_dataset), hash(str(dev_dataset))))
    print("test	{}	{}".format(len(test_dataset), hash(str(test_dataset))))
    print("vocab	{}".format(vocab_size))

    # split into buckets
    train_buckets = make_buckets(train_dataset)

    print_bold("buckets	#data	(train)")
    if args.buckets_limit is not None:
        train_buckets = train_buckets[:args.buckets_limit + 1]
    for size, data in zip(bucket_sizes, train_buckets):
        print("{}	{}".format(size, len(data)))

    print_bold("buckets	#data	(dev)")
    dev_buckets = make_buckets(dev_dataset)
    if args.buckets_limit is not None:
        dev_buckets = dev_buckets[:args.buckets_limit + 1]
    for size, data in zip(bucket_sizes, dev_buckets):
        print("{}	{}".format(size, len(data)))

    print_bold("buckets	#data	(test)")
    test_buckets = make_buckets(test_dataset)
    for size, data in zip(bucket_sizes, test_buckets):
        print("{}	{}".format(size, len(data)))

    # to maintain equilibrium
    min_num_data = 0
    for data in train_buckets:
        if min_num_data == 0 or len(data) < min_num_data:
            min_num_data = len(data)
    repeats = []
    for data in train_buckets:
        repeat = len(data) // min_num_data
        repeat = repeat + 1 if repeat == 0 else repeat
        repeats.append(repeat)

    num_updates_per_iteration = 0
    for repeat, data in zip(repeats, train_buckets):
        num_updates_per_iteration += repeat * args.batchsize
    num_iteration = len(train_dataset) // num_updates_per_iteration + 1

    # init
    model = load_model(args.model_dir)
    if model is None:
        model = RNNModel(vocab_size,
                         args.ndim_embedding,
                         args.num_layers,
                         ndim_h=args.ndim_h,
                         kernel_size=args.kernel_size,
                         pooling=args.pooling,
                         zoneout=args.zoneout,
                         dropout=args.dropout,
                         wgain=args.wgain,
                         densely_connected=args.densely_connected,
                         ignore_label=ID_PAD)
    if args.gpu_device >= 0:
        chainer.cuda.get_device(args.gpu_device).use()
        model.to_gpu()

    # setup an optimizer
    if args.eve:
        optimizer = Eve(alpha=args.learning_rate, beta1=0.9)
    else:
        optimizer = optimizers.Adam(alpha=args.learning_rate, beta1=0.9)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))
    min_learning_rate = 1e-7
    prev_ppl = None
    total_time = 0

    def mean(l):
        return sum(l) / len(l)

    # training
    for epoch in xrange(1, args.epoch + 1):
        print("Epoch", epoch)
        start_time = time.time()
        for itr in xrange(1, num_iteration + 1):
            sys.stdout.write("\r{} / {}".format(itr, num_iteration))
            sys.stdout.flush()

            for repeat, dataset in zip(repeats, train_buckets):
                for r in xrange(repeat):
                    batch = sample_batch_from_bucket(dataset, args.batchsize)
                    source, target = make_source_target_pair(batch)
                    if model.xp is cuda.cupy:
                        source = cuda.to_gpu(source)
                        target = cuda.to_gpu(target)
                    model.reset_state()
                    Y = model(source)
                    loss = softmax_cross_entropy(Y,
                                                 target,
                                                 ignore_label=ID_PAD)
                    optimizer.update(lossfun=lambda: loss)

            if itr % args.interval == 0 or itr == num_iteration:
                save_model(args.model_dir, model)

        # show log
        sys.stdout.write("\r" + stdout.CLEAR)
        sys.stdout.flush()
        print_bold("	accuracy (sampled train)")
        acc_train = compute_random_accuracy(model, train_buckets,
                                            args.batchsize)
        print("	", mean(acc_train), acc_train)
        print_bold("	accuracy (dev)")
        acc_dev = compute_accuracy(model, dev_buckets, args.batchsize)
        print("	", mean(acc_dev), acc_dev)
        print_bold("	ppl (sampled train)")
        ppl_train = compute_random_perplexity(model, train_buckets,
                                              args.batchsize)
        print("	", mean(ppl_train), ppl_train)
        print_bold("	ppl (dev)")
        ppl_dev = compute_perplexity(model, dev_buckets, args.batchsize)
        ppl_dev_mean = mean(ppl_dev)
        print("	", ppl_dev_mean, ppl_dev)
        elapsed_time = (time.time() - start_time) / 60.
        total_time += elapsed_time
        print("	done in {} min, lr = {}, total {} min".format(
            int(elapsed_time), optimizer.alpha, int(total_time)))

        # decay learning rate
        if prev_ppl is not None and ppl_dev_mean >= prev_ppl and optimizer.alpha > min_learning_rate:
            optimizer.alpha *= 0.5
        prev_ppl = ppl_dev_mean
Example #6
0
def compute_random_perplexity(model, buckets, batchsize=100):
    ppl = []
    for dataset in buckets:
        batch = sample_batch_from_bucket(dataset, batchsize)
        ppl.append(compute_perplexity_batch(model, batch))
    return ppl
Example #7
0
def compute_random_accuracy(model, buckets, batchsize=100):
    acc = []
    for dataset in buckets:
        batch = sample_batch_from_bucket(dataset, batchsize)
        acc.append(compute_accuracy_batch(model, batch))
    return acc
Example #8
0
def main(args):
	# load textfile
	source_dataset, target_dataset, vocab, vocab_inv = read_data(args.source_filename, args.target_filename, train_split_ratio=args.train_split, dev_split_ratio=args.dev_split, seed=args.seed)
	save_vocab(args.model_dir, vocab, vocab_inv)

	source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset
	target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset
	print_bold("data	#")
	print("train	{}".format(len(source_dataset_train)))
	print("dev	{}".format(len(source_dataset_dev)))
	print("test	{}".format(len(source_dataset_test)))

	vocab_source, vocab_target = vocab
	vocab_inv_source, vocab_inv_target = vocab_inv
	print("vocab	{}	(source)".format(len(vocab_source)))
	print("vocab	{}	(target)".format(len(vocab_target)))

	# split into buckets
	source_buckets_train, target_buckets_train = make_buckets(source_dataset_train, target_dataset_train)
	if args.buckets_limit is not None:
		source_buckets_train = source_buckets_train[:args.buckets_limit+1]
		target_buckets_train = target_buckets_train[:args.buckets_limit+1]

	print_bold("buckets 	#data	(train)")
	for size, data in zip(bucket_sizes, source_buckets_train):
		print("{} 	{}".format(size, len(data)))

	print_bold("buckets 	#data	(dev)")
	source_buckets_dev, target_buckets_dev = make_buckets(source_dataset_dev, target_dataset_dev)
	if args.buckets_limit is not None:
		source_buckets_dev = source_buckets_dev[:args.buckets_limit+1]
		target_buckets_dev = target_buckets_dev[:args.buckets_limit+1]
	for size, data in zip(bucket_sizes, source_buckets_dev):
		print("{} 	{}".format(size, len(data)))

	print_bold("buckets		#data	(test)")
	source_buckets_test, target_buckets_test = make_buckets(source_dataset_test, target_dataset_test)
	if args.buckets_limit is not None:
		source_buckets_test = source_buckets_test[:args.buckets_limit+1]
		target_buckets_test = target_buckets_test[:args.buckets_limit+1]
	for size, data in zip(bucket_sizes, source_buckets_test):
		print("{} 	{}".format(size, len(data)))

	# to maintain equilibrium
	min_num_data = 0
	for data in source_buckets_train:
		if min_num_data == 0 or len(data) < min_num_data:
			min_num_data = len(data)
	repeats = []
	for data in source_buckets_train:
		repeats.append(len(data) // min_num_data + 1)

	num_updates_per_iteration = 0
	for repeat, data in zip(repeats, source_buckets_train):
		num_updates_per_iteration += repeat * args.batchsize
	num_iteration = len(source_dataset_train) // num_updates_per_iteration + 1

	# init
	model = load_model(args.model_dir)
	if model is None:
		model = seq2seq(len(vocab_source), len(vocab_target), args.ndim_embedding, args.num_layers, ndim_h=args.ndim_h, pooling=args.pooling, dropout=args.dropout, zoneout=args.zoneout, wgain=args.wgain, densely_connected=args.densely_connected, attention=args.attention)
	if args.gpu_device >= 0:
		cuda.get_device(args.gpu_device).use()
		model.to_gpu()

	# setup an optimizer
	if args.eve:
		optimizer = Eve(alpha=args.learning_rate, beta1=0.9)
	else:
		optimizer = optimizers.Adam(alpha=args.learning_rate, beta1=0.9)
	optimizer.setup(model)
	optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))
	optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))
	min_learning_rate = 1e-7
	prev_wer = None
	total_time = 0

	def mean(l):
		return sum(l) / len(l)

	# training
	for epoch in xrange(1, args.epoch + 1):
		print("Epoch", epoch)
		start_time = time.time()
		for itr in xrange(1, num_iteration + 1):
			for repeat, source_bucket, target_bucket in zip(repeats, source_buckets_train, target_buckets_train):
				for r in xrange(repeat):
					# sample minibatch
					source_batch, target_batch = sample_batch_from_bucket(source_bucket, target_bucket, args.batchsize)
					skip_mask = source_batch != ID_PAD
					target_batch_input, target_batch_output = make_source_target_pair(target_batch)

					# to gpu
					if model.xp is cuda.cupy:
						skip_mask = cuda.to_gpu(skip_mask)
						source_batch = cuda.to_gpu(source_batch)
						target_batch_input = cuda.to_gpu(target_batch_input)
						target_batch_output = cuda.to_gpu(target_batch_output)

					# compute loss
					model.reset_state()
					if args.attention:
						last_hidden_states, last_layer_outputs = model.encode(source_batch, skip_mask)
						Y = model.decode(target_batch_input, last_hidden_states, last_layer_outputs, skip_mask)
					else:
						last_hidden_states = model.encode(source_batch, skip_mask)
						Y = model.decode(target_batch_input, last_hidden_states)
					loss = softmax_cross_entropy(Y, target_batch_output, ignore_label=ID_PAD)
					optimizer.update(lossfun=lambda: loss)

				sys.stdout.write("\r{} / {}".format(itr, num_iteration))
				sys.stdout.flush()

			if itr % args.interval == 0 or itr == num_iteration:
				save_model(args.model_dir, model)

		# show log
		sys.stdout.write("\r" + stdout.CLEAR)
		sys.stdout.flush()
		print_bold("translate (train)")
		show_random_source_target_translation(model, source_buckets_train, target_buckets_train, vocab_inv_source, vocab_inv_target, num_translate=5, argmax=True)
		print_bold("translate (dev)")
		show_random_source_target_translation(model, source_buckets_dev, target_buckets_dev, vocab_inv_source, vocab_inv_target, num_translate=5, argmax=True)
		print_bold("WER (sampled train)")
		wer_train = compute_random_mean_wer(model, source_buckets_train, target_buckets_train, len(vocab_inv_target), sample_size=args.batchsize, argmax=True)
		print(mean(wer_train), wer_train)
		print_bold("WER (dev)")
		wer_dev = compute_mean_wer(model, source_buckets_dev, target_buckets_dev, len(vocab_inv_target), batchsize=args.batchsize, argmax=True)
		mean_wer_dev = mean(wer_dev)
		print(mean_wer_dev, wer_dev)
		elapsed_time = (time.time() - start_time) / 60.
		total_time += elapsed_time
		print("done in {} min, lr = {}, total {} min".format(int(elapsed_time), optimizer.alpha, int(total_time)))

		# decay learning rate
		if prev_wer is not None and mean_wer_dev >= prev_wer and optimizer.alpha > min_learning_rate:
			optimizer.alpha *= 0.5
		prev_wer = mean_wer_dev