예제 #1
0
def dump_dataset(source_dataset, vocab, source_bucket):
    source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset
    vocab_source, vocab_target = vocab
    source_buckets_train, source_buckets_dev, source_buckets_test = source_bucket

    printb("data	#")
    print("train	{}".format(len(source_dataset_train)))
    if len(source_dataset_dev) > 0:
        print("dev	{}".format(len(source_dataset_dev)))
    if len(source_dataset_test) > 0:
        print("test	{}".format(len(source_dataset_test)))

    print("vocab	{}	(source)".format(len(vocab_source)))
    print("vocab	{}	(target)".format(len(vocab_target)))

    printb("buckets 	#data	(train)")
    for size, data in zip(bucket_sizes, source_buckets_train):
        print("{} 	{}".format(size, len(data)))

    if source_buckets_dev:
        printb("buckets 	#data	(dev)")
        for size, data in zip(bucket_sizes, source_buckets_dev):
            print("{} 	{}".format(size, len(data)))

    if source_buckets_test:
        printb("buckets		#data	(test)")
        for size, data in zip(bucket_sizes, source_buckets_test):
            print("{} 	{}".format(size, len(data)))
예제 #2
0
def dump_dataset(dataset_train, dataset_dev, train_buckets, dev_buckets, vocab_size):
	printb("data	#	hash")
	print("train	{}	{}".format(len(dataset_train), hash(str(dataset_train))))
	if len(dataset_dev) > 0:
		print("dev	{}	{}".format(len(dataset_dev), hash(str(dataset_dev))))
	print("vocab	{}".format(vocab_size))

	printb("buckets	#data	(train)")
	for size, data in zip(bucket_sizes, train_buckets):
		print("{}	{}".format(size, len(data)))

	if len(dev_buckets) > 0:
		printb("buckets	#data	(dev)")
		for size, data in zip(bucket_sizes, dev_buckets):
			print("{}	{}".format(size, len(data)))
예제 #3
0
def main(args):
    vocab, vocab_inv = load_vocab(args.model_dir)
    vocab_source, vocab_target = vocab
    vocab_inv_source, vocab_inv_target = vocab_inv

    source_dataset, target_dataset = read_data(vocab_source,
                                               vocab_target,
                                               args.source_train,
                                               args.target_train,
                                               args.source_dev,
                                               args.target_dev,
                                               args.source_test,
                                               args.target_test,
                                               reverse_source=True)

    source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset
    target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset
    printb("data	#")
    if len(source_dataset_train) > 0:
        print("train	{}".format(len(source_dataset_train)))
    if len(source_dataset_dev) > 0:
        print("dev	{}".format(len(source_dataset_dev)))
    if len(source_dataset_test) > 0:
        print("test	{}".format(len(source_dataset_test)))

    print("vocab	{}	(source)".format(len(vocab_source)))
    print("vocab	{}	(target)".format(len(vocab_target)))

    # split into buckets
    source_buckets_train = None
    if len(source_dataset_train) > 0:
        printb("buckets 	#data	(train)")
        source_buckets_train, target_buckets_train = make_buckets(
            source_dataset_train, target_dataset_train)
        if args.buckets_slice is not None:
            source_buckets_train = source_buckets_train[:args.buckets_slice +
                                                        1]
            target_buckets_train = target_buckets_train[:args.buckets_slice +
                                                        1]
        for size, data in zip(bucket_sizes, source_buckets_train):
            print("{} 	{}".format(size, len(data)))

    source_buckets_dev = None
    if len(source_dataset_dev) > 0:
        printb("buckets 	#data	(dev)")
        source_buckets_dev, target_buckets_dev = make_buckets(
            source_dataset_dev, target_dataset_dev)
        if args.buckets_slice is not None:
            source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1]
            target_buckets_dev = target_buckets_dev[:args.buckets_slice + 1]
        for size, data in zip(bucket_sizes, source_buckets_dev):
            print("{} 	{}".format(size, len(data)))

    source_buckets_test = None
    if len(source_dataset_test) > 0:
        printb("buckets		#data	(test)")
        source_buckets_test, target_buckets_test = make_buckets(
            source_dataset_test, target_dataset_test)
        if args.buckets_slice is not None:
            source_buckets_test = source_buckets_test[:args.buckets_slice + 1]
            target_buckets_test = target_buckets_test[:args.buckets_slice + 1]
        for size, data in zip(bucket_sizes, source_buckets_test):
            print("{} 	{}".format(size, len(data)))

    model = load_model(args.model_dir)
    assert model is not None
    if args.gpu_device >= 0:
        cuda.get_device(args.gpu_device).use()
        model.to_gpu()

    def mean(l):
        return sum(l) / len(l)

    with chainer.using_config("train", False):
        if source_buckets_train is not None:
            printb("WER (train)")
            wer_train = compute_error_rate_buckets(model, source_buckets_train,
                                                   target_buckets_train,
                                                   len(vocab_target),
                                                   args.beam_width, args.alpha)
            print(mean(wer_train), wer_train)

        if source_buckets_dev is not None:
            printb("WER (dev)")
            wer_dev = compute_error_rate_buckets(model, source_buckets_dev,
                                                 target_buckets_dev,
                                                 len(vocab_target),
                                                 args.beam_width, args.alpha)
            print(mean(wer_dev), wer_dev)

        if source_buckets_test is not None:
            printb("WER (test)")
            wer_test = compute_error_rate_buckets(model, source_buckets_test,
                                                  target_buckets_test,
                                                  len(vocab_target),
                                                  args.beam_width, args.alpha)
            print(mean(wer_test), wer_test)
예제 #4
0
def main():
	# load textfile
	dataset_train, dataset_dev, _, vocab, vocab_inv = read_data(args.train_filename, args.dev_filename)
	vocab_size = len(vocab)

	save_vocab(args.model_dir, vocab, vocab_inv)

	# split into buckets
	train_buckets = make_buckets(dataset_train)

	if args.buckets_slice is not None:
		train_buckets = train_buckets[:args.buckets_slice + 1]

	dev_buckets = None
	if len(dataset_dev) > 0:
		dev_buckets = make_buckets(dataset_dev)
		if args.buckets_slice is not None:
			dev_buckets = dev_buckets[:args.buckets_slice + 1]

	# print
	dump_dataset(dataset_train, dataset_dev, train_buckets, dev_buckets, vocab_size)

	# to maintain equilibrium
	required_interations = []
	for data in train_buckets:
		itr = math.ceil(len(data) / args.batchsize)
		required_interations.append(itr)
	total_iterations = sum(required_interations)
	buckets_distribution = np.asarray(required_interations, dtype=float) / total_iterations

	# init
	model = load_model(args.model_dir)
	if model is None:
		model = RNNModel(vocab_size, args.ndim_embedding, args.num_layers, ndim_h=args.ndim_h, kernel_size=args.kernel_size, pooling=args.pooling, zoneout=args.zoneout, dropout=args.dropout, weightnorm=args.weightnorm, wgain=args.wgain, densely_connected=args.densely_connected, ignore_label=ID_PAD)

	if args.gpu_device >= 0:
		chainer.cuda.get_device(args.gpu_device).use()
		model.to_gpu()

	# setup an optimizer
	optimizer = get_optimizer(args.optimizer, args.learning_rate, args.momentum)
	optimizer.setup(model)
	optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))
	optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))
	final_learning_rate = 1e-4
	total_time = 0

	def mean(l):
		return sum(l) / len(l)

	# training
	for epoch in range(1, args.epoch + 1):
		print("Epoch", epoch)
		start_time = time.time()

		with chainer.using_config("train", True):
			for itr in range(total_iterations):
				bucket_idx = int(np.random.choice(np.arange(len(train_buckets)), size=1, p=buckets_distribution))
				dataset = train_buckets[bucket_idx]
				np.random.shuffle(dataset)
				data_batch = dataset[:args.batchsize]

				source_batch, target_batch = make_source_target_pair(data_batch)

				if args.gpu_device >= 0:
					source_batch = cuda.to_gpu(source_batch)
					target_batch = cuda.to_gpu(target_batch)

				# update params
				model.reset_state()
				y_batch = model(source_batch)
				loss = F.softmax_cross_entropy(y_batch, target_batch, ignore_label=ID_PAD)
				optimizer.update(lossfun=lambda: loss)

				# show log
				printr("iteration {}/{}".format(itr + 1, total_iterations))

		save_model(args.model_dir, model)

		# clear console
		printr("")

		# compute perplexity
		with chainer.using_config("train", False):
			if dev_buckets is not None:
				printb("	ppl (dev)")
				ppl_dev = compute_perplexity(model, dev_buckets, args.batchsize)
				print("	", mean(ppl_dev), ppl_dev)

		# show log
		elapsed_time = (time.time() - start_time) / 60.
		total_time += elapsed_time
		print("	done in {} min, lr = {}, total {} min".format(int(elapsed_time), get_current_learning_rate(optimizer), int(total_time)))

		# decay learning rate
		decay_learning_rate(optimizer, args.lr_decay_factor, final_learning_rate)
예제 #5
0
def main():
	# load textfile
	vocab, vocab_inv = load_vocab(args.model_dir)
	dataset_train, dataset_dev, dataset_test, _, _ = read_data(args.train_filename, args.dev_filename, args.test_filename, vocab=vocab)
	vocab_size = len(vocab)
	printb("data	#	hash")
	print("train	{}	{}".format(len(dataset_train), hash(str(dataset_train))))
	if len(dataset_dev) > 0:
		print("dev	{}	{}".format(len(dataset_dev), hash(str(dataset_dev))))
	if len(dataset_test) > 0:
		print("test	{}	{}".format(len(dataset_test), hash(str(dataset_test))))
	print("vocab	{}".format(vocab_size))

	# split into buckets
	buckets_train = None
	if len(dataset_train) > 0:
		printb("buckets	#data	(train)")
		buckets_train = make_buckets(dataset_train)
		if args.buckets_slice is not None:
			buckets_train = buckets_train[:args.buckets_slice + 1]
		for size, data in zip(bucket_sizes, buckets_train):
			print("{}	{}".format(size, len(data)))

	buckets_dev = None
	if len(dataset_dev) > 0:
		printb("buckets	#data	(dev)")
		buckets_dev = make_buckets(dataset_dev)
		if args.buckets_slice is not None:
			buckets_dev = buckets_dev[:args.buckets_slice + 1]
		for size, data in zip(bucket_sizes, buckets_dev):
			print("{}	{}".format(size, len(data)))

	buckets_test = None
	if len(dataset_test) > 0:
		printb("buckets	#data	(test)")
		buckets_test = make_buckets(dataset_test)
		if args.buckets_slice is not None:
			buckets_test = buckets_test[:args.buckets_slice + 1]
		for size, data in zip(bucket_sizes, buckets_test):
			print("{}	{}".format(size, len(data)))

	# init
	model = load_model(args.model_dir)
	assert model is not None
	if args.gpu_device >= 0:
		chainer.cuda.get_device(args.gpu_device).use()
		model.to_gpu()

	# show log
	def mean(l):
		return sum(l) / len(l)

	sys.stdout.write("\r" + stdout.CLEAR)
	sys.stdout.flush()

	with chainer.using_config("train", False):
		if buckets_train is not None:
			printb("ppl (train)")
			ppl_train = compute_perplexity(model, buckets_train, args.batchsize)
			print(mean(ppl_train), ppl_train)

		if buckets_dev is not None:
			printb("ppl (dev)")
			ppl_dev = compute_perplexity(model, buckets_dev, args.batchsize)
			print(mean(ppl_dev), ppl_dev)

		if buckets_test is not None:
			printb("ppl (test)")
			ppl_test = compute_perplexity(model, buckets_test, args.batchsize)
			print(mean(ppl_test), ppl_dev)
예제 #6
0
def main(args):
    source_dataset, target_dataset, vocab, vocab_inv = read_data_and_vocab(
        args.source_train,
        args.target_train,
        args.source_dev,
        args.target_dev,
        args.source_test,
        args.target_test,
        reverse_source=True)

    save_vocab(args.model_dir, vocab, vocab_inv)

    source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset
    target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset

    vocab_source, vocab_target = vocab
    vocab_inv_source, vocab_inv_target = vocab_inv

    # split into buckets
    source_buckets_train, target_buckets_train = make_buckets(
        source_dataset_train, target_dataset_train)
    if args.buckets_slice is not None:
        source_buckets_train = source_buckets_train[:args.buckets_slice + 1]
        target_buckets_train = target_buckets_train[:args.buckets_slice + 1]

    # development dataset
    source_buckets_dev = None
    if len(source_dataset_dev) > 0:
        source_buckets_dev, target_buckets_dev = make_buckets(
            source_dataset_dev, target_dataset_dev)
        if args.buckets_slice is not None:
            source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1]
            target_buckets_dev = target_buckets_dev[:args.buckets_slice + 1]

    # test dataset
    source_buckets_test = None
    if len(source_dataset_test) > 0:
        source_buckets_test, target_buckets_test = make_buckets(
            source_dataset_test, target_dataset_test)
        if args.buckets_slice is not None:
            source_buckets_test = source_buckets_test[:args.buckets_slice + 1]
            target_buckets_test = target_buckets_test[:args.buckets_slice + 1]

    # show log
    dump_dataset(
        source_dataset, vocab,
        (source_buckets_train, source_buckets_dev, source_buckets_test))

    # to maintain equilibrium
    required_interations = []
    for data in source_buckets_train:
        itr = len(data) // args.batchsize + 1
        required_interations.append(itr)
    total_iterations = sum(required_interations)
    buckets_distribution = np.asarray(required_interations,
                                      dtype=float) / total_iterations

    # init
    model = load_model(args.model_dir)
    if model is None:
        model = seq2seq(len(vocab_source),
                        len(vocab_target),
                        args.ndim_embedding,
                        args.ndim_h,
                        args.num_layers,
                        pooling=args.pooling,
                        dropout=args.dropout,
                        zoneout=args.zoneout,
                        weightnorm=args.weightnorm,
                        wgain=args.wgain,
                        densely_connected=args.densely_connected,
                        attention=args.attention)

    if args.gpu_device >= 0:
        cuda.get_device(args.gpu_device).use()
        model.to_gpu()

    # setup an optimizer
    optimizer = get_optimizer(args.optimizer, args.learning_rate,
                              args.momentum)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(args.weight_decay))
    final_learning_rate = 1e-5
    total_time = 0

    indices_train = []
    for bucket_idx, bucket in enumerate(source_buckets_train):
        indices = np.arange(len(bucket))
        np.random.shuffle(indices)
        indices_train.append(indices)

    def mean(l):
        return sum(l) / len(l)

    # training
    for epoch in range(1, args.epoch + 1):
        print("Epoch", epoch)
        start_time = time.time()

        with chainer.using_config("train", True):

            for itr in range(total_iterations):
                bucket_idx = int(
                    np.random.choice(np.arange(len(source_buckets_train)),
                                     size=1,
                                     p=buckets_distribution))
                source_bucket = source_buckets_train[bucket_idx]
                target_bucket = target_buckets_train[bucket_idx]

                # sample minibatch
                source_batch = source_bucket[:args.batchsize]
                target_batch = target_bucket[:args.batchsize]
                skip_mask = source_batch != ID_PAD
                target_batch_input, target_batch_output = make_source_target_pair(
                    target_batch)

                # to gpu
                if args.gpu_device >= 0:
                    skip_mask = cuda.to_gpu(skip_mask)
                    source_batch = cuda.to_gpu(source_batch)
                    target_batch_input = cuda.to_gpu(target_batch_input)
                    target_batch_output = cuda.to_gpu(target_batch_output)

                # compute loss
                model.reset_state()
                if args.attention:
                    last_hidden_states, last_layer_outputs = model.encode(
                        source_batch, skip_mask)
                    y_batch = model.decode(target_batch_input,
                                           last_hidden_states,
                                           last_layer_outputs, skip_mask)
                else:
                    last_hidden_states = model.encode(source_batch, skip_mask)
                    y_batch = model.decode(target_batch_input,
                                           last_hidden_states)
                loss = softmax_cross_entropy(y_batch,
                                             target_batch_output,
                                             ignore_label=ID_PAD)

                # update parameters
                optimizer.update(lossfun=lambda: loss)

                # show log
                printr("iteration {}/{}".format(itr + 1, total_iterations))

                source_buckets_train[bucket_idx] = np.roll(source_bucket,
                                                           -args.batchsize,
                                                           axis=0)  # shift
                target_buckets_train[bucket_idx] = np.roll(target_bucket,
                                                           -args.batchsize,
                                                           axis=0)  # shift

            # shuffle
            for bucket_idx in range(len(source_buckets_train)):
                indices = indices_train[bucket_idx]
                np.random.shuffle(indices)
                source_buckets_train[bucket_idx] = source_buckets_train[
                    bucket_idx][indices]
                target_buckets_train[bucket_idx] = target_buckets_train[
                    bucket_idx][indices]

        # serialize
        save_model(args.model_dir, model)

        # clear console
        printr("")

        # show log
        with chainer.using_config("train", False):
            if epoch % args.interval == 0:
                printb("translate (train)")
                dump_random_source_target_translation(model,
                                                      source_buckets_train,
                                                      target_buckets_train,
                                                      vocab_inv_source,
                                                      vocab_inv_target,
                                                      num_translate=5,
                                                      beam_width=1)

                if source_buckets_dev is not None:
                    printb("translate (dev)")
                    dump_random_source_target_translation(model,
                                                          source_buckets_dev,
                                                          target_buckets_dev,
                                                          vocab_inv_source,
                                                          vocab_inv_target,
                                                          num_translate=5,
                                                          beam_width=1)

                if source_buckets_dev is not None:
                    printb("WER (dev)")
                    wer_dev = compute_error_rate_buckets(model,
                                                         source_buckets_dev,
                                                         target_buckets_dev,
                                                         len(vocab_inv_target),
                                                         beam_width=1)
                    print(mean(wer_dev), wer_dev)

        elapsed_time = (time.time() - start_time) / 60.
        total_time += elapsed_time
        print("done in {} min, lr = {:.4f}, total {} min".format(
            int(elapsed_time), get_current_learning_rate(optimizer),
            int(total_time)))

        # decay learning rate
        decay_learning_rate(optimizer, args.lr_decay_factor,
                            final_learning_rate)
예제 #7
0
def main(args):
    vocab, vocab_inv = load_vocab(args.model_dir)
    vocab_source, vocab_target = vocab
    vocab_inv_source, vocab_inv_target = vocab_inv

    source_dataset, target_dataset = read_data(vocab_source,
                                               vocab_target,
                                               args.source_train,
                                               None,
                                               args.source_dev,
                                               None,
                                               args.source_test,
                                               None,
                                               reverse_source=True)

    source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset
    target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset
    printb("data	#")
    if len(source_dataset_train) > 0:
        print("train	{}".format(len(source_dataset_train)))
    if len(source_dataset_dev) > 0:
        print("dev	{}".format(len(source_dataset_dev)))
    if len(source_dataset_test) > 0:
        print("test	{}".format(len(source_dataset_test)))

    # split into buckets
    source_buckets_train = None
    if len(source_dataset_train) > 0:
        printb("buckets 	#data	(train)")
        source_buckets_train = make_buckets(source_dataset_train)
        if args.buckets_slice is not None:
            source_buckets_train = source_buckets_train[:args.buckets_slice +
                                                        1]
        for size, data in zip(bucket_sizes, source_buckets_train):
            print("{} 	{}".format(size, len(data)))

    source_buckets_dev = None
    if len(source_dataset_dev) > 0:
        printb("buckets 	#data	(dev)")
        source_buckets_dev = make_buckets(source_dataset_dev)
        if args.buckets_slice is not None:
            source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1]
        for size, data in zip(bucket_sizes, source_buckets_dev):
            print("{} 	{}".format(size, len(data)))

    source_buckets_test = None
    if len(source_dataset_test) > 0:
        printb("buckets		#data	(test)")
        source_buckets_test = make_buckets(source_dataset_test)
        if args.buckets_slice is not None:
            source_buckets_test = source_buckets_test[:args.buckets_slice + 1]
        for size, data in zip(bucket_sizes, source_buckets_test):
            print("{} 	{}".format(size, len(data)))

    # init
    model = load_model(args.model_dir)
    assert model is not None
    if args.gpu_device >= 0:
        cuda.get_device(args.gpu_device).use()
        model.to_gpu()

    if source_buckets_train is not None:
        dump_source_translation(model,
                                source_buckets_train,
                                vocab_inv_source,
                                vocab_inv_target,
                                beam_width=args.beam_width,
                                normalization_alpha=args.alpha)

    if source_buckets_dev is not None:
        dump_source_translation(model,
                                source_buckets_dev,
                                vocab_inv_source,
                                vocab_inv_target,
                                beam_width=args.beam_width,
                                normalization_alpha=args.alpha)

    if source_buckets_test is not None:
        dump_source_translation(model,
                                source_buckets_test,
                                vocab_inv_source,
                                vocab_inv_target,
                                beam_width=args.beam_width,
                                normalization_alpha=args.alpha)