def init_model(self, fn_model_path, ln_model_path, fn_vocab_path, ln_vocab_path): self.fn_tokenizer = load_vocab(fn_vocab_path) self.ln_tokenizer = load_vocab(ln_vocab_path) self.fn_model = get_model(len(self.fn_tokenizer.word_index) + 1) self.ln_model = get_model(len(self.ln_tokenizer.word_index) + 1) load_model(self.fn_model, fn_model_path) load_model(self.ln_model, ln_model_path) self.fn_model.eval() self.ln_model.eval()
def main(args): # load vocab vocab, vocab_inv = load_vocab(args.model_dir) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv # load textfile source_dataset = read_data(args.source_filename, vocab_source) print_bold("data #") print("source {}".format(len(source_dataset))) # split into buckets source_buckets = make_buckets(source_dataset) if args.buckets_limit is not None: source_buckets = source_buckets[:args.buckets_limit + 1] print_bold("buckets #data (train)") for size, data in zip(bucket_sizes, source_buckets): print("{} {}".format(size, len(data))) print_bold("buckets #data (dev)") # init model = load_model(args.model_dir) assert model is not None show_source_translation(model, source_buckets, vocab_inv_source, vocab_inv_target)
def main(args): model = load_model(args.model_dir) assert model is not None vocab, vocab_inv = load_vocab(args.model_dir) assert vocab is not None assert vocab_inv is not None vocab_size = model.vocab_size # np.random.seed(0) # debug for n in xrange(args.num_generate): word_ids = np.arange(0, vocab_size, dtype=np.int32) token = ID_BOS x = np.asarray([[token]]).astype(np.int32) model.reset_state() while token != ID_EOS and x.shape[1] < args.max_sentence_length: u = model.forward_one_step(x, test=True) p = F.softmax(u).data[-1] token = np.random.choice(word_ids, size=1, p=p) x = np.append(x, np.asarray([token]).astype(np.int32), axis=1) sentence = [] for token in x[0]: word = vocab_inv[token] sentence.append(word) print(" ".join(sentence))
def main(): vocab_x = load_vocab("vocab_x") vocab_y = load_vocab("vocab_y") x, y = corpus.read_conll_data(vocab_x, vocab_y, "test.txt", False) # x, y = corpus.read_conll_data(vocab_x, vocab_y, "train.txt", False) tagger = BiLSTMTagger(vocab_x.size(), vocab_y.size()) serializers.load_npz(sys.argv[1], tagger) for k, (word_ids, tags) in enumerate(zip(x, y)): vars_x = [ Variable(np.array([wid], dtype=np.int32)) for wid in word_ids ] vars_y = [Variable(np.array([tid], dtype=np.int32)) for tid in tags] ys = tagger(vars_x) for i in range(len(ys)): if i >= len(word_ids) or i >= len(tags): sys.stderr.write('i: {} {} {}\n'.format( i, len(word_ids), len(tags))) continue word = vocab_x.itos[word_ids[i]] if tags[i] >= vocab_y.size(): sys.stderr.write('{}\n'.format(tags[i])) continue tag = vocab_y.itos[tags[i]] scores = ys[i].data[0] max_id = np.argmax(scores) if max_id >= vocab_y.size(): continue pred_tag = vocab_y.itos[max_id] print('{} POS {} {}'.format(word, tag, pred_tag)) print(flush=True)
def main(args): vocab, vocab_inv = load_vocab(args.model_dir) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv source_dataset, target_dataset = read_data(vocab_source, vocab_target, args.source_train, args.target_train, args.source_dev, args.target_dev, args.source_test, args.target_test, reverse_source=True) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset printb("data #") if len(source_dataset_train) > 0: print("train {}".format(len(source_dataset_train))) if len(source_dataset_dev) > 0: print("dev {}".format(len(source_dataset_dev))) if len(source_dataset_test) > 0: print("test {}".format(len(source_dataset_test))) print("vocab {} (source)".format(len(vocab_source))) print("vocab {} (target)".format(len(vocab_target))) # split into buckets source_buckets_train = None if len(source_dataset_train) > 0: printb("buckets #data (train)") source_buckets_train, target_buckets_train = make_buckets( source_dataset_train, target_dataset_train) if args.buckets_slice is not None: source_buckets_train = source_buckets_train[:args.buckets_slice + 1] target_buckets_train = target_buckets_train[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) source_buckets_dev = None if len(source_dataset_dev) > 0: printb("buckets #data (dev)") source_buckets_dev, target_buckets_dev = make_buckets( source_dataset_dev, target_dataset_dev) if args.buckets_slice is not None: source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1] target_buckets_dev = target_buckets_dev[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) source_buckets_test = None if len(source_dataset_test) > 0: printb("buckets #data (test)") source_buckets_test, target_buckets_test = make_buckets( source_dataset_test, target_dataset_test) if args.buckets_slice is not None: source_buckets_test = source_buckets_test[:args.buckets_slice + 1] target_buckets_test = target_buckets_test[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data))) model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() def mean(l): return sum(l) / len(l) with chainer.using_config("train", False): if source_buckets_train is not None: printb("WER (train)") wer_train = compute_error_rate_buckets(model, source_buckets_train, target_buckets_train, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_train), wer_train) if source_buckets_dev is not None: printb("WER (dev)") wer_dev = compute_error_rate_buckets(model, source_buckets_dev, target_buckets_dev, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_dev), wer_dev) if source_buckets_test is not None: printb("WER (test)") wer_test = compute_error_rate_buckets(model, source_buckets_test, target_buckets_test, len(vocab_target), args.beam_width, args.alpha) print(mean(wer_test), wer_test)
def main(): # load textfile vocab, vocab_inv = load_vocab(args.model_dir) dataset_train, dataset_dev, dataset_test, _, _ = read_data(args.train_filename, args.dev_filename, args.test_filename, vocab=vocab) vocab_size = len(vocab) printb("data # hash") print("train {} {}".format(len(dataset_train), hash(str(dataset_train)))) if len(dataset_dev) > 0: print("dev {} {}".format(len(dataset_dev), hash(str(dataset_dev)))) if len(dataset_test) > 0: print("test {} {}".format(len(dataset_test), hash(str(dataset_test)))) print("vocab {}".format(vocab_size)) # split into buckets buckets_train = None if len(dataset_train) > 0: printb("buckets #data (train)") buckets_train = make_buckets(dataset_train) if args.buckets_slice is not None: buckets_train = buckets_train[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, buckets_train): print("{} {}".format(size, len(data))) buckets_dev = None if len(dataset_dev) > 0: printb("buckets #data (dev)") buckets_dev = make_buckets(dataset_dev) if args.buckets_slice is not None: buckets_dev = buckets_dev[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, buckets_dev): print("{} {}".format(size, len(data))) buckets_test = None if len(dataset_test) > 0: printb("buckets #data (test)") buckets_test = make_buckets(dataset_test) if args.buckets_slice is not None: buckets_test = buckets_test[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, buckets_test): print("{} {}".format(size, len(data))) # init model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: chainer.cuda.get_device(args.gpu_device).use() model.to_gpu() # show log def mean(l): return sum(l) / len(l) sys.stdout.write("\r" + stdout.CLEAR) sys.stdout.flush() with chainer.using_config("train", False): if buckets_train is not None: printb("ppl (train)") ppl_train = compute_perplexity(model, buckets_train, args.batchsize) print(mean(ppl_train), ppl_train) if buckets_dev is not None: printb("ppl (dev)") ppl_dev = compute_perplexity(model, buckets_dev, args.batchsize) print(mean(ppl_dev), ppl_dev) if buckets_test is not None: printb("ppl (test)") ppl_test = compute_perplexity(model, buckets_test, args.batchsize) print(mean(ppl_test), ppl_dev)
def main(args): vocab, vocab_inv = load_vocab(args.model_dir) vocab_source, vocab_target = vocab vocab_inv_source, vocab_inv_target = vocab_inv source_dataset, target_dataset = read_data(vocab_source, vocab_target, args.source_train, None, args.source_dev, None, args.source_test, None, reverse_source=True) source_dataset_train, source_dataset_dev, source_dataset_test = source_dataset target_dataset_train, target_dataset_dev, target_dataset_test = target_dataset printb("data #") if len(source_dataset_train) > 0: print("train {}".format(len(source_dataset_train))) if len(source_dataset_dev) > 0: print("dev {}".format(len(source_dataset_dev))) if len(source_dataset_test) > 0: print("test {}".format(len(source_dataset_test))) # split into buckets source_buckets_train = None if len(source_dataset_train) > 0: printb("buckets #data (train)") source_buckets_train = make_buckets(source_dataset_train) if args.buckets_slice is not None: source_buckets_train = source_buckets_train[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_train): print("{} {}".format(size, len(data))) source_buckets_dev = None if len(source_dataset_dev) > 0: printb("buckets #data (dev)") source_buckets_dev = make_buckets(source_dataset_dev) if args.buckets_slice is not None: source_buckets_dev = source_buckets_dev[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_dev): print("{} {}".format(size, len(data))) source_buckets_test = None if len(source_dataset_test) > 0: printb("buckets #data (test)") source_buckets_test = make_buckets(source_dataset_test) if args.buckets_slice is not None: source_buckets_test = source_buckets_test[:args.buckets_slice + 1] for size, data in zip(bucket_sizes, source_buckets_test): print("{} {}".format(size, len(data))) # init model = load_model(args.model_dir) assert model is not None if args.gpu_device >= 0: cuda.get_device(args.gpu_device).use() model.to_gpu() if source_buckets_train is not None: dump_source_translation(model, source_buckets_train, vocab_inv_source, vocab_inv_target, beam_width=args.beam_width, normalization_alpha=args.alpha) if source_buckets_dev is not None: dump_source_translation(model, source_buckets_dev, vocab_inv_source, vocab_inv_target, beam_width=args.beam_width, normalization_alpha=args.alpha) if source_buckets_test is not None: dump_source_translation(model, source_buckets_test, vocab_inv_source, vocab_inv_target, beam_width=args.beam_width, normalization_alpha=args.alpha)
def init_model(self, model_path, vocab_path): self.model = get_model() load_model(self.model, model_path) self.model.eval() self.tokenizer = load_vocab(vocab_path)