tgt_vocab = Vocab(tgt_vocab_path) src_lines = open(test_src_corpus).readlines() trg_lines = open(test_tgt_corpus).readlines() in_grid, out_grid = 16, 4 grid_size = in_grid + 2 * out_grid + 1 all_dict = {} latent_dim = 2 for idx, (src_line, trg_line) in enumerate(zip(src_lines, trg_lines)): ylen = len(trg_line.strip().split()) if 8 <= ylen: #if not (8 <= ylen and ylen <= 12): continue src_tokens = src_vocab.encode("<s> {} </s>".format( src_line.strip()).split()) trg_tokens = tgt_vocab.encode("<s> {} </s>".format( trg_line.strip()).split()) x = torch.tensor([src_tokens]) y = torch.tensor([trg_tokens]) if torch.cuda.is_available(): x = x.cuda() y = y.cuda() x_mask = nmt.to_float(torch.ne(x, 0)).cuda() y_mask = nmt.to_float(torch.ne(y, 0)).cuda() y_length = y_mask.size(1) x_states = nmt.embed_layer(x) x_states = nmt.x_encoder(x_states, x_mask) with torch.no_grad() if OPTS.modeltype == "fakegrad" else suppress(): y_states = nmt.embed_layer(y) q_states = nmt.q_encoder_xy(y_states, y_mask, x_states, x_mask)
# Read data lines = open(test_src_corpus).readlines() latent_candidate_num = OPTS.Tcandidate_num if OPTS.Tlatent_search else None decode_times = [] if OPTS.profile: lines = lines * 10 if OPTS.test_fix_length > 0: lines = [l for l in lines if len(l.split()) == OPTS.test_fix_length] if not lines: raise SystemError lines = [lines[0]] * 300 trains_stop_stdout_monitor() with open(OPTS.result_path, "w") as outf: for i, line in enumerate(lines): # Make a batch tokens = src_vocab.encode("<s> {} </s>".format( line.strip()).split()) x = torch.tensor([tokens]) if torch.cuda.is_available(): x = x.cuda() start_time = time.time() # with torch.no_grad() if not OPTS.scorenet else nullcontext(): # Predict latent and target words from prior if OPTS.scorenet: targets = scorenet.translate(x, n_iter=OPTS.Trefine_steps, step_size=1.0) else: targets = nmt.translate(x, refine_steps=OPTS.Trefine_steps) target_tokens = targets[0].cpu()[0].numpy().tolist() if targets is None: target_tokens = [2, 2, 2]
print("Cannot find model in {}".format(model_path)) sys.exit() nmt.load(model_path) if torch.cuda.is_available(): nmt.cuda() nmt.train(False) src_vocab = Vocab(src_vocab_path) tgt_vocab = Vocab(tgt_vocab_path) # Testing for langauge model lines = open(test_tgt_corpus).readlines() first_line = lines[0] first_line = "Gut@@ ach : Noch ach Sicherheit ach Fußgän@@ ger ." # first_line = "ach ach ." print(first_line) first_line_tokens = tgt_vocab.encode("<s> {} </s>".format( first_line.strip()).split()) input = torch.tensor([first_line_tokens]) if torch.cuda.is_available(): input = input.cuda() # z = vae.compute_codes(input) z = nmt.compute_prior_states(input) # z = torch.zeros((1, 6, OPTS.latentdim)) mask = torch.ones((1, z.shape[1])) if torch.cuda.is_available(): mask = mask.cuda() z = z.cuda() init_z = z.clone() for _ in range(10): z, tokens = nmt.refine(z, mask, n_steps=1,
DATA_ROOT, os.path.basename(OPTS.model_path).split(".")[0]) autoencoder.train(False) if torch.cuda.is_available(): autoencoder.cuda() with open(out_path, "w") as outf: print("code path", out_path) for i in range(0, len(samples), 512): sub_samples = samples[i:i + 512] src_lines = [x[0] for x in sub_samples] cfg_lines = [x[1] for x in sub_samples] processed_samples = [] for src, cfg in sub_samples: src = src.strip() cfg = cfg.strip() src_ids = src_vocab.encode(src.split()) enc_tree, dec_tree = treegen.build_trees(cfg) processed_samples.append((src_ids, enc_tree, dec_tree)) src_batch, enc_batch, dec_batch = dataset.batch( processed_samples) out = autoencoder(src_batch.cuda(), enc_batch, dec_batch, return_code=True) codes = out["codes"] for j in range(len(src_lines)): src = src_lines[j] cfg = cfg_lines[j] code = codes[j].int() outf.write("{}\n".format(code)) outf.flush()