Example #1
0
tgt_vocab = Vocab(tgt_vocab_path)

src_lines = open(test_src_corpus).readlines()
trg_lines = open(test_tgt_corpus).readlines()

in_grid, out_grid = 16, 4
grid_size = in_grid + 2 * out_grid + 1
all_dict = {}
latent_dim = 2

for idx, (src_line, trg_line) in enumerate(zip(src_lines, trg_lines)):
    ylen = len(trg_line.strip().split())
    if 8 <= ylen:
        #if not (8 <= ylen and ylen <= 12):
        continue
    src_tokens = src_vocab.encode("<s> {} </s>".format(
        src_line.strip()).split())
    trg_tokens = tgt_vocab.encode("<s> {} </s>".format(
        trg_line.strip()).split())
    x = torch.tensor([src_tokens])
    y = torch.tensor([trg_tokens])
    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()
    x_mask = nmt.to_float(torch.ne(x, 0)).cuda()
    y_mask = nmt.to_float(torch.ne(y, 0)).cuda()
    y_length = y_mask.size(1)
    x_states = nmt.embed_layer(x)
    x_states = nmt.x_encoder(x_states, x_mask)
    with torch.no_grad() if OPTS.modeltype == "fakegrad" else suppress():
        y_states = nmt.embed_layer(y)
        q_states = nmt.q_encoder_xy(y_states, y_mask, x_states, x_mask)
Example #2
0
 # Read data
 lines = open(test_src_corpus).readlines()
 latent_candidate_num = OPTS.Tcandidate_num if OPTS.Tlatent_search else None
 decode_times = []
 if OPTS.profile:
     lines = lines * 10
 if OPTS.test_fix_length > 0:
     lines = [l for l in lines if len(l.split()) == OPTS.test_fix_length]
     if not lines:
         raise SystemError
     lines = [lines[0]] * 300
 trains_stop_stdout_monitor()
 with open(OPTS.result_path, "w") as outf:
     for i, line in enumerate(lines):
         # Make a batch
         tokens = src_vocab.encode("<s> {} </s>".format(
             line.strip()).split())
         x = torch.tensor([tokens])
         if torch.cuda.is_available():
             x = x.cuda()
         start_time = time.time()
         # with torch.no_grad() if not OPTS.scorenet else nullcontext():
         # Predict latent and target words from prior
         if OPTS.scorenet:
             targets = scorenet.translate(x,
                                          n_iter=OPTS.Trefine_steps,
                                          step_size=1.0)
         else:
             targets = nmt.translate(x, refine_steps=OPTS.Trefine_steps)
         target_tokens = targets[0].cpu()[0].numpy().tolist()
         if targets is None:
             target_tokens = [2, 2, 2]
Example #3
0
        print("Cannot find model in {}".format(model_path))
        sys.exit()
    nmt.load(model_path)
    if torch.cuda.is_available():
        nmt.cuda()
    nmt.train(False)
    src_vocab = Vocab(src_vocab_path)
    tgt_vocab = Vocab(tgt_vocab_path)

    # Testing for langauge model
    lines = open(test_tgt_corpus).readlines()
    first_line = lines[0]
    first_line = "Gut@@ ach : Noch ach Sicherheit ach Fußgän@@ ger ."
    # first_line = "ach ach ."
    print(first_line)
    first_line_tokens = tgt_vocab.encode("<s> {} </s>".format(
        first_line.strip()).split())
    input = torch.tensor([first_line_tokens])
    if torch.cuda.is_available():
        input = input.cuda()
    # z = vae.compute_codes(input)
    z = nmt.compute_prior_states(input)
    # z = torch.zeros((1, 6, OPTS.latentdim))
    mask = torch.ones((1, z.shape[1]))
    if torch.cuda.is_available():
        mask = mask.cuda()
        z = z.cuda()
    init_z = z.clone()
    for _ in range(10):
        z, tokens = nmt.refine(z,
                               mask,
                               n_steps=1,
Example #4
0
     DATA_ROOT,
     os.path.basename(OPTS.model_path).split(".")[0])
 autoencoder.train(False)
 if torch.cuda.is_available():
     autoencoder.cuda()
 with open(out_path, "w") as outf:
     print("code path", out_path)
     for i in range(0, len(samples), 512):
         sub_samples = samples[i:i + 512]
         src_lines = [x[0] for x in sub_samples]
         cfg_lines = [x[1] for x in sub_samples]
         processed_samples = []
         for src, cfg in sub_samples:
             src = src.strip()
             cfg = cfg.strip()
             src_ids = src_vocab.encode(src.split())
             enc_tree, dec_tree = treegen.build_trees(cfg)
             processed_samples.append((src_ids, enc_tree, dec_tree))
         src_batch, enc_batch, dec_batch = dataset.batch(
             processed_samples)
         out = autoencoder(src_batch.cuda(),
                           enc_batch,
                           dec_batch,
                           return_code=True)
         codes = out["codes"]
         for j in range(len(src_lines)):
             src = src_lines[j]
             cfg = cfg_lines[j]
             code = codes[j].int()
             outf.write("{}\n".format(code))
         outf.flush()