def main(): config = setup_config() vocab_src, vocab_tgt = load_dataset_joey(config) src_file = "{}/vocab.{}".format(config["data_dir"], config["src"]) tgt_file = "{}/vocab.{}".format(config["data_dir"], config["tgt"]) vocab_src.to_file(src_file) vocab_tgt.to_file(tgt_file)
def main(): config = setup_config() vocab_src, vocab_tgt = load_vocabularies(config) train_data, dev_data, opt_data = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) dl = DataLoader(train_data, batch_size=config["batch_size_train"], shuffle=True, num_workers=4) bucketing_dl = BucketingParallelDataLoader(dl) cycle_iterate_dl_back = None if config["back_prefix"] != None: dl_back = DataLoader(dataset=opt_data['back'], batch_size=config["batch_size_train"], shuffle=True, num_workers=2) bucketing_dl_back = BucketingParallelDataLoader(dl_back) cycle_iterate_dl_back = cycle(bucketing_dl_back) model, train_fn, validate_fn = create_model(vocab_src, vocab_tgt, config) model.to(torch.device(config["device"])) train(model, train_fn, validate_fn, bucketing_dl, dev_data, vocab_src, vocab_tgt, config, cycle_iterate_dl_back=cycle_iterate_dl_back)
def main(): config = setup_config() config["train_prefix"] = 'sample' vocab_src, vocab_tgt = load_vocabularies(config) train_data, _, _ = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) val_dl = DataLoader(train_data, batch_size=config["batch_size_eval"], shuffle=False, num_workers=4) val_dl = BucketingParallelDataLoader(val_dl) sentences_x, sentences_y = next(val_dl) model, _, validate_fn = create_model(vocab_src, vocab_tgt, config) model.to(torch.device(config["device"])) # checkpoint_path = "output/aevnmt_z_loss_en-de_run_0/checkpoints/aevnmt_z_loss_en-de_run_0" checkpoint_path = "output/aevnmt_z_loss_de-en_run_0/checkpoints/aevnmt_z_loss_de-en_run_0" state = torch.load(checkpoint_path) model.load_state_dict(state['state_dict']) sample_from_latent(model, vocab_src, vocab_tgt, config) sample_from_posterior(model, sentences_x, vocab_src, vocab_tgt, config)
def main(): # config = setup_config() # config["train_prefix"] = 'sample' # train_data, dev_data, vocab_src, vocab_tgt = load_dataset_joey(config) # dataloader = data.make_data_iter(train_data, 1, train=True) # sample = next(iter(dataloader)) # batch = Batch(sample, vocab_src.stoi[config["pad"]], use_cuda=False if config["device"] == "cpu" else True) # # model_xy, model_yx, _, _, validate_fn = create_models(vocab_src, vocab_tgt, config) # model_xy.to(torch.device(config["device"])) # model_yx.to(torch.device(config["device"])) # # checkpoint_path = "output/coaevnmt_greedy_lm_off_run_5/checkpoints/coaevnmt_greedy_lm_off_run_5" # state = torch.load(checkpoint_path) # model_xy.load_state_dict(state['state_dict_xy']) # model_yx.load_state_dict(state['state_dict_yx']) config = setup_config() config["train_prefix"] = 'sample' vocab_src, vocab_tgt = load_vocabularies(config) train_data, _, _ = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) val_dl = DataLoader(train_data, batch_size=config["batch_size_eval"], shuffle=False, num_workers=4) val_dl = BucketingParallelDataLoader(val_dl) sentences_x, sentences_y = next(val_dl) # model, _, validate_fn = create_model(vocab_src, vocab_tgt, config) # model.to(torch.device(config["device"])) # model_xy, model_yx, _, _, validate_fn = create_models(vocab_src, vocab_tgt, config) # model_xy.to(torch.device(config["device"])) # model_yx.to(torch.device(config["device"])) model_xy, model_yx, bi_train_fn, mono_train_fn, validate_fn = create_models( vocab_src, vocab_tgt, config) model_xy.to(torch.device(config["device"])) model_yx.to(torch.device(config["device"])) checkpoint_path = "output/coaevnmt_curc_diff_greedy_lr2_en-de_run_1/checkpoints/coaevnmt_curc_diff_greedy_lr2_en-de_run_1" state = torch.load(checkpoint_path) model_xy.load_state_dict(state['state_dict_xy']) model_yx.load_state_dict(state['state_dict_yx']) print("validation: {}-{}".format(config["src"], config["tgt"])) sample_from_latent(model_xy, vocab_src, vocab_tgt, config) sample_from_posterior(model_xy, sentences_x, vocab_src, vocab_tgt, config) print("") print("validation: {}-{}".format(config["tgt"], config["src"])) sample_from_latent(model_yx, vocab_tgt, vocab_src, config) sample_from_posterior(model_yx, sentences_y, vocab_tgt, vocab_src, config)
def main(): config = setup_config() train_data, dev_data, vocab_src, vocab_tgt = load_dataset_joey(config) model, train_fn, validate_fn = create_model(vocab_src, vocab_tgt, config) model.to(torch.device(config["device"])) init_model(model, vocab_src.stoi[config["pad"]], vocab_tgt.stoi[config["pad"]], config) trainer = Trainer(model, train_fn, validate_fn, vocab_src, vocab_tgt, train_data, dev_data, config) trainer.train_model()
def main(): # config = setup_config() config = setup_config() config["dev_prefix"] = "dev" # config["dev_prefix"] = "test_2016_flickr.lc.norm.tok" # config["dev_prefix"] = "test_2017_flickr.lc.norm.tok" vocab_src, vocab_tgt = load_vocabularies(config) _, dev_data, _ = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) checkpoint_path = "output/aevnmt_z_loss_en-de_run_1/checkpoints/aevnmt_z_loss_en-de_run_1" if config["model_type"] == "coaevnmt": model_xy, model_yx, _, _, validate_fn = create_models( vocab_src, vocab_tgt, config) model_xy.to(torch.device(config["device"])) model_yx.to(torch.device(config["device"])) state = torch.load(checkpoint_path) model_xy.load_state_dict(state['state_dict_xy']) model_yx.load_state_dict(state['state_dict_yx']) printKL(model_xy, dev_data, vocab_src, vocab_tgt, config, direction="xy") printKL(model_yx, dev_data, vocab_tgt, vocab_src, config, direction="yx") elif config["model_type"] == "aevnmt": model, _, _ = create_model(vocab_src, vocab_tgt, config) model.to(torch.device(config["device"])) state = torch.load(checkpoint_path) model.load_state_dict(state['state_dict']) printKL(model, dev_data, vocab_src, vocab_tgt, config, direction="None")
def main(): # config = setup_config() config = setup_config() # config["dev_prefix"] = "dev" # config["dev_prefix"] = "test_2016_flickr.lc.norm.tok" config["dev_prefix"] = "test_2017_flickr.lc.norm.tok" vocab_src, vocab_tgt = load_vocabularies(config) _, dev_data, _ = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) model_xy, model_yx, _, _, validate_fn = create_models( vocab_src, vocab_tgt, config) model_xy.to(torch.device(config["device"])) model_yx.to(torch.device(config["device"])) # checkpoint_path = "output/coaevnmt_greedy_lm_off_run_5/checkpoints/coaevnmt_greedy_lm_off_run_5" # checkpoint_path = "output/coaevnmt_lr3_curriculum_en-de_run_4/checkpoints/coaevnmt_lr3_curriculum_en-de_run_4" # checkpoint_path = "output/coaevnmt_lr3_no_curriculum_no_warmup_en-de_run_4/checkpoints/coaevnmt_lr3_no_curriculum_no_warmup_en-de_run_4" # checkpoint_path = "output/coaevnmt_lr3_beam_dec_3_en-de_run_3/checkpoints/coaevnmt_lr3_beam_dec_3_en-de_run_3" # checkpoint_path = "output/conmt_anc_en-de_run_3/checkpoints/conmt_anc_en-de_run_3" # checkpoint_path = "output/conmt_greedy_2en-de_run_3/checkpoints/conmt_greedy_2en-de_run_3" # checkpoint_path = "output/conmt_greedy_no_warmup_en-de_run_3/checkpoints/conmt_greedy_no_warmup_en-de_run_3" # checkpoint_path = "output/conmt_beam_dec_3_2en-de_run_1/checkpoints/conmt_beam_dec_3_2en-de_run_1" # checkpoint_path = "output/conmt_beam_dec_5_2en-de_run_3/checkpoints/conmt_beam_dec_5_2en-de_run_3" # checkpoint_path = "output/conmt_beam_dec_10_2en-de_run_3/checkpoints/conmt_beam_dec_10_2en-de_run_3" # checkpoint_path = "output/conmt_beam_dec_10_en-de_run_3/checkpoints/conmt_beam_dec_10_en-de_run_3" # checkpoint_path = "output/conmt_curc_diff_greedy_conv_yx_en-de_run_7/checkpoints/conmt_curc_diff_greedy_conv_yx_en-de_run_7" # checkpoint_path = "output/conmt_final_full_en-de_run_3/checkpoints/conmt_final_full_en-de_run_3" # checkpoint_path = "output/conmt_final_half_en-de_run_3/checkpoints/conmt_final_half_en-de_run_3" # checkpoint_path = "output/conmt_final_fourth_en-de_run_3/checkpoints/conmt_final_fourth_en-de_run_3" # checkpoint_path = "output/coaevnmt_final_full_en-de_run_3/checkpoints/coaevnmt_final_full_en-de_run_3" # checkpoint_path = "output/coaevnmt_final_half_en-de_run_3/checkpoints/coaevnmt_final_half_en-de_run_3" checkpoint_path = "output/coaevnmt_final_fourth_en-de_run_3/checkpoints/coaevnmt_final_fourth_en-de_run_3" state = torch.load(checkpoint_path) model_xy.load_state_dict(state['state_dict_xy']) model_yx.load_state_dict(state['state_dict_yx']) print("validation: {}-{}".format(config["src"], config["tgt"])) evaluate(model_xy, dev_data, vocab_src, vocab_tgt, config, direction="xy") print("validation: {}-{}".format(config["tgt"], config["src"])) evaluate(model_yx, dev_data, vocab_tgt, vocab_src, config, direction="yx")
def main(): config = setup_config() en_files = [config["data_dir"] + "/" + "all_data." + config["src"]] de_files = [config["data_dir"] + "/" + "all_data." + config["tgt"]] vocab_src = Vocabulary().from_data(en_files, min_freq=0, max_size=sys.maxsize) vocab_tgt = Vocabulary().from_data(de_files, min_freq=0, max_size=sys.maxsize) # print(vocab_src) vocab_src.print_statistics() vocab_tgt.print_statistics() vocab_src.save(config["data_dir"] + "/" + "vocab_new." + config["src"]) vocab_tgt.save(config["data_dir"] + "/" + "vocab_new." + config["tgt"])
def main(): config = setup_config() vocab_src, vocab_tgt = load_vocabularies(config) train_data, dev_data, opt_data = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) dl_xy = DataLoader(train_data, batch_size=config["batch_size_train"], shuffle=True, num_workers=2) bucketing_dl_xy = BucketingParallelDataLoader(dl_xy) dl_x = DataLoader(dataset=opt_data['mono_src'], batch_size=config["batch_size_train"], shuffle=True, num_workers=2) bucketing_dl_x = BucketingTextDataLoader(dl_x) cycle_iterate_dl_x = cycle(bucketing_dl_x) dl_y = DataLoader(dataset=opt_data['mono_tgt'], batch_size=config["batch_size_train"], shuffle=True, num_workers=2) bucketing_dl_y = BucketingTextDataLoader(dl_y) cycle_iterate_dl_y = cycle(bucketing_dl_y) model, bi_train_fn, mono_train_fn, validate_fn = create_model( vocab_src, vocab_tgt, config) print(model.emb_src is model.model_xy.emb_src) print(model.emb_tgt is model.model_xy.emb_tgt) asf model.to(torch.device(config["device"])) # model_yx.to(torch.device(config["device"])) train(model, bi_train_fn, mono_train_fn, validate_fn, bucketing_dl_xy, dev_data, cycle_iterate_dl_x, cycle_iterate_dl_y, vocab_src, vocab_tgt, config)
def main(): config = setup_config() config["dev_prefix"] = "comparable" vocab_src, vocab_tgt = load_vocabularies(config) _, dev_data, _ = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) model, _, validate_fn = create_model(vocab_src, vocab_tgt, config) model.to(torch.device(config["device"])) checkpoint_path = "{}/cond_nmt_de-en_run_7/checkpoints/cond_nmt_de-en_run_7".format( config["out_dir"]) state = torch.load(checkpoint_path) model.load_state_dict(state['state_dict']) model.eval() device = torch.device( "cpu") if config["device"] == "cpu" else torch.device("cuda:0") with torch.no_grad(): model_hypotheses = [] references = [] val_dl = DataLoader(dev_data, batch_size=config["batch_size_eval"], shuffle=False, num_workers=4) # val_dl = BucketingParallelDataLoader(val_dl) for sentences_x, sentences_y in tqdm(val_dl): sentences_x = np.array(sentences_x) seq_len = np.array([len(s.split()) for s in sentences_x]) sort_keys = np.argsort(-seq_len) sentences_x = sentences_x[sort_keys] # # sentences_y = np.array(sentences_y) x_in, _, x_mask, x_len = create_batch(sentences_x, vocab_src, device) x_mask = x_mask.unsqueeze(1) if config["model_type"] == "aevnmt": qz = model.inference(x_in, x_mask, x_len) z = qz.mean enc_output, enc_hidden = model.encode(x_in, x_len, z) dec_hidden = model.init_decoder(enc_output, enc_hidden, z) raw_hypothesis = beam_search(model.decoder, model.emb_tgt, model.generate_tm, enc_output, dec_hidden, x_mask, vocab_tgt.size(), vocab_tgt[SOS_TOKEN], vocab_tgt[EOS_TOKEN], vocab_tgt[PAD_TOKEN], config) else: enc_output, enc_hidden = model.encode(x_in, x_len) dec_hidden = model.decoder.initialize(enc_output, enc_hidden) raw_hypothesis = beam_search(model.decoder, model.emb_tgt, model.generate_tm, enc_output, dec_hidden, x_mask, vocab_tgt.size(), vocab_tgt[SOS_TOKEN], vocab_tgt[EOS_TOKEN], vocab_tgt[PAD_TOKEN], config) hypothesis = batch_to_sentences(raw_hypothesis, vocab_tgt) inverse_sort_keys = np.argsort(sort_keys) model_hypotheses += hypothesis[inverse_sort_keys].tolist() references += sentences_y.tolist() save_hypotheses(model_hypotheses, 0, config, None) model_hypotheses, references = clean_sentences(model_hypotheses, references, config) bleu = sacrebleu.raw_corpus_bleu(model_hypotheses, [references]).score print(bleu)
def main(): config = setup_config() config["dev_prefix"] = "comparable" vocab_src, vocab_tgt = load_vocabularies(config) _, dev_data, _ = load_data(config, vocab_src=vocab_src, vocab_tgt=vocab_tgt) # _, dev_data, vocab_src, vocab_tgt = load_dataset_joey(config) model, _, validate_fn = create_model(vocab_src, vocab_tgt, config) model.to(torch.device(config["device"])) checkpoint_path = "{}/cond_nmt_new_de-en_run_2/checkpoints/cond_nmt_new_de-en_run_2".format( config["out_dir"]) state = torch.load(checkpoint_path) model.load_state_dict(state['state_dict']) model.eval() device = torch.device( "cpu") if config["device"] == "cpu" else torch.device("cuda:0") with torch.no_grad(): model_hypotheses = [] references = [] val_dl = DataLoader(dev_data, batch_size=config["batch_size_eval"], shuffle=False, num_workers=4) val_dl = BucketingParallelDataLoader(val_dl) for sentences_x, sentences_y in tqdm(val_dl): x_in, _, x_mask, x_len = create_batch(sentences_x, vocab_src, device) x_mask = x_mask.unsqueeze(1) if config["model_type"] == "aevnmt": qz = model.inference(x_in, x_mask) z = qz.mean enc_output, enc_hidden = model.encode(x_in, z) dec_hidden = model.init_decoder(enc_output, enc_hidden, z) raw_hypothesis = beam_search(model.decoder, model.emb_tgt, model.generate_tm, enc_output, dec_hidden, x_mask, vocab_tgt.size(), vocab_tgt[SOS_TOKEN], vocab_tgt[EOS_TOKEN], vocab_tgt[PAD_TOKEN], config) else: enc_output, enc_hidden = model.encode(x_in) dec_hidden = model.decoder.initialize(enc_output, enc_hidden) raw_hypothesis = beam_search(model.decoder, model.emb_tgt, model.generate, enc_output, dec_hidden, x_mask, vocab_tgt.size(), vocab_tgt[SOS_TOKEN], vocab_tgt[EOS_TOKEN], vocab_tgt[PAD_TOKEN], config) hypothesis = batch_to_sentences(raw_hypothesis, vocab_tgt) model_hypotheses += hypothesis.tolist() references += sentences_y.tolist() save_hypotheses(model_hypotheses, 0, config, None)