def main(fairseq_args): # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ # {"name": "multi30k_test", "languages": ["de-en"], "sizes": [("original", None)]}, { "name": "europarl_lc", "languages": ["de-en"], "sizes": [("50k", 50000)] }, ], subword_models=["unigram"], vocab_sizes=[x for x in [400]], merge_vocabs=False, force_overwrite=False, use_cmd=False, eval_mode="same", conda_env_name="mltests", letter_case="lower", ).build(make_plots=False) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) # Train & Score a model for each dataset scores = [] run_prefix = "transformer256emb" for ds in tr_datasets: try: wandb_params = dict(project="fairseq", entity="salvacarrion") model = FairseqTranslator(conda_fairseq_env_name="fairseq", model_ds=ds, wandb_params=wandb_params, force_overwrite=True, run_prefix=run_prefix) model.fit(max_epochs=100, max_tokens=4096, batch_size=None, seed=1234, patience=10, num_workers=12, devices="auto", fairseq_args=fairseq_args) m_scores = model.predict(ts_datasets, metrics={"bleu"}, beams=[5]) scores.append(m_scores) except Exception as e: print(e) # Make report and print it output_path = f".outputs/fairseq/{str(datetime.datetime.now())}" df_report, df_summary = generate_report( scores=scores, output_path=output_path, plot_metric="beam1__sacrebleu_bleu_score") print("Summary:") print(df_summary.to_string(index=False))
def main(): # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]}, # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]}, ], subword_models=["word"], vocab_sizes=[250, 500, 1000, 2000, 4000, 8000], merge_vocabs=False, force_overwrite=False, use_cmd=True, eval_mode="same", letter_case="lower", ).build(make_plots=False, safe=True) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) # Save embeddings rows = [] origin_emb_size = 304 name = "ae_linear" for ds in tr_datasets: print(f"Encoding data for: {str(ds)}") # Encode data src_scores, trg_scores = encode_data(f".outputs/tmp/{origin_emb_size}/{str(ds)}", enc_dim=256, name=name) # Keep info src_scores["emb_name"] = "src" src_scores["dataset_name"] = ds.dataset_name src_scores["subword_model"] = ds.subword_model src_scores["vocab_size"] = ds.vocab_size trg_scores["emb_name"] = "trg" trg_scores["dataset_name"] = ds.dataset_name trg_scores["subword_model"] = ds.subword_model trg_scores["vocab_size"] = ds.vocab_size rows.append(src_scores) rows.append(trg_scores) # # Encode data 2 # trg_scores = encode_data2(f".outputs/tmp/{origin_emb_size}/{str(ds)}", enc_dim=256, name=name) # # # Keep info # trg_scores["emb_name"] = "trg" # trg_scores["dataset_name"] = ds.dataset_name # trg_scores["subword_model"] = ds.subword_model # trg_scores["vocab_size"] = ds.vocab_size # rows.append(trg_scores) # Print results df = pd.DataFrame(rows) df.to_csv(f".outputs/tmp/{origin_emb_size}/{name}_{origin_emb_size}.csv", index=False) print(df)
def main(): # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ { "name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)] }, # {"name": "europarl", "languages": ["de-en"], "sizes": [("original", None)]}, ], subword_models=["word"], vocab_sizes=[8000], merge_vocabs=False, force_overwrite=True, use_cmd=True, eval_mode="same", letter_case="lower", ).build(make_plots=False) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) # Train & Score a model for each dataset scores = [] errors = [] run_prefix = "model_mt8kemb" for ds in tr_datasets: # try: # Instantiate vocabs and model src_vocab = Vocabulary(max_tokens=120).build_from_ds(ds=ds, lang=ds.src_lang) trg_vocab = Vocabulary(max_tokens=120).build_from_ds(ds=ds, lang=ds.trg_lang) model = Transformer(src_vocab_size=len(src_vocab), trg_vocab_size=len(trg_vocab), padding_idx=src_vocab.pad_id, encoder_embed_dim=256, decoder_embed_dim=256) # Train model wandb_params = dict(project="autonmt-tests", entity="salvacarrion") model = AutonmtTranslator(model=model, src_vocab=src_vocab, trg_vocab=trg_vocab, model_ds=ds, wandb_params=wandb_params, force_overwrite=True, run_prefix=run_prefix) model.fit(max_epochs=100, batch_size=128, seed=1234, num_workers=16, patience=10) m_scores = model.predict(ts_datasets, metrics={"bleu"}, beams=[1], max_gen_length=120, load_best_checkpoint=True) scores.append(m_scores) # except Exception as e: # print(str(e)) # errors += [str(e)] # Make report and print it output_path = f".outputs/autonmt/{str(datetime.datetime.now())}/{run_prefix}" df_report, df_summary = generate_report( scores=scores, output_path=output_path, plot_metric="beam1__sacrebleu_bleu_score") print("Summary:") print(df_summary.to_string(index=False)) print(f"Errors: {len(errors)}") print(errors)
def main(): # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]}, {"name": "europarl", "languages": ["de-en"], "sizes": [("original_lc", None)]}, ], subword_models=["word"], vocab_sizes=[250, 500, 1000, 2000, 4000, 8000], merge_vocabs=False, force_overwrite=False, use_cmd=False, eval_mode="same", letter_case="lower", ).build(make_plots=False, safe=True) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) # Train & Score a model for each dataset scores = [] errors = [] max_tokens = 100 device = "cuda" if torch.cuda.is_available() else "cpu" # Export raw embeddings run_prefix = "model" for ds in tr_datasets: # Save embeddings model, src_vocab, trg_vocab = load_model(ds, run_prefix) save_embeddings_models(model, f".outputs/tmp/256/{str(ds)}") #⁇ pairs = [(250, 500), (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000)] compressors = ["random"] rows = [] for origin_emb_size in [256]: for sw_small, sw_big in pairs: # Get datasets ds_small = get_dataset(tr_datasets, sw_small) ds_big = get_dataset(tr_datasets, sw_big) assert ds_small.dataset_name == ds_big.dataset_name assert ds_small.subword_model == ds_big.subword_model for comp in compressors: # Compress vector src_emb, trg_emb = None, None if comp not in {None, "none"}: _origin_emb_size = 300 if comp == "glove" else origin_emb_size src_emb, trg_emb = load_compressed_embeddings(f".outputs/tmp/{_origin_emb_size}/{str(ds_big)}", comp, subword_size=sw_big, src_emb=256, trg_emb=256) run_prefix = f"transformer256emb" # Don't change it model, big_src_vocab, big_trg_vocab = create_big_model(ds_small, ds_big) # _model, _big_src_vocab, _big_trg_vocab = expand_model(ds_small, ds_big, comp, run_prefix, src_emb, trg_emb) # clone_embeddings(from_model=_model, to_model=model) # del _model # # Load glove embeddings # emb_dir = f".outputs/tmp/glove_256/{sw_small}-{sw_big}/{comp}" # emb_path = f"{emb_dir}/raw_glove.npy" # if os.path.exists(emb_path): # glove_emb = np.load(emb_path) # else: # glove_emb = get_glove_embeddings(big_trg_vocab) # # # Save embeddings # utils.make_dir(emb_dir) # np.save(emb_path, glove_emb) # # # Add trg embedding # add_trg_embedding(model, glove_emb) # Add embeddings model = model.to(device) # Load small model and vocabs # run_prefix = f"transformer256emb" # Don't change it # model, src_vocab, trg_vocab = load_model(ds_small, run_prefix) # # run_prefix += f"_zr_oes{origin_emb_size}_c{comp}_{sw_small}-{sw_big}" # # # Expand model and vocabs # model, src_vocab, trg_vocab = expand_model(model, src_vocab, trg_vocab, ds_big, src_emb, trg_emb, comp) # model = model.to(device) # Test model ds_small.subword_model = "none" wandb_params = None #dict(project="autonmt-tests", entity="salvacarrion") model = AutonmtTranslator(model=model, model_ds=ds_small, src_vocab=big_src_vocab, trg_vocab=big_trg_vocab, wandb_params=wandb_params, run_prefix=run_prefix, force_overwrite=True) model.fit(max_epochs=1, learning_rate=0.0001, optimizer="sgd", batch_size=128, seed=1234, num_workers=0, patience=10) m_scores = model.predict(eval_datasets=ts_datasets, metrics={"bleu"}, beams=[1], max_gen_length=max_tokens) ds_small.subword_model = "word" # Keep results bleu = m_scores[0]['beams']['beam1']['sacrebleu_bleu_score'] row = {"dataset_name": ds_small.dataset_name, "subword_model": ds_small.subword_model, "from_to": f"{sw_small}➔{sw_big}", "origin_emb_size": origin_emb_size, "compressor": comp, "bleu": bleu} rows.append(row) print(row) asd = 3 # Create pandas dataframe df = pd.DataFrame(rows) df.to_csv("europarl.csv", index=False) print(df)
import json import fasttext import fasttext.util # Create preprocessing for training builder_big = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]}, { "name": "europarl", "languages": ["de-en"], "sizes": [("original_lc", None)] }, # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]}, ], subword_models=["word"], vocab_sizes=[16000], merge_vocabs=False, force_overwrite=False, use_cmd=False, eval_mode="same", letter_case="lower", ).build(make_plots=False, safe=True) big_datasets = builder_big.get_ds() ds_ref = big_datasets[0] base_path = "." # Load vocabs
def main(): file = "trg" path = "/home/scarrion/Documents/Programming/Python/mltests/translation/autonmt/.outputs/tmp/256/multi30k_de-en_original_word_8000/" # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]}, # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]}, { "name": "europarl", "languages": ["de-en"], "sizes": [("original_lc", None)] }, ], subword_models=["word"], vocab_sizes=[16000], merge_vocabs=False, force_overwrite=False, use_cmd=True, eval_mode="same", letter_case="lower", ).build(make_plots=False, safe=True) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) file = "trg" for train_tsne in [True, False]: for origin_emb_size in [256]: for ds in tr_datasets: base_path = f".outputs/tmp/{origin_emb_size}/{str(ds)}" base_path = f"/home/scarrion/Documents/Programming/Python/mltests/translation/autonmt/.outputs/tmp/256/fasttext256_europarl_lc_16k" if train_tsne: x = np.load(os.path.join(base_path, f"{file}.npy")) x_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x) np.save(os.path.join(base_path, f"{file}_tsne.npy"), x_embedded) print(f"File saved! ({str(ds)})") else: x = np.load(os.path.join(base_path, f"{file}_tsne.npy")) labels = utils.read_file_lines( ds.get_vocab_file("en") + ".vocab") labels = [l.split('\t')[0] for l in labels] data = pd.DataFrame(data=x, columns=["f1", "f2"]) data["label"] = labels scale = 2.0 plt.figure(figsize=(12, 12)) sns.set(font_scale=scale) g = sns.scatterplot(x="f1", y="f2", palette=sns.color_palette("hls", 10), data=data, legend="full", alpha=0.3) # g.set(title=str(ds).replace('_', ' ') + f"\n(source emb. {origin_emb_size})") for i, row in data.iterrows(): word = row["label"].replace('▁', '') if word in main_words: g.annotate(word, (row["f1"], row["f2"]), fontsize=24) plt.tight_layout() # Print plot savepath = os.path.join(base_path, "plots") utils.make_dir(savepath) for ext in ["png", "pdf"]: path = os.path.join(savepath, f"tsne_{file}__{str(ds)}_4.{ext}") plt.savefig(path, dpi=300) print(f"Plot saved! ({path})") plt.show() plt.close() asdas = 3
with open(path_to_glove_file) as f: for line in f: word, coefs = line.split(maxsplit=1) coefs = np.fromstring(coefs, "f", sep=" ") embeddings_index[word] = coefs # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]}, {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]}, ], subword_models=["word"], vocab_sizes=[250, 500, 1000, 2000, 4000, 8000], merge_vocabs=False, force_overwrite=False, use_cmd=True, eval_mode="same", conda_env_name="mltests", letter_case="lower", ).build(make_plots=False, safe=True) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) # Train & Score a model for each dataset scores = [] errors = []
def main(): # Create preprocessing for training builder_big = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]}, { "name": "europarl", "languages": ["de-en"], "sizes": [("original_lc", None)] }, # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]}, ], subword_models=["word"], vocab_sizes=[16000], merge_vocabs=False, force_overwrite=False, use_cmd=False, eval_mode="same", letter_case="lower", ).build(make_plots=False, safe=True) big_datasets = builder_big.get_ds() ds_ref = big_datasets[0] # Create preprocessing for training builder = DatasetBuilder( base_path="/home/scarrion/datasets/nn/translation", datasets=[ # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]}, # {"name": "europarl", "languages": ["de-en"], "sizes": [("original_lc", None)]}, { "name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)] }, ], subword_models=["word"], vocab_sizes=[250, 500, 1000, 2000, 4000, 8000], merge_vocabs=False, force_overwrite=False, use_cmd=False, eval_mode="same", letter_case="lower", ).build(make_plots=False, safe=True) # Create preprocessing for training and testing tr_datasets = builder.get_ds() ts_datasets = builder.get_ds(ignore_variants=True) # Train & Score a model for each dataset scores = [] errors = [] max_tokens = 100 device = "cuda" if torch.cuda.is_available() else "cpu" # Export raw embeddings # run_prefix = "model" # for ds in tr_datasets: # # Save embeddings # model, src_vocab, trg_vocab = load_model(ds, run_prefix) # save_embeddings_models(model, f".outputs/tmp/256/{str(ds).replace('_test', '')}") #⁇ # pairs = [(250, 500), (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000)] # (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000), (8000, 8000) pairs = [ (8000, 16000), (4000, 8000), (2000, 4000) ] # (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000), (8000, 8000) compressors = ["none"] rows = [] batch_size = 64 for origin_emb_size in [256]: for sw_small, sw_big in pairs: # Get embeddings/vocabs # src_emb, trg_emb = load_compressed_embeddings(f".outputs/tmp/256/{str(ds_ref)}", compressor="none", subword_size=sw_big, src_emb=256, trg_emb=256) # src_vocab, trg_vocab = get_ref_vocabs(ds_ref, limit=16000) src_emb, trg_emb = load_compressed_embeddings( f".outputs/tmp/256/fasttext256_europarl_lc_16k", compressor="none", subword_size=sw_big, src_emb=256, trg_emb=256) src_vocab, trg_vocab = get_ref_vocabs(ds_ref, limit=sw_small) # src_vocab2, trg_vocab2 = get_ref_vocabs2(f".outputs/tmp/256/fasttext256_europarl_lc_16k", limit=16000) # Load small model and vocabs ds_small = get_dataset(tr_datasets, sw_small) run_prefix = "model_fasttext256_16k__europarl100k" #f"model_eu8kemb" # model_mt8kemb model = model_with_embeddings(src_emb, trg_emb) # Load checkpoint (to evaluate) # ds_small.subword_model = "none" # checkpoint_path = ds_small.get_model_checkpoints_path(toolkit="autonmt", run_name=ds_small.get_run_name(run_prefix), fname="checkpoint_best.pt") # model_state_dict = torch.load(checkpoint_path)['state_dict'] # model.load_state_dict(model_state_dict) model = model.to(device) # Test model ds_small.subword_model = "none" wandb_params = None #dict(project="autonmt-tests", entity="salvacarrion") model = AutonmtTranslator(model=model, model_ds=ds_small, src_vocab=src_vocab, trg_vocab=trg_vocab, wandb_params=wandb_params, run_prefix=run_prefix, force_overwrite=True) model.fit(max_epochs=30, learning_rate=0.001, optimizer="adam", batch_size=batch_size, seed=1234, num_workers=0, patience=10) m_scores = model.predict(eval_datasets=ts_datasets, metrics={"bleu"}, beams=[1], max_gen_length=max_tokens, batch_size=batch_size) ds_small.subword_model = "word" # Keep results bleu = m_scores[0]['beams']['beam1']['sacrebleu_bleu_score'] row = { "dataset_name": ds_small.dataset_name, "subword_model": ds_small.subword_model, "from_to": f"{sw_small}", "origin_emb_size": origin_emb_size, "compressor": "none", "bleu": bleu } rows.append(row) print(row) asd = 3 # Create pandas dataframe df = pd.DataFrame(rows) df.to_csv("europarl2M.csv", index=False) print(df)