Esempio n. 1
0
def main(fairseq_args):
    # Create preprocessing for training
    builder = DatasetBuilder(
        base_path="/home/scarrion/datasets/nn/translation",
        datasets=[
            # {"name": "multi30k_test", "languages": ["de-en"], "sizes": [("original", None)]},
            {
                "name": "europarl_lc",
                "languages": ["de-en"],
                "sizes": [("50k", 50000)]
            },
        ],
        subword_models=["unigram"],
        vocab_sizes=[x for x in [400]],
        merge_vocabs=False,
        force_overwrite=False,
        use_cmd=False,
        eval_mode="same",
        conda_env_name="mltests",
        letter_case="lower",
    ).build(make_plots=False)

    # Create preprocessing for training and testing
    tr_datasets = builder.get_ds()
    ts_datasets = builder.get_ds(ignore_variants=True)

    # Train & Score a model for each dataset
    scores = []
    run_prefix = "transformer256emb"
    for ds in tr_datasets:
        try:
            wandb_params = dict(project="fairseq", entity="salvacarrion")
            model = FairseqTranslator(conda_fairseq_env_name="fairseq",
                                      model_ds=ds,
                                      wandb_params=wandb_params,
                                      force_overwrite=True,
                                      run_prefix=run_prefix)
            model.fit(max_epochs=100,
                      max_tokens=4096,
                      batch_size=None,
                      seed=1234,
                      patience=10,
                      num_workers=12,
                      devices="auto",
                      fairseq_args=fairseq_args)
            m_scores = model.predict(ts_datasets, metrics={"bleu"}, beams=[5])
            scores.append(m_scores)
        except Exception as e:
            print(e)

    # Make report and print it
    output_path = f".outputs/fairseq/{str(datetime.datetime.now())}"
    df_report, df_summary = generate_report(
        scores=scores,
        output_path=output_path,
        plot_metric="beam1__sacrebleu_bleu_score")
    print("Summary:")
    print(df_summary.to_string(index=False))
def main():
    # Create preprocessing for training
    builder = DatasetBuilder(
        base_path="/home/scarrion/datasets/nn/translation",
        datasets=[
            {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]},
            # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]},
        ],
        subword_models=["word"],
        vocab_sizes=[250, 500, 1000, 2000, 4000, 8000],
        merge_vocabs=False,
        force_overwrite=False,
        use_cmd=True,
        eval_mode="same",
        letter_case="lower",
    ).build(make_plots=False, safe=True)

    # Create preprocessing for training and testing
    tr_datasets = builder.get_ds()
    ts_datasets = builder.get_ds(ignore_variants=True)

    # Save embeddings
    rows = []
    origin_emb_size = 304
    name = "ae_linear"
    for ds in tr_datasets:
        print(f"Encoding data for: {str(ds)}")
        # Encode data
        src_scores, trg_scores = encode_data(f".outputs/tmp/{origin_emb_size}/{str(ds)}", enc_dim=256, name=name)

        # Keep info
        src_scores["emb_name"] = "src"
        src_scores["dataset_name"] = ds.dataset_name
        src_scores["subword_model"] = ds.subword_model
        src_scores["vocab_size"] = ds.vocab_size
        trg_scores["emb_name"] = "trg"
        trg_scores["dataset_name"] = ds.dataset_name
        trg_scores["subword_model"] = ds.subword_model
        trg_scores["vocab_size"] = ds.vocab_size
        rows.append(src_scores)
        rows.append(trg_scores)

        # # Encode data 2
        # trg_scores = encode_data2(f".outputs/tmp/{origin_emb_size}/{str(ds)}", enc_dim=256, name=name)
        #
        # # Keep info
        # trg_scores["emb_name"] = "trg"
        # trg_scores["dataset_name"] = ds.dataset_name
        # trg_scores["subword_model"] = ds.subword_model
        # trg_scores["vocab_size"] = ds.vocab_size
        # rows.append(trg_scores)

    # Print results
    df = pd.DataFrame(rows)
    df.to_csv(f".outputs/tmp/{origin_emb_size}/{name}_{origin_emb_size}.csv", index=False)
    print(df)
Esempio n. 3
0
def main():
    # Create preprocessing for training
    builder = DatasetBuilder(
        base_path="/home/scarrion/datasets/nn/translation",
        datasets=[
            {
                "name": "multi30k",
                "languages": ["de-en"],
                "sizes": [("original", None)]
            },
            # {"name": "europarl", "languages": ["de-en"], "sizes": [("original", None)]},
        ],
        subword_models=["word"],
        vocab_sizes=[8000],
        merge_vocabs=False,
        force_overwrite=True,
        use_cmd=True,
        eval_mode="same",
        letter_case="lower",
    ).build(make_plots=False)

    # Create preprocessing for training and testing
    tr_datasets = builder.get_ds()
    ts_datasets = builder.get_ds(ignore_variants=True)

    # Train & Score a model for each dataset
    scores = []
    errors = []
    run_prefix = "model_mt8kemb"
    for ds in tr_datasets:
        # try:

        # Instantiate vocabs and model
        src_vocab = Vocabulary(max_tokens=120).build_from_ds(ds=ds,
                                                             lang=ds.src_lang)
        trg_vocab = Vocabulary(max_tokens=120).build_from_ds(ds=ds,
                                                             lang=ds.trg_lang)
        model = Transformer(src_vocab_size=len(src_vocab),
                            trg_vocab_size=len(trg_vocab),
                            padding_idx=src_vocab.pad_id,
                            encoder_embed_dim=256,
                            decoder_embed_dim=256)

        # Train model
        wandb_params = dict(project="autonmt-tests", entity="salvacarrion")
        model = AutonmtTranslator(model=model,
                                  src_vocab=src_vocab,
                                  trg_vocab=trg_vocab,
                                  model_ds=ds,
                                  wandb_params=wandb_params,
                                  force_overwrite=True,
                                  run_prefix=run_prefix)
        model.fit(max_epochs=100,
                  batch_size=128,
                  seed=1234,
                  num_workers=16,
                  patience=10)
        m_scores = model.predict(ts_datasets,
                                 metrics={"bleu"},
                                 beams=[1],
                                 max_gen_length=120,
                                 load_best_checkpoint=True)
        scores.append(m_scores)

        # except Exception as e:
        #     print(str(e))
        #     errors += [str(e)]

    # Make report and print it
    output_path = f".outputs/autonmt/{str(datetime.datetime.now())}/{run_prefix}"
    df_report, df_summary = generate_report(
        scores=scores,
        output_path=output_path,
        plot_metric="beam1__sacrebleu_bleu_score")
    print("Summary:")
    print(df_summary.to_string(index=False))

    print(f"Errors: {len(errors)}")
    print(errors)
def main():
    # Create preprocessing for training
    builder = DatasetBuilder(
        base_path="/home/scarrion/datasets/nn/translation",
        datasets=[
            # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]},
            {"name": "europarl", "languages": ["de-en"], "sizes": [("original_lc", None)]},
        ],
        subword_models=["word"],
        vocab_sizes=[250, 500, 1000, 2000, 4000, 8000],
        merge_vocabs=False,
        force_overwrite=False,
        use_cmd=False,
        eval_mode="same",
        letter_case="lower",
    ).build(make_plots=False, safe=True)

    # Create preprocessing for training and testing
    tr_datasets = builder.get_ds()
    ts_datasets = builder.get_ds(ignore_variants=True)

    # Train & Score a model for each dataset
    scores = []
    errors = []
    max_tokens = 100
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Export raw embeddings
    run_prefix = "model"
    for ds in tr_datasets:
        # Save embeddings
        model, src_vocab, trg_vocab = load_model(ds, run_prefix)
        save_embeddings_models(model, f".outputs/tmp/256/{str(ds)}")  #⁇

    pairs = [(250, 500), (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000)]
    compressors = ["random"]
    rows = []
    for origin_emb_size in [256]:
        for sw_small, sw_big in pairs:
            # Get datasets
            ds_small = get_dataset(tr_datasets, sw_small)
            ds_big = get_dataset(tr_datasets, sw_big)
            assert ds_small.dataset_name == ds_big.dataset_name
            assert ds_small.subword_model == ds_big.subword_model

            for comp in compressors:
                # Compress vector
                src_emb, trg_emb = None, None
                if comp not in {None, "none"}:
                    _origin_emb_size = 300 if comp == "glove" else origin_emb_size
                    src_emb, trg_emb = load_compressed_embeddings(f".outputs/tmp/{_origin_emb_size}/{str(ds_big)}", comp, subword_size=sw_big, src_emb=256, trg_emb=256)
                run_prefix = f"transformer256emb"  # Don't change it

                model, big_src_vocab, big_trg_vocab = create_big_model(ds_small, ds_big)
                # _model, _big_src_vocab, _big_trg_vocab = expand_model(ds_small, ds_big, comp, run_prefix, src_emb, trg_emb)
                # clone_embeddings(from_model=_model, to_model=model)
                # del _model

                # # Load glove embeddings
                # emb_dir = f".outputs/tmp/glove_256/{sw_small}-{sw_big}/{comp}"
                # emb_path = f"{emb_dir}/raw_glove.npy"
                # if os.path.exists(emb_path):
                #     glove_emb = np.load(emb_path)
                # else:
                #     glove_emb = get_glove_embeddings(big_trg_vocab)
                #
                #     # Save embeddings
                #     utils.make_dir(emb_dir)
                #     np.save(emb_path, glove_emb)
                #
                # # Add trg embedding
                # add_trg_embedding(model, glove_emb)

                # Add embeddings
                model = model.to(device)

                # Load small model and vocabs
                # run_prefix = f"transformer256emb"  # Don't change it
                # model, src_vocab, trg_vocab = load_model(ds_small, run_prefix)
                # # run_prefix += f"_zr_oes{origin_emb_size}_c{comp}_{sw_small}-{sw_big}"
                #
                # # Expand model and vocabs
                # model, src_vocab, trg_vocab = expand_model(model, src_vocab, trg_vocab, ds_big, src_emb, trg_emb, comp)
                # model = model.to(device)

                # Test model
                ds_small.subword_model = "none"
                wandb_params = None  #dict(project="autonmt-tests", entity="salvacarrion")
                model = AutonmtTranslator(model=model, model_ds=ds_small,  src_vocab=big_src_vocab, trg_vocab=big_trg_vocab, wandb_params=wandb_params, run_prefix=run_prefix, force_overwrite=True)
                model.fit(max_epochs=1, learning_rate=0.0001, optimizer="sgd", batch_size=128, seed=1234, num_workers=0, patience=10)
                m_scores = model.predict(eval_datasets=ts_datasets, metrics={"bleu"}, beams=[1], max_gen_length=max_tokens)
                ds_small.subword_model = "word"

                # Keep results
                bleu = m_scores[0]['beams']['beam1']['sacrebleu_bleu_score']
                row = {"dataset_name": ds_small.dataset_name, "subword_model": ds_small.subword_model, "from_to": f"{sw_small}➔{sw_big}", "origin_emb_size": origin_emb_size, "compressor": comp, "bleu": bleu}
                rows.append(row)
                print(row)
                asd = 3

    # Create pandas dataframe
    df = pd.DataFrame(rows)
    df.to_csv("europarl.csv", index=False)
    print(df)
import json

import fasttext
import fasttext.util

# Create preprocessing for training
builder_big = DatasetBuilder(
    base_path="/home/scarrion/datasets/nn/translation",
    datasets=[
        # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]},
        {
            "name": "europarl",
            "languages": ["de-en"],
            "sizes": [("original_lc", None)]
        },
        # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]},
    ],
    subword_models=["word"],
    vocab_sizes=[16000],
    merge_vocabs=False,
    force_overwrite=False,
    use_cmd=False,
    eval_mode="same",
    letter_case="lower",
).build(make_plots=False, safe=True)
big_datasets = builder_big.get_ds()
ds_ref = big_datasets[0]

base_path = "."

# Load vocabs
Esempio n. 6
0
def main():
    file = "trg"
    path = "/home/scarrion/Documents/Programming/Python/mltests/translation/autonmt/.outputs/tmp/256/multi30k_de-en_original_word_8000/"

    # Create preprocessing for training
    builder = DatasetBuilder(
        base_path="/home/scarrion/datasets/nn/translation",
        datasets=[
            # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]},
            # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]},
            {
                "name": "europarl",
                "languages": ["de-en"],
                "sizes": [("original_lc", None)]
            },
        ],
        subword_models=["word"],
        vocab_sizes=[16000],
        merge_vocabs=False,
        force_overwrite=False,
        use_cmd=True,
        eval_mode="same",
        letter_case="lower",
    ).build(make_plots=False, safe=True)

    # Create preprocessing for training and testing
    tr_datasets = builder.get_ds()
    ts_datasets = builder.get_ds(ignore_variants=True)

    file = "trg"
    for train_tsne in [True, False]:
        for origin_emb_size in [256]:
            for ds in tr_datasets:
                base_path = f".outputs/tmp/{origin_emb_size}/{str(ds)}"
                base_path = f"/home/scarrion/Documents/Programming/Python/mltests/translation/autonmt/.outputs/tmp/256/fasttext256_europarl_lc_16k"

                if train_tsne:
                    x = np.load(os.path.join(base_path, f"{file}.npy"))
                    x_embedded = TSNE(n_components=2,
                                      learning_rate='auto',
                                      init='random').fit_transform(x)
                    np.save(os.path.join(base_path, f"{file}_tsne.npy"),
                            x_embedded)
                    print(f"File saved! ({str(ds)})")
                else:
                    x = np.load(os.path.join(base_path, f"{file}_tsne.npy"))
                    labels = utils.read_file_lines(
                        ds.get_vocab_file("en") + ".vocab")
                    labels = [l.split('\t')[0] for l in labels]
                    data = pd.DataFrame(data=x, columns=["f1", "f2"])
                    data["label"] = labels

                    scale = 2.0
                    plt.figure(figsize=(12, 12))
                    sns.set(font_scale=scale)

                    g = sns.scatterplot(x="f1",
                                        y="f2",
                                        palette=sns.color_palette("hls", 10),
                                        data=data,
                                        legend="full",
                                        alpha=0.3)
                    # g.set(title=str(ds).replace('_', ' ') + f"\n(source emb. {origin_emb_size})")

                    for i, row in data.iterrows():
                        word = row["label"].replace('▁', '')
                        if word in main_words:
                            g.annotate(word, (row["f1"], row["f2"]),
                                       fontsize=24)
                    plt.tight_layout()

                    # Print plot
                    savepath = os.path.join(base_path, "plots")
                    utils.make_dir(savepath)
                    for ext in ["png", "pdf"]:
                        path = os.path.join(savepath,
                                            f"tsne_{file}__{str(ds)}_4.{ext}")
                        plt.savefig(path, dpi=300)
                        print(f"Plot saved! ({path})")

                    plt.show()
                    plt.close()
                    asdas = 3
Esempio n. 7
0
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs


# Create preprocessing for training
builder = DatasetBuilder(
    base_path="/home/scarrion/datasets/nn/translation",
    datasets=[
        {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]},
        {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]},
    ],
    subword_models=["word"],
    vocab_sizes=[250, 500, 1000, 2000, 4000, 8000],
    merge_vocabs=False,
    force_overwrite=False,
    use_cmd=True,
    eval_mode="same",
    conda_env_name="mltests",
    letter_case="lower",
).build(make_plots=False, safe=True)

# Create preprocessing for training and testing
tr_datasets = builder.get_ds()
ts_datasets = builder.get_ds(ignore_variants=True)

# Train & Score a model for each dataset
scores = []
errors = []
Esempio n. 8
0
def main():
    # Create preprocessing for training
    builder_big = DatasetBuilder(
        base_path="/home/scarrion/datasets/nn/translation",
        datasets=[
            # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]},
            {
                "name": "europarl",
                "languages": ["de-en"],
                "sizes": [("original_lc", None)]
            },
            # {"name": "europarl", "languages": ["de-en"], "sizes": [("100k", 100000)]},
        ],
        subword_models=["word"],
        vocab_sizes=[16000],
        merge_vocabs=False,
        force_overwrite=False,
        use_cmd=False,
        eval_mode="same",
        letter_case="lower",
    ).build(make_plots=False, safe=True)
    big_datasets = builder_big.get_ds()
    ds_ref = big_datasets[0]

    # Create preprocessing for training
    builder = DatasetBuilder(
        base_path="/home/scarrion/datasets/nn/translation",
        datasets=[
            # {"name": "multi30k", "languages": ["de-en"], "sizes": [("original", None)]},
            # {"name": "europarl", "languages": ["de-en"], "sizes": [("original_lc", None)]},
            {
                "name": "europarl",
                "languages": ["de-en"],
                "sizes": [("100k", 100000)]
            },
        ],
        subword_models=["word"],
        vocab_sizes=[250, 500, 1000, 2000, 4000, 8000],
        merge_vocabs=False,
        force_overwrite=False,
        use_cmd=False,
        eval_mode="same",
        letter_case="lower",
    ).build(make_plots=False, safe=True)

    # Create preprocessing for training and testing
    tr_datasets = builder.get_ds()
    ts_datasets = builder.get_ds(ignore_variants=True)

    # Train & Score a model for each dataset
    scores = []
    errors = []
    max_tokens = 100
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Export raw embeddings
    # run_prefix = "model"
    # for ds in tr_datasets:
    #     # Save embeddings
    #     model, src_vocab, trg_vocab = load_model(ds, run_prefix)
    #     save_embeddings_models(model, f".outputs/tmp/256/{str(ds).replace('_test', '')}")  #⁇

    # pairs = [(250, 500), (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000)]  #  (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000), (8000, 8000)
    pairs = [
        (8000, 16000), (4000, 8000), (2000, 4000)
    ]  #  (500, 1000), (1000, 2000), (2000, 4000), (4000, 8000), (8000, 8000)
    compressors = ["none"]
    rows = []
    batch_size = 64
    for origin_emb_size in [256]:
        for sw_small, sw_big in pairs:
            # Get embeddings/vocabs
            # src_emb, trg_emb = load_compressed_embeddings(f".outputs/tmp/256/{str(ds_ref)}", compressor="none", subword_size=sw_big, src_emb=256, trg_emb=256)
            # src_vocab, trg_vocab = get_ref_vocabs(ds_ref, limit=16000)

            src_emb, trg_emb = load_compressed_embeddings(
                f".outputs/tmp/256/fasttext256_europarl_lc_16k",
                compressor="none",
                subword_size=sw_big,
                src_emb=256,
                trg_emb=256)
            src_vocab, trg_vocab = get_ref_vocabs(ds_ref, limit=sw_small)
            # src_vocab2, trg_vocab2 = get_ref_vocabs2(f".outputs/tmp/256/fasttext256_europarl_lc_16k", limit=16000)

            # Load small model and vocabs
            ds_small = get_dataset(tr_datasets, sw_small)
            run_prefix = "model_fasttext256_16k__europarl100k"  #f"model_eu8kemb"  # model_mt8kemb
            model = model_with_embeddings(src_emb, trg_emb)

            # Load checkpoint (to evaluate)
            # ds_small.subword_model = "none"
            # checkpoint_path = ds_small.get_model_checkpoints_path(toolkit="autonmt", run_name=ds_small.get_run_name(run_prefix), fname="checkpoint_best.pt")
            # model_state_dict = torch.load(checkpoint_path)['state_dict']
            # model.load_state_dict(model_state_dict)

            model = model.to(device)

            # Test model
            ds_small.subword_model = "none"
            wandb_params = None  #dict(project="autonmt-tests", entity="salvacarrion")
            model = AutonmtTranslator(model=model,
                                      model_ds=ds_small,
                                      src_vocab=src_vocab,
                                      trg_vocab=trg_vocab,
                                      wandb_params=wandb_params,
                                      run_prefix=run_prefix,
                                      force_overwrite=True)
            model.fit(max_epochs=30,
                      learning_rate=0.001,
                      optimizer="adam",
                      batch_size=batch_size,
                      seed=1234,
                      num_workers=0,
                      patience=10)
            m_scores = model.predict(eval_datasets=ts_datasets,
                                     metrics={"bleu"},
                                     beams=[1],
                                     max_gen_length=max_tokens,
                                     batch_size=batch_size)
            ds_small.subword_model = "word"

            # Keep results
            bleu = m_scores[0]['beams']['beam1']['sacrebleu_bleu_score']
            row = {
                "dataset_name": ds_small.dataset_name,
                "subword_model": ds_small.subword_model,
                "from_to": f"{sw_small}",
                "origin_emb_size": origin_emb_size,
                "compressor": "none",
                "bleu": bleu
            }
            rows.append(row)
            print(row)
            asd = 3

    # Create pandas dataframe
    df = pd.DataFrame(rows)
    df.to_csv("europarl2M.csv", index=False)
    print(df)