Beispiel #1
0
 def job(apply):
     for language_code in language_codes:
         # prepare raw data without multiprocessing,
         # otherwise trouble comes with race conditions of file write
         print(language_code)
         prepare_target_vector_paths(language_code)
         prepare_polyglot_freq_paths(language_code)
         prepare_ud_paths(language_code)
         for model_type in model_types:
             apply(evaluate_pbos, (
                 language_code,
                 model_type,
             ))
Beispiel #2
0
def exp(ref_vec_name):
    result_path = Path("results") / "ws" / f"{ref_vec_name}_sasaki"
    ref_vec_path = prepare_target_vector_paths(ref_vec_name).w2v_emb_path
    codecs_path = prepare_codecs_path(ref_vec_path, result_path)

    log_file = open(result_path / "log.txt", "w+")
    logging.basicConfig(level=logging.DEBUG, stream=log_file)

    logger.info("Training...")
    model_info = train(
        ref_vec_path,
        result_path,
        codecs_path=codecs_path,
        H=40_000,
        F=500_000,
        epoch=300,
    )

    logger.info("Inferencing...")
    combined_query_path = prepare_ws_combined_query_path()
    result_emb_path = inference(model_info, combined_query_path)

    logger.info("Evaluating...")
    evaluate(
        dotdict(eval_result_path=result_path / "result.txt",
                pred_path=result_emb_path))
Beispiel #3
0
def exp(ref_vec_name):
    result_path = Path("results") / "ws_multi" / f"{ref_vec_name}_sasaki"
    ref_vec_path = prepare_target_vector_paths(f"wiki2vec-{ref_vec_name}").w2v_emb_path
    codecs_path = prepare_codecs_path(ref_vec_path, result_path)

    log_file = open(result_path / "log.txt", "w+")
    logging.basicConfig(level=logging.DEBUG, stream=log_file)

    logger.info("Training...")
    train(
        ref_vec_path,
        result_path,
        codecs_path=codecs_path,
        H=40_000,
        F=500_000,
        epoch=300,
    )

    model_info = get_info_from_result_path(result_path / "sep_kvq")

    logger.info("Inferencing...")
    combined_query_path = prepare_ws_combined_query_path(ref_vec_name)
    result_emb_path = inference(model_info, combined_query_path)

    logger.info("Evaluating...")
    evaluate(dotdict(
        model_type="sasaki",
        eval_result_path=result_path / "result.txt",
        pred_path=result_emb_path,
        target_vector_name=ref_vec_name,
        results_dir=result_path,
    ))
Beispiel #4
0
def exp(language):
    result_path = Path("results") / "pos" / language / "sasaki"

    emb_path = prepare_target_vector_paths(language).w2v_emb_path
    freq_path = prepare_polyglot_freq_paths(language).raw_count_path
    codecs_path = prepare_codecs_path(emb_path, result_path)
    ud_data_path, ud_vocab_path = prepare_ud_paths(language)

    model_info = train(
        emb_path,
        result_path,
        freq_path=freq_path,
        codecs_path=codecs_path,
        epoch=300,
        H=40_000,
        F=500_000
    )

    result_emb_path = inference(model_info, ud_vocab_path)

    with open(result_path / "ud.out", "w") as fout, open(result_path / "ud.log", "w") as ferr:
        cmd = f"""
            python pos_eval.py \
            --dataset {ud_data_path} \
            --embeddings {result_emb_path} \
            --C {70} \
        """.split()
        sp.call(cmd, stdout=fout, stderr=ferr)
Beispiel #5
0
def main(targets):
    for target in targets:
        target_vector_path = target if target == "EditSim" else prepare_target_vector_paths(
            target).txt_emb_path

        for dataset in get_ws_dataset_names():
            data_path = prepare_ws_dataset_paths(dataset).txt_path
            for oov_handling in ("drop", "zero"):
                result = eval_ws(target_vector_path,
                                 data_path,
                                 lower=True,
                                 oov_handling=oov_handling)
                print(target, result)
Beispiel #6
0
def exp(model_type, target_vector_name):
    target_vector_paths = prepare_target_vector_paths(
        f"wiki2vec-{target_vector_name}")
    args = dotdict()

    # misc
    args.results_dir = f"results/ws_multi/{target_vector_name}_{model_type}"
    args.model_type = model_type
    args.log_level = "INFO"
    args.target_vector_name = target_vector_name

    # subword
    if model_type == "bos":
        args.word_boundary = True
    elif model_type in ('pbos', 'pbosn'):
        args.word_boundary = False
    args.subword_min_count = None
    args.subword_uniq_factor = None
    if model_type == 'bos':
        args.subword_min_len = 3
        args.subword_max_len = 6
    elif model_type in ('pbos', 'pbosn'):
        args.subword_min_len = 1
        args.subword_max_len = None

    # subword vocab
    args.subword_vocab_max_size = None
    args.subword_vocab_word_freq = target_vector_paths.word_freq_path
    args.subword_vocab = f"{args.results_dir}/subword_vocab.jsonl"

    # subword prob
    args.subword_prob_take_root = False
    if model_type == 'bos':
        args.subword_prob = None
    elif model_type in ('pbos', 'pbosn'):
        args.subword_prob_min_prob = 0
        args.subword_prob_word_freq = prepare_polyglot_freq_paths(
            target_vector_name).word_freq_path
        args.subword_prob = f"{args.results_dir}/subword_prob.jsonl"

    # training
    args.target_vectors = target_vector_paths.pkl_emb_path
    args.model_path = f"{args.results_dir}/model.pkl"
    args.epochs = 50
    args.lr = 1
    args.lr_decay = True
    args.random_seed = 42
    args.subword_prob_eps = 0.01
    args.subword_weight_threshold = None
    if args.model_type == 'pbosn':
        args.normalize_semb = True
    else:
        args.normalize_semb = False

    # prediction & evaluation
    args.eval_result_path = f"{args.results_dir}/result.txt"
    args.pred_path = f"{args.results_dir}/vectors.txt"
    os.makedirs(args.results_dir, exist_ok=True)

    # redirect log output
    log_file = open(f"{args.results_dir}/info.log", "w+")
    logging.basicConfig(level=logging.INFO, stream=log_file)
    dump_args(args)

    with contextlib.redirect_stdout(log_file), contextlib.redirect_stderr(
            log_file):
        train(args)

        combined_query_path = prepare_ws_combined_query_path(
            args.target_vector_name)

        predict(
            model=args.model_path,
            queries=combined_query_path,
            save=args.pred_path,
            word_boundary=args.word_boundary,
        )

        evaluate(args)
Beispiel #7
0
            queries=combined_query_path,
            save=args.pred_path,
            word_boundary=args.word_boundary,
        )

        evaluate(args)


if __name__ == '__main__':
    model_types = ("bos", "pbos")
    target_vector_names = (
        "en",
        "de",
        "it",
        "ru",
    )

    for target_vector_name in target_vector_names:  # avoid race condition
        prepare_target_vector_paths(f"wiki2vec-{target_vector_name}")
        prepare_polyglot_freq_paths(target_vector_name)

    with mp.Pool() as pool:
        results = [
            pool.apply_async(exp, (model_type, target_vector_name))
            for target_vector_name in target_vector_names
            for model_type in model_types
        ]

        for r in results:
            r.get()
Beispiel #8
0
        train(args)

        # prediction
        time_used = predict(
            model=args.model_path,
            queries=args.query_path,
            save=args.pred_path,
            word_boundary=args.word_boundary,
        )
        print(f"time used: {time_used:.3f}")

        # evaluate
        evaluate(args)


if __name__ == '__main__':
    model_types = ("pbos", "bos")
    target_vector_names = ("google", "polyglot")

    for target_vector_name in target_vector_names:  # avoid race condition
        prepare_target_vector_paths(target_vector_name)

    with mp.Pool() as pool:
        results = [
            pool.apply_async(exp, (model_type, target_vector_name))
            for model_type in model_types
            for target_vector_name in target_vector_names
        ]

        for r in results:
            r.get()
Beispiel #9
0
"""
A simple script used to evaluate the raw PolyGlot vector for POS

One can redirect the starnard output of this file to get rid of the training log

python pos_exp_polyglot.py 2>train.log 1>eval.log
"""

import subprocess as sp

from datasets import prepare_target_vector_paths, polyglot_languages, prepare_ud_paths

for language_code in polyglot_languages:
    ud_vocab_embedding_path = prepare_target_vector_paths(language_code).pkl_emb_path
    ud_data_path, ud_vocab_path = prepare_ud_paths(language_code)

    cmd = f"""
        python pos_eval.py \
        --dataset {ud_data_path} \
        --embeddings {ud_vocab_embedding_path} \
    """.split()
    output = sp.check_output(cmd)
    print(f"{language_code}: {output.decode('utf-8')}")
Beispiel #10
0
"""
Used to generate target vector statistics
"""
from datasets import prepare_target_vector_paths, prepare_ws_dataset_paths, get_ws_dataset_names
from ws_eval import eval_ws

for lang in (
        "de",
        "en",
        "it",
        "ru",
):
    target_vector_path = prepare_target_vector_paths(
        f"wiki2vec-{lang}").txt_emb_path
    for dataset in get_ws_dataset_names(lang):
        data_path = prepare_ws_dataset_paths(dataset).txt_path
        for oov_handling in ("drop", "zero"):
            print(
                eval_ws(target_vector_path,
                        data_path,
                        lower=True,
                        oov_handling=oov_handling))
Beispiel #11
0
def evaluate_pbos(language_code, model_type):
    logger.info(
        f"[evaluate_pbos({language_code}, model_type={model_type})] start...")

    # Input files
    polyglot_embeddings_path = prepare_target_vector_paths(language_code)
    polyglot_frequency_path = prepare_polyglot_freq_paths(language_code)

    # Output/result files
    result_path = os.path.join("results", "pos", language_code, model_type)
    os.makedirs(result_path, exist_ok=True)
    subword_vocab_path = os.path.join(result_path, "subword_vocab.jsonl")
    subword_prob_path = os.path.join(result_path, "subword_prob.jsonl")
    subword_embedding_model_path = os.path.join(result_path, "model.pbos")
    training_log_path = subword_embedding_model_path + ".log"
    logger.info(f"[evaluate_pbos({language_code}, model_type={model_type})]"
                f" result_path=`{result_path}`")

    # train subword embedding model using target embeddings and word freq
    if not os.path.exists(subword_embedding_model_path):
        # build subword vocab from target words
        logger.info(
            f"[evaluate_pbos({language_code}, model_type={model_type})]"
            f" building subword vocab...")
        cmd = f"""
            python subwords.py build_vocab \
                --word_freq {polyglot_embeddings_path.word_freq_path} \
                --output {subword_vocab_path} \
        """
        if model_type == 'bos':
            cmd += f" --subword_min_len 3"
            cmd += f" --subword_max_len 6"
        sp.call(cmd.split())

        if model_type in ('pbos', 'pbosn'):
            # build subword prob from word freqs
            logger.info(
                f"[evaluate_pbos({language_code}, model_type={model_type})]"
                f" building subword prob...")
            cmd = f"""
                python subwords.py build_prob \
                    --word_freq {polyglot_frequency_path.word_freq_path} \
                    --output {subword_prob_path} \
            """
            sp.call(cmd.split())
        else:
            logger.info(
                f"[evaluate_pbos({language_code}, model_type={model_type})]"
                f" skipped building subword prob.")

        # invoke training of subword model
        logger.info(
            f"[evaluate_pbos({language_code}, model_type={model_type})]"
            f" training subword model...")
        cmd = f"""
            python pbos_train.py \
              --target_vectors {polyglot_embeddings_path.pkl_emb_path} \
              --model_path {subword_embedding_model_path} \
              --subword_vocab {subword_vocab_path} \
        """
        if model_type == "pbos":
            cmd += f" --subword_prob {subword_prob_path}"
        elif model_type == 'pbosn':
            cmd += f" --subword_prob {subword_prob_path}"
            cmd += f" --normalize_semb"
        cmd = cmd.split()
        with open(training_log_path, "w+") as log:
            sp.call(cmd, stdout=log, stderr=log)
        # with tee_open(training_log_path) as log_tee:
        # sp.call(cmd, stdout=log_tee.stdin, stderr=log_tee.stdin)
    else:
        logger.info(
            f"[evaluate_pbos({language_code}, model_type={model_type})]"
            f" skipped training subword model.")

    ud_data_path, ud_vocab_path = prepare_ud_paths(language_code)
    ud_vocab_embedding_path = os.path.join(result_path,
                                           "ud_vocab_embedding.txt")

    # predict embeddings for ud vocabs
    if not os.path.exists(ud_vocab_embedding_path):
        logger.info(
            f"[evaluate_pbos({language_code}, model_type={model_type})]"
            f" predicting word embeddings...")
        cmd = f"""
            python pbos_pred.py \
            --queries {ud_vocab_path} \
            --save {ud_vocab_embedding_path} \
            --model {subword_embedding_model_path} \
        """
        # --pre_trained {polyglot_embeddings_path.pkl_emb_path} \
        sp.call(cmd.split())
    else:
        logger.info(
            f"[evaluate_pbos({language_code}, model_type={model_type})]"
            f" skipped predicting word embeddings.")

    logger.info(f"[evaluate_pbos({language_code}, model_type={model_type})]"
                f" evaluating on POS tagging...")
    pos_eval(ud_data_path, ud_vocab_embedding_path, result_path)

    logger.info(f"[evaluate_pbos({language_code}, model_type={model_type})]"
                f" done.")
Beispiel #12
0
from datasets import prepare_target_vector_paths, polyglot_languages, prepare_ud_paths
from load import load_embedding

for language in polyglot_languages:
    polyglot_path = prepare_target_vector_paths(language).pkl_emb_path
    polyglot_vocab, _ = load_embedding(polyglot_path)
    polyglot_vocab = set(polyglot_vocab)

    _, ud_vocab_path = prepare_ud_paths(language)
    with open(ud_vocab_path) as f:
        ud_vocab = [w.strip() for w in f]

    oov = sum(w not in polyglot_vocab for w in ud_vocab) / len(ud_vocab)
    print(language, oov)
Beispiel #13
0
        word_boundary=args.word_boundary,
        uniq_factor=args.subword_uniq_factor,
    )
subword_prob = build_subword_prob(
    subword_counter,
    normalize_prob=normalize_prob,
    min_prob=args.subword_prob_min_prob,
    take_root=args.subword_prob_take_root,
)
logger.info(f"subword prob size: {len(subword_prob)}")

logger.info(f"building subword vocab from `{args.vocab_word_freq}`...")
if args.vocab_word_freq is None:
    subword_vocab = set(subword_prob)
else:
    word_freq_path = prepare_target_vector_paths(
        args.vocab_word_freq).word_freq_path
    with open(word_freq_path) as fin:
        word_count_iter = (json.loads(line) for line in file_tqdm(fin))
        subword_counter = build_subword_counter(
            word_count_iter,
            max_size=args.subword_vocab_max_size,
            min_count=args.subword_min_count,
            min_len=args.subword_min_len,
            max_len=args.subword_max_len,
            word_boundary=args.word_boundary,
            uniq_factor=args.subword_uniq_factor,
        )
    subword_vocab = set(subword_counter)
subword_vocab -= set('<>')
logger.info(f"subword vocab size: {len(subword_vocab)}")