def job(apply): for language_code in language_codes: # prepare raw data without multiprocessing, # otherwise trouble comes with race conditions of file write print(language_code) prepare_target_vector_paths(language_code) prepare_polyglot_freq_paths(language_code) prepare_ud_paths(language_code) for model_type in model_types: apply(evaluate_pbos, ( language_code, model_type, ))
def exp(ref_vec_name): result_path = Path("results") / "ws" / f"{ref_vec_name}_sasaki" ref_vec_path = prepare_target_vector_paths(ref_vec_name).w2v_emb_path codecs_path = prepare_codecs_path(ref_vec_path, result_path) log_file = open(result_path / "log.txt", "w+") logging.basicConfig(level=logging.DEBUG, stream=log_file) logger.info("Training...") model_info = train( ref_vec_path, result_path, codecs_path=codecs_path, H=40_000, F=500_000, epoch=300, ) logger.info("Inferencing...") combined_query_path = prepare_ws_combined_query_path() result_emb_path = inference(model_info, combined_query_path) logger.info("Evaluating...") evaluate( dotdict(eval_result_path=result_path / "result.txt", pred_path=result_emb_path))
def exp(ref_vec_name): result_path = Path("results") / "ws_multi" / f"{ref_vec_name}_sasaki" ref_vec_path = prepare_target_vector_paths(f"wiki2vec-{ref_vec_name}").w2v_emb_path codecs_path = prepare_codecs_path(ref_vec_path, result_path) log_file = open(result_path / "log.txt", "w+") logging.basicConfig(level=logging.DEBUG, stream=log_file) logger.info("Training...") train( ref_vec_path, result_path, codecs_path=codecs_path, H=40_000, F=500_000, epoch=300, ) model_info = get_info_from_result_path(result_path / "sep_kvq") logger.info("Inferencing...") combined_query_path = prepare_ws_combined_query_path(ref_vec_name) result_emb_path = inference(model_info, combined_query_path) logger.info("Evaluating...") evaluate(dotdict( model_type="sasaki", eval_result_path=result_path / "result.txt", pred_path=result_emb_path, target_vector_name=ref_vec_name, results_dir=result_path, ))
def exp(language): result_path = Path("results") / "pos" / language / "sasaki" emb_path = prepare_target_vector_paths(language).w2v_emb_path freq_path = prepare_polyglot_freq_paths(language).raw_count_path codecs_path = prepare_codecs_path(emb_path, result_path) ud_data_path, ud_vocab_path = prepare_ud_paths(language) model_info = train( emb_path, result_path, freq_path=freq_path, codecs_path=codecs_path, epoch=300, H=40_000, F=500_000 ) result_emb_path = inference(model_info, ud_vocab_path) with open(result_path / "ud.out", "w") as fout, open(result_path / "ud.log", "w") as ferr: cmd = f""" python pos_eval.py \ --dataset {ud_data_path} \ --embeddings {result_emb_path} \ --C {70} \ """.split() sp.call(cmd, stdout=fout, stderr=ferr)
def main(targets): for target in targets: target_vector_path = target if target == "EditSim" else prepare_target_vector_paths( target).txt_emb_path for dataset in get_ws_dataset_names(): data_path = prepare_ws_dataset_paths(dataset).txt_path for oov_handling in ("drop", "zero"): result = eval_ws(target_vector_path, data_path, lower=True, oov_handling=oov_handling) print(target, result)
def exp(model_type, target_vector_name): target_vector_paths = prepare_target_vector_paths( f"wiki2vec-{target_vector_name}") args = dotdict() # misc args.results_dir = f"results/ws_multi/{target_vector_name}_{model_type}" args.model_type = model_type args.log_level = "INFO" args.target_vector_name = target_vector_name # subword if model_type == "bos": args.word_boundary = True elif model_type in ('pbos', 'pbosn'): args.word_boundary = False args.subword_min_count = None args.subword_uniq_factor = None if model_type == 'bos': args.subword_min_len = 3 args.subword_max_len = 6 elif model_type in ('pbos', 'pbosn'): args.subword_min_len = 1 args.subword_max_len = None # subword vocab args.subword_vocab_max_size = None args.subword_vocab_word_freq = target_vector_paths.word_freq_path args.subword_vocab = f"{args.results_dir}/subword_vocab.jsonl" # subword prob args.subword_prob_take_root = False if model_type == 'bos': args.subword_prob = None elif model_type in ('pbos', 'pbosn'): args.subword_prob_min_prob = 0 args.subword_prob_word_freq = prepare_polyglot_freq_paths( target_vector_name).word_freq_path args.subword_prob = f"{args.results_dir}/subword_prob.jsonl" # training args.target_vectors = target_vector_paths.pkl_emb_path args.model_path = f"{args.results_dir}/model.pkl" args.epochs = 50 args.lr = 1 args.lr_decay = True args.random_seed = 42 args.subword_prob_eps = 0.01 args.subword_weight_threshold = None if args.model_type == 'pbosn': args.normalize_semb = True else: args.normalize_semb = False # prediction & evaluation args.eval_result_path = f"{args.results_dir}/result.txt" args.pred_path = f"{args.results_dir}/vectors.txt" os.makedirs(args.results_dir, exist_ok=True) # redirect log output log_file = open(f"{args.results_dir}/info.log", "w+") logging.basicConfig(level=logging.INFO, stream=log_file) dump_args(args) with contextlib.redirect_stdout(log_file), contextlib.redirect_stderr( log_file): train(args) combined_query_path = prepare_ws_combined_query_path( args.target_vector_name) predict( model=args.model_path, queries=combined_query_path, save=args.pred_path, word_boundary=args.word_boundary, ) evaluate(args)
queries=combined_query_path, save=args.pred_path, word_boundary=args.word_boundary, ) evaluate(args) if __name__ == '__main__': model_types = ("bos", "pbos") target_vector_names = ( "en", "de", "it", "ru", ) for target_vector_name in target_vector_names: # avoid race condition prepare_target_vector_paths(f"wiki2vec-{target_vector_name}") prepare_polyglot_freq_paths(target_vector_name) with mp.Pool() as pool: results = [ pool.apply_async(exp, (model_type, target_vector_name)) for target_vector_name in target_vector_names for model_type in model_types ] for r in results: r.get()
train(args) # prediction time_used = predict( model=args.model_path, queries=args.query_path, save=args.pred_path, word_boundary=args.word_boundary, ) print(f"time used: {time_used:.3f}") # evaluate evaluate(args) if __name__ == '__main__': model_types = ("pbos", "bos") target_vector_names = ("google", "polyglot") for target_vector_name in target_vector_names: # avoid race condition prepare_target_vector_paths(target_vector_name) with mp.Pool() as pool: results = [ pool.apply_async(exp, (model_type, target_vector_name)) for model_type in model_types for target_vector_name in target_vector_names ] for r in results: r.get()
""" A simple script used to evaluate the raw PolyGlot vector for POS One can redirect the starnard output of this file to get rid of the training log python pos_exp_polyglot.py 2>train.log 1>eval.log """ import subprocess as sp from datasets import prepare_target_vector_paths, polyglot_languages, prepare_ud_paths for language_code in polyglot_languages: ud_vocab_embedding_path = prepare_target_vector_paths(language_code).pkl_emb_path ud_data_path, ud_vocab_path = prepare_ud_paths(language_code) cmd = f""" python pos_eval.py \ --dataset {ud_data_path} \ --embeddings {ud_vocab_embedding_path} \ """.split() output = sp.check_output(cmd) print(f"{language_code}: {output.decode('utf-8')}")
""" Used to generate target vector statistics """ from datasets import prepare_target_vector_paths, prepare_ws_dataset_paths, get_ws_dataset_names from ws_eval import eval_ws for lang in ( "de", "en", "it", "ru", ): target_vector_path = prepare_target_vector_paths( f"wiki2vec-{lang}").txt_emb_path for dataset in get_ws_dataset_names(lang): data_path = prepare_ws_dataset_paths(dataset).txt_path for oov_handling in ("drop", "zero"): print( eval_ws(target_vector_path, data_path, lower=True, oov_handling=oov_handling))
def evaluate_pbos(language_code, model_type): logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})] start...") # Input files polyglot_embeddings_path = prepare_target_vector_paths(language_code) polyglot_frequency_path = prepare_polyglot_freq_paths(language_code) # Output/result files result_path = os.path.join("results", "pos", language_code, model_type) os.makedirs(result_path, exist_ok=True) subword_vocab_path = os.path.join(result_path, "subword_vocab.jsonl") subword_prob_path = os.path.join(result_path, "subword_prob.jsonl") subword_embedding_model_path = os.path.join(result_path, "model.pbos") training_log_path = subword_embedding_model_path + ".log" logger.info(f"[evaluate_pbos({language_code}, model_type={model_type})]" f" result_path=`{result_path}`") # train subword embedding model using target embeddings and word freq if not os.path.exists(subword_embedding_model_path): # build subword vocab from target words logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})]" f" building subword vocab...") cmd = f""" python subwords.py build_vocab \ --word_freq {polyglot_embeddings_path.word_freq_path} \ --output {subword_vocab_path} \ """ if model_type == 'bos': cmd += f" --subword_min_len 3" cmd += f" --subword_max_len 6" sp.call(cmd.split()) if model_type in ('pbos', 'pbosn'): # build subword prob from word freqs logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})]" f" building subword prob...") cmd = f""" python subwords.py build_prob \ --word_freq {polyglot_frequency_path.word_freq_path} \ --output {subword_prob_path} \ """ sp.call(cmd.split()) else: logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})]" f" skipped building subword prob.") # invoke training of subword model logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})]" f" training subword model...") cmd = f""" python pbos_train.py \ --target_vectors {polyglot_embeddings_path.pkl_emb_path} \ --model_path {subword_embedding_model_path} \ --subword_vocab {subword_vocab_path} \ """ if model_type == "pbos": cmd += f" --subword_prob {subword_prob_path}" elif model_type == 'pbosn': cmd += f" --subword_prob {subword_prob_path}" cmd += f" --normalize_semb" cmd = cmd.split() with open(training_log_path, "w+") as log: sp.call(cmd, stdout=log, stderr=log) # with tee_open(training_log_path) as log_tee: # sp.call(cmd, stdout=log_tee.stdin, stderr=log_tee.stdin) else: logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})]" f" skipped training subword model.") ud_data_path, ud_vocab_path = prepare_ud_paths(language_code) ud_vocab_embedding_path = os.path.join(result_path, "ud_vocab_embedding.txt") # predict embeddings for ud vocabs if not os.path.exists(ud_vocab_embedding_path): logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})]" f" predicting word embeddings...") cmd = f""" python pbos_pred.py \ --queries {ud_vocab_path} \ --save {ud_vocab_embedding_path} \ --model {subword_embedding_model_path} \ """ # --pre_trained {polyglot_embeddings_path.pkl_emb_path} \ sp.call(cmd.split()) else: logger.info( f"[evaluate_pbos({language_code}, model_type={model_type})]" f" skipped predicting word embeddings.") logger.info(f"[evaluate_pbos({language_code}, model_type={model_type})]" f" evaluating on POS tagging...") pos_eval(ud_data_path, ud_vocab_embedding_path, result_path) logger.info(f"[evaluate_pbos({language_code}, model_type={model_type})]" f" done.")
from datasets import prepare_target_vector_paths, polyglot_languages, prepare_ud_paths from load import load_embedding for language in polyglot_languages: polyglot_path = prepare_target_vector_paths(language).pkl_emb_path polyglot_vocab, _ = load_embedding(polyglot_path) polyglot_vocab = set(polyglot_vocab) _, ud_vocab_path = prepare_ud_paths(language) with open(ud_vocab_path) as f: ud_vocab = [w.strip() for w in f] oov = sum(w not in polyglot_vocab for w in ud_vocab) / len(ud_vocab) print(language, oov)
word_boundary=args.word_boundary, uniq_factor=args.subword_uniq_factor, ) subword_prob = build_subword_prob( subword_counter, normalize_prob=normalize_prob, min_prob=args.subword_prob_min_prob, take_root=args.subword_prob_take_root, ) logger.info(f"subword prob size: {len(subword_prob)}") logger.info(f"building subword vocab from `{args.vocab_word_freq}`...") if args.vocab_word_freq is None: subword_vocab = set(subword_prob) else: word_freq_path = prepare_target_vector_paths( args.vocab_word_freq).word_freq_path with open(word_freq_path) as fin: word_count_iter = (json.loads(line) for line in file_tqdm(fin)) subword_counter = build_subword_counter( word_count_iter, max_size=args.subword_vocab_max_size, min_count=args.subword_min_count, min_len=args.subword_min_len, max_len=args.subword_max_len, word_boundary=args.word_boundary, uniq_factor=args.subword_uniq_factor, ) subword_vocab = set(subword_counter) subword_vocab -= set('<>') logger.info(f"subword vocab size: {len(subword_vocab)}")