"max_pvalue" ] bcach = binocache() wholecsvfilename = "WHOLE"+language + "_" + str(min_sizes[language]) + "_results.csv" outfn = language + "_" + str(min_sizes[language]) + "_results.csv" #wholeout = open(wholecsvfilename, "w") with open(op.join(evaluation_output_directory, outfn), "w") as fout: fout.write("\t".join(cols_for_output) + "\n") for mdir in m_dirs: experiment_dirs = [f.path for f in scandir(mdir) if f.is_dir()] # experiment_dirs.sort(key=lambda x:x.split("_")[-1]) for exp_dir in experiment_dirs: param_dict = get_params_from_dirname(exp_dir) pattern = "k" + str(param_dict["k"]) + "_" + "th" + str(param_dict["th"]) + "_type%d.json" induced_candidates_ = load_candidates(exp_dir, pattern=pattern) if len(induced_candidates_) == 0: logger.error(str(param_dict) + " has an empyt directory: " + str(exp_dir)) break induced_candidates_ = [can for can in induced_candidates_ if len(can["entities"]) >= min_sizes[language]] if len(induced_candidates_) < 2: logger.error("\n\nToo few candidates left for " + str(param_dict)) continue all_entities = collect_entities(induced_candidates_) # Generating randoms randomized_candidates_ = [] if num_random > 0: randomized_candidates_ = create_random_candidates(induced_candidates_, num_random + margin)
import utils recall_cands_file = sys.argv[1] ce_score_file = sys.argv[2] outfile = sys.argv[3] random_seed = 111 rng = random.Random(random_seed) neg_cnt = 4 ce_threshold_neg = 0.1 ce_threshold_pos = 0.9 q_text, p_text, p_title = utils.load_corpus(corpus='marco', q_type='train') pos_qp, pos_qp_add = utils.load_pos_examples(p_text) cand_qp_all, train_qids = utils.load_candidates(recall_cands_file, col=4) ce_score = utils.load_ce_score(ce_score_file, train_qids) # neg examples neg_qp = {} for qid, pids in cand_qp_all.items(): if qid not in pos_qp: continue select_pid = [] pos_cnt = len(pos_qp[qid]) for index in range(50): _pid = pids[index] if len(select_pid) == neg_cnt * pos_cnt: break if _pid in pos_qp[qid] or _pid in select_pid or _pid in pos_qp_add.get( qid, []):
msg = ( "path of text file containing embeddings of the queries and candidates" ) parser.add_argument("path_embeddings", help=msg) parser.add_argument("path_output", help="path of output (pickle file)") parser.add_argument("-s", "--seed", type=int, default=91500) args = parser.parse_args() # Check dataset name dataset_name_exp = utils.expand_subtask_name(args.subtask) # Load candidates print("Loading candidates...") path_candidates = "{}/vocabulary/{}.vocabulary.txt".format( args.dir_datasets, dataset_name_exp) candidates = utils.load_candidates(path_candidates, normalize=True) print("Nb candidates: {}".format(len(candidates))) # Load queries print("Loading queries...") path_q_train = "{}/training/data/{}.training.data.txt".format( args.dir_datasets, dataset_name_exp) path_q_dev = "{}/trial/data/{}.trial.data.txt".format( args.dir_datasets, dataset_name_exp) path_q_test = "{}/test/data/{}.test.data.txt".format( args.dir_datasets, dataset_name_exp) q_train, _ = utils.load_queries(path_q_train, normalize=True) q_dev, _ = utils.load_queries(path_q_dev, normalize=True) q_test, _ = utils.load_queries(path_q_test, normalize=True) print("Nb training queries: {}".format(len(q_train))) print("Nb dev queries: {}".format(len(q_dev)))