Beispiel #1
0
                   "max_pvalue"
                   ]

bcach = binocache()
wholecsvfilename = "WHOLE"+language + "_" + str(min_sizes[language]) + "_results.csv"
outfn = language + "_" + str(min_sizes[language]) + "_results.csv"
#wholeout = open(wholecsvfilename, "w")
with open(op.join(evaluation_output_directory, outfn), "w") as fout:
    fout.write("\t".join(cols_for_output) + "\n")
    for mdir in m_dirs:
        experiment_dirs = [f.path for f in scandir(mdir) if f.is_dir()]
        # experiment_dirs.sort(key=lambda x:x.split("_")[-1])
        for exp_dir in experiment_dirs:
            param_dict = get_params_from_dirname(exp_dir)
            pattern = "k" + str(param_dict["k"]) + "_" + "th" + str(param_dict["th"]) + "_type%d.json"
            induced_candidates_ = load_candidates(exp_dir, pattern=pattern)
            if len(induced_candidates_) == 0:
                logger.error(str(param_dict) + " has an empyt directory: " + str(exp_dir))
                break
            
            induced_candidates_ = [can for can in induced_candidates_
                                   if len(can["entities"])  >= min_sizes[language]]
            if len(induced_candidates_) < 2:
                logger.error("\n\nToo few candidates left for  " + str(param_dict))
                continue
            all_entities = collect_entities(induced_candidates_)
            # Generating randoms
            randomized_candidates_ = []
            if num_random > 0:
                randomized_candidates_ = create_random_candidates(induced_candidates_,
                                                                  num_random + margin)
Beispiel #2
0
import utils

recall_cands_file = sys.argv[1]
ce_score_file = sys.argv[2]
outfile = sys.argv[3]

random_seed = 111
rng = random.Random(random_seed)

neg_cnt = 4
ce_threshold_neg = 0.1
ce_threshold_pos = 0.9

q_text, p_text, p_title = utils.load_corpus(corpus='marco', q_type='train')
pos_qp, pos_qp_add = utils.load_pos_examples(p_text)
cand_qp_all, train_qids = utils.load_candidates(recall_cands_file, col=4)
ce_score = utils.load_ce_score(ce_score_file, train_qids)

# neg examples
neg_qp = {}
for qid, pids in cand_qp_all.items():
    if qid not in pos_qp:
        continue
    select_pid = []
    pos_cnt = len(pos_qp[qid])
    for index in range(50):
        _pid = pids[index]
        if len(select_pid) == neg_cnt * pos_cnt:
            break
        if _pid in pos_qp[qid] or _pid in select_pid or _pid in pos_qp_add.get(
                qid, []):
    msg = (
        "path of text file containing embeddings of the queries and candidates"
    )
    parser.add_argument("path_embeddings", help=msg)
    parser.add_argument("path_output", help="path of output (pickle file)")
    parser.add_argument("-s", "--seed", type=int, default=91500)
    args = parser.parse_args()

    # Check dataset name
    dataset_name_exp = utils.expand_subtask_name(args.subtask)

    # Load candidates
    print("Loading candidates...")
    path_candidates = "{}/vocabulary/{}.vocabulary.txt".format(
        args.dir_datasets, dataset_name_exp)
    candidates = utils.load_candidates(path_candidates, normalize=True)
    print("Nb candidates: {}".format(len(candidates)))

    # Load queries
    print("Loading queries...")
    path_q_train = "{}/training/data/{}.training.data.txt".format(
        args.dir_datasets, dataset_name_exp)
    path_q_dev = "{}/trial/data/{}.trial.data.txt".format(
        args.dir_datasets, dataset_name_exp)
    path_q_test = "{}/test/data/{}.test.data.txt".format(
        args.dir_datasets, dataset_name_exp)
    q_train, _ = utils.load_queries(path_q_train, normalize=True)
    q_dev, _ = utils.load_queries(path_q_dev, normalize=True)
    q_test, _ = utils.load_queries(path_q_test, normalize=True)
    print("Nb training queries: {}".format(len(q_train)))
    print("Nb dev queries: {}".format(len(q_dev)))