Ejemplo n.º 1
0
def save_to_csv():
    gold = get_claim_perspective_id_dict()

    def routine(claims, out_path):
        payloads = predict_by_elastic_search(claims, 50)
        head = ['sentence1', 'sentence2', 'gold_label', 'cid', 'pid']
        rows = []
        for cid, data_list in payloads:
            gold_pids = gold[cid]
            all_pid_set = set(flatten(gold_pids))
            for p_entry in data_list:
                c_text = p_entry['claim_text']
                p_text = p_entry['perspective_text']
                y = 1 if p_entry['pid'] in all_pid_set else 0
                row = [c_text, p_text, y, cid, p_entry['pid']]
                rows.append(row)
        f_out = csv.writer(open(out_path, "w", encoding="utf-8"),
                           dialect='excel-tab')
        f_out.writerows([head] + rows)

    claims, val = train_split()
    routine(claims, get_file_path('train'))
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    routine(claims, get_file_path('dev'))
    d_ids: List[int] = list(load_test_claim_ids())
    claims = get_claims_from_ids(d_ids)
    routine(claims, get_file_path('test'))
Ejemplo n.º 2
0
def run_reweight():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    param = {'k1': 0.5}
    pred = predict_by_reweighter(get_bm25_module(), claims, top_k, param)
    print(evaluate(pred))
Ejemplo n.º 3
0
def run_bm25_rm():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    rm_info = load_from_pickle("perspective_dev_claim_rm")
    top_k = 7
    pred = predict_by_bm25_rm(get_bm25_module(), rm_info, claims, top_k)
    print(evaluate(pred))
Ejemplo n.º 4
0
def pc_new_init_prob():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)
    bias_plus_word: Counter = load_from_pickle("bias_plus_words")
    tokenizer = PCTokenizer()

    base_p = max(bias_plus_word.values())

    init_p_score_d = {}
    for cid in d_ids:
        c_text = claim_d[cid]
        tokens = tokenizer.tokenize_stem(c_text)

        score_for_cid = Counter()
        for t, cnt in Counter(tokens).items():
            prob = cnt * base_p
            score_for_cid[t] = prob

        for t, score in bias_plus_word.items():
            score_for_cid[t] += score

        score_for_cid = normalize_counter_to_sum1(score_for_cid)
        init_p_score_d[cid] = score_for_cid

    save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
Ejemplo n.º 5
0
def claim_language_model_property():
    dev_claim_ids = load_dev_claim_ids()
    claims = get_claims_from_ids(dev_claim_ids)
    all_ranked_list = ClaimRankedList()
    all_voca = set()
    candidate_k = 50
    for claim in claims:
        claim_text, perspectives = get_perspective(claim, candidate_k)
        print(claim_text)
        unigrams = get_relevant_unigrams(perspectives)
        ranked_list = all_ranked_list.get(str(claim['cId']))
        doc_ids = [t[0] for t in ranked_list]
        print("Loading documents")
        preload_tf(doc_ids)
        docs = lmap(load_and_format_doc, doc_ids)

        foreach(lambda doc: all_voca.update(doc['tokens_set']), docs)

        # check hypothesis
        # check_hypothesis(all_voca, cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont,
        #                  ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont, unigrams)

        print("counting terms stat")

        lm_classifier = build_lm(docs, unigrams)

        for p_entry in perspectives:
            _text, _pid, _score = p_entry
            tokens = nltk.word_tokenize(_text)
            score = sum(lmap(lm_classifier.per_token_odd, tokens))
            print(_text, score)
Ejemplo n.º 6
0
def save_dev():
    save_name = "pc_dev_a_passages"
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/dev_claim/q_res_100")
    d_ids = list(load_dev_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    a_relevant_candidate(save_name, q_res_path, claims)
Ejemplo n.º 7
0
def write_claim_perspective_pair_as_query():
    split = "dev"
    assert split in ["train", "dev", "test"]

    d_ids = list({
        "train": load_train_claim_ids(),
        "dev": load_dev_claim_ids(),
        "test": load_test_claim_ids()
    }[split])
    claims = get_claims_from_ids(d_ids)
    print(len(claims), " claims")
    is_train = split == "train"
    all_data_points = get_candidates(claims, is_train)
    k = 0

    def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery:
        tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text)
        qid = "{}_{}".format(x.cid, x.pid)
        return format_query_bm25(qid, tokens, k)

    queries = lmap(get_query_entry_from_data_point, all_data_points)

    out_dir = query_dir_format.format(split)
    exist_or_mkdir(out_dir)
    n_query_per_file = 50

    write_queries_to_files(n_query_per_file, out_dir, queries)
Ejemplo n.º 8
0
def main():
    docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs")
    _, clue12_13_df = load_clueweb12_B13_termstat()
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    r = select_paragraph(docs, clue12_13_df, claims, "topk")
    save_to_pickle(r, "dev_claim_paras")
Ejemplo n.º 9
0
def run_eval_with_dict(pickle_name):
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    print("targets", len(claims))
    top_k = 8
    pc_score_d = load_from_pickle(pickle_name)
    pred = predict_from_dict(pc_score_d, claims, top_k)
    print(evaluate(pred))
Ejemplo n.º 10
0
def run_eval_with_two_dict():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    print("targets", len(claims))
    top_k = 7
    pc_score_d = load_from_pickle("pc_bert_baseline_score_d")
    pc_score_d2 = load_from_pickle("pc_random_walk_based_score_d")
    pred = predict_from_two_dict(pc_score_d, pc_score_d2, claims, top_k)
    print(evaluate(pred))
Ejemplo n.º 11
0
def save_random_walk_pred():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    score_d = prediction_to_dict(pred)
    save_to_pickle(score_d, "pc_random_walk_based_score_d")
Ejemplo n.º 12
0
def run_lm():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 5
    q_tf_replace = dict(load_from_pickle("pc_dev_par_tf"))
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    ctf = load_collection_tf()
    pred = predict_by_lm(q_tf_replace, ctf, bm25, claims, top_k)
    print(evaluate(pred))
Ejemplo n.º 13
0
def run_random_walk_score_with_weight():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    bm25 = get_bm25_module()
    pred = pc_predict_vector_query_and_reweight(bm25, q_tf_replace, claims,
                                                top_k, {'k1': 0.5})
    print(evaluate(pred))
Ejemplo n.º 14
0
def generate_classification_payload():
    claims, val = train_split()
    top_k = 50
    pred = predict_by_elastic_search(claims, top_k)
    save_to_pickle(pred, "perspective_cls_train_X")
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    pred = predict_by_elastic_search(claims, top_k)
    save_to_pickle(pred, "perspective_cls_dev_X")
Ejemplo n.º 15
0
def run_write_claims_as_plain_query():
    for claim_ids, out_name in [
        (load_train_claim_ids(), "train_claim_query_raw.txt"),
        (load_dev_claim_ids(), "dev_claim_query_raw.txt")
    ]:
        claims = get_claims_from_ids(claim_ids)
        q_str_list = get_claims_as_plain_query(claims)
        f = open(pjoin(output_path, out_name), "w")
        for s in q_str_list:
            f.write(s + "\n")
Ejemplo n.º 16
0
def main():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 50
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    bm25 = get_bm25_module()
    pred2 = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    pc_score_d = load_from_pickle("pc_bert_baseline_score_d")
    pred1 = predict_from_dict(pc_score_d, claims, top_k)

    compare_two_runs(pred1, pred2)
Ejemplo n.º 17
0
def run_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 7
    q_tf_replace = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    #q_tf_replace = dict(load_from_pickle("pc_dev_par_tf"))
    #q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all"))
    bm25 = get_bm25_module()
    pred = pc_predict_from_vector_query(bm25, q_tf_replace, claims, top_k)
    print(evaluate(pred))
Ejemplo n.º 18
0
def run_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 20
    bm25 = get_bm25_module()
    #df, N = get_idf()
    #bm25.df = df
    #bm25.N = N
    q_tf_replace_0 = dict(load_from_pickle("random_walk_score_100"))
    q_tf_replace = dict(load_from_pickle("dev_claim_random_walk_debug2"))
    q_tf_replace = dict_key_map(lambda x: int(x), q_tf_replace)
    pc_predict_to_inspect(bm25, q_tf_replace, q_tf_replace_0, claims, top_k)
Ejemplo n.º 19
0
def show_random_walk_score():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    top_k = 7
    q_tf_replace = dict(load_from_pickle("bias_random_walk_dev_plus_all"))

    for claim_id, qtf in q_tf_replace.items():
        print(claim_d[claim_id])
        print(qtf.most_common(100))
    print("")
Ejemplo n.º 20
0
def run_baseline():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    print("targets", len(claims))
    top_k = 5
    score_pred_file: FileName = FileName("pc_para_D_pred_dev_11")
    cpid_resolute_file: FileName = FileName("resolute_dict_dev_11")
    # score_pred_file: FileName = FileName("pc_para_D_pred_dev")
    # cpid_resolute_file: FileName = FileName("resolute_dict_dev")
    pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, claims,
                                  top_k)
    print(evaluate(pred))
Ejemplo n.º 21
0
def run_lm2():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 5
    tokenizer = PCTokenizer()
    tf_d = {
        c['cId']: Counter(nltk.tokenize.word_tokenize(c['text']))
        for c in claims
    }
    bm25 = get_bm25_module()
    ctf = get_perspective_tf()
    pred = predict_by_lm(tf_d, ctf, bm25, claims, top_k)
    print(evaluate(pred))
Ejemplo n.º 22
0
def write_claim_queries_k0():
    def write(claim_ids, split_name):
        claims = get_claims_from_ids(claim_ids)
        queries = get_claims_query(claims, True)
        out_path = os.path.join(
            output_path,
            "perspective_{}_claim_query_k0.json".format(split_name))
        save_queries_to_file(queries, out_path)

    claim_ids, split_name = (load_train_claim_ids(), "train")

    write(claim_ids, split_name)
    claim_ids, split_name = (load_dev_claim_ids(), "dev")
    write(claim_ids, split_name)
Ejemplo n.º 23
0
def load_cppnc_score_and_baseline_and_group(save_name):
    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    info_file_path = os.path.join(out_dir, save_name + ".info")
    pred_file_path = os.path.join(out_dir, save_name + ".score")
    d_ids = list(load_dev_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_d = {c['cId']: c['text'] for c in claims}
    cid_grouped = load_and_group_predictions(info_file_path, pred_file_path)
    save_name = "baseline_cppnc"
    baseline_info_file_path = os.path.join(out_dir, save_name + ".info")
    baseline_pred_file_path = os.path.join(out_dir, save_name + ".score")
    baseline_cid_grouped = load_and_group_predictions(baseline_info_file_path,
                                                      baseline_pred_file_path)
    return baseline_cid_grouped, cid_grouped, claim_d
Ejemplo n.º 24
0
def main():
    args = parser.parse_args(sys.argv[1:])
    prediction_path = args.prediction_path
    data_id_info: Dict = load_from_pickle("pc_dev_passage_payload_info")
    save_name = args.save_name

    d_ids = list(load_dev_claim_ids())

    dev_claims: List[Dict] = get_claims_from_ids(d_ids)
    candidate_perspectives: Dict[int, List[Dict]] = dict(
        get_eval_candidates_from_pickle("dev"))
    config = json.load(open(args.config_path, "r"))
    print(config)
    make_cppnc_problem(prediction_path, data_id_info, dev_claims,
                       candidate_perspectives, config, save_name, encode_inner)
Ejemplo n.º 25
0
def work():
    claim_ids, split_name = (load_train_claim_ids(), "train")
    print("Num claims in train : ", len(list(claim_ids)))

    exit()

    def submit_jobs_inner(claim_ids, split_name):
        claims = get_claims_from_ids(claim_ids)
        queries = get_claims_query(claims)
        out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format(
            split_name)
        exist_or_mkdir(out_root)
        submit_rm_jobs(queries, out_root)

    claim_ids, split_name = (load_dev_claim_ids(), "dev")
    submit_jobs_inner(claim_ids, split_name)
Ejemplo n.º 26
0
def start_generate_jobs_for_dev(generator: InstanceGenerator, name_prefix):
    # claim ids split to train/val
    print("Loading data ....")
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)

    cids = {str(t['cId']) for t in claims}
    qk_candidate: List[QKUnit] = load_qk_candidate_dev()
    qk_candidate_val = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    print("Generate instances : dev")

    def worker_factory(out_dir):
        return QCKWorker(qk_candidate_val, generator, out_dir)

    runner = JobRunnerS(job_man_dir, 138, name_prefix + "_dev", worker_factory)
    runner.start()
Ejemplo n.º 27
0
def perspective_lm_correlation():
    d_ids = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    top_k = 20
    gold = get_claim_perspective_id_dict()
    predictions = predict_with_lm(claims, top_k)

    avg_pos_list = []
    avg_neg_list = []
    for c_Id, prediction_list in predictions:
        gold_pids = gold[c_Id]
        claim_text = prediction_list[0]['claim_text']

        pos_list = []
        neg_list = []
        print("Claim {}: ".format(c_Id), claim_text)
        for prediction in prediction_list:
            pid = prediction['pid']
            valid = False
            for cluster in gold_pids:
                if pid in cluster:
                    valid = True
                    break
            print("{0} {1:.2f} {2}".format(valid, prediction['lm_score'],
                                           prediction['perspective_text']))
            if not valid:
                neg_list.append(prediction['lm_score'])
            else:
                pos_list.append(prediction['lm_score'])

        if pos_list and neg_list:
            pos_score = average(pos_list)
            neg_score = average(neg_list)
            avg_pos_list.append(pos_score)
            avg_neg_list.append(neg_score)

    diff, p = ttest_ind(avg_pos_list, avg_neg_list)
    print("pos", average(avg_pos_list), "neg", average(avg_neg_list))
    print("pos", avg_pos_list)
    print("neg", avg_neg_list)
    print(diff, p)
Ejemplo n.º 28
0
def start_generate_jobs_for_dev(
        generator_functor: Callable[[Dict[int, List[Tuple[List[str], float]]]],
                                    CPPNCGeneratorInterface], writer,
        name_prefix):
    # claim ids split to train/val
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    data = load_from_pickle("pc_dev_a_passages")
    entries, all_passages = data
    cid_to_passages: Dict[int, List[Tuple[List[str], float]]] = {
        claim['cId']: p
        for claim, p in entries
    }
    generator = generator_functor(cid_to_passages)

    print("Generate instances : dev")

    def worker_factory(out_dir):
        return CPPNCWorker(claims, generator, writer, out_dir)

    runner = JobRunner(job_man_dir, 138, name_prefix + "_dev", worker_factory)
    runner.start()
Ejemplo n.º 29
0
def work():
    split = "train"
    assert split in ["train", "dev", "test"]

    tokenizer = PCTokenizer()
    d_ids = list({
        "train": load_train_claim_ids(),
        "dev": load_dev_claim_ids(),
        "test": load_test_claim_ids()
    }[split])
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    print(len(claims), " claims")
    do_balance = False
    all_data_points: List[PerspectiveCandidate] = get_candidates(
        claims, do_balance)

    grouped: Dict[str, List] = group_by(all_data_points, lambda x: x.cid)

    def get_frequency_per_class(datapoints: List[PerspectiveCandidate]):
        pos_text = []
        neg_text = []
        for dp in datapoints:
            tokens = tokenizer.tokenize_stem(dp.p_text)
            tf = Counter(tokens)
            dl = sum(tf.values())
            tf_rel = {k: v / dl for k, v in tf.items()}

            if dp.label == "1":
                pos_text.append(tf_rel)
            elif dp.label == "0":
                neg_text.append(tf_rel)
            else:
                assert False

        def accumulate(tf_list: List[Dict]):
            out_c = Counter()
            n = len(tf_list)
            for tf in tf_list:
                for k, v in tf.items():
                    out_c[k] += v / n

            return out_c

        pos_avg_tf = accumulate(pos_text)
        neg_avg_tf = accumulate(neg_text)
        return pos_avg_tf, neg_avg_tf

    class_freq: Dict[str,
                     Tuple[Counter,
                           Counter]] = dict_value_map(get_frequency_per_class,
                                                      grouped)

    save_to_pickle(class_freq, "per_claim_class_word_tf_{}".format(split))

    def normalize(s_list: List[float]) -> List[float]:
        m = sum(s_list)
        return list([s / m for s in s_list])

    pos_prob_dict = {}
    neg_prob_dict = {}

    for cid, info in class_freq.items():
        pos, neg = info
        all_words = set(pos.keys())
        all_words.update(neg.keys())

        info = []
        for word in all_words:
            score = pos[word] - neg[word]
            info.append((word, score))

        pos_scores = list([(w, s) for w, s in info if s > 0])
        neg_scores = list([(w, s) for w, s in info if s < 0])

        def normalize_right(pair_list):
            right_scores = normalize(right(pair_list))
            return list(zip(left(pair_list), right_scores))

        pos_prob_dict[cid] = normalize_right(pos_scores)
        neg_prob_dict[cid] = normalize_right(neg_scores)

    save_to_pickle(pos_prob_dict, "pc_pos_word_prob_{}".format(split))
    save_to_pickle(neg_prob_dict, "pc_neg_word_prob_{}".format(split))
Ejemplo n.º 30
0
def save_dev_candidate():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    candidates: List[Tuple[Dict, List[Dict]]] = get_all_candidate(claims)
    save_to_pickle(candidates, "pc_dev_candidate")