Ejemplo n.º 1
0
def run():
    in_dir_name = FileName("ukp_paragraph_tfrecord_dev_abortion")
    out_dir_name = FileName("ukp_paragraph_tfrecord_dev_abortion_2way")
    run_dir(in_dir_name, out_dir_name)

    in_dir_name = FileName("ukp_paragraph_tfrecord_train_abortion")
    out_dir_name = FileName("ukp_paragraph_tfrecord_train_abortion_2way")
    run_dir(in_dir_name, out_dir_name)
Ejemplo n.º 2
0
def count_tf():
    continuation_tokens = get_continuation_token_ids()

    dir_path = pjoin(output_path, FileName("nli_tfrecord_cls_300"))

    tf_train = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("train")))
    tf_dev = build_word_tf(continuation_tokens, pjoin(dir_path, FileName("dev_mis")))

    save_to_pickle(tf_dev, "nli_tf_dev_mis")
    save_to_pickle(tf_train, "nli_tf_train")
Ejemplo n.º 3
0
def run_para_scorer():
    claims, val = train_split()
    top_k = 6

    target = filter_avail(val)
    print("targets", len(target))
    score_pred_file: FileName = FileName("pc_para_D_pred")
    cpid_resolute_file: FileName = FileName("resolute_dict_580_606")
    pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, target,
                                  top_k)
    print(evaluate(pred))
Ejemplo n.º 4
0
def count_tf():
    continuation_tokens = get_continuation_token_ids()

    dir_path = pjoin(output_path, FileName("eHealth"))

    tf_train = build_word_tf(continuation_tokens,
                             pjoin(dir_path, FileName("tfrecord_train")))
    tf_dev = build_word_tf(continuation_tokens,
                           pjoin(dir_path, FileName("tfrecord_test")))

    save_to_pickle(tf_train, "clef1_tf_train")
    save_to_pickle(tf_dev, "clef1_tf_test")
Ejemplo n.º 5
0
def run_baseline():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    print("targets", len(claims))
    top_k = 5
    score_pred_file: FileName = FileName("pc_para_D_pred_dev_11")
    cpid_resolute_file: FileName = FileName("resolute_dict_dev_11")
    # score_pred_file: FileName = FileName("pc_para_D_pred_dev")
    # cpid_resolute_file: FileName = FileName("resolute_dict_dev")
    pred = predict_by_para_scorer(score_pred_file, cpid_resolute_file, claims,
                                  top_k)
    print(evaluate(pred))
Ejemplo n.º 6
0
def filter_avail(claims):
    cpid_resolute: Dict[str, CPID] = load_cpid_resolute(
        FileName("resolute_dict_580_606"))
    cid_list: List[int] = lmap(lambda x: int(x.split("_")[0]),
                               cpid_resolute.values())
    cid_list: Set[int] = set(cid_list)
    return lfilter(lambda x: x['cId'] in cid_list, claims)
Ejemplo n.º 7
0
def compare_before_after():
    tokenizer = get_tokenizer()

    ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("heavy metal"))
    dir_name = pjoin(pjoin(output_path, FileName("model")),
                     FileName("alt_emb_heavy_metal_D"))
    before = pjoin(dir_name, FileName("model.ckpt-0"))
    after = pjoin(dir_name, FileName("model.ckpt-10000"))

    v1_d = load_checkpoint_vars(before)
    v2_d = load_checkpoint_vars(after)

    for key in v1_d:
        if key in v2_d:
            s = np.sum(v1_d[key] - v2_d[key])
            if np.abs(s) > 0.01:
                print(key, s)

    ori_emb = v2_d['bert/embeddings/word_embeddings']
    alt_emb_before = v1_d['bert/embeddings/word_embeddings_alt']
    alt_emb_after = v2_d['bert/embeddings/word_embeddings_alt']

    def show_diff_from_ori(token_id):
        diff = np.sum(np.abs(ori_emb[token_id] - alt_emb_after[token_id]))
        print(token_id, diff)

    def show_diff_from_step0(token_id):
        diff = np.sum(
            np.abs(alt_emb_before[token_id] - alt_emb_after[token_id]))
        print(token_id, diff)

    print("Diff against original embedding")
    print("Target words")
    for token_id in ids:
        show_diff_from_ori(token_id)

    print("Random words")
    for token_id in [321, 598, 5854]:
        show_diff_from_ori(token_id)

    print("Diff against step0 random init embedding")
    print("Target words")
    for token_id in range(0, 30000):
        diff = np.sum(
            np.abs(alt_emb_before[token_id] - alt_emb_after[token_id]))
        if diff > 0.001:
            print(token_id, diff)
Ejemplo n.º 8
0
def print_features():
    job_dir = "ukp_paragraph_feature_2"
    job_id = 0
    file_path = os.path.join(sydney_working_dir, job_dir, str(job_id))
    features: List[ParagraphFeature] = pickle.load(
        open(os.path.join(file_path), "rb"))

    out_path = pjoin(output_path, FileName("ukp_paragraph_feature_2.html"))
    print_paragraph_feature(features, out_path)
Ejemplo n.º 9
0
def run_dir(in_dir_name: FileName, out_dir_name: FileName):
    in_dir = pjoin(sydney_working_dir, in_dir_name)
    out_dir = pjoin(sydney_working_dir, out_dir_name)
    exist_or_mkdir(out_dir)

    for file_path in get_dir_files(in_dir):
        name = FileName(os.path.basename(file_path))
        out_path = pjoin(out_dir, name)
        convert_to_2way(file_path, out_path)
Ejemplo n.º 10
0
def eval_from_prediction(prediction_path):
    cpid_resolute_file: FileName = FileName("resolute_dict_dev_11")
    top_k = 5
    cpid_resolute: Dict[str, CPID] = load_cpid_resolute(cpid_resolute_file)

    print("cpid_resolute has {}".format(len(cpid_resolute)))
    strategy = "avg"
    score_d: Dict[CPID, float] = get_cpid_score(prediction_path, cpid_resolute,
                                                strategy)
    return eval_from_score_d(score_d, top_k)
Ejemplo n.º 11
0
    def __init__(self, word_list: List[str], out_path):
        self.out_dir = out_path
        tokenizer = get_tokenizer()
        self.seq_set: List[List[int]] = []
        self.input_dir = pjoin(sydney_working_dir,
                               FileName("alt_emb_heavy_metal"))

        for word in word_list:
            subwords = tokenizer.tokenize(word)
            ids = tokenizer.convert_tokens_to_ids(subwords)
            print(subwords, ids)
            self.seq_set.append(ids)
Ejemplo n.º 12
0
def print_features():
    job_dir = "perspective_paragraph_feature"
    job_id = 0
    file_path = os.path.join(sydney_working_dir, job_dir, str(job_id))

    features: List[ParagraphClaimPersFeature] = pickle.load(
        open(os.path.join(file_path), "rb"))
    features: List[ParagraphFeature] = lmap(to_paragraph_feature, features)

    out_path = pjoin(output_path,
                     FileName("perspective_paragraph_feature.html"))
    print_paragraph_feature(features, out_path)
Ejemplo n.º 13
0
def show_bert_nli_diff():
    model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs"))
    nli = os.path.join(model_dir, FileName("nli"),
                       FileName("model.ckpt-75000"))
    bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"),
                        FileName("bert_model.ckpt"))

    show_embedding_difference(bert, nli)
Ejemplo n.º 14
0
def get_nli_and_bert_embeddings():
    model_dir = pjoin(pjoin(output_path, FileName("model")), FileName("runs"))
    nli = os.path.join(model_dir, FileName("nli"),
                       FileName("model.ckpt-75000"))
    bert = os.path.join(model_dir, FileName("uncased_L-12_H-768_A-12"),
                        FileName("bert_model.ckpt"))
    nli_emb = get_embedding_table(nli)
    bert_emb = get_embedding_table(bert)
    return bert_emb, nli_emb
Ejemplo n.º 15
0
def work():
    q_config_id = Q_CONFIG_ID_BM25_UKP
    ranked_list_save_root = get_ranked_list_save_dir(q_config_id)
    doc_ids = set()
    ticker = TimeEstimator(num_query_file)
    for i in range(num_query_file):
        file_name = FileName("{}_{}.txt".format(index_name_list[0], str(i)))
        ranked_list_path = pjoin(ranked_list_save_root, file_name)
        rl: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(
            ranked_list_path)

        for key, value in rl.items():
            for entry in value[:100]:
                doc_ids.add(entry.doc_id)
        ticker.tick()

    f = open("{}_uniq_100".format(q_config_id), "w")
    for doc_id in doc_ids:
        f.write("{}\n".format(doc_id))
    f.close()
Ejemplo n.º 16
0
def get_query_file(query_collection_id, i) -> FilePath:
    return pjoin(get_query_dir(query_collection_id),
                 FileName("{}.json".format(i)))
Ejemplo n.º 17
0
from arg.ukp.eval import eval
from base_type import FileName

if __name__ == "__main__":
    pred_file = FileName("ukp_para_pred")
    resolute_file = FileName("ukp_resolute_dict")
    eval(pred_file, resolute_file)
Ejemplo n.º 18
0
from arg.ukp.eval import eval
from base_type import FileName

if __name__ == "__main__":
    pred_file = FileName("ukp_pred_para_E_2way")
    resolute_file = FileName("ukp_resolute_dict")
    eval(pred_file, resolute_file, n_way=2)
Ejemplo n.º 19
0
def get_query_dir(query_collection_id) -> FilePath:
    out_dir = pjoin(output_path,
                    FileName("ukp_query_{}".format(query_collection_id)))
    return out_dir
Ejemplo n.º 20
0
            html = html_pos if is_true else html_neg
            claim, perspective, paragraph = split_3segments(tokens)
            highlight_terms = set(claim + perspective)
            if is_true:
                html.write_paragraph("{} of {}".format(idx_true, cnt_true))
                idx_true += 1
            else:
                html.write_paragraph("{} of {}".format(idx_false, cnt_false))
                idx_false += 1

            html.write_paragraph("claim : " + pretty_tokens(claim))
            html.write_paragraph("perspective : " + pretty_tokens(perspective))

            def make_cell(subword: Subword):
                if subword in highlight_terms:
                    return Cell(subword, highlight_score=100)
                else:
                    return Cell(subword)

            cells = lmap(make_cell, paragraph)
            html.multirow_print(cells)

        if item_cnt > 100:
            break


if __name__ == "__main__":
    pred_file = FileName("pc_para_D_pred_dev")
    pred_path: FilePath = pjoin(output_path, pred_file)
    print_file(pred_path)
Ejemplo n.º 21
0
        pc_tokens: List[str] = nltk.word_tokenize(
            f.claim_pers.claim_text) + nltk.word_tokenize(f.claim_pers.p_text)
        pc_tokens_set = set([t.lower() for t in pc_tokens])
        print(pc_tokens_set)

        def get_cell(token) -> Cell:
            if token.lower() in pc_tokens_set:
                score = 100
            else:
                score = 0
            return Cell(token, score)

        html_visualizer.write_paragraph("Label : {}".format(
            f.claim_pers.label))
        for score_paragraph in f.feature:
            paragraph = score_paragraph.paragraph
            cells = [get_cell(t) for t in paragraph.tokens]
            html_visualizer.write_paragraph("---")
            html_visualizer.multirow_print(cells, width=20)


if __name__ == "__main__":
    input_job_name: FileName = FileName("perspective_paragraph_feature_dev")
    input_dir = pjoin(output_path, input_job_name)
    job_id = 0
    features: List[ParagraphClaimPersFeature] = pickle.load(
        open(pjoin(input_dir, FileName(str(job_id))), "rb"))
    html = HtmlVisualizer("pers_dev_para_features.html")
    show(html, features)
Ejemplo n.º 22
0
def get_ranked_list_save_dir(q_config_id):
    return pjoin(subproject_hub, FileName("{}_q_res".format(q_config_id)))