Example #1
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
Example #2
0
def main():
    split = "train"
    subjectivity_path = sys.argv[1]
    q_res_path = sys.argv[2]
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    # load LM
    claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)
    alpha = 0.1
    stopwords = load_stopwords_for_query()
    # load subjectivity predictions.
    subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path)
    doc_ids = subj_d.keys()
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
    tokenizer = PCTokenizer()

    lm_scores = []
    rates = []
    num_subj_list = []
    num_sent_list = []
    for claim_lm in claim_lms:
        qid = str(claim_lm.cid)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        for entry in ranked_list[qid]:
            if entry.doc_id in subj_d:
                tokens = load_doc(entry.doc_id)
                assert type(tokens[0]) == str
                lm_score = get_passage_score(tokens)
                num_subj, num_sent = subj_d[entry.doc_id]
                rate = num_subj / num_sent
                lm_scores.append(lm_score)
                rates.append(rate)
                num_subj_list.append(num_subj)
                num_sent_list.append(num_sent)



    print("lm scores correlation with ")
    print("rates: ", pearsonr(lm_scores, rates))
    print("num subj: ", pearsonr(lm_scores, num_subj_list))
    print("num sent: ", pearsonr(lm_scores, num_sent_list))
Example #3
0
def show_docs(claims, ranked_list, top_n):
    # for each claim
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        print(c['cId'], c['text'])
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                print()
                print("Doc rank {}".format(i))
                print(" ".join(doc))
            except KeyError:
                pass
        print("--------")
def iterate_docs(doc_ids: List[str]) -> Iterable[KnowledgeDocument]:
    docs = []
    for doc_id in doc_ids:
        try:
            tokens = load_doc(doc_id)
            kd = KnowledgeDocument(doc_id, tokens)
            docs.append(kd)
        except KeyError:
            pass

    if len(docs) < len(doc_ids):
        print("Retrieved {} of {} docs".format(len(docs), len(doc_ids)))
    duplicate_doc_ids = get_duplicate(docs)
    unique_docs = [d for d in docs if d.doc_id not in duplicate_doc_ids]
    return unique_docs
Example #5
0
def iterate_docs(q_res: List[SimpleRankedListEntry], top_n: int) -> Iterable[KnowledgeDocument]:
    docs = []
    for i in range(top_n):
        try:
            tokens = load_doc(q_res[i].doc_id)
            kd = KnowledgeDocument(q_res[i].doc_id, tokens)
            docs.append(kd)
        except KeyError:
            pass

    if len(docs) < top_n:
        print("Retrieved {} of {} docs".format(len(docs), top_n))

    duplicate_doc_ids = get_duplicate(docs)
    unique_docs = [d for d in docs if d.doc_id not in duplicate_doc_ids]
    return unique_docs
Example #6
0
def report_missing(claims, ranked_list, top_n):
    # for each claim
    n_missing = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        print(c['cId'], c['text'])
        missing = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
            except KeyError:
                missing.append(i)
                pass
        print(missing)
        n_missing += len(missing)

    print("")
Example #7
0
def iterate_passages(q_res, top_n, get_passage_score):
    passages = []
    docs = []
    for i in range(top_n):
        try:
            doc = load_doc(q_res[i].doc_id)
            docs.append(doc)
        except KeyError:
            pass
    for doc in remove_duplicate(docs):
        idx = 0
        window_size = 300
        while idx < len(doc):
            p = doc[idx:idx + window_size]
            score = get_passage_score(p)
            passages.append((p, score))
            idx += window_size
    return passages
Example #8
0
def binary_feature_demo(datapoint_list):
    ci = PassageRankedListInterface(make_passage_query, Q_CONFIG_ID_BM25)
    not_found_set = set()
    _, clue12_13_df = load_clueweb12_B13_termstat()
    cdf = 50 * 1000 * 1000
    html = HtmlVisualizer("pc_binary_feature.html")

    def idf_scorer(doc, claim_text, perspective_text):
        cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(
            perspective_text)
        cp_tokens = lmap(lambda x: x.lower(), cp_tokens)
        cp_tokens = set(cp_tokens)
        mentioned_terms = lfilter(lambda x: x in doc, cp_tokens)
        mentioned_terms = re_tokenize(mentioned_terms)

        def idf(term):
            if term not in clue12_13_df:
                if term in string.printable:
                    return 0
                not_found_set.add(term)

            return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5))

        score = sum(lmap(idf, mentioned_terms))
        max_score = sum(lmap(idf, cp_tokens))
        # print(claim_text, perspective_text)
        # print(mentioned_terms)
        # print(score, max_score)
        return score, max_score, mentioned_terms

    def bm25_estimator(doc: Counter, claim_text: str, perspective_text: str):
        cp_tokens = nltk.word_tokenize(claim_text) + nltk.word_tokenize(
            perspective_text)
        cp_tokens = lmap(lambda x: x.lower(), cp_tokens)
        k1 = 0

        def BM25_3(f, qf, df, N, dl, avdl) -> float:
            K = compute_K(dl, avdl)
            first = math.log((N - df + 0.5) / (df + 0.5))
            second = ((k1 + 1) * f) / (K + f)
            return first * second

        dl = sum(doc.values())
        info = []
        for q_term in set(cp_tokens):
            if q_term in doc:
                score = BM25_3(doc[q_term], 0, clue12_13_df[q_term], cdf, dl,
                               1200)
                info.append((q_term, doc[q_term], clue12_13_df[q_term], score))
        return info

    print_cnt = 0
    for dp_idx, x in enumerate(datapoint_list):
        ranked_list: List[GalagoRankEntry] = ci.query_passage(
            x.cid, x.pid, x.claim_text, x.p_text)
        html.write_paragraph(x.claim_text)
        html.write_paragraph(x.p_text)
        html.write_paragraph("{}".format(x.label))

        local_print_cnt = 0
        lines = []
        for ranked_entry in ranked_list:
            try:
                doc_id = ranked_entry.doc_id
                galago_score = ranked_entry.score

                tokens = load_doc(doc_id)
                doc_tf = Counter(tokens)
                if doc_tf is not None:
                    score, max_score, mentioned_terms = idf_scorer(
                        doc_tf, x.claim_text, x.p_text)
                    matched = score > max_score * 0.75
                else:
                    matched = "Unk"
                    score = "Unk"
                    max_score = "Unk"

                def get_cell(token):
                    if token in mentioned_terms:
                        return Cell(token, highlight_score=50)
                    else:
                        return Cell(token)

                line = [doc_id, galago_score, matched, score, max_score]
                lines.append(line)
                html.write_paragraph("{0} / {1:.2f}".format(
                    doc_id, galago_score))
                html.write_paragraph("{}/{}".format(score, max_score))
                bm25_info = bm25_estimator(doc_tf, x.claim_text, x.p_text)
                bm25_score = sum(lmap(lambda x: x[3], bm25_info))
                html.write_paragraph(
                    "bm25 re-estimate : {}".format(bm25_score))
                html.write_paragraph("{}".format(bm25_info))
                html.multirow_print(lmap(get_cell, tokens))
                local_print_cnt += 1
                if local_print_cnt > 10:
                    break
            except KeyError:
                pass

        matched_idx = idx_where(lambda x: x[2], lines)
        if not matched_idx:
            html.write_paragraph("No match")
        else:
            last_matched = matched_idx[-1]
            lines = lines[:last_matched + 1]
            rows = lmap(lambda line: lmap(Cell, line), lines)
            html.write_table(rows)

        if dp_idx > 10:
            break
Example #9
0
def join_docs_and_lm():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims[:10]
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords.update([".", ",", "!", "?"])

    alpha = 0.1

    html_visualizer = HtmlVisualizer("doc_lm_joined.html")

    def get_cell_from_token2(token, probs):
        if token.lower() in stopwords:
            probs = 0
        probs = probs * 1e5
        s = min(100, probs)
        c = Cell(token, s)
        return c

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))

        clusters: List[List[int]] = gold[c['cId']]

        for cluster in clusters:
            html_visualizer.write_paragraph("---")
            p_text_list: List[str] = lmap(perspective_getter, cluster)
            for text in p_text_list:
                html_visualizer.write_paragraph(text)
            html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)),
                             doc)
                html_visualizer.write_headline("Doc rank {}".format(i))
                html_visualizer.multirow_print(cells, width=20)
            except KeyError:
                pass
        html_visualizer.write_paragraph("Not found: {}".format(not_found))