Example #1
0
class TokenizeForBM25Worker:
    def __init__(self, split, query_group, candidate_docs_d, out_dir):
        self.query_group = query_group
        self.tokenizer = PCTokenizer()
        self.candidate_docs_d = candidate_docs_d
        self.out_dir = out_dir
        self.ms_reader = MSMarcoDataReader(split)

    def work(self, job_id):
        qid_list = self.query_group[job_id]
        ticker = TimeEstimator(len(qid_list))
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        def get_tf(text):
            tokens = self.tokenizer.tokenize_stem(text)
            return Counter(tokens)

        for qid in qid_list:
            if qid not in self.candidate_docs_d:
                continue

            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            ticker.tick()

            target_docs = self.candidate_docs_d[qid]
            tokens_d = {}
            for d in docs:
                if d.doc_id in target_docs:
                    title_tokens = self.tokenizer.tokenize_stem(d.title)
                    body_sents = sent_tokenize(d.body)
                    body_tf_list = lmap(get_tf, body_sents)
                    tokens_d[d.doc_id] = (title_tokens, body_tf_list)

            if len(tokens_d) < len(target_docs):
                log_variables(job_id, qid)
                print("{} of {} not found".format(len(tokens_d),
                                                  len(target_docs)))

            save_path = os.path.join(self.out_dir, str(qid))
            pickle.dump(tokens_d, open(save_path, "wb"))
def main(config):
    word_list_path = config['word_list_path']
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    word_list_d: Dict = json.load(open(word_list_path, "r"))

    tokenizer = PCTokenizer()

    for query_id in word_list_d:
        claim = claim_d[int(query_id)]
        word_list = word_list_d[query_id]
        base_query_terms = tokenizer.tokenize_stem(claim)
        base_query_terms = list(
            [t for t in base_query_terms if t not in stopwords])
        #print

        new_term_set = set()
        for new_term in word_list:
            t = tokenizer.stemmer.stem(new_term)
            if t not in base_query_terms:
                new_term_set.add(t)

        print()
        print("Claim {}: {}".format(query_id, claim))
        print("base query terms: ", base_query_terms)
        print("new terms: ", new_term_set)
Example #3
0
def pc_new_init_prob():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)
    bias_plus_word: Counter = load_from_pickle("bias_plus_words")
    tokenizer = PCTokenizer()

    base_p = max(bias_plus_word.values())

    init_p_score_d = {}
    for cid in d_ids:
        c_text = claim_d[cid]
        tokens = tokenizer.tokenize_stem(c_text)

        score_for_cid = Counter()
        for t, cnt in Counter(tokens).items():
            prob = cnt * base_p
            score_for_cid[t] = prob

        for t, score in bias_plus_word.items():
            score_for_cid[t] += score

        score_for_cid = normalize_counter_to_sum1(score_for_cid)
        init_p_score_d[cid] = score_for_cid

    save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
Example #4
0
class Worker:
    def __init__(self, out_dir):
        robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
        tprint("Loading doc ids")
        self.doc_ids = all_doc_ids_of_interest()
        tprint("Loading robust docs")
        self.docs: Dict[str, str] = trec.load_robust(robust_path)
        tprint("Start processing")

        n_docs = len(self.doc_ids)
        docs_per_job = int((n_docs+n_jobs) / 5)
        self.docs_per_job = docs_per_job
        self.tokenizer = PCTokenizer()
        self.out_dir = out_dir

    def work(self, job_id):
        doc_id_to_count = dict()
        st = job_id * self.docs_per_job
        ed = st + self.docs_per_job
        todo = self.doc_ids[st:ed]
        ticker = TimeEstimator(len(todo))
        for doc_id in todo:
            try:
                text = self.docs[doc_id]
                tokens = self.tokenizer.tokenize_stem(text)
                counter = Counter(tokens)
                doc_id_to_count[doc_id] = counter
                ticker.tick()
            except KeyError as e:
                print(e)
                print("key error")
                pass

        save_path = os.path.join(self.out_dir, str(job_id))
        pickle.dump(doc_id_to_count, open(save_path, "wb"))
Example #5
0
def get_valid_terms():
    perspective = get_perspective_dict()
    tokenizer = PCTokenizer()
    voca = set()
    for text in perspective.values():
        voca.update(tokenizer.tokenize_stem(text))
    return voca
Example #6
0
class BM25:
    def __init__(self, df, num_doc, avdl, k1=0.01, k2=100, b=0.6):
        self.core = BM25Bare(df, num_doc, avdl, k1, k2, b)
        self.tokenizer = PCTokenizer()

    def score(self, query, text) -> NamedNumber:
        q_terms = self.tokenizer.tokenize_stem(query)
        t_terms = self.tokenizer.tokenize_stem(text)
        q_tf = Counter(q_terms)
        t_tf = Counter(t_terms)
        return self.core.score_inner(q_tf, t_tf)

    def term_idf_factor(self, term):
        return self.core.term_idf_factor(term)

    def score_inner(self, q_tf, t_tf) -> NamedNumber:
        return self.core.score_inner(q_tf, t_tf)
Example #7
0
def a_relevant(save_name, q_res_path, claims):
    top_n = 10

    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    num_pos_sum = 0
    num_pos_exists = 0

    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)
        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))

    data = entries, all_passages

    save_to_pickle(data, save_name)
Example #8
0
def count_df(passages: Iterable[Passage]) -> Counter:
    tokenizer = PCTokenizer()
    df = Counter()
    for p in passages:
        tokens = tokenizer.tokenize_stem(p.text)

        for term in set(tokens):
            df[term] += 1

    return df
Example #9
0
def build_lm(split) -> Iterable[RelevanceModel]:
    tokenizer = PCTokenizer()
    problems, candidate_pool_d = prepare_eval_data(split)
    payload: List[Passage] = get_eval_payload_from_dp(problems)
    for query, problem in zip(payload, problems):
        p = problem
        source_text = p.text1.text
        tokens = tokenizer.tokenize_stem(source_text)
        counter = tokens_to_freq(tokens)
        yield RelevanceModel(query.id.id, query.text, counter)
Example #10
0
class LMScorer:
    def __init__(self, query_lms: Dict[str, Counter], alpha=0.5):
        self.query_lms = query_lms
        bg_lm = average_counters(list(query_lms.values()))
        self.bg_lm = bg_lm
        self.log_bg_lm: Counter = get_lm_log(bg_lm)
        self.alpha = alpha
        self.log_odd_d: Dict[str, Counter] = {
            k: Counter()
            for k in query_lms.keys()
        }
        self.stopwords = load_stopwords_for_query()
        self.tokenizer = PCTokenizer()

    def score(self, query_id, raw_tokens) -> float:
        stemmed_tokens = self.filter_and_stem(raw_tokens)
        return self._get_score_from_stemmed_tokens(query_id, stemmed_tokens)

    def filter_and_stem(self, tokens):
        stemmed_tokens = []
        for t in tokens:
            if t in self.stopwords:
                pass
            else:
                try:
                    stemmed_t = self.tokenizer.stemmer.stem(t)
                    stemmed_tokens.append(stemmed_t)
                except UnicodeDecodeError:
                    pass
        return stemmed_tokens

    def score_text(self, query_id, text):
        tokens = self.tokenizer.tokenize_stem(text)
        tokens = list([t for t in tokens if t not in self.stopwords])
        return self._get_score_from_stemmed_tokens(query_id, tokens)

    def _get_score_from_stemmed_tokens(self, query_id, tokens) -> float:
        log_odd_d: Counter = self.log_odd_d[query_id]
        lm = self.query_lms[query_id]

        def get_score(token: str) -> float:
            if token in log_odd_d:
                return log_odd_d[token]

            if token in lm or token in self.bg_lm:
                prob_pos = lm[token] * (
                    1 - self.alpha) + self.bg_lm[token] * self.alpha
                pos_log = math.log(prob_pos)
            else:
                pos_log = 0
            score = pos_log - self.log_bg_lm[token]
            log_odd_d[token] = score
            return score

        return average(lmap(get_score, tokens))
Example #11
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Example #12
0
def main():
    split = "train"
    resource = ProcessedResource10docMulti(split)

    query_group: List[List[QueryID]] = load_query_group(split)
    msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv")
    passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path)

    qids = query_group[0]
    qids = qids[:100]
    pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc"
    try:
        passage_dict = load_from_pickle(pickle_name)
    except FileNotFoundError:
        print("Reading passages...")
        passage_dict = get_passages(qids, passage_qrels)
        save_to_pickle(passage_dict, pickle_name)
    def get_rel_doc_id(qid):
        if qid not in resource.get_doc_for_query_d():
            raise KeyError
        for doc_id in resource.get_doc_for_query_d()[qid]:
            label = resource.get_label(qid, doc_id)
            if label:
                return doc_id
        raise KeyError

    def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body):
        acc = 0
        for idx, tokens in enumerate(stemmed_body_tokens_list):
            acc += len(tokens)
            if loc_in_body < acc:
                return idx
        return -1

    pc_tokenize = PCTokenizer()
    bert_tokenizer = get_tokenizer()

    for qid in qids:
        try:
            doc_id = get_rel_doc_id(qid)
            stemmed_tokens_d = resource.get_stemmed_tokens_d(qid)
            stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id]
            rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score])
            success = False
            found_idx = -1
            for rel_passage_id in rel_passages:
                passage_text = passage_dict[rel_passage_id].strip()
                passage_tokens = pc_tokenize.tokenize_stem(passage_text)
                stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list)
                n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True)
                if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0:
                    success = True
                    _, loc_in_body = log[0]

                    sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body)
                    prev = stemmed_body_tokens_flat[:loc_in_body]

                    loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev)))
                    print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list))
                    found_idx = sent_idx
            if not success:
                print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens)))

        except KeyError:
            pass
Example #13
0
def doc_lm_scoring():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    html_visualizer = HtmlVisualizer("doc_lm_doc_level.html")

    tokenizer = PCTokenizer()
    random_passages = []
    num_pos_sum = 0
    num_pos_exists = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))
        # for cluster in clusters:
        #     html_visualizer.write_paragraph("---")
        #     p_text_list: List[str] = lmap(perspective_getter, cluster)
        #     for text in p_text_list:
        #         html_visualizer.write_paragraph(text)
        #     html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        threshold = average(scores)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        def get_passage_score(p):
            return sum([log_odd[tokenizer.stemmer.stem(t)]
                        for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        passages.sort(key=lambda x: x[1], reverse=True)
        html_visualizer.write_paragraph("Threshold {}".format(threshold))

        top5_scores = right(passages[:5])
        bot5_scores = right(passages[-5:])

        if len(random_passages) > 5:
            random_sel_pssages = random.choices(random_passages, k=5)
        else:
            random_sel_pssages = []
        random5_scores = lmap(get_passage_score, random_sel_pssages)

        def score_line(scores):
            return " ".join(lmap(two_digit_float, scores))

        html_visualizer.write_paragraph("top 5: " + score_line(top5_scores))
        html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores))
        html_visualizer.write_paragraph("random 5: " +
                                        score_line(random5_scores))

        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        def print_doc(doc, html_visualizer, score):
            cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc)
            html_visualizer.write_headline("score={}".format(score))
            html_visualizer.multirow_print(cells, width=20)

        random_passages.extend(left(passages))
        if threshold < 0:
            continue
        for doc, score in passages:
            if score < 0:
                break
            print_doc(doc, html_visualizer, score)

        html_visualizer.write_headline("Bottom 5")
        for doc, score in passages[-5:]:
            print_doc(doc, html_visualizer, score)

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))