Ejemplo n.º 1
0
def main(config):
    word_list_path = config['word_list_path']
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    word_list_d: Dict = json.load(open(word_list_path, "r"))

    tokenizer = PCTokenizer()

    for query_id in word_list_d:
        claim = claim_d[int(query_id)]
        word_list = word_list_d[query_id]
        base_query_terms = tokenizer.tokenize_stem(claim)
        base_query_terms = list(
            [t for t in base_query_terms if t not in stopwords])
        #print

        new_term_set = set()
        for new_term in word_list:
            t = tokenizer.stemmer.stem(new_term)
            if t not in base_query_terms:
                new_term_set.add(t)

        print()
        print("Claim {}: {}".format(query_id, claim))
        print("base query terms: ", base_query_terms)
        print("new terms: ", new_term_set)
Ejemplo n.º 2
0
def a_relevant(save_name, q_res_path, claims):
    top_n = 10

    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    num_pos_sum = 0
    num_pos_exists = 0

    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)
        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))

    data = entries, all_passages

    save_to_pickle(data, save_name)
Ejemplo n.º 3
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    claims = claims[:10]
    top_n = 100
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)

    stopwords = load_stopwords_for_query()
    alpha = 0.7

    tokenizer = PCTokenizer()
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        docs = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
                docs.append(doc)
            except KeyError:
                docs.append(None)
                pass

        print(c['text'])
        rows = []
        for rank, doc in enumerate(docs):
            if doc is None:
                rows.append((rank, "-", "-"))
                continue

            scores = get_doc_score(doc, get_passage_score)
            avg_score = average(scores)
            max_score = max(scores)
            rows.append((rank, avg_score, max_score))

        print_table(rows)
Ejemplo n.º 4
0
def a_relevant():
    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.3

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        base = average(scores)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        all_passages.extend(passages)
        a_rel_passages = lfilter(lambda x: x[1] > 0, passages)

        entries.append((c, a_rel_passages))

    data = entries, all_passages

    save_to_pickle(data, "pc_train_a_passages")
Ejemplo n.º 5
0
 def __init__(self, query_lms: Dict[str, Counter], alpha=0.5):
     self.query_lms = query_lms
     bg_lm = average_counters(list(query_lms.values()))
     self.bg_lm = bg_lm
     self.log_bg_lm: Counter = get_lm_log(bg_lm)
     self.alpha = alpha
     self.log_odd_d: Dict[str, Counter] = {
         k: Counter()
         for k in query_lms.keys()
     }
     self.stopwords = load_stopwords_for_query()
     self.tokenizer = PCTokenizer()
Ejemplo n.º 6
0
def main():
    split = "train"
    subjectivity_path = sys.argv[1]
    q_res_path = sys.argv[2]
    ranked_list: Dict[str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)

    # load LM
    claim_lms: List[ClaimLM] = build_gold_lms_for_sub_split(split)
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)
    alpha = 0.1
    stopwords = load_stopwords_for_query()
    # load subjectivity predictions.
    subj_d: Dict[str, Tuple[int, int]] = load_subjectivity(subjectivity_path)
    doc_ids = subj_d.keys()
    preload_man.preload(TokenizedCluewebDoc, doc_ids)
    tokenizer = PCTokenizer()

    lm_scores = []
    rates = []
    num_subj_list = []
    num_sent_list = []
    for claim_lm in claim_lms:
        qid = str(claim_lm.cid)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        for entry in ranked_list[qid]:
            if entry.doc_id in subj_d:
                tokens = load_doc(entry.doc_id)
                assert type(tokens[0]) == str
                lm_score = get_passage_score(tokens)
                num_subj, num_sent = subj_d[entry.doc_id]
                rate = num_subj / num_sent
                lm_scores.append(lm_score)
                rates.append(rate)
                num_subj_list.append(num_subj)
                num_sent_list.append(num_sent)



    print("lm scores correlation with ")
    print("rates: ", pearsonr(lm_scores, rates))
    print("num subj: ", pearsonr(lm_scores, num_subj_list))
    print("num sent: ", pearsonr(lm_scores, num_sent_list))
Ejemplo n.º 7
0
def get_answer_maker(config):
    stopwords = load_stopwords_for_query()

    def make_answer1(problem: PackedInstance, score: np.array) -> List[str]:
        # among tokens from documents
        # select unique words that has highest score
        token_score = Counter()
        n_appear = Counter()
        max_len = len(problem.input_ids)
        for idx in range(max_len):
            if problem.input_mask[idx] == 0:
                break

            # skip query tokens
            if problem.segment_ids[idx] == 0:
                continue

            token_idx = problem.idx_mapping[idx]
            if token_idx == -1:
                assert problem.word_tokens[token_idx] == "[CLS]"
                print("skip cls token")
            token = problem.word_tokens[token_idx]
            token_score[token] += score[idx]
            n_appear[token] += 1

        out_tokens = []
        max_score = None
        for token, token_score in token_score.most_common():
            if len(out_tokens) > config.max_terms:
                break
            if config.drop_stopwords and token in stopwords:
                continue

            if max_score is None:
                max_score = token_score
                score_cut = max_score * config.cut_factor

            if len(out_tokens) == 0:
                include = True
            else:
                if token_score > score_cut:
                    include = True
                else:
                    include = False

            if include:
                out_tokens.append(token)
            else:
                break
        return out_tokens

    return make_answer1
Ejemplo n.º 8
0
def get_answer_maker_token_level(config):
    stopwords = load_stopwords_for_query()

    def make_answer(problem: str, score: np.array) -> List[str]:
        tokens = problem.split()
        sep_idx = tokens.index("[SEP]")
        # among tokens from documents
        # select unique words that has highest score
        token_score = Counter()
        n_appear = Counter()
        max_len = len(tokens)
        print(tokens)
        print(max_len)
        print(len(score))
        for idx in range(sep_idx + 1, max_len):
            # skip query tokens
            if tokens[idx] == "[PAD]":
                break
            token = tokens[idx]
            token_score[token] += score[idx]
            n_appear[token] += 1

        out_tokens = []
        max_score = None
        for token, token_score in token_score.most_common():
            if len(out_tokens) > config.max_terms:
                break
            if config.drop_stopwords and token in stopwords:
                continue

            if max_score is None:
                max_score = token_score
                score_cut = max_score * config.cut_factor

            if len(out_tokens) == 0:
                include = True
            else:
                if token_score > score_cut:
                    include = True
                else:
                    include = False

            if include:
                out_tokens.append(token)
            else:
                break
        return out_tokens

    return make_answer
Ejemplo n.º 9
0
def main():
    split = "dev"
    stopword = load_stopwords_for_query()
    # split = "train"
    ex_info_dir = "/mnt/nfs/work3/youngwookim/job_man/pc_rm_terms_{}".format(
        split)
    query_path = os.path.join(
        output_path, "perspective_{}_claim_query_k0_fixed.json".format(split))
    queries = load_queries(query_path)
    ex_w_scale = 100
    out_path = os.path.join(output_path, "perspective_query",
                            "pc_{}_claim_query_rm_ex.json".format(split))
    ##
    new_queries = get_extended(ex_info_dir, ex_w_scale, queries, stopword)
    save_queries_to_file(new_queries, out_path)
Ejemplo n.º 10
0
def get_generator(max_seq_length, bg_lm, alpha):
    log_bg_lm = get_lm_log(bg_lm)
    top_n = 100
    stopwords = load_stopwords_for_query()
    fail_logger = Counter()
    bert_tokenizer = get_tokenizer()

    def generate(claim_lm: ClaimLM, ranked_list: List[SimpleRankedListEntry]):
        claim_text = claim_lm.claim
        claim_tokens = bert_tokenizer.tokenize(claim_text)
        claim_token_len = len(claim_tokens)

        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)
        doc_ids = lmap(lambda x: x.doc_id, ranked_list[:top_n])
        print("loading docs")
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)

        window_size = max_seq_length - claim_token_len - 3
        step_size = max_seq_length - 112
        enum_paragraph = enum_paragraph_functor(step_size, window_size)

        def get_record(tokens):
            scores, masks = get_target_labels(tokens, log_odd, stopwords,
                                              fail_logger)
            sum(scores)
            return Record(claim_tokens, tokens, scores, masks)

        tokens_list: List[List[str]] = []
        not_found = 0
        for doc_id in doc_ids:
            try:
                tokens: List[str] = list(
                    flatten(load(BertTokenizedCluewebDoc, doc_id)))
                tokens_list.append(tokens)
            except KeyError:
                not_found += 1
                pass

        print("{} of {} not found".format(not_found, len(tokens_list)))
        paragraph_list: Iterable[List[str]] = enum_paragraph(tokens_list)
        records: List[Record] = lmap(get_record, paragraph_list)

        return records

    return generate
Ejemplo n.º 11
0
def main():
    file_path = sys.argv[1]
    name = os.path.basename(file_path)
    viewer = EstimatorPredictionViewer(file_path)
    html = HtmlVisualizer("toke_score_gold.html")
    stopwords = load_stopwords_for_query()

    skip = 10
    for entry_idx, entry in enumerate(viewer):
        if entry_idx % skip != 0:
            continue
        tokens = entry.get_tokens("input_ids")
        input_ids = entry.get_vector("input_ids")
        label_ids = entry.get_vector("label_ids")
        label_ids = np.reshape(label_ids, [-1, 2])
        log_label_ids = np.log(label_ids + 1e-10)
        seg1, seg2 = split_p_h_with_input_ids(tokens, input_ids)

        pad_idx = tokens.index("[PAD]")
        assert pad_idx > 0

        logits = entry.get_vector("logits")
        cells = []
        cells2 = []
        for idx in range(pad_idx):
            probs = label_ids[idx]
            token = tokens[idx]

            score = probs[0]
            color = "B" if score > 0 else "R"
            highlight_score = min(abs(score) * 10000, 100)
            if token in stopwords:
                highlight_score = 0
            if token in seg1:
                highlight_score = 50
                color = "G"

            c = Cell(token,
                     highlight_score=highlight_score,
                     target_color=color)
            cells.append(c)
        html.multirow_print_from_cells_list([cells, cells2])

        if entry_idx > 10000:
            break
Ejemplo n.º 12
0
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    save_path = config['save_path']
    threshold = config['threshold']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    all_d = {}
    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        entry.sort(key=get_second, reverse=True)
        word_list = []
        for word, diff, pos, neg in entry[:100]:
            if diff > threshold:
                word = word.strip()
                word_list.append(word)
        all_d[query_id] = word_list
    json.dump(all_d, open(save_path, "w"))
Ejemplo n.º 13
0
def get_answer_maker_term_level(config):
    stopwords = load_stopwords_for_query()
    stopwords.add("[SEP]")
    stopwords.add("[CLS]")
    tokenizer = get_tokenizer()

    def make_answer1(problem: PackedInstance, score: Dict) -> List[str]:
        # among tokens from documents
        # select unique words that has highest score
        out_tokens = []
        max_score = None
        for term_id_str, term_score in Counter(score).most_common():
            if len(out_tokens) > config.max_terms:
                break

            input_ids = recover_int_list_str(term_id_str)
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            term = get_term(tokens)
            if config.drop_stopwords and term in stopwords:
                continue

            if max_score is None:
                max_score = term_score
                score_cut = max_score * config.cut_factor

            if len(out_tokens) == 0:
                include = True
            else:
                if term_score > score_cut:
                    include = True
                else:
                    include = False

            if include:
                out_tokens.append(term)
            else:
                break
        return out_tokens

    return make_answer1
Ejemplo n.º 14
0
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        print(query_id, claim_d[int(query_id)])
        entry.sort(key=get_second, reverse=True)
        for word, diff, pos, neg in entry[:100]:
            word = word.strip()
            print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format(
                word, diff, pos, neg))
Ejemplo n.º 15
0
    def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True):
        self.tokenizer = get_tokenizer()

        self.stopwords_as_ids: Set[WordAsID] = set()
        new_d = {}
        if skip_stopwords:
            stopwords = load_stopwords_for_query()
            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                if len(tokens) == 1 and tokens[0] in stopwords:
                    pass
                    self.stopwords_as_ids.add(key)
                else:
                    new_d[key] = d[key]
            d = new_d

        if stem:
            d_raw = defaultdict(list)
            stemmer = Stemmer()

            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                plain_word = pretty_tokens(tokens, True)
                stemmed = stemmer.stem(plain_word)
                d_raw[stemmed].append(d[key])

            new_d: Dict[str, TokenScore] = {}
            for key, items in d_raw.items():
                score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])]
                new_d[key] = score
            d = new_d
            self.stem = True
            self.stemmer = stemmer
            self.log_odd = self.log_odd_w_stem

        self.d = d
        self.smoothing = 0.1
Ejemplo n.º 16
0
 def __init__(self, drop_stopwords=True):
     self.drop_stopwords = drop_stopwords
     self.stopword = load_stopwords_for_query()
Ejemplo n.º 17
0
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        print(c['cId'], c['text'])
        missing = []
        for i in range(top_n):
            try:
                doc = load_doc(q_res[i].doc_id)
            except KeyError:
                missing.append(i)
                pass
        print(missing)
        n_missing += len(missing)

    print("")


stopwords = load_stopwords_for_query()


def get_cell_from_token(token, log_odd):
    if token in stopwords:
        log_odd = 0

    if log_odd > 0:
        s = min(150, log_odd * 50)
        c = Cell(token, s, target_color="B")
    elif log_odd < 0:
        s = min(150, -log_odd * 50)
        c = Cell(token, s, target_color="R")
    else:
        c = Cell(token)
    return c
Ejemplo n.º 18
0
def doc_lm_scoring():
    gold = get_claim_perspective_id_dict()

    d_ids = list(load_train_claim_ids())
    claims: List[Dict] = get_claims_from_ids(d_ids)
    claims = claims
    top_n = 10
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/perspective/train_claim/q_res_100")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    html_visualizer = HtmlVisualizer("doc_lm_doc_level.html")

    tokenizer = PCTokenizer()
    random_passages = []
    num_pos_sum = 0
    num_pos_exists = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        html_visualizer.write_headline("{} : {}".format(c['cId'], c['text']))
        # for cluster in clusters:
        #     html_visualizer.write_paragraph("---")
        #     p_text_list: List[str] = lmap(perspective_getter, cluster)
        #     for text in p_text_list:
        #         html_visualizer.write_paragraph(text)
        #     html_visualizer.write_paragraph("---")
        claim_lm = claim_lms_d[c['cId']]
        topic_lm_prob = smooth(claim_lm.LM, bg_lm, alpha)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])
        threshold = average(scores)

        s = "\t".join(left(log_odd.most_common(30)))
        html_visualizer.write_paragraph("Log odd top: " + s)
        not_found = set()

        def get_log_odd(x):
            x = tokenizer.stemmer.stem(x)
            if x not in log_odd:
                not_found.add(x)
            return log_odd[x]

        def get_probs(x):
            x = tokenizer.stemmer.stem(x)
            if x not in topic_lm_prob:
                not_found.add(x)
            return topic_lm_prob[x]

        def get_passage_score(p):
            return sum([log_odd[tokenizer.stemmer.stem(t)]
                        for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)

        passages.sort(key=lambda x: x[1], reverse=True)
        html_visualizer.write_paragraph("Threshold {}".format(threshold))

        top5_scores = right(passages[:5])
        bot5_scores = right(passages[-5:])

        if len(random_passages) > 5:
            random_sel_pssages = random.choices(random_passages, k=5)
        else:
            random_sel_pssages = []
        random5_scores = lmap(get_passage_score, random_sel_pssages)

        def score_line(scores):
            return " ".join(lmap(two_digit_float, scores))

        html_visualizer.write_paragraph("top 5: " + score_line(top5_scores))
        html_visualizer.write_paragraph("bot 5: " + score_line(bot5_scores))
        html_visualizer.write_paragraph("random 5: " +
                                        score_line(random5_scores))

        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        def print_doc(doc, html_visualizer, score):
            cells = lmap(lambda x: get_cell_from_token(x, get_log_odd(x)), doc)
            html_visualizer.write_headline("score={}".format(score))
            html_visualizer.multirow_print(cells, width=20)

        random_passages.extend(left(passages))
        if threshold < 0:
            continue
        for doc, score in passages:
            if score < 0:
                break
            print_doc(doc, html_visualizer, score)

        html_visualizer.write_headline("Bottom 5")
        for doc, score in passages[-5:]:
            print_doc(doc, html_visualizer, score)

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))
Ejemplo n.º 19
0
def filter_stopwords(tokens: Iterable[str]) -> List[str]:
    global stopwords
    if stopwords is None:
        stopwords = load_stopwords_for_query()
    return list([t for t in tokens if t not in stopwords])