Esempio n. 1
0
    def generate_instances(self, claim: Dict,
                           data_id_manager: DataIDManager) -> List[Instance]:
        cid = claim['cId']
        claim = claim['text']

        passages = self.cid_to_passages[cid]
        good_passages: List[List[str]] = left(
            lfilter(score_over_zero, passages))
        not_good_passages: List[List[str]] = left(
            lfilter_not(score_over_zero, passages))

        n_good = len(good_passages)
        n_not_good = len(not_good_passages)
        random_passage = list([self.random_sample(cid) for _ in range(10)])

        # len(pair_list_g_ng) = n_not_good   ( assuming n_not_good > n_good)

        def make_instance(passage, label):
            info = {'cid': cid}
            return Instance(claim, passage, label,
                            data_id_manager.assign(info))

        l1 = lmap(lambda p: make_instance(p, 1), good_passages)
        l2 = lmap(lambda p: make_instance(p, 0), not_good_passages)
        l3 = lmap(lambda p: make_instance(p, 0), random_passage)
        print("g: ng : rand = {} : {} : {}".format(len(l1), len(l2), len(l3)))
        return l1 + l2 + l3
Esempio n. 2
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[PairedInstance]:
        cid = claim['cId']
        perspective_clusters: List[List[int]] = self.gold[cid]

        passages = self.cid_to_passages[cid]
        gold_candidate_texts: List[str] = flatten_map(perspective_getter,
                                                      perspective_clusters)

        good_passages: List[List[str]] = left(
            lfilter(score_over_zero, passages))
        not_good_passages: List[List[str]] = left(
            lfilter_not(score_over_zero, passages))

        # print("good/not_good passages : {}/{}".format(len(good_passages), len(not_good_passages)))

        # make good vs not_good pairs
        # about 100 items
        pair_list_g_ng: List[Tuple[
            List[str], List[str]]] = generate_pairwise_combinations(
                not_good_passages, good_passages, True)
        # make not_good vs random pairs
        # about 100 items
        pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([
            (inst, self.random_sample(cid)) for inst in not_good_passages
        ])

        # generate (candiate_texts) X (two pair_list), while limit maximum to 5  * len(two pair_list) = 1000
        max_insts = 100 * 2 * 5

        def infinite_passage_iterator():
            while True:
                for pair in pair_list_g_ng:
                    strict_good = 1
                    strict_bad = 0
                    yield pair, strict_good, strict_bad
                for pair in pair_list_ng_rand:
                    strict_good = 0
                    strict_bad = 1
                    yield pair, strict_good, strict_bad

        itr = infinite_passage_iterator()
        all_passage_pair_len = len(pair_list_g_ng) + len(pair_list_ng_rand)
        n_passage_per_inst = int(max_insts / len(gold_candidate_texts)) + 1
        n_passage_per_inst = min(all_passage_pair_len, n_passage_per_inst)

        all_insts = []
        for candidate in gold_candidate_texts:
            for _ in range(n_passage_per_inst):
                passage_pair, strict_good, strict_bad = itr.__next__()
                passage_good, passage_worse = passage_pair
                insts = PairedInstance(passage_good, passage_worse, candidate,
                                       strict_good, strict_bad)
                all_insts.append(insts)
        return all_insts
Esempio n. 3
0
def get_aawd_binary_train_dev():
    global aawd_train_dev_preload
    if aawd_train_dev_preload is not None:
        return aawd_train_dev_preload
    train, dev, test = load_aawd_splits_as_binary()
    train_x = left(train)
    train_y = right(train)
    dev_x = left(dev)
    dev_y = right(dev)
    aawd_train_dev_preload = train_x, train_y, dev_x, dev_y
    return aawd_train_dev_preload
Esempio n. 4
0
def eval_map(split, score_d: Dict[CPIDPair, float], debug=False):
    # load pre-computed perspectives
    candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle(
        split)
    # only evalaute what's available
    valid_cids: Set[int] = set(left(score_d.keys()))
    sub_candidates: List[Tuple[int, List[Dict]]] = lfilter(
        lambda x: x[0] in valid_cids, candidates)
    print("{} claims are evaluated".format(len(sub_candidates)))
    print(left(sub_candidates))
    predictions = predict_from_dict(score_d, sub_candidates, 50)
    return evaluate_map(predictions, debug)
Esempio n. 5
0
def get_ap_list_from_score_d(score_d, split):
    candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle(
        split)
    # only evalaute what's available
    valid_cids: Set[int] = set(left(score_d.keys()))
    sub_candidates: List[Tuple[int, List[Dict]]] = lfilter(
        lambda x: x[0] in valid_cids, candidates)
    print("{} claims are evaluated".format(len(sub_candidates)))
    predictions = predict_from_dict(score_d, sub_candidates, 50)
    cids = left(predictions)
    ap_list = get_average_precision_list(predictions, False)
    return ap_list, cids
Esempio n. 6
0
def main(config):
    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate")
    qk_out_entries: List[QKOutEntry] = load_qk_score(config)

    score_type = config['score_type']
    k = config['k']
    queries = left(qk_candidate)
    good_doc_list_d = {q.query_id: set() for q in queries}

    for entry in qk_out_entries:
        score = get_score_from_logit(score_type, entry.logits)
        if score > k:
            good_doc_list_d[entry.query.query_id].add(entry.kdp.doc_id)

    stat_count = Counter()

    def filter_map(qk_unit: QKUnit):
        query, kdp_list = qk_unit
        good_doc_list = good_doc_list_d[query.query_id]

        def is_good(kdp):
            return kdp.doc_id in good_doc_list

        new_kdp_list = lfilter(is_good, kdp_list)
        print("{} -> {}".format(len(kdp_list), len(new_kdp_list)))
        if not new_kdp_list:
            stat_count["no kdp"] += 1
        return query, new_kdp_list

    new_qk_candidate = lmap(filter_map, qk_candidate)
    print(stat_count)
    save_to_pickle(new_qk_candidate, "robust_on_clueweb_qk_candidate_filtered")
Esempio n. 7
0
def get_stance_check_candidate(text: str, bm25_module: BM25):
    sents = sent_tokenize_newline(text)
    term_importance = get_term_importance(bm25_module, sents)

    def is_heading_num(s):
        return re.match(r'^\[(\d{1,3}|i{1,5})\]', s) is not None

    r = []
    for sent in sents:
        if not sent.strip():
            continue

        if is_heading_num(sent.strip()):
            continue
        tokens = nltk.tokenize.word_tokenize(sent)
        tokens = set(tokens)

        def per_token_score(t):
            s = bm25_module.tokenizer.stemmer.stem(t)
            return term_importance[s]

        scores: List[Tuple[str, float]] = lmap_pairing(per_token_score, tokens)
        scores.sort(key=lambda x: x[1], reverse=True)
        terms = left(scores[:5])

        candidate = sent, terms
        r.append(candidate)
    return r
Esempio n. 8
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[Payload]:
        cid = claim['cId']
        claim = claim['text']
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]

        if self.filter_good:
            filter_condition = score_over_zero
        else:

            def filter_condition(dummy):
                return True

        good_passages: List[List[str]] = left(
            lfilter(filter_condition, passages))
        output = []
        for pid in perspectives:
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            for passage_idx, passage in enumerate(good_passages):
                perspective = perspective_getter(pid)
                info = {'cid': cid, 'pid': pid, 'passage_idx': passage_idx}
                p = Payload(passage, claim, perspective,
                            data_id_manager.assign(info), is_correct)
                output.append(p)

        return output
Esempio n. 9
0
def load_prediction(
        data: EstimatorPredictionViewer) -> List[Tuple[str, List[float]]]:

    print("prediction has {} entry".format(data.data_len))

    def parse_entry(entry) -> Tuple[str, float]:
        input_tokens: Segment = entry.get_tokens('input_ids')
        logits = entry.get_vector("logits")
        probs = softmax(logits)
        key = input_tokens_to_key(input_tokens)
        score = probs[1]

        return key, score

    parsed_data: List[Tuple[str, float]] = lmap(parse_entry, data)

    keys: List[str] = unique_from_sorted(left(parsed_data))
    grouped: Dict[str, List[Tuple[str,
                                  float]]] = group_by(parsed_data,
                                                      lambda x: x[0])

    def fetch_scores(key):
        l = []
        for k2, score in grouped[key]:
            assert key == k2
            l.append(score)
        return key, l

    results: List[Tuple[str, List[float]]] = lmap(fetch_scores, keys)
    return results
Esempio n. 10
0
def extract_predictions(score_d, split):
    candidates: List[Tuple[int, List[Dict]]] = get_eval_candidates_from_pickle(
        split)
    # only evalaute what's available
    valid_cids: Set[int] = set(left(score_d.keys()))
    sub_candidates: List[Tuple[int, List[Dict]]] = lfilter(
        lambda x: x[0] in valid_cids, candidates)
    print("{} claims are evaluated".format(len(sub_candidates)))

    def make_decisions(e: Tuple[int, List[Dict]]):
        cid, p_list = e
        decisions = []
        for p in p_list:
            pid = int(p['pid'])
            query_id = CPIDPair((cid, pid))

            if query_id in score_d:
                score = score_d[query_id]
            else:
                score = 0

            binary = 1 if score > 0.5 else 0
            decisions.append((cid, pid, binary))

        return cid, decisions

    predictions = lmap(make_decisions, candidates)
    return predictions
Esempio n. 11
0
def load_prediction(pred_path) -> List[Tuple[str, List[np.ndarray]]]:
    data = EstimatorPredictionViewer(pred_path)

    def parse_entry(entry) -> Tuple[str, np.ndarray]:
        input_tokens: Segment = entry.get_tokens('input_ids')
        logits = entry.get_vector("logits")
        probs = softmax(logits)
        key = input_tokens_to_key(input_tokens)
        return key, probs

    parsed_data: List[Tuple[str, np.ndarray]] = lmap(parse_entry, data)

    keys: List[str] = unique_from_sorted(left(parsed_data))
    grouped: Dict[str,
                  List[Tuple[str,
                             np.ndarray]]] = group_by(parsed_data,
                                                      lambda x: x[0])

    def fetch_scores(key):
        l = []
        for k2, score in grouped[key]:
            assert key == k2
            l.append(score)
        return key, l

    results: List[Tuple[str, List[np.ndarray]]] = lmap(fetch_scores, keys)
    return results
Esempio n. 12
0
def qk_candidate_gen(q_res_path: str, doc_score_path, split,
                     config) -> List[Tuple[QCKQuery, List[KDP]]]:
    queries: List[QCKQuery] = get_qck_queries(split)
    num_jobs = d_n_claims_per_split2[split]
    score_d = load_doc_scores(doc_score_path, num_jobs)

    tprint("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_ids = list(ranked_list.keys())
    query_ids.sort()
    print("num queries", len(query_ids))
    q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)}
    print("Pre loading docs")
    top_n = config['top_n']
    out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = []

    all_doc_parts = 0
    ticker = TimeEstimator(len(queries))
    for q in queries:
        job_id: int = q_id_to_job_id[q.query_id]
        entries: List = score_d[job_id]
        entries.sort(key=get_second, reverse=True)
        doc_ids = left(entries)
        doc_ids = doc_ids[:top_n]
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        docs = iterate_docs(doc_ids)
        doc_part_list: List[KDP] = iterate_document_parts(
            docs, config['window_size'], config['step_size'], 20)

        all_doc_parts += len(doc_part_list)
        out_qk.append((q, doc_part_list))
        ticker.tick()
    return out_qk
Esempio n. 13
0
def select_vertices_edges(counter) -> Tuple[Edges, List[Any]]:
    def is_not_funct(word):
        if len(word) > 2:
            return True

        return word not in ",.)(:'\"`-?''``,%"

    #print("total pairs", len(counter))
    vertice_counter = get_vertices_info(counter)
    #print("total terms", len(vertice_counter))
    common_vertices = list([(k, cnt) for k, cnt in vertice_counter.items()
                            if cnt > 100])
    common_vertices.sort(key=lambda x: x[1], reverse=True)
    # print(left(common_vertices[:20]))
    # print("Terms with more than 100 appearance : ", len(common_vertices))
    valid_vertices: List[Any] = lfilter(is_not_funct, left(common_vertices))
    valid_pairs = list([((a, b), cnt) for (a, b), cnt in counter.items()
                        if a in valid_vertices and b in valid_vertices])
    # print("valid pairs", len(valid_pairs))
    unnormalized_edges: Dict[Any, Dict] = {}
    for (a, b), cnt in valid_pairs:
        if a not in unnormalized_edges:
            unnormalized_edges[a] = Counter()
        unnormalized_edges[a][b] += cnt

    edges = {}
    for vertex_a, raw_edges in unnormalized_edges.items():
        total = sum(raw_edges.values())
        local_edges = Counter()
        for vertex_b, cnt in raw_edges.items():
            prob = cnt / total
            local_edges[vertex_b] = prob
        edges[vertex_a] = local_edges
    return Edges(edges), valid_vertices
Esempio n. 14
0
def count_n_gram_grom_docs(docs, n, config, exclude_fn):
    count = Counter()
    tick = TimeEstimator(len(docs))

    top_k = 10000

    after_pruning = False
    for doc_idx, doc in enumerate(docs):
        if doc_idx % 10000 == 0:
            print(doc_idx)
        tick.tick()
        for segment in doc:
            if MERGE_SUBWORD in config:
                segment = merge_subword(segment)
            assert type(segment) == list
            for ngram_item in ngrams(segment, n):
                if after_pruning and ngram_item in selected_ngram:
                    continue
                elif exclude_fn(ngram_item):
                    pass
                else:
                    count[ngram_item] += 1

        if len(count) > 1000 * 1000 and not after_pruning:
            print("Performing pruning")
            tf_cnt = list(count.items())
            tf_cnt.sort(key=lambda x: x[1], reverse=True)
            selected_ngram = set(left(tf_cnt[:top_k]))
            after_pruning = True

    return count
Esempio n. 15
0
    def generate_instances(self, claim: Dict,
                           data_id_manager) -> List[Payload]:
        cid = claim['cId']
        claim = claim['text']
        perspectives = self.candidate_perspective[cid]
        passages = self.cid_to_passages[cid]

        output = []
        for pid in perspectives:
            info = {
                'cid': cid,
                'pid': pid,
            }
            is_correct = any([pid in cluster for cluster in self.gold[cid]])
            perspective = perspective_getter(pid)
            passage_list = left(passages)
            payload = Payload(
                passage_list,
                claim,
                perspective,
                data_id_manager.assign(info),
                is_correct,
            )
            output.append(payload)

        return output
Esempio n. 16
0
def passage_to_lm(tokenizer, claim, passages: List[Tuple[List[str], float]], alpha):
    claim_text = claim['text']
    claim_tokens = tokenizer.tokenize_stem(claim_text)

    tf = tokens_to_freq(flatten(left(passages)))
    c_tf = tokens_to_freq(claim_tokens)
    r_tf = smooth_ex(c_tf, tf, alpha)
    return r_tf
Esempio n. 17
0
            def predict(doc):
                tokens = tokenizer(doc)
                sum_odd = 0

                top10 = left(list(self.get_tf10(tokens)))
                odd_list = lmap(term_odd, tokens)
                result = sum(odd_list)
                return result
Esempio n. 18
0
    def __init__(self, max_sequence, vocab_filename, voca_size):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        self.mscore = read_mscore_valid()
        self.mscore_dict = dict(self.mscore)
        self.train_topics, self.dev_topics = self.held_out(left(self.mscore))

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
        self.client = TextReaderClient()

        class UniformSampler:
            def __init__(self, topics):
                self.sample_space = topics

            def sample(self):
                return random.sample(self.sample_space, 2)


        class BiasSampler:
            def __init__(self, topics, score_dict):
                self.sample_space = []
                self.sample_group = dict()

                def score2key(score):
                    return int(math.log(score+1, 1.1))

                for topic in topics:
                    key = score2key(score_dict[topic])
                    if key not in self.sample_group:
                        self.sample_group[key] = []
                    self.sample_group[key].append(topic)

                self.sample_space = list(self.sample_group.keys())


            # Sample from all group
            def sample(self):
                def pick1(l):
                    return l[random.randrange(len(l))]

                g1, g2 = random.sample(self.sample_space, 2)
                t1 = pick1(self.sample_group[g1])
                t2 = pick1(self.sample_group[g2])
                return t1, t2

        self.train_sampler = BiasSampler(self.train_topics, self.mscore_dict)
        self.dev_sampler = BiasSampler(self.dev_topics, self.mscore_dict)
Esempio n. 19
0
def select_paragraph(
    docs: Dict[str, List[List[str]]],
    clue12_13_df,
    claim_list: List[Dict],
    strategy="topk",
) -> List[Tuple[str, List[List[str]]]]:

    claim_id_to_text: Dict[int,
                           str] = {c['cId']: c['text']
                                   for c in claim_list}

    cdf = 50 * 1000 * 1000
    top_k = 100
    not_found_set = set()

    def idf(term: str):
        if term not in clue12_13_df:
            if term in string.printable:
                return 0
            not_found_set.add(term)

        return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5))

    r: List[Tuple[str, List[List[str]]]] = []
    ticker = TimeEstimator(len(docs))
    for claim_id, docs in docs.items():
        claim_text = claim_id_to_text[int(claim_id)]
        q_terms = set(re_tokenize(nltk.tokenize.word_tokenize(claim_text)))

        def scorer(para: List[str]) -> float:
            return paragraph_scorer(idf, q_terms, para)

        max_score = sum(lmap(idf, q_terms))

        def get_best_per_doc(doc: List[str]) -> List[Tuple[List[str], float]]:
            paragraph_list: Iterable[List[str]] = enum_paragraph([doc])
            paragraph_scored_list: List[Tuple[List[str],
                                              float]] = lmap_pairing(
                                                  scorer, paragraph_list)
            paragraph_scored_list.sort(key=lambda x: x[1], reverse=True)
            return paragraph_scored_list[:1]

        selected: List[Tuple[List[str], float]] = list(
            flatten(lmap(get_best_per_doc, docs)))

        # if strategy == "topk":
        #     selected: List[Tuple[List[str], float]] = paragraph_scored_list[:top_k]
        # elif strategy == "cutoff":
        #     cut_off = max_score * 0.6
        #     selected: List[Tuple[List[str], float]] = lfilter(lambda x: x[1] > cut_off, paragraph_scored_list)
        # else:
        #     assert False

        e = claim_id, left(selected)
        r.append(e)
        ticker.tick()

    return r
Esempio n. 20
0
def sample_kdps(qk_list: List[QKUnit]) -> List[QKUnit]:
    n = 4

    def sample(l: List[KDP]):
        random.shuffle(l)
        return l[:n]

    right_things = lmap(sample, right(qk_list))
    return list(zip(left(qk_list), right_things))
Esempio n. 21
0
def get_scores(r: List[Tuple[int, int]]) -> Dict:
    tp = sum([1 if a == b == 1 else 0 for a, b in r])
    tn = sum([1 if a == b == 0 else 0 for a, b in r])
    accuracy = (tp + tn) / len(r)

    pp = sum(left(r))
    precision = tp / pp if pp != 0 else 0
    recall = tp / sum(right(r))

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall}
Esempio n. 22
0
    def show(r: RelevanceModel):
        print('----')
        print(r.text)
        log_topic_lm = get_lm_log(smooth(r.lm, bg_lm, alpha))
        log_bg_lm = get_lm_log(bg_lm)
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        for k, v in r.lm.most_common(50):
            print(k, v)

        s = "\t".join(left(r.lm.most_common(10)))
        print("LM freq: ", s)
        print(s)

        s = "\t".join(left(log_odd.most_common(30)))
        print("Log odd top", s)

        s = "\t".join(left(least_common(log_odd, 10)))
        print("Log odd bottom", s)
Esempio n. 23
0
    def show(claim_lm: ClaimLM):
        print('----')
        print(claim_lm.claim)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_bg_lm = get_lm_log(bg_lm)
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        for k, v in claim_lm.LM.most_common(50):
            print(k, v)

        s = "\t".join(left(claim_lm.LM.most_common(10)))
        print("LM freq: ", s)
        print(s)

        s = "\t".join(left(log_odd.most_common(30)))
        print("Log odd top", s)

        s = "\t".join(left(least_common(log_odd, 10)))
        print("Log odd bottom", s)
Esempio n. 24
0
def guardian_generate(query):
    articles = load_all_articles(query)
    article_d = {}
    for entry in articles:
        id = entry[0]
        article_d[id] = entry

    score_list = load_ranking(query)

    top_k = 200
    ids = get_top_ids(score_list, top_k)
    sents = []
    for id in ids:
        print(id)
        id, title, short_id, text = article_d[id]
        sents += [title] + nltk.sent_tokenize(text)

    verbs_all = Counter()
    nouns_all = Counter()
    entities_all = Counter()
    print("POS tagging...")
    shuffle(sents)
    size_small = int(len(sents)*0.1)
    ticker = TimeEstimator(size_small)
    sub_sents = sents[:size_small]
    for sent in sub_sents:
        verbs, nouns = get_verb_nouns(sent)
        nouns_all.update(nouns)
        verbs_all.update(verbs)
        entities_all.update(get_entities(sent))
        ticker.tick()


    v_top = left(verbs_all.most_common(100))
    n_top = left(nouns_all.most_common(100))
    en_top = left(entities_all.most_common(100))

    print("Verbs")
    list_print(v_top, 10)
    print("Nouns")
    list_print(n_top, 10)
    print("Entities")
    list_print(en_top, 10)
Esempio n. 25
0
    def high_idf_q_terms(self, q_tf, n_limit=10):
        total_doc = 11503029 + 100

        high_qt = Counter()
        for term, qf in q_tf.items():
            qdf = self.df[term]
            w = BM25_3_q_weight(qf, qdf, total_doc)
            high_qt[term] = w

        return set(left(high_qt.most_common(n_limit)))
Esempio n. 26
0
def load_passage_dict(todo, passage_qrels):
    passage_ids_to_find = []
    qids = left(todo)
    for qid in qids:
        for passage_id, score in passage_qrels[qid].items():
            if score:
                passage_ids_to_find.append(passage_id)
    passage_dict = get_passage_dict(passage_ids_to_find)
    save_to_pickle(passage_dict, "msmarco_passage_doc_analyze_passage_dict")
    return passage_dict
Esempio n. 27
0
def build_match_tree():
    selected_words = load_from_pickle("nli_dev_selected_words")

    seq_set = left(selected_words)

    match_tree = MatchTree()
    for seq in seq_set:
        match_tree.add_seq(seq)

    save_to_pickle(match_tree, "match_tree_nli_dev")
Esempio n. 28
0
    def high_idf_q_terms(self, q_tf, n_limit=10):
        total_doc = self.total_doc_n

        high_qt = Counter()
        for term, qf in q_tf.items():
            postings = self.get_posting(term)
            qdf = len(postings)
            w = BM25_3_q_weight(qf, qdf, total_doc)
            high_qt[term] = w

        return set(left(high_qt.most_common(n_limit)))
Esempio n. 29
0
    def generate_instances(
            self, claim: Dict,
            data_id_manager: DataIDManager) -> List[PairedInstance]:
        cid = claim['cId']
        claim = claim['text']

        passages = self.cid_to_passages[cid]
        good_passages: List[List[str]] = left(
            lfilter(score_over_zero, passages))
        not_good_passages: List[List[str]] = left(
            lfilter_not(score_over_zero, passages))

        n_good = len(good_passages)
        n_not_good = len(not_good_passages)

        # len(pair_list_g_ng) = n_not_good   ( assuming n_not_good > n_good)
        pair_list_g_ng: List[Tuple[
            List[str], List[str]]] = generate_pairwise_combinations(
                not_good_passages, good_passages, True)
        # len(pair_list_g_rand) = n_good
        pair_list_g_rand: List[Tuple[List[str], List[str]]] = list([
            (inst, self.random_sample(cid)) for inst in good_passages
        ])
        # len(pair_list_g_rand) = n_not_good
        pair_list_ng_rand: List[Tuple[List[str], List[str]]] = list([
            (inst, self.random_sample(cid)) for inst in not_good_passages
        ])

        def make_instance(passage_pair, strict_good, strict_bad):
            passage_good, passage_worse = passage_pair
            info = {'cid': cid}
            return PairedInstance(claim, passage_good, passage_worse,
                                  strict_good, strict_bad,
                                  data_id_manager.assign(info))

        l1 = lmap(lambda pair: make_instance(pair, 1, 0), pair_list_g_ng)
        l2 = lmap(lambda pair: make_instance(pair, 0, 1), pair_list_ng_rand)
        l3 = lmap(lambda pair: make_instance(pair, 1, 1), pair_list_g_rand)
        print("g-ng : ng-rank : g-rand = {} : {} : {}".format(
            len(l1), len(l2), len(l3)))
        return l1 + l2 + l3
Esempio n. 30
0
def load_n_1_gram_set(topic, n):
    if n == 1:
        return set()
    else:
        count = load_n_gram_from_pickle(topic, n - 1)
        l = list(count.items())
        l.sort(key=lambda x: x[1], reverse=True)
        top_k = 10000
        for j in range(n - 1):
            top_k *= 100
        print(l[0])
        return set(left(l)[:top_k])