Exemple #1
0
    def create_instances_from_documents(self, documents):
        documents = [doc for doc in documents if doc]
        max_num_tokens = self.max_seq_length - 3
        target_seq_length = max_num_tokens

        docs_as_chunks, target_inst_num = self.pool_chunks_from_docs(
            documents, target_seq_length)

        instances = []
        for _ in range(target_inst_num):
            chunk_1 = pick1(pick1(docs_as_chunks))

            m = self.rng.randint(1, len(chunk_1))
            tokens_a = flatten(chunk_1[:m])
            b_length = target_seq_length - len(tokens_a)
            if self.rng.random() < 0.5:
                chunk_2 = pick1(pick1(docs_as_chunks))
                tokens_b = flatten(chunk_2)[:b_length]
            else:
                tokens_b = flatten(chunk_1[m:])[:b_length]
            truncate_seq_pair(tokens_a, tokens_b, target_seq_length, self.rng)

            tokens, segment_ids = format_tokens_pair_n_segid(
                tokens_a, tokens_b)
            instance = SegmentInstance(tokens=tokens, segment_ids=segment_ids)
            instances.append(instance)

        return instances
Exemple #2
0
 def different_claim() -> Iterator[Tuple[int, int]]:
     for cid1, cid2 in combinations(ids, 2):
         clusters1 = id_dict[cid1]
         clusters2 = id_dict[cid2]
         for p1 in flatten(clusters1):
             for p2 in flatten(clusters2):
                 yield p1, p2
Exemple #3
0
    def tree2seq(node):
        if not node.children():
            return [node.name]

        left = [tree2seq(c) for c in node.children_left]
        right = [tree2seq(c) for c in node.children_right]
        seq = flatten(left) + [node.name] + flatten(right)
        return seq
def syntactic_parsing_method(article, comments):
    all_texts = article + comments
    all_tokens = list([tokenize(t, set()) for t in all_texts])
    tr = TextRank(all_tokens)
    r = tr.run(flatten(all_tokens))
    r = generate(all_texts, r)
    print(r)
Exemple #5
0
    def select_paragraph_from_datapoint(x: TPDataPoint) -> ParagraphFeature:
        try:
            ranked_docs: List[SimpleRankedListEntry] = ci.fetch_from_q_res_id(dp_id_to_q_res_id_fn(x.id))
            ranked_docs = ranked_docs[:100]
        except KeyError:
            ranked_docs = []

        paragraph_scorer_local: Callable[[Paragraph], ScoreParagraph] = paragraph_scorer_factory(x)
        #  prefetch tokens and bert tokens
        doc_ids = lmap(lambda x: x.doc_id, ranked_docs)
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)

        def get_best_paragraph_from_doc(doc: SimpleRankedListEntry) -> List[ScoreParagraph]:
            paragraph_list = paragraph_iterator(doc)
            score_paragraph = lmap(paragraph_scorer_local, paragraph_list)
            score_paragraph.sort(key=lambda p: p.score, reverse=True)
            return score_paragraph[:1]

        def get_all_paragraph_from_doc(doc: SimpleRankedListEntry) -> List[ScoreParagraph]:
            paragraph_list = paragraph_iterator(doc)
            score_paragraph = lmap(paragraph_scorer_local, paragraph_list)
            return score_paragraph

        if option.para_per_doc == ONE_PARA_PER_DOC:
            get_paragraphs = get_best_paragraph_from_doc
        else:
            get_paragraphs = get_all_paragraph_from_doc

        candidate_paragraph: List[ScoreParagraph] = list(flatten(lmap(get_paragraphs, ranked_docs)))
        candidate_paragraph.sort(key=lambda x: x.score, reverse=True)
        candidate_paragraph = remove_duplicate(candidate_paragraph)

        return ParagraphFeature(datapoint=x,
                                feature=candidate_paragraph[:n_passages])
Exemple #6
0
def main():
    run_config = json.load(open(sys.argv[1], "r"))

    l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['first_list'])
    l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['second_list'])
    run_name = run_config['run_name']
    strategy = run_config['strategy']
    save_path = run_config['save_path']
    k1 = run_config['k1']
    k2 = run_config['k2']
    new_entries: Dict[str, List[TrecRankedListEntry]] = l1

    qid_list = l1.keys()
    for key in l2:
        if key not in qid_list:
            print("WARNING qid {} is not in the first list".format(key))

    for qid in qid_list:
        if qid not in l2:
            new_entries[qid] = l1[qid]
        else:
            entries1 = l1[qid]
            entries2 = l2[qid]
            if strategy == "reciprocal":
                fused_scores = reciprocal_fusion(entries1, entries2, k1, k2)
            elif strategy == "weighted_sum":
                fused_scores = weighted_sum_fusion(entries1, entries2, k1, k2)
            else:
                assert False
            new_entries[qid] = scores_to_ranked_list_entries(fused_scores, run_name, qid)

    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    write_trec_ranked_list_entry(flat_entries, save_path)
Exemple #7
0
def main():
    input_path = sys.argv[1]
    save_path = sys.argv[2]
    l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(input_path)

    new_entries: Dict[str, List[TrecRankedListEntry]] = {}
    run_name = "Reverse"

    for qid, ranked_list in l1.items():
        raw_ranked_list = []
        for e in ranked_list:
            score = 1 - e.score
            raw_e = (e.query_id, e.doc_id, score)
            raw_ranked_list.append(raw_e)

        raw_ranked_list.sort(key=lambda x: x[2], reverse=True)

        new_ranked_list = []
        for rank, e in enumerate(raw_ranked_list):
            query_id, doc_id, score = e
            e_new = TrecRankedListEntry(query_id, doc_id, rank, score, run_name)
            new_ranked_list.append(e_new)

        new_entries[qid] = new_ranked_list
    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    write_trec_ranked_list_entry(flat_entries, save_path)
Exemple #8
0
    def summarize(self):
        topic = data_generator.argmining.ukp_header.all_topics[0]
        data_loader = ukp.DataLoader(topic)
        stopwords = load_stopwords()

        def tokenize(x):
            return tokenizer.tokenize(x, stopwords)

        def sent_score(token_sent, bow_score):
            score = 0
            factor = 1
            for t in token_sent:
                score += bow_score[t] * factor
                factor *= 0.5
            return score

        def is_argument(entry):
            return entry['annotation'] == "Argument_for" or entry[
                'annotation'] == "Argument_against"

        for topic in data_generator.argmining.ukp_header.all_topics:
            entries = data_loader.all_data[topic]
            raw_sents = list(
                [e['sentence'] for e in entries if e['set'] == 'train'])
            token_sents = list(map(tokenize, raw_sents))
            tprint("Runing TextRank")
            text_rank = TextRank(token_sents)
            tr_score = Counter(text_rank.run(flatten(token_sents)))
            tprint("claim_gen.generate")

            raw_sents.sort(key=lambda x: sent_score(tokenize(x), tr_score),
                           reverse=True)
            for i in range(10):
                print(raw_sents[i])
Exemple #9
0
def convert_alt_emb(source_path, output_path, seq_set: List[List[int]]):
    all_tokens: Set[int] = set(flatten(seq_set))
    min_overlap = min([len(set(tokens)) for tokens in seq_set])

    def feature_transformer(feature):
        new_features = collections.OrderedDict()
        success = False
        for key in feature:
            v = take(feature[key])
            if key == "input_ids":
                alt_emb_mask = [0] * len(v)
                s = set(v)
                if len(s.intersection(all_tokens)) >= min_overlap:
                    for word in seq_set:
                        pre_match = 0
                        for i in range(len(v)):
                            if v[i] == word[pre_match]:
                                pre_match += 1
                            else:
                                pre_match = 0
                            if pre_match == len(word):
                                pre_match = 0
                                for j in range(i - len(word) + 1, i + 1):
                                    alt_emb_mask[j] = 1
                                    success = True
                new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask)
            new_features[key] = create_int_feature(v)

        if success:
            return new_features
        else:
            return None

    return tfrecord_convertor_with_none(source_path, output_path,
                                        feature_transformer)
Exemple #10
0
def debug_failture(predictions):
    gold = get_claim_perspective_id_dict()
    ap_list = []
    for c_Id, prediction_list in predictions:
        gold_pids = gold[c_Id]
        gold_pids_set: Set[int] = set(flatten(gold_pids))
        claim_text = prediction_list[0]['claim_text']
        print("Claim {}: ".format(c_Id), claim_text)
        correctness_list = lmap(lambda p: p['pid'] in gold_pids_set,
                                prediction_list)
        ap = get_ap(prediction_list, gold_pids, False)

        if not any(correctness_list):  # all wrong
            continue

        if ap > 0.9:
            continue

        def print_line(prediction):
            pid = prediction['pid']
            correct = pid in gold_pids_set
            if correct:
                correct_str = "Y"
            else:
                correct_str = "N"

            score = prediction['score']
            print(correct_str, score, score.name,
                  prediction['perspective_text'])

        foreach(print_line, prediction_list)
        ap_list.append(ap)

    map = average(ap_list)
    return {'map': map}
Exemple #11
0
    def pool_tokens(self, sent_list, target_seq_length, skip=False):
        results = []
        current_chunk = []
        current_length = 0
        i = 0
        if skip:
            i = i + self.rng.randint(0, 3)

        while i < len(sent_list):
            segment = sent_list[i]
            current_chunk.append(segment)
            current_length += len(segment)
            if i == len(sent_list) - 1 or current_length >= target_seq_length:
                tokens_a = flatten(current_chunk)
                tokens_a = tokens_a[:target_seq_length]
                results.append(tokens_a)
                current_chunk = []
                current_length = 0
                if skip:
                    i = i + self.rng.randint(0, 3)
            i += 1

        self.all_doc_cnt += 1
        if len(results) == 1:
            if len(results[0]) < target_seq_length * 0.5:
                self.short_doc_cnt += 1

        return results
Exemple #12
0
def passage_to_lm(tokenizer, claim, passages: List[Tuple[List[str], float]], alpha):
    claim_text = claim['text']
    claim_tokens = tokenizer.tokenize_stem(claim_text)

    tf = tokens_to_freq(flatten(left(passages)))
    c_tf = tokens_to_freq(claim_tokens)
    r_tf = smooth_ex(c_tf, tf, alpha)
    return r_tf
Exemple #13
0
def get_claim_perspective_label_dict() -> Dict[CPIDPair, int]:
    gold = get_claim_perspective_id_dict()
    d = defaultdict(int)
    for cid, pid_list_list in gold.items():
        for pid in flatten(pid_list_list):
            cpid_pair = CPIDPair((cid, pid))
            d[cpid_pair] = 1
    return d
Exemple #14
0
def get_trec_relevance_judgement() -> Iterable[TrecRelevanceJudgementEntry]:
    gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict()
    for cid, clusters in gold.items():
        query_id = str(cid)
        pids = set(flatten(clusters))
        for pid in pids:
            e = TrecRelevanceJudgementEntry(query_id, str(pid), 1)
            yield e
Exemple #15
0
def get_term_importance(bm25_module, sents):
    tokens = flatten([bm25_module.tokenizer.tokenize_stem(s) for s in sents])

    q_tf = Counter(tokens)
    term_importance = Counter()
    for term, tf in q_tf.items():
        term_importance[term] += bm25_module.term_idf_factor(term) * tf
    return term_importance
def select_paragraph(
    docs: Dict[str, List[List[str]]],
    clue12_13_df,
    claim_list: List[Dict],
    strategy="topk",
) -> List[Tuple[str, List[List[str]]]]:

    claim_id_to_text: Dict[int,
                           str] = {c['cId']: c['text']
                                   for c in claim_list}

    cdf = 50 * 1000 * 1000
    top_k = 100
    not_found_set = set()

    def idf(term: str):
        if term not in clue12_13_df:
            if term in string.printable:
                return 0
            not_found_set.add(term)

        return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5))

    r: List[Tuple[str, List[List[str]]]] = []
    ticker = TimeEstimator(len(docs))
    for claim_id, docs in docs.items():
        claim_text = claim_id_to_text[int(claim_id)]
        q_terms = set(re_tokenize(nltk.tokenize.word_tokenize(claim_text)))

        def scorer(para: List[str]) -> float:
            return paragraph_scorer(idf, q_terms, para)

        max_score = sum(lmap(idf, q_terms))

        def get_best_per_doc(doc: List[str]) -> List[Tuple[List[str], float]]:
            paragraph_list: Iterable[List[str]] = enum_paragraph([doc])
            paragraph_scored_list: List[Tuple[List[str],
                                              float]] = lmap_pairing(
                                                  scorer, paragraph_list)
            paragraph_scored_list.sort(key=lambda x: x[1], reverse=True)
            return paragraph_scored_list[:1]

        selected: List[Tuple[List[str], float]] = list(
            flatten(lmap(get_best_per_doc, docs)))

        # if strategy == "topk":
        #     selected: List[Tuple[List[str], float]] = paragraph_scored_list[:top_k]
        # elif strategy == "cutoff":
        #     cut_off = max_score * 0.6
        #     selected: List[Tuple[List[str], float]] = lfilter(lambda x: x[1] > cut_off, paragraph_scored_list)
        # else:
        #     assert False

        e = claim_id, left(selected)
        r.append(e)
        ticker.tick()

    return r
Exemple #17
0
def build_voca(data):
    short_desc_list = lmap(lambda x: x['short_desc'], data)
    all_text = tokenize(short_desc_list)
    voca = set(flatten(all_text))
    n_output_voca = len(voca)
    word2idx = {}
    for idx, word in enumerate(list(voca)):
        word2idx[word] = idx
    return n_output_voca, word2idx
Exemple #18
0
def init_token_voca():
    topic = "atheism"
    setting = SimpleTokner(topic)

    stance_text = stance_detection.get_train_text()
    token_list = list([l.split() for l in stance_text])
    print(token_list[:20])
    encoder = TokenTextEncoder(None, vocab_list=flatten(token_list))
    encoder.store_to_file(setting.vocab_filename)
Exemple #19
0
def work():
    data = load_all("C:\work\Data\controversy_tweets\census")
    all_texts = flatten(data.values())
    print("all text:", len(all_texts))
    uniq_texts = set(all_texts)
    print("unique text:", len(uniq_texts))
    uniq_texts = near_duplicate_deletion(uniq_texts)
    print("unique text:", len(uniq_texts))
    for t in uniq_texts:
        print(t.strip())
Exemple #20
0
def main():
    first_list_path = sys.argv[1]
    l: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(
        first_list_path)

    new_entries: Dict[str, List[TrecRankedListEntry]] = l

    flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values())
    doc_ids = list(set([e.doc_id for e in flat_entries]))
    urls_d = get_urls(doc_ids)
    save_to_pickle(urls_d, "urls_d")
Exemple #21
0
def generate_instances(claim_passages_list: Iterable[ClaimPassages],
                       data_id_manager: DataIDManager) -> Iterable[QKInstance]:
    def convert(pair: ClaimPassages) -> Iterable[QKInstance]:
        claim, passages = pair
        cid = claim['cId']
        query_text = claim['text']
        for passage_idx, (passage, dummy_score) in enumerate(passages):
            info = {'cid': cid, 'passage_idx': passage_idx}
            yield QKInstance(query_text, passage, data_id_manager.assign(info))

    return flatten(map(convert, claim_passages_list))
Exemple #22
0
def preload_docs(ranked_list, claims, top_n):
    def get_doc_ids(claim: Dict):
        # Find the q_res
        q_res: List[SimpleRankedListEntry] = ranked_list[str(claim['cId'])]
        return list([q_res[i].doc_id for i in range(top_n)])

    all_doc_ids: Set[str] = set(flatten(lmap(get_doc_ids, claims)))
    print(f"total of {len(all_doc_ids)} docs")
    print("Accessing DB")
    #  Get the doc from DB
    preload_man.preload(TokenizedCluewebDoc, all_doc_ids)
Exemple #23
0
def main(prefix1, prefix2):
    topic = "abortion"
    tfrecord_path = "./data/ukp_tfrecord/dev_" + topic
    tfrecord = list(load_tfrecord(tfrecord_path))

    get_correctness_arr_fn = partial(get_correctness_arr, tfrecord)

    prediction_list_1 = list(get_existing_predictions(prefix1, topic))
    prediction_list_2 = list(get_existing_predictions(prefix2, topic))

    num_runs = min(len(prediction_list_1), len(prediction_list_2))
    prediction_list_1 = prediction_list_1[:num_runs]
    prediction_list_2 = prediction_list_2[:num_runs]

    c1 = flatten(lmap(get_correctness_arr_fn, prediction_list_1))
    c2 = flatten(lmap(get_correctness_arr_fn, prediction_list_2))

    print(len(c1))
    print(len(c2))

    _, p_value = stats.ttest_rel(c1, c2)
    print(p_value)
Exemple #24
0
    def generate(
        self,
        kc_candidate: Iterable[QKUnit],
        data_id_manager: DataIDManager,
    ) -> Iterable[QKInstance]:
        def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[QKInstance]:
            query, passages = pair
            for passage in passages:
                info = {'query': query, 'kdp': passage}
                yield QKInstance(query.text, passage.tokens,
                                 data_id_manager.assign(info),
                                 self._is_correct(query, passage))

        return flatten(lmap(convert, kc_candidate))
Exemple #25
0
def build_df():
    claims, val = train_split()
    gold = get_claim_perspective_id_dict()

    tokenizer = PCTokenizer()
    df = Counter()

    dl_list = []
    for claim in claims:
        cid = claim["cId"]
        gold_pids = flatten(gold[cid])
        p_text_list: List[str] = lmap(perspective_getter, gold_pids)
        tokens_list = lmap(tokenizer.tokenize_stem, p_text_list)
        dl_list.extend(lmap(len, tokens_list))

        for t in set(flatten(tokens_list)):
            df[t] += 1

    print(dl_list)
    print("Avdl", average(dl_list))
    print(len(claims))
    print(df.most_common(30))
    save_to_pickle(df, "pc_df")
Exemple #26
0
def save_way_back_fetch():
    all_url = flatten(parse_all_urls())
    wayback_dict = {}
    for url in all_url:
        print(url)
        prefix = "http://archive.org/wayback/available?url="
        ret = requests.get(prefix + url)
        if ret.status_code != 200:
            print(ret.status_code)
            break
        else:
            wayback_dict[url] = ret.content

    pickle.dump(wayback_dict, open(way_back_save_path, "wb"))
Exemple #27
0
def pool_tokens(rng,
                sent_list: List[List[Token]],
                target_seq_length,
                skip=False) -> List[List[Token]]:
    results: List[List[Token]] = []
    current_chunk = []
    current_length = 0
    i = 0
    if skip:
        i = i + rng.randint(0, 3)

    def is_new_doc(segment):
        return 'isbn' in segment

    num_real_doc = 1
    while i < len(sent_list):
        segment: List[Token] = sent_list[i]
        if is_new_doc(segment):
            num_real_doc += 1
            tokens_a: List[Token] = list(flatten(current_chunk))
            tokens_a = tokens_a[:target_seq_length]
            results.append(tokens_a)
            current_chunk: List[List[Token]] = []
            current_length = 0

        current_chunk.append(segment)
        current_length += len(segment)
        if i == len(sent_list) - 1 or current_length >= target_seq_length:
            tokens_a = list(flatten(current_chunk))
            tokens_a = tokens_a[:target_seq_length]
            results.append(tokens_a)
            current_chunk = []
            current_length = 0
            if skip:
                i = i + rng.randint(0, 3)
        i += 1
    return results
Exemple #28
0
    def create_instances(self, topic, raw_docs, labeled_data):
        # Format: [CLS] [Abortion] [LABEL_FAVOR] ...(ukp text)...[SEP] [ABORTION] [LABEL_UNK] ..(clue text).. [SEP]
        topic_tokens = self.tokenizer.tokenize(topic.replace("_", " "))
        # TODO iterate docs, pool chunk
        # randomly draw and sometimes insert labeled one
        # encode and add to instances
        max_num_tokens = self.max_seq_length - 3 - 2 - 2 * len(topic_tokens)
        target_seq_length = max_num_tokens
        docs_as_chunks, target_inst_num = self.pool_chunks_from_docs(
            raw_docs, target_seq_length)

        instances = []
        for _ in range(target_inst_num):
            chunk_1 = pick1(pick1(docs_as_chunks))

            m = self.rng.randint(1, len(chunk_1))
            tokens_a = flatten(chunk_1[:m])
            b_length = target_seq_length - len(tokens_a)
            if self.rng.random() < self.ratio_labeled and labeled_data:
                label, tokens_b = pick1(labeled_data)
            else:
                if self.rng.random() < 0.5:
                    chunk_2 = pick1(pick1(docs_as_chunks))
                    tokens_b = flatten(chunk_2)[:b_length]
                else:
                    tokens_b = flatten(chunk_1[m:])[:b_length]
                label = -1
            truncate_seq_pair(tokens_a, tokens_b, target_seq_length, self.rng)

            swap = self.rng.random() < 0.5

            tokens, segment_ids = encode_label_and_token_pair(
                topic_tokens, label, tokens_b, tokens_a, swap)
            instance = SegmentInstance(tokens=tokens, segment_ids=segment_ids)
            instances.append(instance)

        return instances
Exemple #29
0
def eval_classification(classifier, split):
    payloads = load_payload(split)
    gold = get_claim_perspective_id_dict()

    r = []
    for cid, data_list in payloads:
        gold_pids = gold[cid]
        all_pid_set = set(flatten(gold_pids))
        for p_entry in data_list:
            c_text = p_entry['claim_text']
            p_text = p_entry['perspective_text']
            z = classifier(c_text, p_text)
            y = 1 if p_entry['pid'] in all_pid_set else 0
            r.append((z, y))
    return get_scores(r)
Exemple #30
0
def tune_kernel_a():
    split = "train"
    payloads = load_payload(split)
    gold = get_claim_perspective_id_dict()

    r = []
    for cid, data_list in payloads:
        gold_pids = gold[cid]
        all_pid_set = set(flatten(gold_pids))
        for p_entry in data_list:
            c_text = p_entry['claim_text']
            p_text = p_entry['perspective_text']
            y = 1 if p_entry['pid'] in all_pid_set else 0
            r.append((c_text, p_text, y))
    tune_kernel_save(r)