def create_instances_from_documents(self, documents): documents = [doc for doc in documents if doc] max_num_tokens = self.max_seq_length - 3 target_seq_length = max_num_tokens docs_as_chunks, target_inst_num = self.pool_chunks_from_docs( documents, target_seq_length) instances = [] for _ in range(target_inst_num): chunk_1 = pick1(pick1(docs_as_chunks)) m = self.rng.randint(1, len(chunk_1)) tokens_a = flatten(chunk_1[:m]) b_length = target_seq_length - len(tokens_a) if self.rng.random() < 0.5: chunk_2 = pick1(pick1(docs_as_chunks)) tokens_b = flatten(chunk_2)[:b_length] else: tokens_b = flatten(chunk_1[m:])[:b_length] truncate_seq_pair(tokens_a, tokens_b, target_seq_length, self.rng) tokens, segment_ids = format_tokens_pair_n_segid( tokens_a, tokens_b) instance = SegmentInstance(tokens=tokens, segment_ids=segment_ids) instances.append(instance) return instances
def different_claim() -> Iterator[Tuple[int, int]]: for cid1, cid2 in combinations(ids, 2): clusters1 = id_dict[cid1] clusters2 = id_dict[cid2] for p1 in flatten(clusters1): for p2 in flatten(clusters2): yield p1, p2
def tree2seq(node): if not node.children(): return [node.name] left = [tree2seq(c) for c in node.children_left] right = [tree2seq(c) for c in node.children_right] seq = flatten(left) + [node.name] + flatten(right) return seq
def syntactic_parsing_method(article, comments): all_texts = article + comments all_tokens = list([tokenize(t, set()) for t in all_texts]) tr = TextRank(all_tokens) r = tr.run(flatten(all_tokens)) r = generate(all_texts, r) print(r)
def select_paragraph_from_datapoint(x: TPDataPoint) -> ParagraphFeature: try: ranked_docs: List[SimpleRankedListEntry] = ci.fetch_from_q_res_id(dp_id_to_q_res_id_fn(x.id)) ranked_docs = ranked_docs[:100] except KeyError: ranked_docs = [] paragraph_scorer_local: Callable[[Paragraph], ScoreParagraph] = paragraph_scorer_factory(x) # prefetch tokens and bert tokens doc_ids = lmap(lambda x: x.doc_id, ranked_docs) preload_man.preload(TokenizedCluewebDoc, doc_ids) preload_man.preload(BertTokenizedCluewebDoc, doc_ids) def get_best_paragraph_from_doc(doc: SimpleRankedListEntry) -> List[ScoreParagraph]: paragraph_list = paragraph_iterator(doc) score_paragraph = lmap(paragraph_scorer_local, paragraph_list) score_paragraph.sort(key=lambda p: p.score, reverse=True) return score_paragraph[:1] def get_all_paragraph_from_doc(doc: SimpleRankedListEntry) -> List[ScoreParagraph]: paragraph_list = paragraph_iterator(doc) score_paragraph = lmap(paragraph_scorer_local, paragraph_list) return score_paragraph if option.para_per_doc == ONE_PARA_PER_DOC: get_paragraphs = get_best_paragraph_from_doc else: get_paragraphs = get_all_paragraph_from_doc candidate_paragraph: List[ScoreParagraph] = list(flatten(lmap(get_paragraphs, ranked_docs))) candidate_paragraph.sort(key=lambda x: x.score, reverse=True) candidate_paragraph = remove_duplicate(candidate_paragraph) return ParagraphFeature(datapoint=x, feature=candidate_paragraph[:n_passages])
def main(): run_config = json.load(open(sys.argv[1], "r")) l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['first_list']) l2: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(run_config['second_list']) run_name = run_config['run_name'] strategy = run_config['strategy'] save_path = run_config['save_path'] k1 = run_config['k1'] k2 = run_config['k2'] new_entries: Dict[str, List[TrecRankedListEntry]] = l1 qid_list = l1.keys() for key in l2: if key not in qid_list: print("WARNING qid {} is not in the first list".format(key)) for qid in qid_list: if qid not in l2: new_entries[qid] = l1[qid] else: entries1 = l1[qid] entries2 = l2[qid] if strategy == "reciprocal": fused_scores = reciprocal_fusion(entries1, entries2, k1, k2) elif strategy == "weighted_sum": fused_scores = weighted_sum_fusion(entries1, entries2, k1, k2) else: assert False new_entries[qid] = scores_to_ranked_list_entries(fused_scores, run_name, qid) flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) write_trec_ranked_list_entry(flat_entries, save_path)
def main(): input_path = sys.argv[1] save_path = sys.argv[2] l1: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped(input_path) new_entries: Dict[str, List[TrecRankedListEntry]] = {} run_name = "Reverse" for qid, ranked_list in l1.items(): raw_ranked_list = [] for e in ranked_list: score = 1 - e.score raw_e = (e.query_id, e.doc_id, score) raw_ranked_list.append(raw_e) raw_ranked_list.sort(key=lambda x: x[2], reverse=True) new_ranked_list = [] for rank, e in enumerate(raw_ranked_list): query_id, doc_id, score = e e_new = TrecRankedListEntry(query_id, doc_id, rank, score, run_name) new_ranked_list.append(e_new) new_entries[qid] = new_ranked_list flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) write_trec_ranked_list_entry(flat_entries, save_path)
def summarize(self): topic = data_generator.argmining.ukp_header.all_topics[0] data_loader = ukp.DataLoader(topic) stopwords = load_stopwords() def tokenize(x): return tokenizer.tokenize(x, stopwords) def sent_score(token_sent, bow_score): score = 0 factor = 1 for t in token_sent: score += bow_score[t] * factor factor *= 0.5 return score def is_argument(entry): return entry['annotation'] == "Argument_for" or entry[ 'annotation'] == "Argument_against" for topic in data_generator.argmining.ukp_header.all_topics: entries = data_loader.all_data[topic] raw_sents = list( [e['sentence'] for e in entries if e['set'] == 'train']) token_sents = list(map(tokenize, raw_sents)) tprint("Runing TextRank") text_rank = TextRank(token_sents) tr_score = Counter(text_rank.run(flatten(token_sents))) tprint("claim_gen.generate") raw_sents.sort(key=lambda x: sent_score(tokenize(x), tr_score), reverse=True) for i in range(10): print(raw_sents[i])
def convert_alt_emb(source_path, output_path, seq_set: List[List[int]]): all_tokens: Set[int] = set(flatten(seq_set)) min_overlap = min([len(set(tokens)) for tokens in seq_set]) def feature_transformer(feature): new_features = collections.OrderedDict() success = False for key in feature: v = take(feature[key]) if key == "input_ids": alt_emb_mask = [0] * len(v) s = set(v) if len(s.intersection(all_tokens)) >= min_overlap: for word in seq_set: pre_match = 0 for i in range(len(v)): if v[i] == word[pre_match]: pre_match += 1 else: pre_match = 0 if pre_match == len(word): pre_match = 0 for j in range(i - len(word) + 1, i + 1): alt_emb_mask[j] = 1 success = True new_features["alt_emb_mask"] = create_int_feature(alt_emb_mask) new_features[key] = create_int_feature(v) if success: return new_features else: return None return tfrecord_convertor_with_none(source_path, output_path, feature_transformer)
def debug_failture(predictions): gold = get_claim_perspective_id_dict() ap_list = [] for c_Id, prediction_list in predictions: gold_pids = gold[c_Id] gold_pids_set: Set[int] = set(flatten(gold_pids)) claim_text = prediction_list[0]['claim_text'] print("Claim {}: ".format(c_Id), claim_text) correctness_list = lmap(lambda p: p['pid'] in gold_pids_set, prediction_list) ap = get_ap(prediction_list, gold_pids, False) if not any(correctness_list): # all wrong continue if ap > 0.9: continue def print_line(prediction): pid = prediction['pid'] correct = pid in gold_pids_set if correct: correct_str = "Y" else: correct_str = "N" score = prediction['score'] print(correct_str, score, score.name, prediction['perspective_text']) foreach(print_line, prediction_list) ap_list.append(ap) map = average(ap_list) return {'map': map}
def pool_tokens(self, sent_list, target_seq_length, skip=False): results = [] current_chunk = [] current_length = 0 i = 0 if skip: i = i + self.rng.randint(0, 3) while i < len(sent_list): segment = sent_list[i] current_chunk.append(segment) current_length += len(segment) if i == len(sent_list) - 1 or current_length >= target_seq_length: tokens_a = flatten(current_chunk) tokens_a = tokens_a[:target_seq_length] results.append(tokens_a) current_chunk = [] current_length = 0 if skip: i = i + self.rng.randint(0, 3) i += 1 self.all_doc_cnt += 1 if len(results) == 1: if len(results[0]) < target_seq_length * 0.5: self.short_doc_cnt += 1 return results
def passage_to_lm(tokenizer, claim, passages: List[Tuple[List[str], float]], alpha): claim_text = claim['text'] claim_tokens = tokenizer.tokenize_stem(claim_text) tf = tokens_to_freq(flatten(left(passages))) c_tf = tokens_to_freq(claim_tokens) r_tf = smooth_ex(c_tf, tf, alpha) return r_tf
def get_claim_perspective_label_dict() -> Dict[CPIDPair, int]: gold = get_claim_perspective_id_dict() d = defaultdict(int) for cid, pid_list_list in gold.items(): for pid in flatten(pid_list_list): cpid_pair = CPIDPair((cid, pid)) d[cpid_pair] = 1 return d
def get_trec_relevance_judgement() -> Iterable[TrecRelevanceJudgementEntry]: gold: Dict[int, List[List[int]]] = get_claim_perspective_id_dict() for cid, clusters in gold.items(): query_id = str(cid) pids = set(flatten(clusters)) for pid in pids: e = TrecRelevanceJudgementEntry(query_id, str(pid), 1) yield e
def get_term_importance(bm25_module, sents): tokens = flatten([bm25_module.tokenizer.tokenize_stem(s) for s in sents]) q_tf = Counter(tokens) term_importance = Counter() for term, tf in q_tf.items(): term_importance[term] += bm25_module.term_idf_factor(term) * tf return term_importance
def select_paragraph( docs: Dict[str, List[List[str]]], clue12_13_df, claim_list: List[Dict], strategy="topk", ) -> List[Tuple[str, List[List[str]]]]: claim_id_to_text: Dict[int, str] = {c['cId']: c['text'] for c in claim_list} cdf = 50 * 1000 * 1000 top_k = 100 not_found_set = set() def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5)) r: List[Tuple[str, List[List[str]]]] = [] ticker = TimeEstimator(len(docs)) for claim_id, docs in docs.items(): claim_text = claim_id_to_text[int(claim_id)] q_terms = set(re_tokenize(nltk.tokenize.word_tokenize(claim_text))) def scorer(para: List[str]) -> float: return paragraph_scorer(idf, q_terms, para) max_score = sum(lmap(idf, q_terms)) def get_best_per_doc(doc: List[str]) -> List[Tuple[List[str], float]]: paragraph_list: Iterable[List[str]] = enum_paragraph([doc]) paragraph_scored_list: List[Tuple[List[str], float]] = lmap_pairing( scorer, paragraph_list) paragraph_scored_list.sort(key=lambda x: x[1], reverse=True) return paragraph_scored_list[:1] selected: List[Tuple[List[str], float]] = list( flatten(lmap(get_best_per_doc, docs))) # if strategy == "topk": # selected: List[Tuple[List[str], float]] = paragraph_scored_list[:top_k] # elif strategy == "cutoff": # cut_off = max_score * 0.6 # selected: List[Tuple[List[str], float]] = lfilter(lambda x: x[1] > cut_off, paragraph_scored_list) # else: # assert False e = claim_id, left(selected) r.append(e) ticker.tick() return r
def build_voca(data): short_desc_list = lmap(lambda x: x['short_desc'], data) all_text = tokenize(short_desc_list) voca = set(flatten(all_text)) n_output_voca = len(voca) word2idx = {} for idx, word in enumerate(list(voca)): word2idx[word] = idx return n_output_voca, word2idx
def init_token_voca(): topic = "atheism" setting = SimpleTokner(topic) stance_text = stance_detection.get_train_text() token_list = list([l.split() for l in stance_text]) print(token_list[:20]) encoder = TokenTextEncoder(None, vocab_list=flatten(token_list)) encoder.store_to_file(setting.vocab_filename)
def work(): data = load_all("C:\work\Data\controversy_tweets\census") all_texts = flatten(data.values()) print("all text:", len(all_texts)) uniq_texts = set(all_texts) print("unique text:", len(uniq_texts)) uniq_texts = near_duplicate_deletion(uniq_texts) print("unique text:", len(uniq_texts)) for t in uniq_texts: print(t.strip())
def main(): first_list_path = sys.argv[1] l: Dict[str, List[TrecRankedListEntry]] = load_ranked_list_grouped( first_list_path) new_entries: Dict[str, List[TrecRankedListEntry]] = l flat_entries: Iterable[TrecRankedListEntry] = flatten(new_entries.values()) doc_ids = list(set([e.doc_id for e in flat_entries])) urls_d = get_urls(doc_ids) save_to_pickle(urls_d, "urls_d")
def generate_instances(claim_passages_list: Iterable[ClaimPassages], data_id_manager: DataIDManager) -> Iterable[QKInstance]: def convert(pair: ClaimPassages) -> Iterable[QKInstance]: claim, passages = pair cid = claim['cId'] query_text = claim['text'] for passage_idx, (passage, dummy_score) in enumerate(passages): info = {'cid': cid, 'passage_idx': passage_idx} yield QKInstance(query_text, passage, data_id_manager.assign(info)) return flatten(map(convert, claim_passages_list))
def preload_docs(ranked_list, claims, top_n): def get_doc_ids(claim: Dict): # Find the q_res q_res: List[SimpleRankedListEntry] = ranked_list[str(claim['cId'])] return list([q_res[i].doc_id for i in range(top_n)]) all_doc_ids: Set[str] = set(flatten(lmap(get_doc_ids, claims))) print(f"total of {len(all_doc_ids)} docs") print("Accessing DB") # Get the doc from DB preload_man.preload(TokenizedCluewebDoc, all_doc_ids)
def main(prefix1, prefix2): topic = "abortion" tfrecord_path = "./data/ukp_tfrecord/dev_" + topic tfrecord = list(load_tfrecord(tfrecord_path)) get_correctness_arr_fn = partial(get_correctness_arr, tfrecord) prediction_list_1 = list(get_existing_predictions(prefix1, topic)) prediction_list_2 = list(get_existing_predictions(prefix2, topic)) num_runs = min(len(prediction_list_1), len(prediction_list_2)) prediction_list_1 = prediction_list_1[:num_runs] prediction_list_2 = prediction_list_2[:num_runs] c1 = flatten(lmap(get_correctness_arr_fn, prediction_list_1)) c2 = flatten(lmap(get_correctness_arr_fn, prediction_list_2)) print(len(c1)) print(len(c2)) _, p_value = stats.ttest_rel(c1, c2) print(p_value)
def generate( self, kc_candidate: Iterable[QKUnit], data_id_manager: DataIDManager, ) -> Iterable[QKInstance]: def convert(pair: Tuple[QCKQuery, List[KDP]]) -> Iterable[QKInstance]: query, passages = pair for passage in passages: info = {'query': query, 'kdp': passage} yield QKInstance(query.text, passage.tokens, data_id_manager.assign(info), self._is_correct(query, passage)) return flatten(lmap(convert, kc_candidate))
def build_df(): claims, val = train_split() gold = get_claim_perspective_id_dict() tokenizer = PCTokenizer() df = Counter() dl_list = [] for claim in claims: cid = claim["cId"] gold_pids = flatten(gold[cid]) p_text_list: List[str] = lmap(perspective_getter, gold_pids) tokens_list = lmap(tokenizer.tokenize_stem, p_text_list) dl_list.extend(lmap(len, tokens_list)) for t in set(flatten(tokens_list)): df[t] += 1 print(dl_list) print("Avdl", average(dl_list)) print(len(claims)) print(df.most_common(30)) save_to_pickle(df, "pc_df")
def save_way_back_fetch(): all_url = flatten(parse_all_urls()) wayback_dict = {} for url in all_url: print(url) prefix = "http://archive.org/wayback/available?url=" ret = requests.get(prefix + url) if ret.status_code != 200: print(ret.status_code) break else: wayback_dict[url] = ret.content pickle.dump(wayback_dict, open(way_back_save_path, "wb"))
def pool_tokens(rng, sent_list: List[List[Token]], target_seq_length, skip=False) -> List[List[Token]]: results: List[List[Token]] = [] current_chunk = [] current_length = 0 i = 0 if skip: i = i + rng.randint(0, 3) def is_new_doc(segment): return 'isbn' in segment num_real_doc = 1 while i < len(sent_list): segment: List[Token] = sent_list[i] if is_new_doc(segment): num_real_doc += 1 tokens_a: List[Token] = list(flatten(current_chunk)) tokens_a = tokens_a[:target_seq_length] results.append(tokens_a) current_chunk: List[List[Token]] = [] current_length = 0 current_chunk.append(segment) current_length += len(segment) if i == len(sent_list) - 1 or current_length >= target_seq_length: tokens_a = list(flatten(current_chunk)) tokens_a = tokens_a[:target_seq_length] results.append(tokens_a) current_chunk = [] current_length = 0 if skip: i = i + rng.randint(0, 3) i += 1 return results
def create_instances(self, topic, raw_docs, labeled_data): # Format: [CLS] [Abortion] [LABEL_FAVOR] ...(ukp text)...[SEP] [ABORTION] [LABEL_UNK] ..(clue text).. [SEP] topic_tokens = self.tokenizer.tokenize(topic.replace("_", " ")) # TODO iterate docs, pool chunk # randomly draw and sometimes insert labeled one # encode and add to instances max_num_tokens = self.max_seq_length - 3 - 2 - 2 * len(topic_tokens) target_seq_length = max_num_tokens docs_as_chunks, target_inst_num = self.pool_chunks_from_docs( raw_docs, target_seq_length) instances = [] for _ in range(target_inst_num): chunk_1 = pick1(pick1(docs_as_chunks)) m = self.rng.randint(1, len(chunk_1)) tokens_a = flatten(chunk_1[:m]) b_length = target_seq_length - len(tokens_a) if self.rng.random() < self.ratio_labeled and labeled_data: label, tokens_b = pick1(labeled_data) else: if self.rng.random() < 0.5: chunk_2 = pick1(pick1(docs_as_chunks)) tokens_b = flatten(chunk_2)[:b_length] else: tokens_b = flatten(chunk_1[m:])[:b_length] label = -1 truncate_seq_pair(tokens_a, tokens_b, target_seq_length, self.rng) swap = self.rng.random() < 0.5 tokens, segment_ids = encode_label_and_token_pair( topic_tokens, label, tokens_b, tokens_a, swap) instance = SegmentInstance(tokens=tokens, segment_ids=segment_ids) instances.append(instance) return instances
def eval_classification(classifier, split): payloads = load_payload(split) gold = get_claim_perspective_id_dict() r = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] z = classifier(c_text, p_text) y = 1 if p_entry['pid'] in all_pid_set else 0 r.append((z, y)) return get_scores(r)
def tune_kernel_a(): split = "train" payloads = load_payload(split) gold = get_claim_perspective_id_dict() r = [] for cid, data_list in payloads: gold_pids = gold[cid] all_pid_set = set(flatten(gold_pids)) for p_entry in data_list: c_text = p_entry['claim_text'] p_text = p_entry['perspective_text'] y = 1 if p_entry['pid'] in all_pid_set else 0 r.append((c_text, p_text, y)) tune_kernel_save(r)