def generate_instances(self, job_id, data_id_man): q_id = self.job_id_to_q_id[job_id] query_text = self.query_d[int(q_id)] query_tokens = self.tokenizer.tokenize(query_text) ranked_list = self.ranked_list[q_id][:1000] doc_ids = list([e.doc_id for e in ranked_list]) tprint("Loading documents start") docs_d: Dict[str, List[List[str]]] = load_multiple(BertTokenizedCluewebDoc, doc_ids, True) tprint("Loading documents done") avail_seq_length = self.max_seq_length - len(query_tokens) - 3 label_dummy = 0 not_found = 0 for doc_id in doc_ids: try: doc: List[List[str]] = docs_d[doc_id] passages: Iterable[List[str]] = enum_passages(doc, avail_seq_length) for passage_idx, p in enumerate(passages): if passage_idx > 9: break data_id = data_id_man.assign({ 'query_id': q_id, 'doc_id': doc_id, 'passage_idx': passage_idx }) yield Instance(query_tokens, p, label_dummy, data_id) except KeyError: not_found += 1 print("{} of {} docs not found".format(not_found, len(doc_ids)))
def get_feature_binary_model(claim_id, perspective_id, claim_text, perspective_text, ci: DynRankedListInterface, is_mention_fn: Callable[[Counter[str], str, str], bool], ) -> Tuple[Counter, int]: def is_mention(doc: Counter) -> bool: return is_mention_fn(doc, claim_text, perspective_text) print(claim_id, perspective_id) ranked_docs: List[SimpleRankedListEntry] = ci.query(claim_id, perspective_id, claim_text, perspective_text) ranked_docs = ranked_docs[:100] print("{} docs in ranked list".format(len(ranked_docs))) doc_id_list: List[str] = lmap(get_doc_id, ranked_docs) tf_d = load_multiple(CluewebDocTF, doc_id_list, True) not_found = [] for idx, doc_id in enumerate(doc_id_list): if doc_id not in tf_d: not_found.append(idx) ranked_docs_tf = tf_d.values() mentioned_docs: List[Counter] = lfilter(is_mention, ranked_docs_tf) print("Found doc", len(tf_d), "mentioned doc", len(mentioned_docs)) docs_rel_freq: List[Counter] = lmap(div_by_doc_len, mentioned_docs) num_doc: int = len(docs_rel_freq) p_w_m: Counter = average_tf_over_docs(docs_rel_freq, num_doc) return p_w_m, num_doc
def generate(pos_doc_ids, all_doc_list, max_seq_length) -> List[Instance]: # load list of documents # make list of negative documents. # remove duplicates. seq_length = max_seq_length - 2 neg_docs_ids = list([d for d in all_doc_list if d not in pos_doc_ids]) pos_docs: List[List[List[str]]] = load_multiple(BertTokenizedCluewebDoc, pos_doc_ids, True) hashes = lmap(doc_hash, pos_docs) duplicate_indice = get_duplicate_list(hashes) pos_docs: List[List[List[str]]] = list( [doc for i, doc in enumerate(pos_docs) if i not in duplicate_indice]) neg_docs: List[List[List[str]]] = load_multiple_divided( BertTokenizedCluewebDoc, neg_docs_ids, True) data_id_man = DataIDManager() def enum_instances(doc_list: List[List[List[str]]], label: int) -> Iterator[Instance]: for d in doc_list: for passage in enum_passages(d, seq_length): yield Instance(passage, data_id_man.assign([]), label) pos_insts = list(enum_instances(pos_docs, 1)) neg_insts = list(enum_instances(neg_docs, 0)) all_insts = pos_insts + neg_insts print("{} instances".format(len(all_insts))) random.shuffle(all_insts) return all_insts
def remove_duplicate(doc_id_list: List[str]) -> List[str]: docs_d: Dict[str, List[str]] = load_multiple(TokenizedCluewebDoc, doc_id_list, True) hashes = lmap(doc_hash, [ docs_d[doc_id] if doc_id in docs_d else None for doc_id in doc_id_list ]) duplicate_indice = get_duplicate_list(hashes) non_duplicate = list([ doc_id_list[i] for i in range(len(doc_id_list)) if i not in duplicate_indice ]) return non_duplicate
def main(): ranked_list_path = sys.argv[1] save_path = sys.argv[2] rl: List[TrecRankedListEntry] = load_ranked_list(ranked_list_path) doc_ids = list([e.doc_id for e in rl]) docs_d: Dict[str, List[str]] = {} idx = 0 target_len = 10000 step = 100 # while idx < target_len: print(idx) doc_ids_window = doc_ids[idx:idx+step] docs_d.update(load_multiple(RawCluewebDoc, doc_ids_window, True)) idx += step print("{} docs_loaded".format(len(docs_d))) json.dump(docs_d, open(save_path, "w"))
def main(): doc_ids = list(set(load_doc_ids())) print("num docs", len(doc_ids)) save_dir = os.path.join(output_path, "pc_docs_html") k = 0 step = 1000 while k < len(doc_ids): print(k, k + step) cur_doc_ids = doc_ids[k:k + step] docs = load_multiple(RawCluewebDoc, cur_doc_ids, True) exist_or_mkdir(save_dir) for doc_id in cur_doc_ids: try: doc_html = docs[doc_id] save_path = os.path.join(save_dir, doc_id + ".html") open(save_path, "w").write(doc_html) except KeyError: pass k += step
def load_tf_multiple(doc_ids): return load_multiple(CluewebDocTF, doc_ids)