def select_doc_per_query_top50(split): ms_reader = MSMarcoDataReader(split) save_path = os.path.join(root_dir, "train_docs_top50_{}.tsv".format(split)) out_f = open(save_path, "w") def pop(query_id, cur_doc_ids: List[Tuple[str, int]]): pos_docs = ms_reader.qrel[query_id] neg_docs = [] for doc_id, rank in cur_doc_ids: if doc_id not in pos_docs and rank < 50: neg_docs.append(doc_id) doc_needed = pos_docs + neg_docs row = [query_id] + doc_needed out_f.write("\t".join(row) + "\n") total_line = 36701116 ticker = TimeEstimator(total_line, "reading", 1000) with open_top100(split) as top100f: last_topic_id = None cur_doc_ids = [] for line_no, line in enumerate(top100f): [topic_id, _, doc_id, rank, _, _] = line.split() if last_topic_id is None: last_topic_id = topic_id elif last_topic_id != topic_id: pop(last_topic_id, cur_doc_ids) last_topic_id = topic_id cur_doc_ids = [] ticker.tick() cur_doc_ids.append((doc_id, int(rank)))
def select_doc_per_query(split): ms_reader = MSMarcoDataReader(split) save_path = os.path.join(root_dir, "train_docs_10times_{}.tsv".format(split)) out_f = open(save_path, "w") def pop(query_id, cur_doc_ids: Set): pos_docs = ms_reader.qrel[query_id] neg_docs = list( [doc_id for doc_id in cur_doc_ids if doc_id not in pos_docs]) if pos_docs: num_neg_docs = 10 * len(pos_docs) sel_docs = random.sample(neg_docs, num_neg_docs) doc_needed = pos_docs + sel_docs row = [query_id] + doc_needed out_f.write("\t".join(row) + "\n") total_line = 36701116 ticker = TimeEstimator(total_line, "reading", 1000) with open_top100(split) as top100f: last_topic_id = None cur_doc_ids = set() for line_no, line in enumerate(top100f): [topic_id, _, doc_id, rank, _, _] = line.split() if last_topic_id is None: last_topic_id = topic_id elif last_topic_id != topic_id: pop(last_topic_id, cur_doc_ids) last_topic_id = topic_id cur_doc_ids = set() ticker.tick() cur_doc_ids.add(doc_id)
def group_average_per_query(outputs) -> Dict[str, Dict[WordAsID, np.array]]: tokenizer = get_tokenizer() def collect_by_word_fn(entry: QKTokenLevelOutEntry): return collect_by_words(tokenizer, entry) print("Grouping entries") grouped: Dict[str, List[QKTokenLevelOutEntry]] = group_by(outputs, lambda x: x.query.query_id) def average_scores(out_entries: List[QKTokenLevelOutEntry]) -> Dict[WordAsID, np.array]: items: List[Iterable[Tuple[WordAsID, TokenScore]]] = lmap(collect_by_word_fn, out_entries) d: Dict[WordAsID, List] = defaultdict(list) for item in items: item: Iterable[Tuple[WordAsID, TokenScore]] = item for word, probs in item: d[word].append(probs) def average_per_dim(probs_list) -> np.array: return np.mean(np.array(probs_list), axis=0) out_d: Dict[WordAsID, np.array] = dict_value_map(average_per_dim, d) return out_d print("Collecting token level scores") per_query_infos: Dict[str, Dict[WordAsID, np.array]] = {} ticker = TimeEstimator(len(grouped)) for key, value in grouped.items(): per_query_infos[key] = average_scores(value) ticker.tick() return per_query_infos
def collect_pc_rel_score(prediction_file, info: Dict): data = EstimatorPredictionViewer(prediction_file) print("Num data ", data.data_len) group_by_key = {} num_append = 0 last_claim = None ticker = TimeEstimator(data.data_len) for entry in data: ticker.tick() logits = entry.get_vector("logits") data_id = entry.get_vector("data_id")[0] try: cur_info = info[data_id] if 'cid' in cur_info: cid = cur_info['cid'] last_claim = cid, logits elif 'pid' in cur_info: pid = cur_info['pid'] cid, c_logits = last_claim key = cid, pid if key not in group_by_key: group_by_key[key] = [] group_by_key[key].append((c_logits, logits)) num_append += 1 else: assert False except KeyError as e: print(e) pass print(num_append) return group_by_key
def work(job_id, pm): rng = random.Random(0) max_num_tokens = 256 masked_lm_prob = 0.15 short_seq_prob = 0.1 problem_per_job = 100 * 1000 in_path = in_path_format.format(job_id) out_path = os.path.join(working_path, "problems", "{}".format(job_id)) query_out_path = os.path.join(working_path, "query", "{}".format(job_id)) in_data = pickle.load(open(in_path, "rb")) out_data = [] queries = [] ticker = TimeEstimator(len(in_data)) for idx, inst in enumerate(in_data): mask_inst = pm.generate_mask(inst, max_num_tokens, masked_lm_prob, short_seq_prob, rng) query = pm.generate_query(mask_inst) qid = job_id * problem_per_job + idx queries.append((qid, query)) out_data.append(mask_inst) ticker.tick() write_query_json(queries, query_out_path) pickle.dump(out_data, open(out_path, 'wb'))
def sentence_payload_gen(q_res_path: str, top_n, data_id_man: DataIDManager): print("loading ranked list") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) qid_list = list(ranked_list.keys()) qid_list = qid_list[:10] ranked_list = {k: ranked_list[k] for k in qid_list} print("Pre loading docs") preload_docs(ranked_list, top_n) entries: List[Tuple[str, bool, int]] = [] def enum_sentence(tokens) -> Iterator[str]: text = " ".join(tokens) sents = sent_tokenize(text) yield from sents ticker = TimeEstimator(len(ranked_list)) for qid in ranked_list: q_res: List[SimpleRankedListEntry] = ranked_list[qid] docs = iterate_docs(q_res, top_n) for doc in docs: for sent_idx, sent in enumerate(enum_sentence(doc.tokens)): info = { 'doc_id': doc.doc_id, 'sent_idx': sent_idx, 'sentence': sent } data_id = data_id_man.assign(info) e = sent, True, data_id entries.append(e) ticker.tick() return entries
def work(self, job_id): data_to_save = {} group_id, doc_ids = self.todo[job_id] cur_targets = set(doc_ids) dir_helper = get_sydney_clueweb09_corpus_helper() print(group_id, len(cur_targets)) ticker = TimeEstimator(len(cur_targets)) group_done = len(cur_targets) == 0 for file_path in dir_helper.iter_gz_files_for_group(group_id): if group_done: break for doc_id, content in iter_docs(file_path): if doc_id in cur_targets: data_to_save[doc_id] = content cur_targets.remove(doc_id) ticker.tick() if len(cur_targets) == 0: group_done = True break if cur_targets: print(len(cur_targets), "not found") pickle.dump(data_to_save, open(os.path.join(self.out_dir, str(job_id)), "wb"))
def count_n_gram_grom_docs(docs, n, config, exclude_fn): count = Counter() tick = TimeEstimator(len(docs)) top_k = 10000 after_pruning = False for doc_idx, doc in enumerate(docs): if doc_idx % 10000 == 0: print(doc_idx) tick.tick() for segment in doc: if MERGE_SUBWORD in config: segment = merge_subword(segment) assert type(segment) == list for ngram_item in ngrams(segment, n): if after_pruning and ngram_item in selected_ngram: continue elif exclude_fn(ngram_item): pass else: count[ngram_item] += 1 if len(count) > 1000 * 1000 and not after_pruning: print("Performing pruning") tf_cnt = list(count.items()) tf_cnt.sort(key=lambda x: x[1], reverse=True) selected_ngram = set(left(tf_cnt[:top_k])) after_pruning = True return count
def get_idf_keyword_score(problems: List[QueryDoc], get_idf) -> Iterable[Counter]: stemmer = CacheStemmer() ticker = TimeEstimator(len(problems)) for p in problems: tokens = p.doc tf = Counter() reverse_map = {} # Stemmed -> raw tokens = [t for t in tokens if t not in [".", ",", "!"]] for raw_t in tokens: stem_t = stemmer.stem(raw_t) reverse_map[stem_t] = raw_t tf[stem_t] += 1 score_d = Counter() for term, cnt in tf.items(): score = math.log(1 + cnt) * get_idf(term) assert type(score) == float score_d[term] = score score_d_surface_form: Counter = Counter( dict_key_map(lambda x: reverse_map[x], score_d)) ticker.tick() yield score_d_surface_form
def segment_per_doc_index(task_id): token_reader = get_token_reader() stemmer = CacheStemmer() stopword = load_stopwords() p = os.path.join(cpath.data_path, "adhoc", "robust_seg_info.pickle") seg_info = pickle.load(open(p, "rb")) def get_doc_posting_list(doc_id): doc_posting = defaultdict(list) for interval in seg_info[doc_id]: (loc, loc_ed), (_, _) = interval tokens = token_reader.retrieve(doc_id) st_tokens = list([stemmer.stem(t) for t in tokens]) ct = Counter(st_tokens[loc:loc_ed]) for term, cnt in ct.items(): if term in stopword: continue doc_posting[term].append((loc, cnt)) return doc_posting doc_id_list = get_doc_task(task_id) ticker = TimeEstimator(len(doc_id_list)) doc_posting_d = {} for doc_id in doc_id_list: doc_posting_d[doc_id] = get_doc_posting_list(doc_id) ticker.tick() save_path = os.path.join(cpath.data_path, "adhoc", "per_doc_posting_{}.pickle".format(task_id)) pickle.dump(doc_posting_d, open(save_path, "wb"))
def run_B(self, job_id): if self.pr.doc_posting is None: self.pr.doc_posting = per_doc_posting_server.load_dict() output_A = self.load_output_A(job_id) candi_docs = self.load_candidate_docs(job_id) feature_str_list = [] seg_candi_list = [] ticker = TimeEstimator(self.inst_per_job) for i in range(self.inst_per_job): problem, qid = output_A[i] qid_str = str(qid) if qid_str in candi_docs: doc_candi = candi_docs[qid_str] seg_candi, features = self.process_B(problem, doc_candi) fstr = "\n".join([libsvm_str(qid, 0, f) for f in features]) feature_str_list.append(fstr) seg_candi_list.append(seg_candi) else: feature_str_list.append([]) seg_candi_list.append([]) ticker.tick() if i % 100 == 3: self.code_tick.print() self.save("seg_candi_list", job_id, seg_candi_list) self.save_ltr(job_id, feature_str_list)
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: obj = pickle.load(open(input_file, "rb")) # obj = List[Document] # Document = List[(Tokens, Probs)] all_documents.extend(obj) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.values()) instances = [] ticker = TimeEstimator(dupe_factor * len(all_documents)) for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) ticker.tick() rng.shuffle(instances) return instances
def combine_pc_rel_with_cpid(prediction_file, info: Dict) \ -> Dict[DataID, Tuple[CPIDPair, Logits, Logits]]: data = EstimatorPredictionViewer(prediction_file) print("Num data ", data.data_len) out_d: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = {} num_append = 0 last_claim = None prev_data_id = None ticker = TimeEstimator(data.data_len) for entry in data: ticker.tick() logits = entry.get_vector("logits") data_id = entry.get_vector("data_id")[0] try: cur_info = info[data_id] if 'cid' in cur_info: cid = cur_info['cid'] last_claim = cid, logits prev_data_id = data_id elif 'pid' in cur_info: pid = cur_info['pid'] cid, c_logits = last_claim cpid = CPIDPair((cid, pid)) out_d[data_id] = (cpid, c_logits, logits) out_d[prev_data_id] = (cpid, c_logits, logits) num_append += 1 else: assert False except KeyError as e: print(e) pass return out_d
def main(): dir_path = sys.argv[1] tokenizer = get_tokenizer() averager = Averager() sbc = SubwordConvertor() df = Counter() collection_size = 0 tikcer = TimeEstimator(485393) for file_path in get_dir_files(dir_path): for idx, record in enumerate( tf.compat.v1.python_io.tf_record_iterator(file_path)): example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature input_ids = feature["input_ids"].int64_list.value tokens = tokenizer.convert_ids_to_tokens(input_ids) sep_idx1 = tokens.index("[SEP]") sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1) doc_tokens = tokens[sep_idx1:sep_idx2] words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens)) dl = len(words) collection_size += dl averager.append(dl) for word in set(words): df[word] += 1 tikcer.tick() print("collection length", collection_size) print("average dl", averager.get_average()) save_to_pickle(df, "subword_df_robust_train")
def rank_with_query_lm(query_lms: Dict[str, Counter], candidate_dict: Dict[str, List[QCKCandidateI]], num_query=100, alpha=0.5) -> Dict[str, List[TrecRankedListEntry]]: run_name = "run_name" scorer = LMScorer(query_lms, alpha) out_d = {} print("Start scoring") keys = list(candidate_dict.keys()) keys = keys[:num_query] ticker = TimeEstimator(len(keys)) for query_id in keys: candidates = candidate_dict[query_id] def get_score(c: QCKCandidateI) -> float: text = c.text assert text score = scorer.score_text(query_id, text) return score candidates.sort(key=get_score, reverse=True) l: List[TrecRankedListEntry] = [] for rank, c in enumerate(candidates): l.append( TrecRankedListEntry(query_id, c.id, rank, get_score(c), run_name)) out_d[query_id] = l ticker.tick() return out_d
def qk_candidate_gen(q_res_path: str, doc_score_path, split, config) -> List[Tuple[QCKQuery, List[KDP]]]: queries: List[QCKQuery] = get_qck_queries(split) num_jobs = d_n_claims_per_split2[split] score_d = load_doc_scores(doc_score_path, num_jobs) tprint("loading ranked list") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_ids = list(ranked_list.keys()) query_ids.sort() print("num queries", len(query_ids)) q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)} print("Pre loading docs") top_n = config['top_n'] out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = [] all_doc_parts = 0 ticker = TimeEstimator(len(queries)) for q in queries: job_id: int = q_id_to_job_id[q.query_id] entries: List = score_d[job_id] entries.sort(key=get_second, reverse=True) doc_ids = left(entries) doc_ids = doc_ids[:top_n] preload_man.preload(TokenizedCluewebDoc, doc_ids) docs = iterate_docs(doc_ids) doc_part_list: List[KDP] = iterate_document_parts( docs, config['window_size'], config['step_size'], 20) all_doc_parts += len(doc_part_list) out_qk.append((q, doc_part_list)) ticker.tick() return out_qk
def subtoken_split(task_id): #robust_tokens = load_robust_token() token_reader = get_token_reader() doc_id_list = get_doc_task(task_id) num_doc = len(doc_id_list) vocab_file = os.path.join(cpath.data_path, "bert_voca.txt") tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) window_size = 256 - 3 skip = int(window_size / 2) ticker = TimeEstimator(num_doc) doc_seg_info = {} for key in doc_id_list: tokens = token_reader.retrieve(key) fn = tokenizer.wordpiece_tokenizer.tokenize sub_tokens = list([fn(t) for t in tokens]) def move(loc, loc_sub, skip): loc_idx = loc num_passed_sw = 0 loc_sub_idx = loc_sub for i in range(skip): num_passed_sw += 1 loc_sub_idx += 1 if num_passed_sw == len(sub_tokens[loc_idx]): loc_idx += 1 num_passed_sw = 0 if loc_idx >= len(sub_tokens): break # only move in token level if num_passed_sw > 0: loc_sub_idx -= num_passed_sw return loc_idx, loc_sub_idx loc = 0 loc_sub = 0 interval_list = [] while loc < len(tokens): loc_ed, loc_sub_ed = move(loc, loc_sub, skip) e = (loc, loc_ed), (loc_sub, loc_sub_ed) interval_list.append(e) loc = loc_ed loc_sub = loc_sub_ed doc_seg_info[key] = interval_list ticker.tick() p = os.path.join(cpath.data_path, "adhoc", "robust_seg_info_{}.pickle".format(task_id)) pickle.dump(doc_seg_info, open(p, "wb"))
def select_paragraph( docs: Dict[str, List[List[str]]], clue12_13_df, claim_list: List[Dict], strategy="topk", ) -> List[Tuple[str, List[List[str]]]]: claim_id_to_text: Dict[int, str] = {c['cId']: c['text'] for c in claim_list} cdf = 50 * 1000 * 1000 top_k = 100 not_found_set = set() def idf(term: str): if term not in clue12_13_df: if term in string.printable: return 0 not_found_set.add(term) return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5)) r: List[Tuple[str, List[List[str]]]] = [] ticker = TimeEstimator(len(docs)) for claim_id, docs in docs.items(): claim_text = claim_id_to_text[int(claim_id)] q_terms = set(re_tokenize(nltk.tokenize.word_tokenize(claim_text))) def scorer(para: List[str]) -> float: return paragraph_scorer(idf, q_terms, para) max_score = sum(lmap(idf, q_terms)) def get_best_per_doc(doc: List[str]) -> List[Tuple[List[str], float]]: paragraph_list: Iterable[List[str]] = enum_paragraph([doc]) paragraph_scored_list: List[Tuple[List[str], float]] = lmap_pairing( scorer, paragraph_list) paragraph_scored_list.sort(key=lambda x: x[1], reverse=True) return paragraph_scored_list[:1] selected: List[Tuple[List[str], float]] = list( flatten(lmap(get_best_per_doc, docs))) # if strategy == "topk": # selected: List[Tuple[List[str], float]] = paragraph_scored_list[:top_k] # elif strategy == "cutoff": # cut_off = max_score * 0.6 # selected: List[Tuple[List[str], float]] = lfilter(lambda x: x[1] > cut_off, paragraph_scored_list) # else: # assert False e = claim_id, left(selected) r.append(e) ticker.tick() return r
def nli_data_indexing(self, data): data_info = {} ticker = TimeEstimator(len(data), "nli indexing", 100) for data_idx, e in enumerate(data): input_ids, input_mask, segment_ids, y = e tokens = self.tokenizer.convert_ids_to_tokens(input_ids) data_info[data_idx] = self.index(tokens) ticker.tick() return data_info
def build_co_occur_from_pc_feature(data: Dict[str, List[List[str]]]) \ -> List[Tuple[str, Counter]]: window_size = 10 stemmer = CacheStemmer() r = [] ticker = TimeEstimator(len(data)) for cid, tokens_list in data.items(): ticker.tick() counter = build_co_occurrence(tokens_list, window_size, stemmer) r.append((cid, counter)) return r
def do(data_id): working_dir = os.environ["TF_WORKING_DIR"] tokenzier = get_tokenizer() name1 = os.path.join(working_dir, "bert_loss", "{}.pickle".format(data_id)) name2 = os.path.join(working_dir, "bfn_loss", "{}.pickle".format(data_id)) tf_logging.debug("Loading " + name1) output1 = PredictionOutput(name1) tf_logging.debug("Loading " + name2) output2 = PredictionOutput(name2) assert len(output1.input_ids) == len(output2.input_ids) out_path = os.path.join(working_dir, "loss_pred_train_data/{}".format(data_id)) record_writer = RecordWriterWrap(out_path) n_inst = len(output1.input_ids) sep_id = tokenzier.vocab["[SEP]"] tf_logging.debug("Iterating") ticker = TimeEstimator(n_inst, "", 1000) for i in range(n_inst): if i % 1000 == 0: assert_input_equal(output1.input_ids[i], output2.input_ids[i]) try: features = get_segment_and_mask(output1.input_ids[i], sep_id) except: try: sep_indice = get_sep_considering_masking( output1.input_ids[i], sep_id, output1.masked_lm_ids[i], output1.masked_lm_positions[i]) features = get_segment_and_mask_inner(output1.input_ids[i], sep_indice) except: tokens = tokenzier.convert_ids_to_tokens(output1.input_ids[i]) print(tokenization.pretty_tokens(tokens)) print(output1.masked_lm_ids[i]) print(output1.masked_lm_positions[i]) raise features["next_sentence_labels"] = create_int_feature([0]) features["masked_lm_positions"] = create_int_feature( output1.masked_lm_positions[i]) features["masked_lm_ids"] = create_int_feature( output1.masked_lm_ids[i]) features["masked_lm_weights"] = create_float_feature( output1.masked_lm_weights[i]) features["loss_base"] = create_float_feature( output1.masked_lm_example_loss[i]) features["loss_target"] = create_float_feature( output2.masked_lm_example_loss[i]) record_writer.write_feature(features) ticker.tick() record_writer.close()
def merge_counter(file_prefix, st, ed): count = Counter() ticker = TimeEstimator(ed - st) for i in range(st, ed): path = file_prefix + str(i) d = pickle.load(open(path, "rb")) for key in d: count[key] += d[key] ticker.tick() out_path = file_prefix + "_merged" pickle.dump(count, open(out_path, "wb"))
def work(self, job_id): qid_list = self.query_group[job_id] ticker = TimeEstimator(len(qid_list)) missing_rel_cnt = 0 missing_nrel_cnt = 0 def empty_doc_fn(query_id, doc_id): rel_docs = self.ms_reader.qrel[query_id] nonlocal missing_rel_cnt nonlocal missing_nrel_cnt if doc_id in rel_docs: missing_rel_cnt += 1 else: missing_nrel_cnt += 1 for qid in qid_list: if qid not in self.candidate_docs_d: continue docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn) ticker.tick() target_docs = self.candidate_docs_d[qid] text_d = {} bert_tokens_d = {} stemmed_tokens_d = {} for d in docs: if d.doc_id in target_docs: title = d.title title = crop_to_space(title, self.max_title_length) body_sents = sent_tokenize(d.body) new_body_sents = self.resplit_body_sents(body_sents) text_d[d.doc_id] = title, new_body_sents for tokenize_fn, save_dict in [ (self.bert_tokenizer.tokenize, bert_tokens_d), (self.stem_tokenizer.tokenize_stem, stemmed_tokens_d) ]: title_tokens = tokenize_fn(title) body_tokens_list = lmap(tokenize_fn, new_body_sents) save_dict[d.doc_id] = (title_tokens, body_tokens_list) todo = [ (text_d, self.text_dir_name), (bert_tokens_d, self.bert_tokens_dir_name), (stemmed_tokens_d, self.stemmed_tokens_dir_name), ] for tokens_d, dir_name in todo: save_path = os.path.join(self.out_dir, dir_name, str(qid)) pickle.dump(tokens_d, open(save_path, "wb"))
def tokenize_docs(self, doc_id_list): tokenizer = get_tokenizer() token_d = {} ticker = TimeEstimator(len(doc_id_list)) for doc_id in doc_id_list: text = self.data[doc_id] text = re.sub(r"<\s*[^>]*>", " ", text) # tokenize text tokens = tokenizer.tokenize(text) token_d[doc_id] = tokens ticker.tick() return token_d
def build_co_occur_from_pc_feature( data: Dict[str, List[ScoreParagraph]]) -> List[Tuple[str, Counter]]: window_size = 10 stemmer = CacheStemmer() r = [] ticker = TimeEstimator(len(data)) for cid, para_list in data.items(): ticker.tick() tokens_list: List[List[str]] = [e.paragraph.tokens for e in para_list] counter = build_co_occurrence(tokens_list, window_size, stemmer) r.append((cid, counter)) return r
def main(): num_inst = 1000 * 1000 * 100 path_format = "/mnt/nfs/work3/youngwookim/data/tlm/enwiki_seg_galago/train.{}.trectext" text_path = list([path_format.format(i) for i in range(10)]) ts = TextSampler(text_path) sp = StreamPickler("wiki_segments3_", 1000 * 100) ticker = TimeEstimator(num_inst) for i in range(num_inst): inst = ts.sample() sp.add(inst) ticker.tick() sp.flush()
def retrieve_urls(disk_id, doc_ids): f = open(get_url_dict_path(disk_id), "r") loc = 0 d = {} ticker = TimeEstimator(len(doc_ids)) for doc_id in doc_ids: line, found_loc = find_line_start_with(f, doc_id, loc) loc = found_loc doc_id_s, url = line.split() assert doc_id_s[-1] == "," assert doc_id == doc_id_s[:-1] d[doc_id] = url ticker.tick() return d
def save_doc_len(): collection = trec.load_robust(trec.robust_path) print("writing...") ticker = TimeEstimator(len(collection)) doc_len = dict() for doc_id in collection: content = collection[doc_id] tokens = nltk.tokenize.wordpunct_tokenize(content) doc_len[doc_id] = len(tokens) ticker.tick() save_path = os.path.join(cpath.data_path, "adhoc", "doc_len.pickle") pickle.dump(doc_len, open(save_path, "wb"))
def run_A(self, job_id): output = [] queries = [] ticker = TimeEstimator(self.inst_per_job) for i in range(self.inst_per_job): p, q = self.process_A() qid = job_id * self.inst_per_job + i output.append((p, qid)) queries.append((qid, q)) ticker.tick() self.save_query(job_id, queries) self.save_output_A(job_id, output)
def get_qk_candidate(config, q_res_path, qck_queries: List[QCKQuery]) -> List[QKUnit]: top_n = config['top_n'] worker = QKWorker(q_res_path, config, top_n) all_candidate: List[QKUnit] = [] ticker = TimeEstimator(len(qck_queries)) for q in qck_queries: ticker.tick() try: doc_part_list: List[KDP] = worker.work(q) e: QKUnit = q, doc_part_list all_candidate.append(e) except KeyError as e: print(e) return all_candidate