def pickle_payload(slave_id): query_path = "/mnt/nfs/work3/youngwookim/code/adhoc/robus/queries.train.tsv" docs = load_robust("/mnt/nfs/work3/youngwookim/data/robust04") query = load_marco_query(query_path) print("{}] Load sampled".format(slave_id)) sample_filename = "id_pair_{}.pickle".format(slave_id) sampled = pickle.load(open(sample_filename, "rb")) inst = [] for q_id_1, doc_id_1, q_id_2, doc_id_2 in sampled: q1 = query[q_id_1] d1 = docs[doc_id_1] q2 = query[q_id_2] d2 = docs[doc_id_2] inst.append((q1, d1, q2, d2)) print(len(inst)) step = 1000 n_block = int(len(inst) / step) for i in range(n_block): st = i * step ed = (i + 1) * step name = str(slave_id) + "_" + str(i) pickle.dump(inst[st:ed], open("../output/plain_pair_{}.pickle".format(name), "wb"))
def split_docs(): print("loading...") collection = trec.load_robust(trec.robust_path) window_size = 200 * 3 def sample_shift(): return random.randrange(0, window_size * 2) fout = open("rob04.split.txt", "w") def write(new_id, text_span): fout.write("<DOC>\n") fout.write("<DOCNO>{}</DOCNO>\n".format(new_id)) fout.write("<TEXT>\n") fout.write(text_span) fout.write("</TEXT>\n") fout.write("</DOC>\n") print("writing...") for doc_id in collection: content = collection[doc_id] loc_ptr = 0 while loc_ptr < len(content): text_span = content[loc_ptr:loc_ptr + window_size] new_id = doc_id + "_{}".format(loc_ptr) write(new_id, text_span) loc_ptr += window_size #sample_shift()
def main(): robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" data = trec.load_robust(robust_path) def iter_docs(): for doc_id, text in data.items(): yield text summarize_doc_length(iter_docs())
def sanity_check(): ranked_list = load_2k_rank() collection = trec.load_robust(trec.robust_path) def process(doc): return doc.lower().split() for q_id, listings in ranked_list.items(): for doc_id, rank, score in listings[:1]: docs_path = os.path.join(cpath.data_path, "robust", "docs", doc_id) content = process(collection[doc_id]) open(docs_path, "w").write(content)
def add_rob_sents(): print("loading...") collection = trec.load_robust(trec.robust_path) print("writing...") for doc_id in collection: content = collection[doc_id] sents = sent_tokenize(content) for i, s in enumerate(sents): if len(s) > 500: sents[i] = s[:500] add_sents(doc_id, sents)
def __init__(self, out_dir): robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" tprint("Loading doc ids") self.doc_ids = all_doc_ids_of_interest() tprint("Loading robust docs") self.docs: Dict[str, str] = trec.load_robust(robust_path) tprint("Start processing") n_docs = len(self.doc_ids) docs_per_job = int((n_docs+n_jobs) / 5) self.docs_per_job = docs_per_job self.tokenizer = PCTokenizer() self.out_dir = out_dir
def save_doc_len(): collection = trec.load_robust(trec.robust_path) print("writing...") ticker = TimeEstimator(len(collection)) doc_len = dict() for doc_id in collection: content = collection[doc_id] tokens = nltk.tokenize.wordpunct_tokenize(content) doc_len[doc_id] = len(tokens) ticker.tick() save_path = os.path.join(cpath.data_path, "adhoc", "doc_len.pickle") pickle.dump(doc_len, open(save_path, "wb"))
def build_krovetz_index(): stemmer = Stemmer() stopwords = load_stopwords() stem_dict = dict() def stem(token): if token in stem_dict: return stem_dict[token] else: r = stemmer.stem(token) stem_dict[token] = r return r collection = trec.load_robust(trec.robust_path) print("writing...") inv_index = dict() ticker = TimeEstimator(len(collection)) for doc_id in collection: content = collection[doc_id] tokens = nltk.tokenize.wordpunct_tokenize(content) terms = dict() for idx, t in enumerate(tokens): if t in stopwords: continue t_s = stem(t) if t_s not in terms: terms[t_s] = list() terms[t_s].append(idx) for t_s in terms: if t_s not in inv_index: inv_index[t_s] = list() posting = (doc_id, terms[t_s]) inv_index[t_s].append(posting) ticker.tick() save_path = os.path.join(cpath.data_path, "adhoc", "robust_inv_index.pickle") pickle.dump(inv_index, open(save_path, "wb"))
def main(): top_k = 1000 galago_rank = load_bm25_best() doc_id_set = set() for query_id, ranked_list in galago_rank.items(): ranked_list.sort(key=lambda x :x[1]) doc_id_set.update([x[0] for x in ranked_list[:top_k]]) doc_id_list = list(doc_id_set) robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" data = load_robust(robust_path) save_d = {} for doc_id in doc_id_list: try: save_d[doc_id] = data[doc_id] except KeyError: print(doc_id, 'not found') save_to_pickle(save_d, "robust04_docs_predict")
def __init__(self): robust_path = "/mnt/nfs/work3/youngwookim/data/robust04" self.data = trec.load_robust(robust_path)