def write(claim_ids, split_name): claims = get_claims_from_ids(claim_ids) queries = get_claims_query(claims, True) out_path = os.path.join( output_path, "perspective_{}_claim_query_k0.json".format(split_name)) save_queries_to_file(queries, out_path)
def main(): pc_clusters: Iterable[PerspectiveCluster] = enum_perspective_clusters() tokenizer = TokenizerForGalago() def get_terms(text: str) -> Counter: terms = tokenizer.tokenize(text) return Counter(terms) # Query = [claim :: avg(perspective)] claim_text_d: Dict[int, str] = get_all_claim_d() perspective_text_d: Dict[int, str] = get_perspective_dict() def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery: claim_text = claim_text_d[cluster.claim_id] perspective_text_list = list( [perspective_text_d[pid] for pid in cluster.perspective_ids]) query_id = get_pc_cluster_query_id(cluster) claim_tf: Counter = get_terms(claim_text) pers_tf: Counter = average_counters( lmap(get_terms, perspective_text_list)) tf = sum_counters([claim_tf, pers_tf]) query: DocQuery = counter_to_galago_query(query_id, tf) return query query_list: List[DocQuery] = lmap(cluster_to_query, pc_clusters) print(len(query_list)) out_path = os.path.join(output_path, "perspective_query", "pc_query_for_evidence.json") save_queries_to_file(query_list, out_path)
def write_queries_to_files(n_query_per_file, out_dir, queries: List[DocQuery]): i = 0 while i * n_query_per_file < len(queries): st = i * n_query_per_file ed = (i + 1) * n_query_per_file out_path = os.path.join(out_dir, "{}.json".format(i)) save_queries_to_file(queries[st:ed], out_path) i += 1
def write_simple_claim_queries(): for split in splits: claim_ids = load_claim_ids_for_split(split) claims = get_claims_from_ids(claim_ids) queries = get_simple_claim_query(claims, True) out_path = os.path.join(output_path, "perspective_query", "simple_query_{}.json".format(split)) save_queries_to_file(queries, out_path)
def work(years, query_type, save_path): queries = load_queries(years) def convert_query(q): return trec_query_to_galago_query(q, query_type) queries = lmap(convert_query, queries) save_queries_to_file(queries, save_path)
def xml_query_to_json(xml_path, json_path): queries: List[Query] = load_xml_query(xml_path) def transform(q: Query) -> Dict: tokens = word_tokenize(q.text) tokens = clean_query(tokens) return format_query_bm25(q.qid, tokens) queries_dict_list: List[Dict] = lmap(transform, queries) save_queries_to_file(queries_dict_list, json_path)
def write_queries(split: str, queries: List[DocQuery]): root_dir_path = os.path.join(job_man_dir, "counter_arg_queries") exist_or_mkdir(root_dir_path) dir_path = os.path.join(root_dir_path, split) exist_or_mkdir(dir_path) query_per_file = 50 file_idx = 0 while file_idx * query_per_file < len(queries): save_path = os.path.join(dir_path, str(file_idx) + ".json") st = file_idx * query_per_file ed = st + query_per_file queries_to_save = queries[st:ed] save_queries_to_file(queries_to_save, save_path) file_idx += 1
def write_claim_as_query(): d_ids = list(load_train_claim_ids()) claims = get_claims_from_ids(d_ids) queries = [] for c in claims: cid = c["cId"] claim_text = c["text"] tokens = claim_text.split() query_text = clean_query(tokens) print(query_text) q_entry = get_query_entry_bm25_anseri(cid, query_text) queries.append(q_entry) out_path = os.path.join(output_path, "perspective_dev_claim_query.json") save_queries_to_file(queries, out_path)
def main(): split = "dev" stopword = load_stopwords_for_query() # split = "train" ex_info_dir = "/mnt/nfs/work3/youngwookim/job_man/pc_rm_terms_{}".format( split) query_path = os.path.join( output_path, "perspective_{}_claim_query_k0_fixed.json".format(split)) queries = load_queries(query_path) ex_w_scale = 100 out_path = os.path.join(output_path, "perspective_query", "pc_{}_claim_query_rm_ex.json".format(split)) ## new_queries = get_extended(ex_info_dir, ex_w_scale, queries, stopword) save_queries_to_file(new_queries, out_path)
def main(): print("Start") spr = StreamPickleReader("robust_candi_query_") query_per_task = 1000 * 10 out_idx = 0 while spr.has_next(): queries = [] for i in range(query_per_task): if not spr.has_next(): break q_id, query = spr.get_item() query = clean_query(query) queries.append(get_query_entry(q_id, query)) out_path = os.path.join(cpath.output_path, "query", "g_query_{}.json".format(out_idx)) save_queries_to_file(queries, out_path) out_idx += 1
def send_queries_inner(index_path, num_result, queries, timeout) -> List[str]: query_path = get_new_query_json_path() # save query to file save_queries_to_file(queries, query_path) # issue galago command cmd = [ "galago", "threaded-batch-search", "--requested=" + str(num_result), "--index=" + index_path, query_path ] os.environ['PYTHONUNBUFFERED'] = "1" temp_outpath = query_path + ".output" out_file = open(temp_outpath, "w") proc = subprocess.Popen( cmd, stdout=out_file, stderr=PIPE, universal_newlines=True, ) # wait , read pipe prev_num_remain = 999999 last_update_time = time.time() try: while proc.poll() is None: line = proc.stderr.readline() if line.startswith("INFO: Still running..."): st = len("INFO: Still running...") tokens = line[st:].split() num_remain = int(tokens[0]) if num_remain != prev_num_remain: print(line, end='') prev_num_remain = num_remain last_update_time = time.time() if time.time() - last_update_time > timeout: break except subprocess.TimeoutExpired: proc.kill() out_file.close() file_content = open(temp_outpath, "r").read() lines: List[str] = file_content.splitlines() return lines