def work(st, ed): st = int(st) ed = int(ed) q_config_id = Q_CONFIG_ID_BM25_10000 ci = DynRankedListInterface(make_doc_query, q_config_id) all_data_points = load_train_data_point() print("Running {}~{} of {}".format(st, ed, len(all_data_points))) num_request = 10000 todo = all_data_points[st:ed] not_done = lfilter(partial(db_not_contains, q_config_id), todo) queries: List[DocQuery] = lmap(datapoint_to_doc_query, not_done) print("Executing {} queries".format(len(queries))) ranked_list_dict: Dict[str, List[SimpleRankedListEntry]] = \ send_doc_queries(ci.disk_path, num_request, queries, 600) qid_list = lmap(dp_to_qid, not_done) print("{} of {} succeed".format(len(ranked_list_dict), len(queries))) def add_to_db(query_id: str): if query_id in ranked_list_dict: r = ranked_list_dict[query_id] q_res_id: str = "{}_{}".format(query_id, q_config_id) if not has_key(QueryResult, q_res_id): save(QueryResult, q_res_id, r) foreach(add_to_db, qid_list) flush()
def read_doc_list(st, ed): st = int(st) ed = int(ed) q_config_id = Q_CONFIG_ID_BM25_10000 all_data_points = load_train_data_point() print("Running {}~{} of {}".format(st, ed, len(all_data_points))) todo = all_data_points[st:ed] qid_list = lmap(dp_to_qid, todo) doc_list = set() ticker = TimeEstimator(len(qid_list)) def get_doc_list(query_id: str): q_res_id: str = "{}_{}".format(query_id, q_config_id) ticker.tick() if has_key(QueryResult, q_res_id): r: List[SimpleRankedListEntry] = load(QueryResult, q_res_id) for entry in r: doc_id, rank, score = entry doc_list.add(doc_id) print("parsing_doc_list") foreach(get_doc_list, qid_list) return doc_list
def work(): opt = "binary" ci = CollectionInterface() all_data_points = load_train_data_point() if opt == "weighted": features = parallel_run(all_data_points, build_weighted_feature, 1000) save_to_pickle(features, "pc_train_features") elif opt == "binary": build_binary_feature_fn = partial(build_binary_feature, ci) features = parallel_run(all_data_points, build_binary_feature_fn, 1000) save_to_pickle(features, "pc_train_features_binary") else: assert False print("{} build from {}".format(len(features), len(all_data_points)))
def work(): opt = "binary" ci = DynRankedListInterface(make_doc_query, Q_CONFIG_ID_BM25_10000) doc_getter = DocGetter() print("load_train_data_point") all_data_points = load_train_data_point() ## print("") if opt == "weighted": features = parallel_run(all_data_points, build_weighted_feature, 1000) save_to_pickle(features, "pc_train_features") elif opt == "binary": build_binary_feature_fn = partial(build_binary_feature, ci) features = build_binary_feature_fn(all_data_points) #features = parallel_run(all_data_points, build_binary_feature_fn, 1000) save_to_pickle(features, "pc_train_features_binary") else: assert False ### print("{} build from {}".format(len(features), len(all_data_points)))
def main(): ci = DynRankedListInterface(make_doc_query, Q_CONFIG_ID_BM25_10000) all_data_points = load_train_data_point() print("data_poing len", len(all_data_points)) def data_point_to_doc_id_list(x: PerspectiveCandidate) -> List[str]: ranked_docs: List[SimpleRankedListEntry] = ci.query( x.cid, x.pid, x.claim_text, x.p_text) ranked_docs = ranked_docs[:100] doc_id_list: List[str] = lmap(get_doc_id, ranked_docs) return doc_id_list doc_ids_list = lmap(data_point_to_doc_id_list, all_data_points) doc_ids = list(set(flatten(doc_ids_list))) print(len(doc_ids)) save_path = os.path.join(output_path, "q_res_9_100") f = open(save_path, "w") for doc_id in doc_ids: f.write("{}\n".format(doc_id)) f.close()
def test_rm_classifier(): datapoint_list = load_train_data_point() disk_name = index_name_list[0] dir_path = "/mnt/nfs/work3/youngwookim/data/perspective/train_claim_perspective/rm3" def get_rm(data_point): label, cid, pid, claim_text, p_text = data_point file_name = "{}_{}_{}.txt".format(disk_name, cid, pid) f = open(os.path.join(dir_path, file_name)) def parse_line(line): term, prob = line.split("\t") # prob = float(prob) * 1000 return term, prob return lmap(parse_line, f), int(label) valid_datapoint_list = lmap_w_exception(get_rm, datapoint_list, FileNotFoundError) print("Total of {} data point".format(len(valid_datapoint_list))) voca = set(left(flatten(left(valid_datapoint_list)))) voca2idx = dict(zip(list(voca), range(len(voca)))) idx2voca = {v: k for k, v in voca2idx.items()} split = int(len(valid_datapoint_list) * 0.7) train_data = valid_datapoint_list[:split] val_data = valid_datapoint_list[split:] pos_data = lfilter(lambda x: x[1] == "1", valid_datapoint_list) neg_data = lfilter(lambda x: x[1] == "0", valid_datapoint_list) featurize = partial(featurize_fn, voca, voca2idx) x, y = zip(*lmap(featurize, train_data)) val_x, val_y = zip(*lmap(featurize, val_data)) model = LogisticRegression() model.fit(x, y) x_a = np.array(x) print(x_a.shape) avg_x = np.sum(x_a, axis=0) contrib = np.multiply(avg_x, model.coef_)[0] print(contrib.shape) ranked_idx = np.argsort(contrib) print(ranked_idx.shape) for i in range(30): idx = ranked_idx[i] print(idx2voca[idx], contrib[idx]) for i in range(30): j = len(voca) - 1 - i idx = ranked_idx[j] print(idx2voca[idx], contrib[idx]) def acc(y, pred_y): return np.average(np.equal(y, pred_y)) pred_y = model.predict(x) print("train acc", acc(y, pred_y)) print("val acc", acc(val_y, model.predict(val_x)))