Esempio n. 1
0
def work(st, ed):
    st = int(st)
    ed = int(ed)
    q_config_id = Q_CONFIG_ID_BM25_10000
    ci = DynRankedListInterface(make_doc_query, q_config_id)
    all_data_points = load_train_data_point()

    print("Running {}~{} of {}".format(st, ed, len(all_data_points)))
    num_request = 10000
    todo = all_data_points[st:ed]
    not_done = lfilter(partial(db_not_contains, q_config_id), todo)
    queries: List[DocQuery] = lmap(datapoint_to_doc_query, not_done)
    print("Executing {} queries".format(len(queries)))
    ranked_list_dict: Dict[str, List[SimpleRankedListEntry]] = \
        send_doc_queries(ci.disk_path, num_request, queries, 600)
    qid_list = lmap(dp_to_qid, not_done)

    print("{} of {} succeed".format(len(ranked_list_dict), len(queries)))

    def add_to_db(query_id: str):
        if query_id in ranked_list_dict:
            r = ranked_list_dict[query_id]
            q_res_id: str = "{}_{}".format(query_id, q_config_id)
            if not has_key(QueryResult, q_res_id):
                save(QueryResult, q_res_id, r)

    foreach(add_to_db, qid_list)
    flush()
def read_doc_list(st, ed):
    st = int(st)
    ed = int(ed)
    q_config_id = Q_CONFIG_ID_BM25_10000
    all_data_points = load_train_data_point()

    print("Running {}~{} of {}".format(st, ed, len(all_data_points)))

    todo = all_data_points[st:ed]
    qid_list = lmap(dp_to_qid, todo)

    doc_list = set()

    ticker = TimeEstimator(len(qid_list))

    def get_doc_list(query_id: str):
        q_res_id: str = "{}_{}".format(query_id, q_config_id)
        ticker.tick()
        if has_key(QueryResult, q_res_id):
            r: List[SimpleRankedListEntry] = load(QueryResult, q_res_id)

            for entry in r:
                doc_id, rank, score = entry
                doc_list.add(doc_id)

    print("parsing_doc_list")
    foreach(get_doc_list, qid_list)

    return doc_list
Esempio n. 3
0
def work():
    opt = "binary"
    ci = CollectionInterface()

    all_data_points = load_train_data_point()

    if opt == "weighted":
        features = parallel_run(all_data_points, build_weighted_feature, 1000)
        save_to_pickle(features, "pc_train_features")
    elif opt == "binary":
        build_binary_feature_fn = partial(build_binary_feature, ci)
        features = parallel_run(all_data_points, build_binary_feature_fn, 1000)
        save_to_pickle(features, "pc_train_features_binary")
    else:
        assert False

    print("{} build from {}".format(len(features), len(all_data_points)))
Esempio n. 4
0
def work():
    opt = "binary"
    ci = DynRankedListInterface(make_doc_query, Q_CONFIG_ID_BM25_10000)
    doc_getter = DocGetter()
    print("load_train_data_point")
    all_data_points = load_train_data_point()
    ##
    print("")
    if opt == "weighted":
        features = parallel_run(all_data_points, build_weighted_feature, 1000)
        save_to_pickle(features, "pc_train_features")
    elif opt == "binary":
        build_binary_feature_fn = partial(build_binary_feature, ci)
        features = build_binary_feature_fn(all_data_points)
        #features = parallel_run(all_data_points, build_binary_feature_fn, 1000)

        save_to_pickle(features, "pc_train_features_binary")
    else:
        assert False
        ###
    print("{} build from {}".format(len(features), len(all_data_points)))
Esempio n. 5
0
def main():
    ci = DynRankedListInterface(make_doc_query, Q_CONFIG_ID_BM25_10000)

    all_data_points = load_train_data_point()

    print("data_poing len", len(all_data_points))

    def data_point_to_doc_id_list(x: PerspectiveCandidate) -> List[str]:
        ranked_docs: List[SimpleRankedListEntry] = ci.query(
            x.cid, x.pid, x.claim_text, x.p_text)
        ranked_docs = ranked_docs[:100]
        doc_id_list: List[str] = lmap(get_doc_id, ranked_docs)
        return doc_id_list

    doc_ids_list = lmap(data_point_to_doc_id_list, all_data_points)
    doc_ids = list(set(flatten(doc_ids_list)))
    print(len(doc_ids))

    save_path = os.path.join(output_path, "q_res_9_100")

    f = open(save_path, "w")
    for doc_id in doc_ids:
        f.write("{}\n".format(doc_id))
    f.close()
Esempio n. 6
0
def test_rm_classifier():
    datapoint_list = load_train_data_point()

    disk_name = index_name_list[0]
    dir_path = "/mnt/nfs/work3/youngwookim/data/perspective/train_claim_perspective/rm3"

    def get_rm(data_point):
        label, cid, pid, claim_text, p_text = data_point
        file_name = "{}_{}_{}.txt".format(disk_name, cid, pid)
        f = open(os.path.join(dir_path, file_name))

        def parse_line(line):
            term, prob = line.split("\t")  #
            prob = float(prob) * 1000
            return term, prob

        return lmap(parse_line, f), int(label)

    valid_datapoint_list = lmap_w_exception(get_rm, datapoint_list,
                                            FileNotFoundError)
    print("Total of {} data point".format(len(valid_datapoint_list)))

    voca = set(left(flatten(left(valid_datapoint_list))))
    voca2idx = dict(zip(list(voca), range(len(voca))))
    idx2voca = {v: k for k, v in voca2idx.items()}

    split = int(len(valid_datapoint_list) * 0.7)
    train_data = valid_datapoint_list[:split]
    val_data = valid_datapoint_list[split:]

    pos_data = lfilter(lambda x: x[1] == "1", valid_datapoint_list)
    neg_data = lfilter(lambda x: x[1] == "0", valid_datapoint_list)
    featurize = partial(featurize_fn, voca, voca2idx)
    x, y = zip(*lmap(featurize, train_data))
    val_x, val_y = zip(*lmap(featurize, val_data))

    model = LogisticRegression()
    model.fit(x, y)

    x_a = np.array(x)
    print(x_a.shape)
    avg_x = np.sum(x_a, axis=0)

    contrib = np.multiply(avg_x, model.coef_)[0]
    print(contrib.shape)
    ranked_idx = np.argsort(contrib)
    print(ranked_idx.shape)
    for i in range(30):
        idx = ranked_idx[i]
        print(idx2voca[idx], contrib[idx])

    for i in range(30):
        j = len(voca) - 1 - i
        idx = ranked_idx[j]
        print(idx2voca[idx], contrib[idx])

    def acc(y, pred_y):
        return np.average(np.equal(y, pred_y))

    pred_y = model.predict(x)
    print("train acc", acc(y, pred_y))
    print("val acc", acc(val_y, model.predict(val_x)))