コード例 #1
0
def main(dir_path):
    output_path = os.path.join(dir_path, "all_balanced")
    pos_insts = []
    neg_insts = []
    all_insts = [neg_insts, pos_insts]

    for i in range(665):
        p = os.path.join(dir_path, str(i))
        if os.path.exists(p):
            for record in load_record(p):
                new_features = collections.OrderedDict()
                for key in record:
                    new_features[key] = create_int_feature(take(record[key]))

                label = take(record['label_ids'])[0]
                all_insts[label].append(new_features)

    random.shuffle(pos_insts)
    random.shuffle(neg_insts)

    num_sel = min(len(pos_insts), len(neg_insts))
    print("{} insts per label".format(num_sel))

    insts_to_write = pos_insts[:num_sel] + neg_insts[:num_sel]
    writer = RecordWriterWrap(output_path)
    foreach(writer.write_feature, insts_to_write)
コード例 #2
0
def build_and_show():
    claim_lms = build_gold_claim_lm_train()
    alpha = 0.1
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))

    def show(claim_lm: ClaimLM):
        print('----')
        print(claim_lm.claim)
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_bg_lm = get_lm_log(bg_lm)
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        for k, v in claim_lm.LM.most_common(50):
            print(k, v)

        s = "\t".join(left(claim_lm.LM.most_common(10)))
        print("LM freq: ", s)
        print(s)

        s = "\t".join(left(log_odd.most_common(30)))
        print("Log odd top", s)

        s = "\t".join(left(least_common(log_odd, 10)))
        print("Log odd bottom", s)

    foreach(show, claim_lms[:10])
コード例 #3
0
def debug_failture(predictions):
    gold = get_claim_perspective_id_dict()
    ap_list = []
    for c_Id, prediction_list in predictions:
        gold_pids = gold[c_Id]
        gold_pids_set: Set[int] = set(flatten(gold_pids))
        claim_text = prediction_list[0]['claim_text']
        print("Claim {}: ".format(c_Id), claim_text)
        correctness_list = lmap(lambda p: p['pid'] in gold_pids_set,
                                prediction_list)
        ap = get_ap(prediction_list, gold_pids, False)

        if not any(correctness_list):  # all wrong
            continue

        if ap > 0.9:
            continue

        def print_line(prediction):
            pid = prediction['pid']
            correct = pid in gold_pids_set
            if correct:
                correct_str = "Y"
            else:
                correct_str = "N"

            score = prediction['score']
            print(correct_str, score, score.name,
                  prediction['perspective_text'])

        foreach(print_line, prediction_list)
        ap_list.append(ap)

    map = average(ap_list)
    return {'map': map}
コード例 #4
0
def write_records(records: List[Payload], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def tokenize_from_tokens(tokens: List[str]) -> List[str]:
        output = []
        for t in tokens:
            ts = tokenizer.tokenize(t)
            output.extend(ts)
        return output

    def encode(inst: Payload) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
        max_seg2_len = max_seq_length - 3 - len(tokens1)
        tokens2 = tokenize_from_tokens(inst.passage)[:max_seg2_len]
        tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]
        segment_ids = [0] * (len(tokens1) + 2) \
                      + [1] * (len(tokens2) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)
        features['label_ids'] = create_int_feature([inst.is_correct])
        features['data_id'] = create_int_feature([inst.data_id])
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
コード例 #5
0
def claim_language_model_property():
    dev_claim_ids = load_dev_claim_ids()
    claims = get_claims_from_ids(dev_claim_ids)
    all_ranked_list = ClaimRankedList()
    all_voca = set()
    candidate_k = 50
    for claim in claims:
        claim_text, perspectives = get_perspective(claim, candidate_k)
        print(claim_text)
        unigrams = get_relevant_unigrams(perspectives)
        ranked_list = all_ranked_list.get(str(claim['cId']))
        doc_ids = [t[0] for t in ranked_list]
        print("Loading documents")
        preload_tf(doc_ids)
        docs = lmap(load_and_format_doc, doc_ids)

        foreach(lambda doc: all_voca.update(doc['tokens_set']), docs)

        # check hypothesis
        # check_hypothesis(all_voca, cdf_cont, cdf_ncont, clueweb_cdf, clueweb_ctf, clueweb_df, clueweb_tf, ctf_cont,
        #                  ctf_ncont, df_cont, df_ncont, tf_cont, tf_ncont, unigrams)

        print("counting terms stat")

        lm_classifier = build_lm(docs, unigrams)

        for p_entry in perspectives:
            _text, _pid, _score = p_entry
            tokens = nltk.word_tokenize(_text)
            score = sum(lmap(lm_classifier.per_token_odd, tokens))
            print(_text, score)
コード例 #6
0
def read_doc_list(st, ed):
    st = int(st)
    ed = int(ed)
    q_config_id = Q_CONFIG_ID_BM25_10000
    all_data_points = load_train_data_point()

    print("Running {}~{} of {}".format(st, ed, len(all_data_points)))

    todo = all_data_points[st:ed]
    qid_list = lmap(dp_to_qid, todo)

    doc_list = set()

    ticker = TimeEstimator(len(qid_list))

    def get_doc_list(query_id: str):
        q_res_id: str = "{}_{}".format(query_id, q_config_id)
        ticker.tick()
        if has_key(QueryResult, q_res_id):
            r: List[SimpleRankedListEntry] = load(QueryResult, q_res_id)

            for entry in r:
                doc_id, rank, score = entry
                doc_list.add(doc_id)

    print("parsing_doc_list")
    foreach(get_doc_list, qid_list)

    return doc_list
コード例 #7
0
ファイル: run_queries.py プロジェクト: clover3/Chair
def work(st, ed):
    st = int(st)
    ed = int(ed)
    q_config_id = Q_CONFIG_ID_BM25_10000
    ci = DynRankedListInterface(make_doc_query, q_config_id)
    all_data_points = load_train_data_point()

    print("Running {}~{} of {}".format(st, ed, len(all_data_points)))
    num_request = 10000
    todo = all_data_points[st:ed]
    not_done = lfilter(partial(db_not_contains, q_config_id), todo)
    queries: List[DocQuery] = lmap(datapoint_to_doc_query, not_done)
    print("Executing {} queries".format(len(queries)))
    ranked_list_dict: Dict[str, List[SimpleRankedListEntry]] = \
        send_doc_queries(ci.disk_path, num_request, queries, 600)
    qid_list = lmap(dp_to_qid, not_done)

    print("{} of {} succeed".format(len(ranked_list_dict), len(queries)))

    def add_to_db(query_id: str):
        if query_id in ranked_list_dict:
            r = ranked_list_dict[query_id]
            q_res_id: str = "{}_{}".format(query_id, q_config_id)
            if not has_key(QueryResult, q_res_id):
                save(QueryResult, q_res_id, r)

    foreach(add_to_db, qid_list)
    flush()
コード例 #8
0
def write_records(records: List[Record], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def encode(record: Record) -> OrderedDict:
        tokens = ["[CLS]"] + record.claim_tokens + [
            "[SEP]"
        ] + record.doc_tokens + ["[SEP]"]
        segment_ids = [0] * (len(record.claim_tokens) + 2) \
                      + [1] * (len(record.doc_tokens) + 1)
        tokens = tokens[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        features = get_basic_input_feature(tokenizer, max_seq_length, tokens,
                                           segment_ids)

        labels = [0.] * (len(record.claim_tokens) + 2) + record.scores
        labels += (max_seq_length - len(labels)) * [0.]
        label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask
        label_mask += (max_seq_length - len(label_mask)) * [0]
        features['label_ids'] = create_float_feature(labels)
        features['label_masks'] = create_int_feature(label_mask)
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
コード例 #9
0
def get_relevant_unigrams(perspectives):
    unigrams = set()
    tokens_list = [
        lower_all(nltk.word_tokenize(_text))
        for _text, _pid, _score in perspectives
    ]
    foreach(unigrams.update, tokens_list)
    return unigrams
コード例 #10
0
def main():
    claim_text_d: Dict[int, str] = get_all_claim_d()
    claim_text_d: Dict[str, str] = dict_key_map(str, claim_text_d)
    evi_dict: Dict[str, str] = dict_key_map(str, load_evidence_dict())
    evi_gold_dict: Dict[str, List[int]] = evidence_gold_dict_str_qid()
    print("V2")

    def print_entry(entry):
        evidence_text = evi_dict[entry.doc_id]
        print("[{}] {}: {}".format(entry.rank, entry.doc_id, evidence_text))

    ranked_list_dict = load_ranked_list_grouped(sys.argv[1])
    for query, ranked_list in ranked_list_dict.items():
        print()

        claim_id, perspective_id = query.split("_")
        gold_ids: List[str] = lmap(str, evi_gold_dict[query])
        if not gold_ids:
            print("query {} has no gold".format(query))
            continue
        assert gold_ids
        claim_text = claim_text_d[claim_id]
        perspective_text = perspective_getter(int(perspective_id))

        pos_entries = []
        neg_entries = []
        for entry in ranked_list:
            label = entry.doc_id in gold_ids
            if label:
                pos_entries.append(entry)
            elif entry.rank < 3:
                neg_entries.append(entry)

        if not pos_entries:
            print("gold not in ranked list")
            continue

        num_rel = len(pos_entries)

        correctness = []
        for entry in ranked_list[:num_rel]:
            label = entry.doc_id in gold_ids
            correctness.append(int(label))

        precision = average(correctness)
        if precision > 0.99:
            print("Good")
            continue
        print("precision at {}: {}".format(num_rel, precision))

        print("Claim: ", claim_text)
        print("perspective_text: ", perspective_text)
        print(" < GOLD >")
        foreach(print_entry, pos_entries)
        print(" < False Positive >")
        foreach(print_entry, neg_entries)
コード例 #11
0
ファイル: gen_to_csv.py プロジェクト: clover3/Chair
def main():
    exist_or_mkdir(os.path.join(output_path, "alamri_tfrecord"))

    data_id_manager = DataIDManager()
    entries = []
    for claim1, claim2 in enum_true_instance():
        entries.append((claim1.text, claim2.text))

    save_path = at_output_dir("alamri_pilot", "true_pairs_all.csv")
    csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8"))
    foreach(csv_writer.writerow, entries)
コード例 #12
0
    def work(self, job_id):
        features: List[ParagraphClaimPersFeature] = pickle.load(
            open(os.path.join(self.input_dir, str(job_id)), "rb"))

        writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id)))
        for f in features:
            f2: ParagraphFeature = to_paragraph_feature(f)
            encoded_list: List[OrderedDict] = format_paragraph_features(
                self.tokenizer, self.max_seq_length, f2)
            foreach(writer.write_feature, encoded_list)
        writer.close()
コード例 #13
0
ファイル: pilot_annotation.py プロジェクト: clover3/Chair
def main():
    save_path = at_output_dir("alamri_pilot", "pilot_pairs.csv")

    entries = []
    for claim1, claim2 in enum_true_instance(3):
        print("--")
        print("{}".format(claim1.text))
        print("{}".format(claim2.text))
        entries.append((claim1.text, claim2.text))

    csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8"))
    foreach(csv_writer.writerow, entries)
コード例 #14
0
def main():
    baseline_cid_grouped, cid_grouped, claim_d = load_cppnc_related_data()
    gold = get_claim_perspective_id_dict()

    bin_keys = ["< 0.05", "< 0.50", "< 0.95", "< 1"]

    def bin_fn(item: float):
        if item > 0.95:
            return "< 1"
        elif item > 0.5:
            return "< 0.95"
        elif item > 0.05:
            return "< 0.50"
        else:
            return "< 0.05"

    for cid, pid_entries in cid_grouped.items():
        baseline_pid_entries = baseline_cid_grouped[cid]

        baseline_score_d = {}
        for cpid, a_thing_array in baseline_pid_entries:
            _, pid = cpid
            assert len(a_thing_array) == 1
            score = a_thing_array[0]['score']
            baseline_score_d[pid] = score

        gold_pids = gold[cid]

        def get_score_per_pid_entry(p_entries: Tuple[CPIDPair, List[Dict]]):
            cpid, entries = p_entries
            return average(lmap(lambda e: e['score'], entries))

        pid_entries.sort(key=get_score_per_pid_entry, reverse=True)

        s = "{} : {}".format(cid, claim_d[cid])
        print(s)
        head_row = [""] + bin_keys
        rows = [head_row]
        for cpid, things in pid_entries:
            histogram = BinHistogram(bin_fn)
            _, pid = cpid
            label = any([pid in pids for pids in gold_pids])
            label_str = bool_to_yn(label)
            base_score = baseline_score_d[pid]
            base_score_str = "{0:.2f}".format(base_score)
            scores: List[float] = lmap(lambda x: (x['score']), things)
            foreach(histogram.add, scores)
            row = [label_str, base_score_str] + [
                str(histogram.counter[bin_key]) for bin_key in bin_keys
            ]
            rows.append(row)
        print_table(rows)
コード例 #15
0
def do_join_and_write(doc_list: Iterable, save_name):
    doc_id_in_db: Set = get_docs_in_db(save_name)
    print("doc_list", len(doc_list))
    print("Num doc in db", len(doc_id_in_db))
    doc_list_to_fetch = set(doc_list) - doc_id_in_db
    print("doc_list_to_fetch", len(doc_list_to_fetch))
    exist_or_mkdir(os.path.join(output_path, "doc_list"))

    save_path = os.path.join(output_path, "doc_list", save_name)
    f = open(save_path, "w")
    write = lambda doc_id: f.write("{}\n".format(doc_id))
    foreach(write, doc_list_to_fetch)
    f.close()
コード例 #16
0
def encode_label_and_token_pair(topic_tokens, label, tokens_labeled, tokens_unlabeled, swap):
    tokens = []
    segment_ids = []
    cur_segment_type = 0

    label_token = get_label_token(label)
    sent1 = tokens_labeled if not swap else tokens_unlabeled
    label_1 = label_token if not swap else get_unk_label_token()
    sent2 = tokens_unlabeled if not swap else tokens_labeled
    label_2 = get_unk_label_token() if not swap else label_token

    def append_token(token):
        tokens.append(token)
        segment_ids.append(cur_segment_type)

    append_token("[CLS]")
    foreach(append_token, topic_tokens)
    append_token(label_1)
    foreach(append_token, sent1)
    append_token("[SEP]")

    cur_segment_type = 1
    foreach(append_token, topic_tokens)
    append_token(label_2)
    foreach(append_token, sent2)
    append_token("[SEP]")
    return tokens, segment_ids
コード例 #17
0
def build_co_occurrence(list_tokens: List[List[str]], window_size,
                        stemmer: CacheStemmer) -> Counter:
    list_tokens: List[List[str]] = lmap(stemmer.stem_list, list_tokens)

    stopword = load_stopwords()

    def remove_stopwords(tokens: List[str]) -> List[str]:
        return list([t for t in tokens if t not in stopword])

    list_tokens: List[List[str]] = lmap(remove_stopwords, list_tokens)
    counter = Counter()

    def count_co_ocurrence_fn(token_list):
        count_co_ocurrence(window_size, counter, token_list)

    foreach(count_co_ocurrence_fn, list_tokens)

    return counter
コード例 #18
0
def test_generative_model():
    train, val = load_feature_and_split()
    print("Training lm")
    classifier = learn_lm(train)
    stopwords = load_stopwords()

    def fileter_fn(data_point: Dict):
        remove_stopword_and_punct(stopwords, data_point['feature'])

    foreach(fileter_fn, train)

    def is_correct(data_point: Dict):
        x = data_point['feature']
        y = int(data_point['label'])
        return classifier.predict(x) == int(y)

    correctness = lmap(is_correct, val)

    print("val acc: ", average(correctness))
コード例 #19
0
    def work(self, job_id):
        features: List[ParagraphClaimPersFeature] = pickle.load(
            open(os.path.join(self.input_dir, str(job_id)), "rb"))

        info_d_all = {}
        data_id_base = job_id * 100000
        data_id_gen = DataIDGen(data_id_base)
        writer = RecordWriterWrap(os.path.join(self.out_dir, str(job_id)))
        for f in features:
            pair = to_retrieval_format(self.tokenizer, self.max_seq_length,
                                       data_id_gen, f)
            info_d: Dict = pair[0]
            f2: List[OrderedDict] = pair[1]

            info_d_all.update(info_d)
            foreach(writer.write_feature, f2)
        writer.close()

        pickle.dump(info_d_all,
                    open(os.path.join(self.info_out_dir, str(job_id)), "wb"))
コード例 #20
0
def main():
    save_dir = at_output_dir("alamri_annotation1", "grouped_pairs")
    exist_or_mkdir(save_dir)

    summary = []
    grouped_claim_pairs: List[Tuple[Review, List[Claim,
                                                 Claim]]] = NotImplemented
    for review_idx, (review, claim_pairs) in enumerate(grouped_claim_pairs):
        entries = []
        for claim1, claim2 in claim_pairs:
            entries.append((claim1.text, claim2.text))

        review_no = review_idx + 1
        save_path = os.path.join(save_dir, "{}.csv".format(review_no))
        csv_writer = csv.writer(
            open(save_path, "w", newline='', encoding="utf-8"))
        foreach(csv_writer.writerow, entries)
        summary.append((str(review_no), review.pmid, str(len(claim_pairs))))

    save_path = os.path.join(save_dir, 'sumamry.csv')
    csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8"))
    foreach(csv_writer.writerow, summary)
コード例 #21
0
def lm_contribution():
    train, val = load_feature_and_split()
    print("Training lm")
    stopwords = load_stopwords()

    def fileter_fn(data_point):
        remove_stopword_and_punct(stopwords, data_point[0][0])

    foreach(fileter_fn, train)
    classifier = learn_lm(train)

    acc_contrib = Counter()
    for data_point in train:
        (tf, num), y = data_point

        contrib = classifier.counter_contribution(tf)
        # print("{} {} {}".format(y, classifier.predict(tf), classifier.counter_odd(tf)))
        # print("--------------")
        for t, score in contrib.most_common(100):
            acc_contrib[t] += score

    for t, score in acc_contrib.most_common(100):
        print(t, score, classifier.P_w_C_dict[t], classifier.P_w_NC_dict[t])
コード例 #22
0
def write_records(records: List[PairedInstance], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def tokenize_from_tokens(tokens: List[str]) -> List[str]:
        output = []
        for t in tokens:
            ts = tokenizer.tokenize(t)
            output.extend(ts)
        return output

    def encode(inst: PairedInstance) -> OrderedDict:
        tokens1: List[str] = tokenizer.tokenize(inst.candidate_text)
        max_seg2_len = max_seq_length - 3 - len(tokens1)

        def concat_tokens(raw_tokens: List[str]):
            tokens2 = tokenize_from_tokens(raw_tokens)[:max_seg2_len]
            tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

            segment_ids = [0] * (len(tokens1) + 2) \
                          + [1] * (len(tokens2) + 1)
            tokens = tokens[:max_seq_length]
            segment_ids = segment_ids[:max_seq_length]
            return tokens, segment_ids

        out_tokens1, seg1 = concat_tokens(inst.passage_good)
        out_tokens2, seg2 = concat_tokens(inst.passage_worse)
        features = combine_features(out_tokens1, seg1, out_tokens2, seg2,
                                    tokenizer, max_seq_length)
        features['strict_good'] = create_int_feature([inst.strict_good])
        features['strict_bad'] = create_int_feature([inst.strict_bad])
        return features

    writer = RecordWriterWrap(output_path)
    features: List[OrderedDict] = lmap(encode, records)
    foreach(writer.write_feature, features)
    writer.close()
コード例 #23
0
def main():
    print("Loading scores...")
    cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score_wrap()
    baseline_cid_grouped = load_baseline("train_baseline")
    gold = get_claim_perspective_id_dict()
    tokenizer = get_tokenizer()
    claim_d = load_train_claim_d()

    print("Start analyzing")
    html = HtmlVisualizer("cppnc_value_per_token_score.html")
    claim_cnt = 0
    for cid, pid_entries_d in cid_grouped.items():
        pid_entries_d: Dict[str, List[Dict]] = pid_entries_d
        pid_entries: List[Tuple[str, List[Dict]]] = list(pid_entries_d.items())
        baseline_pid_entries = baseline_cid_grouped[int(cid)]
        baseline_score_d = fetch_score_per_pid(baseline_pid_entries)
        gold_pids = gold[int(cid)]

        ret = collect_score_per_doc(baseline_score_d, get_score_from_entry, gold_pids,
                                                                  pid_entries)
        passage_tokens_d = collect_passage_tokens(pid_entries)
        doc_info_d: Dict[int, Tuple[str, int]] = ret[0]
        doc_value_arr: List[List[float]] = ret[1]

        kdp_result_grouped = defaultdict(list)
        for doc_idx, doc_values in enumerate(doc_value_arr):
            doc_id, passage_idx = doc_info_d[doc_idx]
            avg_score = average(doc_values)
            kdp_result = doc_id, passage_idx, avg_score
            kdp_result_grouped[doc_id].append(kdp_result)

        s = "{} : {}".format(cid, claim_d[int(cid)])
        html.write_headline(s)
        claim_cnt += 1
        if claim_cnt > 10:
            break

        scores: List[float] = list([r[2] for r in doc_value_arr])

        foreach(html.write_paragraph, lmap(str, scores))

        for doc_id, kdp_result_list in kdp_result_grouped.items():
            html.write_headline(doc_id)
            tokens, per_token_score = combine_collect_score(tokenizer, doc_id, passage_tokens_d, kdp_result_list)
            str_tokens = tokenizer.convert_ids_to_tokens(tokens)
            row = cells_from_tokens(str_tokens)
            for idx in range(len(str_tokens)):
                score = per_token_score[idx][0]
                norm_score = min(abs(score) * 10000, 100)
                color = "B" if score > 0 else "R"
                row[idx].highlight_score = norm_score
                row[idx].target_color = color

            rows = [row]
            nth = 0
            any_score_found = True
            while any_score_found:
                any_score_found = False
                score_list = []
                for idx in range(len(str_tokens)):
                    if nth < len(per_token_score[idx]):
                        score = per_token_score[idx][nth]
                        any_score_found = True
                    else:
                        score = "-"
                    score_list.append(score)

                def get_cell(score):
                    if score == "-":
                        return Cell("-")
                    else:
                        # 0.01 -> 100
                        norm_score = min(abs(score) * 10000, 100)
                        color = "B" if score > 0 else "R"
                        return Cell("", highlight_score=norm_score, target_color=color)

                nth += 1
                if any_score_found:
                    row = lmap(get_cell, score_list)
                    rows.append(row)
            html.multirow_print_from_cells_list(rows)
コード例 #24
0
def test_logistic_regression():
    train_and_val = load_feature_and_split()
    train: List[Dict] = train_and_val[0]
    val: List[Dict] = train_and_val[1]
    valid_datapoint_list: List[Dict] = train + val
    stopwords = load_stopwords()

    def fileter_fn(data_point: Dict):
        remove_stopword_and_punct(stopwords, data_point['feature'])

    foreach(fileter_fn, train)
    foreach(fileter_fn, val)

    tf_list = lmap(lambda dp: dp['feature'], valid_datapoint_list)
    tf_acc = Counter()
    for tf in tf_list:
        tf_acc.update(tf)

    voca: List[str] = left(tf_acc.most_common(10000))
    #voca = set(flatten(lmap(get_voca_from_datapoint, valid_datapoint_list)))
    voca2idx: Dict[str, int] = dict(zip(list(voca), range(len(voca))))
    idx2voca: Dict[int, str] = {v: k for k, v in voca2idx.items()}
    print("Num voca:", len(voca))
    feature_size = len(voca) + 1

    def featurize(datapoint: Dict):
        tf = datapoint['feature']
        y = int(datapoint['label'])
        v = np.zeros([feature_size])
        for t, prob in tf.items():
            if t in voca2idx:
                v[voca2idx[t]] = prob
        v[-1] = datapoint['num_mention']
        return v, int(y)

    x, y = zip(*lmap(featurize, train))
    val_x, val_y = zip(*lmap(featurize, val))

    model = LogisticRegression()
    model.fit(x, y)

    x_a = np.array(x)
    print(x_a.shape)
    avg_x = np.sum(x_a, axis=0)

    def acc(y, pred_y):
        return np.average(np.equal(y, pred_y))

    pred_y = model.predict(x)
    print("train acc", acc(y, pred_y))
    print("val acc", acc(val_y, model.predict(val_x)))
    t = np.multiply(avg_x, model.coef_)
    contrib = t[0]
    ranked_idx = np.argsort(contrib)

    def print_feature_at(idx):
        if idx == feature_size - 1:
            print("[NUM_MENTION]", contrib[idx])
        else:
            print(idx2voca[idx], contrib[idx])

    print("Top k features (POS)")
    for i in range(30):
        idx = ranked_idx[i]
        print_feature_at(idx)

    print("Top k features (NEG)")
    for i in range(30):
        j = len(voca) - 1 - i
        idx = ranked_idx[j]
        print_feature_at(idx)

    print("In training data")
    print("pred\tgold\tterms")
    for i in range(100):
        terms = left(train[i]['feature'].most_common(50))
        terms = list(terms[25:])
        print(pred_y[i], y[i], terms)  #
コード例 #25
0
ファイル: uni_lm.py プロジェクト: clover3/Chair
        yield RelevanceModel(query.id.id, query.text, counter)


if __name__ == "__main__":
    split = "training"
    lms: List[Tuple[str, Counter]] = list(build_lm(split))
    alpha = 0.1
    bg_lm = average_counters(lmap(lambda x: x.lm, lms))

    def show(r: RelevanceModel):
        print('----')
        print(r.text)
        log_topic_lm = get_lm_log(smooth(r.lm, bg_lm, alpha))
        log_bg_lm = get_lm_log(bg_lm)
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        for k, v in r.lm.most_common(50):
            print(k, v)

        s = "\t".join(left(r.lm.most_common(10)))
        print("LM freq: ", s)
        print(s)

        s = "\t".join(left(log_odd.most_common(30)))
        print("Log odd top", s)

        s = "\t".join(left(least_common(log_odd, 10)))
        print("Log odd bottom", s)

    foreach(show, lms[:10])