Ejemplo n.º 1
0
def main():
    q_res_path = FilePath(
        "/mnt/nfs/work3/youngwookim/data/qck/evidence/q_res_10.txt")
    qck_queries = get_qck_queries_all()
    candidate = get_qk_candidate(config1(), q_res_path, qck_queries)
    print("Num candidate : {}", len(candidate))
    save_to_pickle(candidate, "pc_evidence_qk")
Ejemplo n.º 2
0
def nli_attribution_predict(hparam, nli_setting, data_loader,
                            explain_tag, method_name, data_id, sub_range, model_path):
    enc_payload, plain_payload = data_loader.load_plain_text(data_id)
    if sub_range is not None:
        raise Exception("Sub_range is not supported")


    from attribution.gradient import explain_by_gradient
    from attribution.deepexplain.tensorflow import DeepExplain

    sess = init_session()

    with DeepExplain(session=sess, graph=sess.graph) as de:
        task = transformer_nli_pooled_embedding_in(hparam, nli_setting.vocab_size, False)
        softmax_out = tf.nn.softmax(task.logits, axis=-1)
        sess.run(tf.global_variables_initializer())
        load_model(sess, model_path)
        emb_outputs = task.encoded_embedding_out, task.attention_mask_out
        emb_input = task.encoded_embedding_in, task.attention_mask_in

        def feed_end_input(batch):
            x0, x1, x2 = batch
            return {task.x_list[0]:x0,
                    task.x_list[1]:x1,
                    task.x_list[2]:x2,
                    }

        explains = explain_by_gradient(enc_payload, method_name, explain_tag, sess, de,
                                       feed_end_input, emb_outputs, emb_input, softmax_out)

        pred_list = predict_translate(explains, data_loader, enc_payload, plain_payload)
        save_to_pickle(pred_list, "pred_{}_{}".format(method_name, data_id))
Ejemplo n.º 3
0
def main():
    dir_path = sys.argv[1]
    tokenizer = get_tokenizer()
    averager = Averager()
    sbc = SubwordConvertor()
    df = Counter()
    collection_size = 0
    tikcer = TimeEstimator(485393)
    for file_path in get_dir_files(dir_path):
        for idx, record in enumerate(
                tf.compat.v1.python_io.tf_record_iterator(file_path)):
            example = tf.train.Example()
            example.ParseFromString(record)
            feature = example.features.feature
            input_ids = feature["input_ids"].int64_list.value
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            sep_idx1 = tokens.index("[SEP]")
            sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1)
            doc_tokens = tokens[sep_idx1:sep_idx2]
            words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens))
            dl = len(words)
            collection_size += dl
            averager.append(dl)
            for word in set(words):
                df[word] += 1
            tikcer.tick()

    print("collection length", collection_size)
    print("average dl", averager.get_average())
    save_to_pickle(df, "subword_df_robust_train")
Ejemplo n.º 4
0
def main(config):
    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate")
    qk_out_entries: List[QKOutEntry] = load_qk_score(config)

    score_type = config['score_type']
    k = config['k']
    queries = left(qk_candidate)
    good_doc_list_d = {q.query_id: set() for q in queries}

    for entry in qk_out_entries:
        score = get_score_from_logit(score_type, entry.logits)
        if score > k:
            good_doc_list_d[entry.query.query_id].add(entry.kdp.doc_id)

    stat_count = Counter()

    def filter_map(qk_unit: QKUnit):
        query, kdp_list = qk_unit
        good_doc_list = good_doc_list_d[query.query_id]

        def is_good(kdp):
            return kdp.doc_id in good_doc_list

        new_kdp_list = lfilter(is_good, kdp_list)
        print("{} -> {}".format(len(kdp_list), len(new_kdp_list)))
        if not new_kdp_list:
            stat_count["no kdp"] += 1
        return query, new_kdp_list

    new_qk_candidate = lmap(filter_map, qk_candidate)
    print(stat_count)
    save_to_pickle(new_qk_candidate, "robust_on_clueweb_qk_candidate_filtered")
Ejemplo n.º 5
0
def work():
    acc_counter = Counter()
    for i in range(0, 122):
        save_name = "acc_count_{}".format(i)
        counter = load_from_pickle(save_name)
        acc_counter.update(counter)
    save_to_pickle(acc_counter, "acc_count")
Ejemplo n.º 6
0
def save_concat_dev():
    #    prediction_path = pjoin(output_path, "pc_long_seq11")
    prediction_path = pjoin(output_path, "pc_long_focus_1")
    scores: Dict[CPID, List[float]] = collect_pipeline2_score(
        prediction_path, "pc_rel_dev_info_all")
    reduced_score: Dict[CPID, float] = dict_value_map(sum, scores)
    save_to_pickle(reduced_score, "pc_concat_dev_score")
Ejemplo n.º 7
0
def pc_new_init_prob():
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)
    bias_plus_word: Counter = load_from_pickle("bias_plus_words")
    tokenizer = PCTokenizer()

    base_p = max(bias_plus_word.values())

    init_p_score_d = {}
    for cid in d_ids:
        c_text = claim_d[cid]
        tokens = tokenizer.tokenize_stem(c_text)

        score_for_cid = Counter()
        for t, cnt in Counter(tokens).items():
            prob = cnt * base_p
            score_for_cid[t] = prob

        for t, score in bias_plus_word.items():
            score_for_cid[t] += score

        score_for_cid = normalize_counter_to_sum1(score_for_cid)
        init_p_score_d[cid] = score_for_cid

    save_to_pickle(init_p_score_d, "pc_dev_new_init_prob")
Ejemplo n.º 8
0
def a_relevant_candidate(save_name, q_res_path, claims):
    top_n = 10
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    all_passages = []
    entries = []

    all_docs = 0
    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_text = c['text']

        def get_passage_score(dummy):
            return 0

        passages: List[Tuple[List[str], float]] = iterate_passages(
            q_res, top_n, get_passage_score)
        all_docs += len(passages)
        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs ".format(len(claims), all_docs))
    data = entries, all_passages
    save_to_pickle(data, save_name)
Ejemplo n.º 9
0
def run(args):
    hp = hyperparams.HPGenEx()
    save_name = "{}_labels".format(args.data_name)
    data = load_as_simple_format(args.data_name)
    labels = label_predict(hp, data, args.model_path)

    save_to_pickle(labels, save_name)
Ejemplo n.º 10
0
def tf_record_gen(ranked_list: Dict[str, List[SimpleRankedListEntry]],
                  queries: Dict,
                  text_reader: Callable[[str], str],
                  output_path,
                  max_seq_length: int,
                  data_info_save_name,
                  ):
    writer = RecordWriterWrap(output_path)
    tokenizer = get_tokenizer()
    dummy_label = 0

    data_id_idx = 0
    data_id_info = {}
    for query_id_str in ranked_list:
        query_rep = queries[query_id_str]
        query_str = query_rep['query']

        for ranked_entry in ranked_list[query_id_str]:
            data_id = data_id_idx
            data_id_idx += 1
            data_id_info[data_id] = (query_id_str, ranked_entry.doc_id)
            text = text_reader(ranked_entry.doc_id)
            tokens, segment_ids = encode_query_and_text(tokenizer, query_str, text, max_seq_length)
            features = get_basic_input_feature(tokenizer,
                                               max_seq_length,
                                               tokens,
                                               segment_ids)
            features['label_ids'] = create_int_feature([dummy_label])
            features['data_id'] = create_int_feature([data_id])
            writer.write_feature(features)

    save_to_pickle(data_id_info, data_info_save_name)
    writer.close()
Ejemplo n.º 11
0
def write_topic_sentence_as_query():
    query_collection_id = Q_CONFIG_ID_BM25_UKP

    dp_id_to_q_res_id = {}

    def dp_to_query(dp: UkpDataPoint) -> DocQuery:
        topic_tokens = clean_tokenize_str_to_tokens(dp.topic)
        sent_tokens = clean_tokenize_str_to_tokens(dp.sentence)
        qid = str(dp.id)
        dp_id_to_q_res_id[str(dp.id)] = "{}_{}".format(qid, query_collection_id)
        return format_query(topic_tokens, sent_tokens, qid, 3)

    train_data, val_data = load_all_data()

    def all_data_iterator() -> Iterator[UkpDataPoint]:
        for data_list in chain(train_data.values(), val_data.values()):
            for dp in data_list:
                yield dp

    all_queries: List[DocQuery] = lmap(dp_to_query, all_data_iterator())

    out_dir = get_query_dir(query_collection_id)
    exist_or_mkdir(out_dir)

    n_query_per_file = 50
    save_to_pickle(dp_id_to_q_res_id, "ukp_10_dp_id_to_q_res_id")
Ejemplo n.º 12
0
def majority(build_lm_from_tokens_list, save_name):
    tf0 = Counter()
    tf1 = Counter()
    tf2 = Counter()
    for doc, preds in enum_docs_and_stance():
        assert len(preds) == len(doc)
        cnt_stance1 = 0
        cnt_stance2 = 0
        for sent, pred in zip(doc, preds):
            probs = softmax(pred)
            if probs[1] > 0.5:
                cnt_stance1 += 1
            elif probs[2] > 0.5:
                cnt_stance2 += 1

        if cnt_stance1 > cnt_stance2:
            stance = 1
        elif cnt_stance2 > cnt_stance1:
            stance = 2
        else:
            stance = 0

        if stance > 0:
            tf = build_lm_from_tokens_list(doc)
            [tf0, tf1, tf2][stance].update(tf)

    result = tf0, tf1, tf2
    save_to_pickle(result, save_name)
    display(tf1, tf2, "favor", "against")
Ejemplo n.º 13
0
def main():
    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement = load_qrels_structured(qrel_path)

    def is_correct(query: QCKQuery, candidate: QCKCandidate):
        qid = query.query_id
        doc_id = candidate.id
        if qid not in judgement:
            return 0
        d = judgement[qid]
        label = 1 if doc_id in d and d[doc_id] > 0 else 0
        return label

    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate_filtered")

    candidate_dict = load_cache("candidate_for_robust_qck_7")
    if candidate_dict is None:
        candidate_dict: \
            Dict[str, List[QCKCandidateI]] = get_candidate_all_passage_w_samping()
        save_to_pickle(candidate_dict, "candidate_for_robust_qck_7")

    generator = QCKInstanceGenerator(candidate_dict, is_correct)
    num_jobs = 250

    def worker_factory(out_dir):
        worker = QCKWorker(qk_candidate, generator, out_dir)
        return worker

    ##
    job_name = "robust_qck_10"
    runner = JobRunner(job_man_dir, num_jobs, job_name, worker_factory)
    runner.start()
Ejemplo n.º 14
0
def predict_for_view(hparam, nli_setting, data_loader, data_id, model_path,
                     run_name, modeling_option, tags):
    print("predict_nli_ex")
    print("Modeling option: ", modeling_option)
    enc_payload, plain_payload = data_loader.load_plain_text(data_id)
    batches = get_batches_ex(enc_payload, hparam.batch_size, 3)

    task = transformer_nli_pooled(hparam, nli_setting.vocab_size)

    explain_predictor = ExplainPredictor(len(tags),
                                         task.model.get_sequence_output(),
                                         modeling_option)
    sess = init_session()
    sess.run(tf.global_variables_initializer())
    load_model(sess, model_path)

    out_entries = []
    for batch in batches:
        x0, x1, x2 = batch
        logits, ex_logits, = sess.run(
            [task.logits, explain_predictor.get_score()],
            feed_dict={
                task.x_list[0]: x0,
                task.x_list[1]: x1,
                task.x_list[2]: x2,
            })

        for i in range(len(x0)):
            e = x0[i], logits[i], tuple_list_select(ex_logits, i)
            out_entries.append(e)

    save_to_pickle(out_entries, "save_view_{}_{}".format(run_name, data_id))
Ejemplo n.º 15
0
def main():
    docs: Dict[str, List[List[str]]] = load_from_pickle("dev_claim_docs")
    _, clue12_13_df = load_clueweb12_B13_termstat()
    d_ids: List[int] = list(load_dev_claim_ids())
    claims = get_claims_from_ids(d_ids)
    r = select_paragraph(docs, clue12_13_df, claims, "topk")
    save_to_pickle(r, "dev_claim_paras")
Ejemplo n.º 16
0
def do_predict(
    bert_hp,
    train_config,
    data,
    lms_config,
    modeling_option,
    init_fn,
):
    num_gpu = train_config.num_gpu
    train_batches, dev_batches = data

    lms_model = LMSModel(modeling_option, bert_hp, lms_config, num_gpu)
    sess = init_session()
    sess.run(tf.global_variables_initializer())
    init_fn(sess)

    step_size = 100
    for i in range(100):
        st = i * step_size
        ed = st + step_size
        # make explain train_op does not increase global step
        tprint(st, ed)
        output_d = predict_fn(sess, train_batches[st:ed], lms_model.logits,
                              lms_model.loss_tensor, lms_model.ex_score_tensor,
                              lms_model.per_layer_logit_tensor,
                              lms_model.batch2feed_dict)

        save_path = at_output_dir("lms_scores", str(i))
        save_to_pickle(output_d, save_path)
Ejemplo n.º 17
0
def main(config):
    new_qks = filter_with_ranked_list_path(config['qk_name'],
                                           config['ranked_list_path'],
                                           config['threshold'],
                                           config['top_k'])

    save_to_pickle(new_qks, config['save_name'])
Ejemplo n.º 18
0
def main():
    split = "train"
    qk_candidate = get_qk_candidate(split)
    query_lms: Dict[str, Counter] = get_query_lms(split)
    print(len(qk_candidate), len(query_lms))
    filtered_qk_candidate = filter_qk_rel(qk_candidate, query_lms, 50)
    save_to_pickle(filtered_qk_candidate, "pc_qk2_filtered_rel_{}".format(split))
Ejemplo n.º 19
0
def save_for_train():
    info = load_from_pickle("pc_rel_info_all")
    prediction_path = pjoin(output_path, "pc_rel")
    rel_info: Dict[DataID, Tuple[CPIDPair, Logits,
                                 Logits]] = combine_pc_rel_with_cpid(
                                     prediction_path, info)
    save_to_pickle(rel_info, "pc_rel_with_cpid")
Ejemplo n.º 20
0
def main_hp09():
    split = "train"
    qk_candidate = get_qk_candidate(split)
    query_lms: Dict[str, Counter] = get_query_lms(split)
    print(len(qk_candidate), len(query_lms))
    alpha = 0.9
    filtered_qk_candidate = filter_qk(qk_candidate, query_lms, alpha)
    save_to_pickle(filtered_qk_candidate, "pc_qk2_09_filtered_{}".format(split))
Ejemplo n.º 21
0
def gen_overlap(config):
    split = config['split']
    q_res_path = config['q_res_path']
    save_name = config['save_name']
    doc_score_path = config['doc_score_path']
    candidate: List[QKUnit] = qk_candidate_gen(q_res_path, doc_score_path,
                                               split, config2())
    save_to_pickle(candidate, save_name)
Ejemplo n.º 22
0
def main():
    raw_payload: List[ClaimPassages] = load_dev_payload()
    save_path = os.path.join(output_path, "pc_dev_passage_payload")
    encode = get_encode_fn(512)
    data_id_manage = DataIDManager()
    insts = list(generate_instances(raw_payload, data_id_manage))
    write_records_w_encode_fn(save_path, encode, insts, len(insts))
    save_to_pickle(data_id_manage.id_to_info, "pc_dev_passage_payload_info")
Ejemplo n.º 23
0
def save_to_cache():
    for split in splits:
        job_name = "argu_qck_datagen_{}".format(split)
        candidate_dict, correct_d = load_base_resource(
            EvalCondition.EntirePortalCounters, split)
        obj = candidate_dict, correct_d

        save_to_pickle(obj, job_name + "_base_resource")
Ejemplo n.º 24
0
def main():
    for split in splits:
        q_res_path = os.path.join(output_path, "perspective_experiments",
                                  "clueweb_qres", "{}.txt".format(split))
        qck_queries = get_qck_queries(split)
        candidate = get_qk_candidate(config1(), q_res_path, qck_queries)
        print("Num candidate : {}", len(candidate))
        save_to_pickle(candidate, "pc_qk2_{}".format(split))
Ejemplo n.º 25
0
def main():
    split = "train"
    all_qk = load_all_qk()
    qids = list(get_qids_for_split(split_name2, split))
    qks_for_split = list([qk for qk in all_qk if qk[0].query_id in qids])
    query_lms: Dict[str, Counter] = get_claim_lms()
    print(len(qks_for_split), len(query_lms))
    filtered_qk_candidate = filter_qk_rel(qks_for_split, query_lms, 50)
    save_to_pickle(filtered_qk_candidate, "pc_qk3_filtered_rel_{}".format(split))
Ejemplo n.º 26
0
def select_word_from_dev():
    tokenizer = get_tokenizer()

    tf_dev = load_from_pickle("nli_tf_dev_mis")
    selected_words = select_common(tf_dev, tokenizer)

    print(list(tf_dev.most_common(100))[-1])

    save_to_pickle(selected_words, "nli_dev_selected_words")
Ejemplo n.º 27
0
 def work(self, job_id):
     jsonl_path = self.jsonl_path_format.format(job_id)
     f = open(jsonl_path, "r")
     line_itr = f
     buffered_saver = datastore.tool.PayloadSaver()
     payload_saver = process_jsonl(line_itr, self.tokenize_fn,
                                   buffered_saver)
     save_name = os.path.basename(jsonl_path)
     save_to_pickle(payload_saver, save_name)
Ejemplo n.º 28
0
def get_cpid_score_from_cache_or_raw(pred_path, cpid_resolute, strategy):
    cache_name = os.path.basename(pred_path) + "_" + strategy

    r = load_cache(cache_name)
    if r is None:
        r = get_cpid_score(pred_path, cpid_resolute, strategy)

    save_to_pickle(r, cache_name)
    return r
Ejemplo n.º 29
0
def main():
    save_name = sys.argv[1]
    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    info_file_path = os.path.join(out_dir, sys.argv[2])
    pred_file_path = os.path.join(out_dir, save_name + ".score")
    score_d = summarize_score(info_file_path, pred_file_path)
    save_to_pickle(score_d, "score_d")
    print("Saved as 'score_d'")
Ejemplo n.º 30
0
def a_relevant(save_name, q_res_path, claims):
    top_n = 10

    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    preload_docs(ranked_list, claims, top_n)
    claim_lms = build_gold_lms(claims)
    claim_lms_d = {lm.cid: lm for lm in claim_lms}
    bg_lm = average_counters(lmap(lambda x: x.LM, claim_lms))
    log_bg_lm = get_lm_log(bg_lm)

    stopwords = load_stopwords_for_query()
    alpha = 0.5

    tokenizer = PCTokenizer()
    all_passages = []
    entries = []
    num_pos_sum = 0
    num_pos_exists = 0

    for c in claims:
        q_res: List[SimpleRankedListEntry] = ranked_list[str(c['cId'])]
        claim_lm = claim_lms_d[c['cId']]
        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)

        claim_text = c['text']
        claim_tokens = tokenizer.tokenize_stem(claim_text)

        scores = []
        for t in claim_tokens:
            if t in log_odd:
                scores.append(log_odd[t])

        def get_passage_score(p):
            def get_score(t):
                if t in stopwords:
                    return 0
                return log_odd[tokenizer.stemmer.stem(t)]

            return sum([get_score(t) for t in p]) / len(p) if len(p) > 0 else 0

        passages = iterate_passages(q_res, top_n, get_passage_score)
        num_pos = len(lfilter(lambda x: x[1] > 0, passages))
        num_pos_sum += num_pos
        if num_pos > 0:
            num_pos_exists += 1

        all_passages.extend(passages)
        entries.append((c, passages))

    print("{} claims. {} docs on {} claims".format(len(claims), num_pos_sum,
                                                   num_pos_exists))

    data = entries, all_passages

    save_to_pickle(data, save_name)