Example #1
0
def load_data(batch_size):
    train_data = load_pickle_from(
        os.path.join(data_path, "msmarco", "train.pickle"))
    dev_data = load_pickle_from(
        os.path.join(data_path, "msmarco", "dev.pickle"))

    train_batches = get_batches_ex(train_data, batch_size, 4)
    dev_batches = get_batches_ex(dev_data, batch_size, 4)
    return train_batches, dev_batches
Example #2
0
def main():
    info_dir = os.path.join(job_man_dir, "best_seg_prediction_gen_train_info")
    job_id = 0
    info_file_path = os.path.join(info_dir, str(job_id) + ".info")
    print(info_file_path)
    info = json.load(open(info_file_path, "r"))
    prediction_dir = "output/mmd_ss/mmd_Z_50000"
    prediction_file = os.path.join(prediction_dir, str(job_id) + ".score")
    pred_data: List[Dict] = join_prediction_with_info(prediction_file, info)

    target_qdid = ("1000633", "D144400")
    saved_entries = []
    for key, entry in info.items():
        if entry['qid'] == "1000633" and entry['doc_id'] == 'D144400':
            saved_entries.append(entry)
            print(entry)

    print('--')
    for entry in pred_data:
        if entry['qid'] == "1000633" and entry['doc_id'] == 'D144400':
            print(entry)

    qid = "1000633"
    sr_path = os.path.join(job_man_dir, "seg_resource_train", qid)
    sr_per_query: SRPerQuery = load_pickle_from(sr_path)

    for sr_per_query_doc in sr_per_query.sr_per_query_doc:
        if sr_per_query_doc.doc_id == "D144400":
            print("doc {} has {} segs".format(sr_per_query_doc.doc_id,
                                              len(sr_per_query_doc.segs)))
Example #3
0
def main():
    load_path = os.path.join(output_path, "word2vec_clueweb12_13B")

    model: gensim.models.Word2Vec = load_pickle_from(os.path.join(load_path))
    print(model.trainables.syn1neg.shape)
    terms = ['proposition', 'issue', 'reason']

    v_sum = np.sum([model[t] for t in terms], axis=0)
    print(v_sum)
    j = np.argmax(v_sum)
    print(list([model[t][j] for t in terms]))
    candi = model.wv.similar_by_vector(v_sum, topn=300)
    j_rank = np.argsort([model[word][j] for word, _ in candi])[::-1]
    for j_idx in j_rank[:20]:
        print(candi[j_idx])

    word = terms[0]
    term_id = model.wv.vocab[word].index
    #print(word, model.wv.vectors[term_id], model[word])

    scores = np.dot(v_sum, model.trainables.syn1neg.T)
    print(scores.shape)
    rank_by_context = np.argsort(scores)[::-1]
    for j_idx in rank_by_context[:20]:
        print(model.wv.index2word[j_idx])
Example #4
0
def main():
    target_data_idx = int(sys.argv[1])
    info_path = os.path.join(job_man_dir, "robust_w_data_id_desc_info_pickle",
                             "{}".format(target_data_idx))
    max_seq_length = 512
    info = load_pickle_from(info_path)
    demo_score(info, max_seq_length)
Example #5
0
def main():
    num_layers = 12
    dva = DictValueAverage()

    all_val = defaultdict(list)
    for i in range(1):
        save_path = at_output_dir("lms_scores", str(i) + ".pickle")
        output_d = load_pickle_from(save_path)
        input_mask = output_d['input_mask']  # [num_inst, seq_length]
        for layer_no in range(num_layers):
            probs = sigmoid(
                output_d['logits'][layer_no])  # [num_inst, seq_length, 2]
            num_inst, seq_length, maybe_2 = np.shape(probs)

            for data_idx in range(num_inst):
                for seq_idx in range(seq_length):
                    if input_mask[data_idx, seq_idx]:
                        key = layer_no
                        v = probs[data_idx, seq_idx, 1]
                        dva.add(key, v)
                        all_val[key].append(v)

    for k, v in dva.all_average().items():
        print(k, v)

    for k, l in all_val.items():
        min_val = max(l)
        print(k, min_val)
Example #6
0
def main(config):
    split = config['split']
    top_k = config['top_k']
    word_prob_path = config['word_prob_path']
    run_name = config['run_name']
    save_path = config['save_path']
    if top_k == 50:
        candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_as_qck(split)
    elif top_k == 1000:
        candidate_d: Dict[str, List[QCKCandidate]] = get_eval_candidates_1k_as_qck(split)
    else:
        assert False

    per_query_infos: Dict[str, Dict[WordAsID, np.array]] = load_pickle_from(word_prob_path)

    all_ranked_list_entries = []

    for query_id, d in per_query_infos.items():
        scorer = Scorer(d, True)
        candidates: List[QCKCandidate] = candidate_d[query_id]

        entries = []
        for c in candidates:
            e = c.id, scorer.score(c.text)
            entries.append(e)
        entries.sort(key=get_second, reverse=True)

        ranked_list_entries = scores_to_ranked_list_entries(entries, run_name, query_id)
        all_ranked_list_entries.extend(ranked_list_entries)

    write_trec_ranked_list_entry(all_ranked_list_entries, save_path)
Example #7
0
def get_candiset(i):
    p = os.path.join(cpath.data_path, "stream_pickled",
                     "CandiSet_{}_0".format(i))

    if not os.path.exists(p):
        return None

    return load_pickle_from(p)
Example #8
0
def load_info_from_compressed(pickle_path):
    tprint("loading info pickle")
    output_d = {}
    data = load_pickle_from(pickle_path)
    tprint("decompressing...")
    for data_id, value_d in data.items():
        new_entry = decompress_seg_ids_entry(value_d)
        output_d[data_id] = new_entry
    return output_d
Example #9
0
def get_ap_from_file_path(input_path):
    tf_prediction_data = load_pickle_from(input_path)
    tf_prediction_data = flatten_batches(tf_prediction_data)
    logits = tf_prediction_data["logits"]
    label_ids = tf_prediction_data["label_ids"]

    scores = lmap(logit_to_score_softmax, logits)

    assert len(scores) == len(label_ids)
    return get_ap(label_ids, scores)
Example #10
0
 def __init__(
     self,
     out_path,
     input_file,
     train_fn: GraphEmbeddingTrainer,
 ):
     self.out_dir = out_path
     self.corpus_d: Dict[int,
                         List[List[str]]] = load_pickle_from(input_file)
     self.key_list = list(self.corpus_d.keys())
     self.key_list.sort()
     self.train_fn: GraphEmbeddingTrainer = train_fn
Example #11
0
 def work(self, job_id):
     tfrecord_path = os.path.join(self.input_dir, str(job_id))
     features = load_record(tfrecord_path)
     save_path = os.path.join(self.out_dir, str(job_id))
     rel_score_path = os.path.join(self.rel_ex_score_dir, str(job_id))
     rel_score = load_pickle_from(rel_score_path)
     writer = RecordWriterWrap(save_path)
     for f in collect_passages(features, self.relevance_scores,
                               self.cpid_to_label, self.num_max_para,
                               self.window_size, rel_score):
         writer.write_feature(f)
     writer.close()
Example #12
0
def load_corpus():
    dir_path = FilePath("/mnt/nfs/work3/youngwookim/data/bert_tf/clueweb12_13B_word_tokens/")

    corpus = []

    cnt = 0
    for file_path in get_dir_files(dir_path):
        tokens_list = load_pickle_from(file_path)
        corpus.extend(tokens_list)
        if cnt > 50:
            break
        cnt += 1
    return corpus
Example #13
0
def generate_selected_training_data_for_many_runs(
        target_data_idx, info_dir, max_seq_length, score_and_save_dir: List,
        generate_selected_training_data_fn):
    interval_start_list = left(robust_query_intervals)
    key = interval_start_list[target_data_idx]
    info_path = os.path.join(info_dir, str(key))
    tprint("loading info: " + info_path)
    info = load_pickle_from(info_path)
    for score_dir, save_dir in score_and_save_dir:
        exist_or_mkdir(save_dir)
        tprint(save_dir)
        generate_selected_training_data_fn(info, key, max_seq_length, save_dir,
                                           score_dir)
Example #14
0
def main():
    target_data_idx = int(sys.argv[1])
    max_seq_length = int(sys.argv[2])
    max_seg = int(sys.argv[3])
    info_path = os.path.join(job_man_dir,
                             "robust_w_data_id_desc_info_pickle",
                             "{}".format(target_data_idx))

    info = load_pickle_from(info_path)
    save_dir_path = at_output_dir("robust_seg_sel", "exact_match{}_{}".format(max_seq_length, max_seg))
    exist_or_mkdir(save_dir_path)
    get_score_fn = get_score_fn_functor()
    generate_selected_training_data(info, max_seq_length, save_dir_path, get_score_fn, max_seg)
Example #15
0
def generate_selected_training_data_loop(split_no, score_dir, info_dir,
                                         max_seq_length, save_dir,
                                         generate_selected_training_data_fn):
    train_items, held_out = get_robust_splits(split_no)
    print(train_items)
    exist_or_mkdir(save_dir)
    for key in train_items:
        info_path = os.path.join(info_dir, str(key))
        # info = load_combine_info_jsons(info_path, False, False)
        tprint("loading info: " + info_path)
        info = load_pickle_from(info_path)
        # info = load_info_from_compressed(info_path)
        generate_selected_training_data_fn(info, key, max_seq_length, save_dir,
                                           score_dir)
Example #16
0
def parse_prediction_and_eval(prediction_path, payload_type, data_id, k=100):
    payload_info = get_payload_info(payload_type, data_id)
    tf_prediction_data = load_pickle_from(prediction_path)
    all_ranked_list = generate_ranked_list(tf_prediction_data, payload_info, k)

    text_output_path = prediction_path + ".txt"
    st = int(data_id)
    write_ranked_list(range(st, st + 50), all_ranked_list, text_output_path)
    pred_list = []
    for ranked_list in all_ranked_list:
        pred = [x[0] for x in ranked_list]
        pred_list.append(pred)

    return eval(pred_list, data_id)
Example #17
0
def collect_ngram_count(dir_path, ed, ngram_range, st):
    all_counter = {}
    df_counter = {}
    for n in ngram_range:
        all_counter[n] = Counter()
        df_counter[n] = Counter()
    for i in range(st, ed):
        file_path = os.path.join(dir_path, str(i))
        features: List[PCNGramFeature] = load_pickle_from(file_path)

        for f in features:
            for n in ngram_range:
                counter: Counter = f.n_grams[n]
                all_counter[n].update(counter)
                for key in counter:
                    df_counter[n][key] += 1
    return df_counter
Example #18
0
def save_to_trec_format(prediction_path, payload_type, data_id, num_candidate,
                        run_name, save_path):
    payload_info = get_payload_info(payload_type, data_id)
    tf_prediction_data = load_pickle_from(prediction_path)
    all_ranked_list = generate_ranked_list(tf_prediction_data, payload_info,
                                           num_candidate)
    st = int(data_id)
    query_ids = [str(i) for i in range(st, st + 50)]
    all_entries: List[Tuple[str, List[TrecRankedListEntry]]] = []
    for query_id, ranked_list in zip(query_ids, all_ranked_list):
        rl = [
            TrecRankedListEntry(query_id, doc_id, rank, score, run_name)
            for doc_id, rank, score in ranked_list
        ]
        all_entries.append((query_id, rl))

    write_ranked_list_from_s(dict(all_entries), save_path)
Example #19
0
def main(info_path, input_type, label_dict_path, save_path):
    f_handler = get_format_handler(input_type)
    info: Dict[str, Dict] = load_combine_info_jsons(info_path, f_handler.get_mapping(), f_handler.drop_kdp())
    label_dict: Dict[Tuple[str, str], bool] = load_pickle_from(label_dict_path)

    l = []
    for entry in info.values():
        key = f_handler.get_pair_id(entry)
        query_id, candidate_id = key
        if key in label_dict:
            correctness = label_dict[key]
        else:
            correctness = False
        e = TrecRelevanceJudgementEntry(query_id, candidate_id, int(correctness))
        l.append(e)

    write_trec_relevance_judgement(l, save_path)
 def work(self, job_id):
     file_no = int(job_id / 10)
     idx = job_id % 10
     pc_co_occurrence = load_pickle_from(
         os.path.join(self.input_dir, str(file_no)))
     cid, pair_counter = pc_co_occurrence[idx]
     edges, valid_vertices = select_vertices_edges(pair_counter)
     try:
         init_p_dict = Counter(dict(self.prob_score_d[cid]))
         result = run_biased_random_walk(edges, valid_vertices,
                                         self.max_repeat, self.p_reset,
                                         init_p_dict)
         result = Counter(result)
         output = cid, result
         save_path = os.path.join(self.out_dir, str(job_id))
         pickle.dump(output, open(save_path, "wb"))
     except KeyError as e:
         print(e)
Example #21
0
def main():
    data_name = "wiki"
    for method in ["deletion", "LIME"]:
        for config in [DropStop, Config2, ConfigShort]:
            data_method_str = "{}_{}".format(data_name, method)
            save_dir = os.path.join(output_path, "genex", data_method_str)
            for i in range(100):
                try:
                    idx_str = "{0:02d}".format(i)
                    score_name = "{}_{}_{}".format(data_name, idx_str, method)
                    save_name = "{}_{}.txt".format(score_name, config.name)
                    save_path = os.path.join(save_dir, save_name)
                    score_path = os.path.join(data_path, "cache", data_method_str, score_name + ".pickle")
                    scores: List[np.array] = load_pickle_from(score_path)
                    data: List[PackedInstance] = load_packed(data_name)
                    save_score_to_file(data, config, save_path, scores)
                except:
                    print(data_name)
                    raise
Example #22
0
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    save_path = config['save_path']
    threshold = config['threshold']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    all_d = {}
    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        entry.sort(key=get_second, reverse=True)
        word_list = []
        for word, diff, pos, neg in entry[:100]:
            if diff > threshold:
                word = word.strip()
                word_list.append(word)
        all_d[query_id] = word_list
    json.dump(all_d, open(save_path, "w"))
Example #23
0
def main():
    input_path = sys.argv[1]
    tf_prediction_data = load_pickle_from(input_path)
    tf_prediction_data = flatten_batches(tf_prediction_data)
    logits = tf_prediction_data["logits"]
    label_ids = tf_prediction_data["label_ids"]

    scores = lmap(logit_to_score_softmax, logits)

    assert len(scores) == len(label_ids)
    print("{} data points".format(len(scores)))
    todo = [(get_auc, "auc"), (get_ap, "ap")]
    rows = []
    for metric_fn, metric_name in todo:
        score = metric_fn(label_ids, scores)
        row = [metric_name, score]
        rows.append(row)

    print_table(rows)
Example #24
0
def build_ngram_feature(dir_path, st, ed):
    selected_ngram_set = load_from_pickle("selected_ngram_feature")
    ngram_range = [1, 2, 3]
    all_data_point = []
    for i in range(st, ed):
        file_path = os.path.join(dir_path, str(i))
        features: List[PCNGramFeature] = load_pickle_from(file_path)

        for f in features:
            vector_builder = []
            for n in ngram_range:
                counter: Counter = f.n_grams[n]
                vector = [counter[k] for k in selected_ngram_set[n]]
                vector_builder.extend(vector)

            r = PCVectorFeature(f.claim_pers, vector_builder)
            all_data_point.append(r)

    save_name = os.path.basename(dir_path) + "_ngram_features"
    save_to_pickle(all_data_point, save_name)
Example #25
0
def main(config):
    split = config['split']
    word_prob_path = config['word_prob_path']
    per_query_infos: Dict[str,
                          Dict[WordAsID,
                               np.array]] = load_pickle_from(word_prob_path)
    claims = load_claims_for_sub_split(split)
    claim_d = claims_to_dict(claims)
    stopwords = load_stopwords_for_query()

    def is_stopword(tokens):
        if len(tokens) == 1 and tokens[0] in stopwords:
            return True
        else:
            return False

    tokenizer = get_tokenizer()

    for query_id, d in per_query_infos.items():
        entry = []
        for key in d.keys():
            tokens: List[str] = decode_word_as_id(tokenizer, key)
            if is_stopword(tokens):
                continue

            plain_word: str = pretty_tokens(tokens, True)
            pos, neg = d[key]
            pos_log = math.log(pos + 1e-10)
            neg_log = math.log(neg + 1e-10)
            diff = pos_log - neg_log
            entry.append((plain_word, diff, pos_log, neg_log))

        print(query_id, claim_d[int(query_id)])
        entry.sort(key=get_second, reverse=True)
        for word, diff, pos, neg in entry[:100]:
            word = word.strip()
            print("{0}\t{1:.2f}\t{2:.2f}\t{3:.2f}".format(
                word, diff, pos, neg))
Example #26
0
def save_to_trec_format():
    prediction_path_format = "output/robust/A_train_{}.score"
    payload_type = "first_clean"
    num_candidate = 100
    run_name = "train_eval"
    save_path = "output/ranked_list/robust_train_pred.txt"
    all_entries = []
    for data_id in [301, 351, 401, 601]:
        payload_info = get_payload_info(payload_type, str(data_id))
        prediction_path = prediction_path_format.format(data_id)
        tf_prediction_data = load_pickle_from(prediction_path)
        all_ranked_list = generate_ranked_list(tf_prediction_data,
                                               payload_info, num_candidate)
        st = int(data_id)
        query_ids = [str(i) for i in range(st, st + 50)]
        for query_id, ranked_list in zip(query_ids, all_ranked_list):
            rl = [
                TrecRankedListEntry(query_id, doc_id, rank, score, run_name)
                for doc_id, rank, score in ranked_list
            ]
            all_entries.append((query_id, rl))

    write_ranked_list_from_s(dict(all_entries), save_path)
Example #27
0
def load_entries(cid):
    save_root = os.path.join(output_path, "cppnc", "cid_grouped")
    save_path = os.path.join(save_root, cid)
    return load_pickle_from(save_path)
Example #28
0
def predict_nli(model_path):
    hp = HPGenEx()
    run_name = "msmarco"
    dev_batches = load_pickle_from(os.path.join(data_path, "msmarco", "dev.pickle"))[:10]
    predict(hp, run_name, dev_batches, model_path, load_model)
Example #29
0
def main():
    prediction_file_path = at_output_dir("robust", "rob_dense2_pred.score")
    info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_2_info")
    queries: Dict[str, str] = load_robust_04_query("desc")
    tokenizer = get_tokenizer()
    query_token_len_d = {}
    for qid, q_text in queries.items():
        query_token_len_d[qid] = len(tokenizer.tokenize(q_text))
    step_size = 16
    window_size = 128
    out_entries: List[AnalyzedDoc] = token_score_by_ablation(
        info_file_path, prediction_file_path, query_token_len_d, step_size,
        window_size)

    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement_d = load_qrels_structured(qrel_path)

    html = HtmlVisualizer("robust_desc_128_step16_2.html", use_tooltip=True)

    tprint("loading tokens pickles")
    tokens_d: Dict[str, List[str]] = load_pickle_from(
        os.path.join(sydney_working_dir, "RobustPredictTokens3", "1"))
    tprint("Now printing")
    n_printed = 0

    def transform(x):
        return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3))

    n_pos = 0
    n_neg = 0
    for e in out_entries:
        max_score: float = max(
            lmap(SegmentScorePair.get_max_score,
                 flatten(e.token_info.values())))
        if max_score < 0.6:
            if n_neg > n_pos:
                continue
            else:
                n_neg += 1
                pass
        else:
            n_pos += 1

        n_printed += 1
        if n_printed > 500:
            break

        doc_tokens: List[str] = tokens_d[e.doc_id]
        score_len = max(e.token_info.keys()) + 1
        judgement: Dict[str, int] = judgement_d[e.query_id]
        label = judgement[e.doc_id]

        if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size:
            print("doc length : ", len(doc_tokens))
            print("score len:", score_len)
            print("doc length +step_size: ", len(doc_tokens) + step_size)
            continue

        row = []
        q_text = queries[e.query_id]
        html.write_paragraph("qid: " + e.query_id)
        html.write_paragraph("q_text: " + q_text)
        html.write_paragraph("Pred: {0:.2f}".format(max_score))
        html.write_paragraph("Label: {0:.2f}".format(label))

        for idx in range(score_len):
            token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]'
            token_info: List[SegmentScorePair] = e.token_info[idx]
            full_scores: List[float] = lmap(SegmentScorePair.get_score_diff,
                                            token_info)

            full_score_str = " ".join(lmap(two_digit_float, full_scores))
            # 1 ~ -1
            score = average(full_scores)
            if score > 0:
                color = "B"
            else:
                color = "R"
            normalized_score = transform(abs(score)) * 200
            c = get_tooltip_cell(token, full_score_str)
            c.highlight_score = normalized_score
            c.target_color = color
            row.append(c)

        html.multirow_print(row, 16)
Example #30
0
def main():
    prediction_file_path = at_output_dir("robust", "rob_dense_pred.score")
    info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_info")
    queries: Dict[str, str] = load_robust_04_query("desc")
    tokenizer = get_tokenizer()
    query_token_len_d = {}
    for qid, q_text in queries.items():
        query_token_len_d[qid] = len(tokenizer.tokenize(q_text))
    step_size = 16
    window_size = 128
    out_entries: List[DocTokenScore] = collect_token_scores(
        info_file_path, prediction_file_path, query_token_len_d, step_size,
        window_size)

    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement_d = load_qrels_structured(qrel_path)

    html = HtmlVisualizer("robust_desc_128_step16.html", use_tooltip=True)

    tprint("loading tokens pickles")
    tokens_d: Dict[str, List[str]] = load_pickle_from(
        os.path.join(sydney_working_dir, "RobustPredictTokens3", "1"))
    tprint("Now printing")
    n_printed = 0

    def transform(x):
        return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3))

    for e in out_entries:
        max_score = e.max_segment_score()
        if max_score < 0.6:
            continue
        n_printed += 1
        if n_printed > 10:
            break
        doc_tokens: List[str] = tokens_d[e.doc_id]
        score_len = len(e.scores)
        judgement: Dict[str, int] = judgement_d[e.query_id]
        label = judgement[e.doc_id]

        if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size:
            print("doc length : ", len(doc_tokens))
            print("score len:", score_len)
            print("doc length +step_size: ", len(doc_tokens) + step_size)
            raise IndexError

        row = []
        q_text = queries[e.query_id]
        html.write_paragraph("qid: " + e.query_id)
        html.write_paragraph("q_text: " + q_text)
        html.write_paragraph("Pred: {0:.2f}".format(max_score))
        html.write_paragraph("Label: {0:.2f}".format(label))

        for idx in range(score_len):
            token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]'

            full_scores = e.full_scores[idx]
            full_score_str = " ".join(lmap(two_digit_float, full_scores))
            score = e.scores[idx]
            normalized_score = transform(score) * 200
            c = get_tooltip_cell(token, full_score_str)
            c.highlight_score = normalized_score
            row.append(c)

        html.multirow_print(row, 16)