Example #1
0
def main():
    dir_path = sys.argv[1]
    tokenizer = get_tokenizer()
    averager = Averager()
    sbc = SubwordConvertor()
    df = Counter()
    collection_size = 0
    tikcer = TimeEstimator(485393)
    for file_path in get_dir_files(dir_path):
        for idx, record in enumerate(
                tf.compat.v1.python_io.tf_record_iterator(file_path)):
            example = tf.train.Example()
            example.ParseFromString(record)
            feature = example.features.feature
            input_ids = feature["input_ids"].int64_list.value
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            sep_idx1 = tokens.index("[SEP]")
            sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1)
            doc_tokens = tokens[sep_idx1:sep_idx2]
            words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens))
            dl = len(words)
            collection_size += dl
            averager.append(dl)
            for word in set(words):
                df[word] += 1
            tikcer.tick()

    print("collection length", collection_size)
    print("average dl", averager.get_average())
    save_to_pickle(df, "subword_df_robust_train")
Example #2
0
def main():
    dir_path = sys.argv[1]
    tokenizer = get_tokenizer()
    averager = Averager()

    for file_path in get_dir_files(dir_path):
        for idx, record in enumerate(
                tf.compat.v1.python_io.tf_record_iterator(file_path)):
            if idx % 3:
                continue
            example = tf.train.Example()
            example.ParseFromString(record)
            feature = example.features.feature
            input_mask = feature["input_mask"].int64_list.value
            if input_mask[-1]:
                input_ids = feature["input_ids"].int64_list.value
                tokens = tokenizer.convert_ids_to_tokens(input_ids)
                sep_idx1 = tokens.index("[SEP]")
                sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1)
                doc_tokens = tokens[sep_idx1:sep_idx2]
                continue_cnt = 0
                for t in doc_tokens:
                    if t[:2] == "##":
                        continue_cnt += 1


##
                n_words = len(doc_tokens) - continue_cnt
                averager.append(n_words)

    print("average", averager.get_average())
Example #3
0
def work(dir_path: FilePath):
    q_config_id = Q_CONFIG_ID_DEV_ALL
    print(dir_path)
    for file_path in get_dir_files(dir_path):
        print(file_path)
        ##
        insert_ranked_list_from_path(file_path, q_config_id)
Example #4
0
def enum_dir_records(dir_path):
    file_path_list = get_dir_files(dir_path)

    while True:
        for file_path in file_path_list:
            for item in load_record(file_path):
                yield item
Example #5
0
def run(in_dir_path, out_dir_path, keyword):
    exist_or_mkdir(out_dir_path)
    tokenizer = get_tokenizer()
    ids = tokenizer.convert_tokens_to_ids([keyword])
    assert len(ids) == 1
    id_keyword = ids[0]

    def condition_fn(features):
        return id_keyword in take(features['input_ids'])

    inst_cnt = 0

    def debug_call_back(features):
        nonlocal inst_cnt
        if inst_cnt < 4:
            input_tokens = tokenizer.convert_ids_to_tokens(
                take(features['input_ids']))
            print(pretty_tokens(input_tokens))
        inst_cnt += 1

    for file_path in get_dir_files(in_dir_path):
        inst_cnt = 0
        name = os.path.basename(file_path)
        out_path = os.path.join(out_dir_path, name)
        do_filtering(file_path, out_path, condition_fn)
def run_dir(in_dir_name: FileName, out_dir_name: FileName):
    in_dir = pjoin(sydney_working_dir, in_dir_name)
    out_dir = pjoin(sydney_working_dir, out_dir_name)
    exist_or_mkdir(out_dir)

    for file_path in get_dir_files(in_dir):
        name = FileName(os.path.basename(file_path))
        out_path = pjoin(out_dir, name)
        convert_to_2way(file_path, out_path)
Example #7
0
def main():
    dir_path = os.path.join(job_man_dir, "qcknc3_dev_info")
    out_dir_path = os.path.join(job_man_dir, "qcknc3_dev_info_light")
    exist_or_mkdir(out_dir_path)
    for file_path in get_dir_files(FilePath(dir_path)):
        print(file_path)
        if file_path.endswith(".info"):
            out_file_path = os.path.join(out_dir_path,
                                         os.path.basename(file_path))
            drop_tokens(file_path, out_file_path)
Example #8
0
def load_tokens_for_topic(token_path, topic):
    d = {}
    for path in get_dir_files(token_path):
        if topic.replace(" ", "_") in path:
            data = pickle.load(open(path, "rb"))
            if len(data) < 10000:
                print("{} has {} data".format(path, len(data)))
            d.update(data)
    print("Loaded {} docs for {}".format(len(d), topic))
    return d
Example #9
0
def main():
    dir_path = os.path.join(data_path, "pc_evi_qck_predict_dev_info")
    out_dir_path = os.path.join(data_path, "pc_evi_qck_predict_dev_info_fixed")
    exist_or_mkdir(out_dir_path)
    for file_path in get_dir_files(FilePath(dir_path)):
        # print(file_path)
        # file_path = "/mnt/nfs/work3/youngwookim/job_man/pc_evi_qck_predict_dev_info/0.info"
        # out_file_path = "/mnt/nfs/work3/youngwookim/job_man/temp_0.info"
        out_file_path = os.path.join(out_dir_path, os.path.basename(file_path))
        modify_and_save(file_path, out_file_path)
Example #10
0
def load_ranked_list(relevance_list_path):
    all_ranked_list = {}
    for file_path in get_dir_files(relevance_list_path):
        file_name = os.path.basename(file_path)
        ranked_list_d = load_galago_ranked_list(file_path)

        queries = ranked_list_d.keys()
        any_query = list(queries)[0]
        ranked_list = ranked_list_d[any_query]
        all_ranked_list[file_name] = ranked_list
    return all_ranked_list
Example #11
0
def load_combine_info_jsons(dir_path, convert_map, drop_kdp=True) -> Dict:
    if os.path.isdir(dir_path):
        d = {}
        for file_path in get_dir_files(dir_path):
            if file_path.endswith(".info"):
                j = json.load(open(file_path, "r", encoding="utf-8"))
                parse_info(j, convert_map, drop_kdp)
                d.update(j)
    else:
        d = json.load(open(dir_path, "r"))
        parse_info(d, convert_map, drop_kdp)
    return d
Example #12
0
def load():
    root = os.path.join(scope_dir, "by_time")
    l_all = []
    for dir_path in get_dir_dir(root):
        print(dir_path)
        for file_path in get_dir_files(dir_path):
            l = load_article_only_short_url(file_path)
            l_all.extend(l)

    print("Total of {} articles ".format(len(l_all)))
    out_path = os.path.join(root, "list.pickle")
    pickle.dump(l_all, open(out_path, "wb"))
Example #13
0
def load_corpus():
    dir_path = FilePath("/mnt/nfs/work3/youngwookim/data/bert_tf/clueweb12_13B_word_tokens/")

    corpus = []

    cnt = 0
    for file_path in get_dir_files(dir_path):
        tokens_list = load_pickle_from(file_path)
        corpus.extend(tokens_list)
        if cnt > 50:
            break
        cnt += 1
    return corpus
Example #14
0
def load_combine_info_jsons(dir_path, convert_map):
    token_d = {}
    for file_path in get_dir_files(dir_path):
        if file_path.endswith(".info"):
            j = json.load(open(file_path, "r", encoding="utf-8"))
            parse_info(j, convert_map, False)

            for data_id, info in j.items():
                kdp: KDP = info["kdp"]
                key = kdp.doc_id, kdp.passage_idx
                if key in token_d:
                    if str(token_d[key]) != str(kdp.tokens):
                        print(key)
                token_d[key] = kdp.tokens
Example #15
0
def load_all_docs() -> List[MPQARawDoc]:
    docs = []
    doc_dir_path = os.path.join(root_dir, "docs")
    for parent_dir in get_dir_dir(doc_dir_path):
        parent_name = os.path.basename(parent_dir)
        for doc_leaf_path in get_dir_files(parent_dir):
            file_name = os.path.basename(doc_leaf_path)
            doc_id = parent_name + "/" + file_name
            try:
                content = open(doc_leaf_path, "r", encoding="utf-8").read()
                docs.append(MPQARawDoc(doc_id, content))
            except UnicodeDecodeError:
                print(doc_leaf_path)
                raise
    return docs
Example #16
0
def summarize_runner(summarizer, out_root):
    dir_root = "/mnt/nfs/scratch1/youngwookim/data/clueweb12_10000_pred_ex"
    for file_path in get_dir_files(dir_root):
        try:
            if "abortion" not in file_path:
                continue
            print(file_path)
            file_name = os.path.basename(file_path)
            obj = pickle.load(open(file_path, "rb"))
            r = summarizer(obj)
            out_path = os.path.join(out_root, file_name)
            pickle.dump(r, open(out_path, "wb"))
        except Exception as e:
            print(e)
            pass
Example #17
0
def enum_docs_and_stance():
    topic = "abortion"
    summary_path = "/mnt/nfs/work3/youngwookim/data/stance/clueweb12_10000_pred_ex_summary_w_logit"
    relevance_list_path = "/home/youngwookim/work/ukp/relevant_docs/clueweb12"
    all_tokens = ukp_load_tokens_for_topic(topic)
    all_ranked_list = load_ranked_list(relevance_list_path)
    for file_path in get_dir_files(summary_path):
        if topic not in file_path:
            continue
        file_name = os.path.basename(file_path)
        predictions = pickle.load(open(file_path, "rb"))
        for doc_idx, preds in predictions:
            doc_id, rank, score = all_ranked_list[file_name][doc_idx]
            doc = all_tokens[doc_id]
            yield doc, preds
Example #18
0
def loss_view(dir_path):
    tokenizer = get_tokenizer()
    html_writer = HtmlVisualizer("ukp_lm_grad_high.html", dark_mode=False)

    for file_path in get_dir_files(dir_path):
        items = pickle.load(open(file_path, "rb"))

        for e in items:
            input_ids, masked_input_ids, masked_lm_example_loss = e
            tokens = mask_resolve_1(
                tokenizer.convert_ids_to_tokens(input_ids),
                tokenizer.convert_ids_to_tokens(masked_input_ids))
            highlight = lmap(is_mask, tokens)

            cells = cells_from_tokens(tokens, highlight)
            html_writer.multirow_print(cells)
Example #19
0
def load_all_annotations() -> List[Tuple[str, List[MPQAAnnLine]]]:
    doc_dir_path = os.path.join(root_dir, "man_anns")
    for parent_dir in get_dir_dir(doc_dir_path):
        parent_name = os.path.basename(parent_dir)
        for doc_leaf_path in get_dir_dir(parent_dir):
            doc_leaf_name = os.path.basename(doc_leaf_path)
            doc_id = parent_name + "/" + doc_leaf_name
            ann_set_list = []
            for ann_file_path in get_dir_files(doc_leaf_path):
                ann_file_type = os.path.basename(ann_file_path)
                assert ann_file_type in [
                    "gateman.mpqa.lre.2.0", "gatesentences.mpqa.2.0",
                    "answer.mpqa.2.0"
                ]
                lines = read_mqpa_anns(ann_file_path)
                ann_set_list.extend(lines)
                yield doc_id, ann_set_list
Example #20
0
def sample_median():
    # we don't want to make one of (bad/good) split to have shorter text than the other.
    files = get_dir_files(get_prediction_dir(working_dir))
    random.shuffle(files)

    all_scores = []
    for file_path in files[:10]:
        data = pickle.load(open(file_path, "rb"))
        data = flatten_batches(data)
        t = scorer(data["prob1"], data["prob2"])
        all_scores.extend(t)

    all_scores.sort()
    l = len(all_scores)
    print(l)
    mid = int(l / 2)
    print(all_scores[mid])
def load_multiple_ranked_list(dir_path, get_key_from_name):
    files = get_dir_files(dir_path)

    data = []
    for file_path in files:
        name = os.path.basename(file_path)
        ranked_list_d = load_galago_ranked_list(file_path)
        for query, ranked_list in ranked_list_d.items():
            data.append((name, ranked_list))

    new_d = {}
    key_fn = lambda x: get_key_from_name(x[0])
    for key, sub_data in group_by(data, key_fn).items():
        ranked_list = right(sub_data)
        new_d[key] = merge_ranked_list_list(ranked_list)

    return new_d
    def estimator_prediction_loader(p, fetch_field_list=None):
        if os.path.isdir(p):
            data = []
            for file_path in get_dir_files(p):
                data.extend(pickle.load(open(file_path, "rb")))
        else:
            data = pickle.load(open(p, "rb"))

        if fetch_field_list is None:
            keys = list(data[0].keys())
            vectors = flatten_batches(data)
        else:
            keys = list([k for k in data[0].keys() if k in fetch_field_list])
            vectors = flatten_batches_inner(data, fetch_field_list)

        any_key = keys[0]
        data_len = len(vectors[any_key])

        return vectors, keys, data_len
Example #23
0
    def __init__(self, select_by_preds):
        summary_path = "/mnt/nfs/work3/youngwookim/data/stance/clueweb12_10000_pred_summary"
        relevance_list_path = "/home/youngwookim/work/ukp/relevant_docs/clueweb12"
        all_ranked_list = load_ranked_list(relevance_list_path)

        self.selected = set()
        for file_path in get_dir_files(summary_path):
            file_name = os.path.basename(file_path)
            predictions = pickle.load(open(file_path, "rb"))
            n_reject = 0
            for doc_idx, preds in predictions:
                doc_id, rank, score = all_ranked_list[file_name][doc_idx]
                assert rank == doc_idx + 1
                if select_by_preds(preds):
                    self.selected.add(doc_id)
                else:
                    n_reject += 1
            print("{} Reject {}".format(file_name,
                                        n_reject / len(predictions)))
Example #24
0
def count_terms_for_dir(dir_path):
    def sig_to_terms(sig: str):
        token_ids = [int(t) for t in sig.split(" ")]
        terms = tokenizer.convert_ids_to_tokens(token_ids)
        return "".join(terms)

    counter = Counter()
    file_list = get_dir_files(dir_path)
    ticker = TimeEstimator(len(file_list))
    for file_path in file_list:
        counter.update(count_terms(file_path))
        ticker.tick()

    tokenizer = get_tokenizer()

    for sig, cnt in counter.items():
        term = sig_to_terms(sig)
        print(term, cnt)

    return
Example #25
0
def sample_median():
    # we don't want to make one of (bad/good) split to have shorter text than the other.
    all_scores = []
    scorer = get_lm_scorer()

    files = get_dir_files(tf_record_dir)
    random.shuffle(files)

    for file_path in files[:10]:
        tfrecord_itr = load_record(file_path)
        ticker = TimeEstimator(1000)
        for idx, inst in enumerate(tfrecord_itr):
            all_scores.append(scorer(inst))
            if idx > 1000:
                break
            ticker.tick()
    all_scores.sort()
    l = len(all_scores)
    print(l)
    mid = int(l / 2)
    print(all_scores[mid])
Example #26
0
def show(dir_path):
    topic = "abortion"
    tokenizer = get_tokenizer()
    for file_path in get_dir_files(dir_path):
        if topic not in file_path:
            continue
        file_name = os.path.basename(file_path)
        predictions = pickle.load(open(file_path, "rb"))
        for doc in predictions:
            show_doc = False
            for e in doc:
                sout, input_ids = e
                if sout[2] > 0.5:
                    show_doc = True

            if show_doc:
                for e in doc:
                    sout, input_ids = e
                    tokens = tokenizer.convert_ids_to_tokens(input_ids)
                    pred = np.argmax(sout)
                    print(pred, pretty_tokens(tokens, True))
                print("------------")
Example #27
0
def collect_unique_passage(dir_path):
    key_set = set()
    unique_passages = []

    def update(j):
        for doc_id, value in j.items():
            j = value
            kdp = KDP(*j['kdp'])
            key = kdp.doc_id, kdp.passage_idx
            if key not in key_set:
                unique_passages.append(kdp)
                key_set.add(key)

    if os.path.isdir(dir_path):
        d = {}
        for file_path in get_dir_files(dir_path):
            if file_path.endswith(".info"):
                j = json.load(open(file_path, "r", encoding="utf-8"))
                update(j)
    else:
        d = json.load(open(dir_path, "r"))
        update(d)
    return unique_passages
Example #28
0
def get_dir_all_itr(dir_path):
    for file_path in get_dir_files(dir_path):
        one_itr = load_record_v2(file_path)
        for item in one_itr:
            yield item
Example #29
0
def load_all_comments(dir_path):
    for comment_path in get_dir_files(dir_path):
        yield parse_comment.parse_comments(comment_path)
Example #30
0
 def iter_gz_files_for_group(self, group_name):
     dir_to_iter = os.path.join(self.root_dir,
                                group_name_to_subdir_path(group_name))
     file_list = get_dir_files(FilePath(dir_to_iter))
     file_list.sort()
     return file_list