Example #1
0
def augment(short_records,
            long_records,
            target_len,
            save_dir,
            start_record_idx=0):
    exist_or_mkdir(save_dir)
    record_idx = start_record_idx
    print("record_idx", record_idx)

    def get_next_writer():
        return RecordWriterWrap(os.path.join(save_dir, str(record_idx)))

    writer = get_next_writer()
    cnt = 0
    while cnt < target_len:
        first_inst = short_records.__next__()
        second_inst = long_records.__next__()

        first_inst = feature_to_ordered_dict(first_inst)
        first_inst["next_sentence_labels"] = create_int_feature([1])
        second_inst = feature_to_ordered_dict(second_inst)
        second_inst["next_sentence_labels"] = create_int_feature([1])

        writer.write_feature(first_inst)
        writer.write_feature(second_inst)
        #
        cnt += 2
        if writer.total_written >= 100000:
            record_idx += 1
            print("Wrote {} data".format(cnt))
            writer.close()
            writer = get_next_writer()

    return
Example #2
0
def simple():
    tf_logging.setLevel(logging.INFO)
    out_path = os.path.join(working_path, "dict_reader3")
    exist_or_mkdir(out_path)
    worker = DGenWorker(out_path)
    worker.gen.f_hide_word = False
    worker.work(1)
Example #3
0
def split_train_to_tdev():
    sequence_length = 300
    data_loader = get_modified_nli_data_loader(sequence_length)
    file = data_loader.train_file

    dir_path = os.path.join(output_path,
                            "nli_tfrecord_t_{}".format(sequence_length))
    exist_or_mkdir(dir_path)

    itr = data_loader.example_generator(file)
    all_inst = []
    for e in itr:
        f = entry_to_feature_dict(e)
        all_inst.append(f)

    random.shuffle(all_inst)

    tdev_size = 9000
    train_t = all_inst[:-tdev_size]
    dev_t = all_inst[-tdev_size:]
    assert len(train_t) + len(dev_t) == len(all_inst)

    def save(name, data):
        output_file = os.path.join(dir_path, name)
        writer = write_features_to_file(data, output_file)
        print("%s: Wrote %d total instances" % (name, writer.total_written))

    save("train_t", train_t)
    save("dev_t", dev_t)
Example #4
0
def add_jobs(sh_format_path, model_sub_path, save_dir, job_group_name, job_list):
    save_path_list = []
    exist_or_mkdir(save_dir)
    job_id_list = []
    job_info_list: List[Dict] = []
    for i in job_list:
        save_path = os.path.join(save_dir, str(i))
        run_name = "{}-{}".format(job_group_name, i)
        d = {
            "$model_subpath": model_sub_path,
            "$run_name": run_name,
            "$i": str(i),
            "$save_path": save_path
        }
        job_id = run_job(sh_format_path, d)
        job_id_list.append(job_id)
        save_path_list.append(save_path)
        job_info = {
            'job_id': job_id,
            'save_path': save_path,
            'data_no': i,
        }
        job_info_list.append(job_info)

    return job_info_list
Example #5
0
def work():
    ranked_list_save_root = get_ranked_list_save_dir(Q_CONFIG_ID_BM25_UKP)
    exist_or_mkdir(ranked_list_save_root)
    query_files = get_all_query_file_names(Q_CONFIG_ID_BM25_UKP)
    query_to_all_clueweb_disk.send(query_files, index_name_list[:1],
                                   "ukp_{}".format(Q_CONFIG_ID_BM25_UKP),
                                   ranked_list_save_root)
Example #6
0
def make_cppnc_dummy_problem(claims: List[Dict], candidate_perspectives,
                             save_name: str, encode_inner_fn) -> None:

    empty_passage = {'passage': []}

    def get_payload() -> Iterable[Tuple[int, int, List[Dict]]]:
        for cid, candidates in candidate_perspectives.items():
            for candi in candidates:
                yield cid, candi['pid'], [empty_passage]

    tokenizer = get_tokenizer()
    data_id_man = DataIDManager()

    payloads: Iterable[PayloadAsTokens] = put_texts(get_payload(), claims,
                                                    tokenizer, data_id_man)
    max_seq_length = 512

    def encode_fn(r: PayloadAsTokens):
        return encode_inner_fn(max_seq_length, tokenizer, r)

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    save_path = os.path.join(out_dir, save_name + ".tfrecord")
    write_records_w_encode_fn(save_path, encode_fn, payloads)
    info_save_path = os.path.join(out_dir, save_name + ".info")
    print("Payload size : ", len(data_id_man.id_to_info))

    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
    print("tfrecord saved at :", save_path)
    print("info saved at :", info_save_path)
Example #7
0
    def __init__(self):
        print("AAA")
        logger.debug("Executer init")
        # save/log current jobs, so that it can restart.
        self.task_info_dir = os.path.join(root_dir, "task_info")
        self.root_info_dir = os.path.join(root_dir, "root_info")
        exist_or_mkdir(self.task_info_dir)
        exist_or_mkdir(self.root_info_dir)

        # load task info for all active / queued task
        self.active_task_list = TaskList(
            os.path.join(self.root_info_dir, "active_task.json"),
            self.task_info_dir)
        self.queued_task_list = TaskList(
            os.path.join(self.root_info_dir, "queued_task.json"),
            self.task_info_dir)
        self.info_dict = JsonTiedDict(
            os.path.join(self.root_info_dir, "info.json"))

        tpu_info_path = os.path.join(self.root_info_dir, "tpu_info.json")
        self.tpu_resource = ResourceList(tpu_info_path, ["v2-tf2", "v2-tf2-2"])
        self.current_task_handles = {}  # task_id -> process object
        # task_id being in current_task_handle does NOT imply the task is active, we don't delete handles
        self.task_cache = {}  # task_id -> TaskObj
        self._init_info()
Example #8
0
def gen_tfrecord_w_tdev():
    max_sequence = 300
    dir_path = os.path.join(data_path, "ukp_tdev_{}".format(max_sequence))
    exist_or_mkdir(dir_path)
    for topic in all_topics:
        data_loader = modify_data_loader(
            BertDataLoader(topic, True, max_sequence, "bert_voca.txt",
                           "only_topic_word"))
        todo = [("dev", data_loader.get_dev_data())]

        train_data = list(data_loader.get_train_data())

        random.shuffle(train_data)
        validation_size = int(len(train_data) * 0.1)

        train_train_data = train_data[:-validation_size]
        train_dev_data = train_data[validation_size:]
        todo.append(("ttrain", train_train_data))
        todo.append(("tdev", train_dev_data))

        for name, data in todo[::-1]:
            features = lmap(entry_to_feature_dict, data)
            out_name = "{}_{}".format(name, topic)
            out_path = os.path.join(dir_path, out_name)
            write_features_to_file(features, out_path)
Example #9
0
def gen_tf_record():
    sequence_length = 300
    data_loader = get_biobert_nli_data_loader(sequence_length)
    todo = [("train", [data_loader.train_file]),
            ("dev", [data_loader.dev_file])]
    batch_size = 32
    dir_path = os.path.join(output_path,
                            "biobert_mnli_{}".format(sequence_length))
    exist_or_mkdir(dir_path)

    for name, files in todo[::-1]:
        output_file = os.path.join(dir_path, name)
        writer = RecordWriterWrap(output_file)
        for file in files:
            for e in data_loader.example_generator(file):
                f = entry_to_feature_dict(e)
                f["is_real_example"] = create_int_feature([1])
                writer.write_feature(f)

        if name == "dev":
            while writer.total_written % batch_size != 0:
                f["is_real_example"] = create_int_feature([0])
                writer.write_feature(f)

        writer.close()

        print("Wrote %d total instances" % writer.total_written)
Example #10
0
def write_topic_sentence_as_query():
    query_collection_id = Q_CONFIG_ID_BM25_UKP

    dp_id_to_q_res_id = {}

    def dp_to_query(dp: UkpDataPoint) -> DocQuery:
        topic_tokens = clean_tokenize_str_to_tokens(dp.topic)
        sent_tokens = clean_tokenize_str_to_tokens(dp.sentence)
        qid = str(dp.id)
        dp_id_to_q_res_id[str(dp.id)] = "{}_{}".format(qid, query_collection_id)
        return format_query(topic_tokens, sent_tokens, qid, 3)

    train_data, val_data = load_all_data()

    def all_data_iterator() -> Iterator[UkpDataPoint]:
        for data_list in chain(train_data.values(), val_data.values()):
            for dp in data_list:
                yield dp

    all_queries: List[DocQuery] = lmap(dp_to_query, all_data_iterator())

    out_dir = get_query_dir(query_collection_id)
    exist_or_mkdir(out_dir)

    n_query_per_file = 50
    save_to_pickle(dp_id_to_q_res_id, "ukp_10_dp_id_to_q_res_id")
Example #11
0
def gen_pairwise():
    max_sequence = 300
    dir_path = os.path.join(data_path, "ukp_pairwise_{}".format(max_sequence))
    exist_or_mkdir(dir_path)

    for topic in all_topics:
        data_loader = modify_data_loader(
            BertDataLoader(topic, True, max_sequence, "bert_voca.txt",
                           "only_topic_word"))
        todo = [("train", data_loader.get_train_data()),
                ("dev", data_loader.get_dev_data())]
        for name, data in todo[::-1]:
            out_name = "{}_{}".format(name, topic)
            out_path = os.path.join(dir_path, out_name)

            grouped = [[], [], []]
            for e in data:
                input_ids, input_mask, segment_ids, label = e
                grouped[label].append(e)

            combs = []
            combs.extend(generate_pairwise_combinations(
                grouped[0], grouped[1]))
            combs.extend(generate_pairwise_combinations(
                grouped[1], grouped[2]))
            combs.extend(generate_pairwise_combinations(
                grouped[2], grouped[0]))
            features = lmap(pairwise_entry_to_feature_dict, combs)
            write_features_to_file(features, out_path)
Example #12
0
def write_claim_perspective_pair_as_query():
    split = "dev"
    assert split in ["train", "dev", "test"]

    d_ids = list({
        "train": load_train_claim_ids(),
        "dev": load_dev_claim_ids(),
        "test": load_test_claim_ids()
    }[split])
    claims = get_claims_from_ids(d_ids)
    print(len(claims), " claims")
    is_train = split == "train"
    all_data_points = get_candidates(claims, is_train)
    k = 0

    def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery:
        tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text)
        qid = "{}_{}".format(x.cid, x.pid)
        return format_query_bm25(qid, tokens, k)

    queries = lmap(get_query_entry_from_data_point, all_data_points)

    out_dir = query_dir_format.format(split)
    exist_or_mkdir(out_dir)
    n_query_per_file = 50

    write_queries_to_files(n_query_per_file, out_dir, queries)
Example #13
0
def run(in_dir_path, out_dir_path, keyword):
    exist_or_mkdir(out_dir_path)
    tokenizer = get_tokenizer()
    ids = tokenizer.convert_tokens_to_ids([keyword])
    assert len(ids) == 1
    id_keyword = ids[0]

    def condition_fn(features):
        return id_keyword in take(features['input_ids'])

    inst_cnt = 0

    def debug_call_back(features):
        nonlocal inst_cnt
        if inst_cnt < 4:
            input_tokens = tokenizer.convert_ids_to_tokens(
                take(features['input_ids']))
            print(pretty_tokens(input_tokens))
        inst_cnt += 1

    for file_path in get_dir_files(in_dir_path):
        inst_cnt = 0
        name = os.path.basename(file_path)
        out_path = os.path.join(out_dir_path, name)
        do_filtering(file_path, out_path, condition_fn)
Example #14
0
def run(args):
    data_name = args.data_name
    method_name = args.method_name
    score_name = "{}_{}".format(data_name, method_name)
    config = DropStop
    try:
        save_name = "{}_{}.txt".format(score_name, config.name)
        save_dir = os.path.join(output_path, "genex", "runs")
        exist_or_mkdir(os.path.join(output_path, "genex"))
        exist_or_mkdir(save_dir)
        save_path = os.path.join(save_dir, save_name)
        data: List[PackedInstance] = load_packed(data_name)

        if method_name == "random":
            config = RandomConfig
            scores: List[np.array] = [np.random.random([512])] * len(data)
        else:
            scores: List[np.array] = load_from_pickle(score_name)

        if "term_" in method_name:
            save_score_to_file_term_level(data, config, save_path, scores)
        else:
            save_score_to_file(data, config, save_path, scores)
    except:
        raise
Example #15
0
 def submit_jobs_inner(claim_ids, split_name):
     claims = get_claims_from_ids(claim_ids)
     queries = get_claims_query(claims)
     out_root = "/mnt/nfs/work3/youngwookim/data/perspective/{}_claim_rm3".format(
         split_name)
     exist_or_mkdir(out_root)
     submit_rm_jobs(queries, out_root)
Example #16
0
def make_cppnc_problem(passage_score_path: FilePath, data_id_to_info: Dict,
                       claims: List[Dict], candidate_perspectives, config,
                       save_name: str, encode_inner_fn) -> None:
    output: List[Tuple[int, List[Dict]]] = collect_good_passages(
        data_id_to_info, passage_score_path, config)
    joined_payloads: List = list(
        join_perspective(output, candidate_perspectives))
    tokenizer = get_tokenizer()
    data_id_man = DataIDManager()

    payloads: Iterable[PayloadAsTokens] = put_texts(joined_payloads, claims,
                                                    tokenizer, data_id_man)
    max_seq_length = 512

    def encode_fn(r: PayloadAsTokens):
        return encode_inner_fn(max_seq_length, tokenizer, r)

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    save_path = os.path.join(out_dir, save_name + ".tfrecord")
    write_records_w_encode_fn(save_path, encode_fn, payloads)
    info_save_path = os.path.join(out_dir, save_name + ".info")
    print("Payload size : ", len(data_id_man.id_to_info))

    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
    print("tfrecord saved at :", save_path)
    print("info saved at :", info_save_path)
Example #17
0
def generate_qc3():
    is_correct_fn = get_is_correct_fn()
    save_dir = os.path.join(output_path, "pc_evidence_qc3")
    exist_or_mkdir(save_dir)
    for split in splits:
        candidate_dict: Dict[str, List[QCKCandidateI]] = get_candidate(split)
        do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split)
Example #18
0
 def __init__(self, max_seq_length,
              out_dir):
     self.query_group: List[List[QueryID]] = load_query_group("train")
     self.seg_resource_loader = SegmentResourceLoader(job_man_dir, "train")
     self.max_seq_length = max_seq_length
     self.out_dir = out_dir
     self.info_dir = self.out_dir + "_info"
     exist_or_mkdir(self.info_dir)
Example #19
0
def init_worker():
    out_path1 = os.path.join(working_path, "entry_prediction_tf")
    out_path2 = os.path.join(working_path, "entry_prediction_n")
    exist_or_mkdir(out_path1)
    exist_or_mkdir(out_path2)

    worker = Worker(out_path1, out_path2)
    return worker
Example #20
0
def main():
    save_dir = os.path.join(output_path, "pc_qc")
    exist_or_mkdir(save_dir)
    for split in splits:
        queries = get_qck_queries(split)
        eval_candidate = get_eval_candidates_as_qck(split)
        save_path = os.path.join(save_dir, split)
        make_pc_qc(queries, eval_candidate, is_correct_factory(), save_path)
Example #21
0
def main():
    exist_or_mkdir(os.path.join(output_path, "aawd_tfrecord"))
    train, dev, test = load_aawd_splits()
    todo = [(train, "train"), (dev, "dev"), (test, "test")]
    encode_fn = get_encode_fn(256)
    for data, split in todo:
        save_path = at_output_dir("aawd_tfrecord", split)
        write_records_w_encode_fn(save_path, encode_fn, data)
Example #22
0
def main():
    job_name = "MMD_train_single_seg"
    out_dir = os.path.join(job_man_dir, job_name)
    exist_or_mkdir(out_dir)
    worker = SingleSegTrainGen(512, out_dir)
    for job_id in range(178, train_query_group_len):
        print("job_id:", job_id)
        worker.work(job_id)
Example #23
0
def main():
    save_name = sys.argv[1]
    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    info_file_path = os.path.join(out_dir, sys.argv[2])
    pred_file_path = os.path.join(out_dir, save_name + ".score")
    score_d = summarize_score(info_file_path, pred_file_path)
    save_to_pickle(score_d, "score_d")
    print("Saved as 'score_d'")
Example #24
0
def do_nli():
    model_1_path = "gs://clovertpu/training/model/nli_bert_freeze_D/model.ckpt-73615"
    model_2_path = 'gs://clover_eu4/model/alt_emb_L/model.ckpt-20000'
    save_dir = os.path.join(output_path, "nli_from_alt_emb_L")
    exist_or_mkdir(save_dir)
    save_path = os.path.join(save_dir, "model.ckpt-0")
    download_and_combine(model_1_path, model_2_path, save_path)
    upload_gs_dir = "gs://clover_eu4/model/nli_from_alt_emb_L"
    upload_to_gs(save_path, upload_gs_dir)
Example #25
0
def save_per_cid():
    print("Loading scores...")
    cid_grouped: Dict[str, Dict[str, List[Dict]]] = load_cppnc_score_wrap()
    save_root = os.path.join(output_path, "cppnc", "cid_grouped")
    exist_or_mkdir(save_root)

    for cid, entries in cid_grouped.items():
        save_path = os.path.join(save_root, cid)
        pickle.dump(entries, open(save_path, "wb"))
Example #26
0
def ukp():
    model_1_path = "gs://clovertpu/training/model/ukp_runs/ukp_8_bert_/model.ckpt-1939"
    model_2_path = 'gs://clover_eu4/model/alt_emb_O_ukp/model.ckpt-20000'
    save_dir = os.path.join(output_path, "ukp_from_alt_emb_O")
    exist_or_mkdir(save_dir)
    save_path = os.path.join(save_dir, "model.ckpt-0")
    download_and_combine(model_1_path, model_2_path, save_path)
    upload_gs_dir = "gs://clovertpu/training/model/ukp_from_alt_emb_O"
    upload_to_gs(save_path, upload_gs_dir)
Example #27
0
def ehealth_K():
    model_1_path = "gs://clovertpu/training/model/ehealth_bert_freeze/model.ckpt-10000"
    model_2_path = 'gs://clover_eu4/model/alt_emb_K/model.ckpt-20000'
    save_dir = os.path.join(output_path, "ehealth_K")
    exist_or_mkdir(save_dir)
    save_path = os.path.join(save_dir, "model.ckpt-0")
    download_and_combine(model_1_path, model_2_path, save_path)
    upload_gs_dir = "gs://clover_eu4/model/ehealth_combined_K"
    upload_to_gs(save_path, upload_gs_dir)
Example #28
0
def main():
    save_name = sys.argv[1]
    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    info_file_path = os.path.join(out_dir, save_name + ".info")
    pred_file_path = os.path.join(out_dir, save_name + ".score")
    score_d = summarize_score(info_file_path, pred_file_path)
    map_score = eval_map("dev", score_d, False)
    print(map_score)
Example #29
0
def run_dir(in_dir_name: FileName, out_dir_name: FileName):
    in_dir = pjoin(sydney_working_dir, in_dir_name)
    out_dir = pjoin(sydney_working_dir, out_dir_name)
    exist_or_mkdir(out_dir)

    for file_path in get_dir_files(in_dir):
        name = FileName(os.path.basename(file_path))
        out_path = pjoin(out_dir, name)
        convert_to_2way(file_path, out_path)
Example #30
0
def generate_qc_bert4():
    is_correct_fn = get_is_correct_fn()
    save_dir = os.path.join(output_path, "pc_evidence_qc4")
    exist_or_mkdir(save_dir)
    for split in splits:
        candidate_dict: Dict[
            str,
            List[QCKCandidateI]] = get_ex_candidate_for_training(split, False)
        do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split)