Esempio n. 1
0
def make_cppnc_problem(passage_score_path: FilePath, data_id_to_info: Dict,
                       claims: List[Dict], candidate_perspectives, config,
                       save_name: str, encode_inner_fn) -> None:
    output: List[Tuple[int, List[Dict]]] = collect_good_passages(
        data_id_to_info, passage_score_path, config)
    joined_payloads: List = list(
        join_perspective(output, candidate_perspectives))
    tokenizer = get_tokenizer()
    data_id_man = DataIDManager()

    payloads: Iterable[PayloadAsTokens] = put_texts(joined_payloads, claims,
                                                    tokenizer, data_id_man)
    max_seq_length = 512

    def encode_fn(r: PayloadAsTokens):
        return encode_inner_fn(max_seq_length, tokenizer, r)

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    save_path = os.path.join(out_dir, save_name + ".tfrecord")
    write_records_w_encode_fn(save_path, encode_fn, payloads)
    info_save_path = os.path.join(out_dir, save_name + ".info")
    print("Payload size : ", len(data_id_man.id_to_info))

    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
    print("tfrecord saved at :", save_path)
    print("info saved at :", info_save_path)
Esempio n. 2
0
def write_qck_as_tfrecord(save_path, payloads: Iterable[QCKCompactEntry]):
    data_id_man = DataIDManager(0, 1000 * 1000)

    tokenizer = get_tokenizer()
    cache_tokenizer = CachedTokenizer(tokenizer)
    max_seq_length = 512

    def encode_fn(e: QCKCompactEntry) -> OrderedDict:
        query, candidate, qk_out_entry = e
        candidate: QCKCandidate = candidate
        info = {
            'query': query,
            'candidate': candidate,
            'kdp': qk_out_entry.kdp
        }

        p = PayloadAsTokens(passage=qk_out_entry.passage_tokens,
                            text1=cache_tokenizer.tokenize(query.text),
                            text2=cache_tokenizer.tokenize(candidate.text),
                            data_id=data_id_man.assign(info),
                            is_correct=0
                            )
        return encode_two_inputs(max_seq_length, tokenizer, p)

    write_records_w_encode_fn(save_path, encode_fn, payloads)
    return data_id_man
Esempio n. 3
0
    def generate_selected_training_data_ablation_only_pos(info, key, max_seq_length, save_dir, score_dir):
        data_id_manager = DataIDManager(0, 1000000)
        out_path = os.path.join(save_dir, str(key))
        pred_path = os.path.join(score_dir, str(key))
        tprint("data gen")
        itr = enum_best_segments(pred_path, info)
        insts = []
        for selected_entry in itr:
            selected = decompress_seg_ids_entry(selected_entry)
            assert len(selected['input_ids']) == len(selected['seg_ids'])

            selected['input_ids'] = pad0(selected['input_ids'], max_seq_length)
            selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length)
            # data_id = data_id_manager.assign(selected_segment.to_info_d())
            data_id = 0
            ci = InstAsInputIds(
                selected['input_ids'],
                selected['seg_ids'],
                selected['label'],
                data_id)
            insts.append(ci)

        def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict:
            return encode_inst_as_input_ids(max_seq_length, inst)

        tprint("writing")
        write_records_w_encode_fn(out_path, encode_fn, insts, len(insts))
        save_info(save_dir, data_id_manager, str(key) + ".info")
Esempio n. 4
0
def write_to_file(output_path, g2: Iterable[Tuple[int, Tuple[Vectors, Label]]],
                  max_entries):
    def encode(e: Tuple[int, Tuple[Vectors, Label]]) -> OrderedDict:
        data_id, (vector, label) = e
        features = OrderedDict()
        features['label_ids'] = create_int_feature([label])
        features['data_id'] = create_int_feature([data_id])

        vector = np.stack(vector,
                          axis=0)  # [n_entries, seq-length, hidden_unit]
        vector = vector[:max_entries]
        vector_len, seq_len, hidden_unit = np.shape(vector)
        valid_mask = np.ones([vector_len, seq_len, 1], np.int)
        if len(vector) < max_entries:
            pad_len = max_entries - len(vector)
            vector = np.concatenate(
                [vector, np.zeros([pad_len, seq_len, hidden_unit])], axis=0)
            valid_mask = np.concatenate(
                [valid_mask,
                 np.zeros([pad_len, seq_len, 1], np.int)], axis=0)

        v = np.reshape(vector, [-1])  # [n_entries * seq_length]
        valid_mask = np.reshape(valid_mask, [-1])  # [n_entries * seq_length]
        features['vectors'] = create_float_feature(v)
        features['valid_mask'] = create_int_feature(valid_mask)
        return features

    write_records_w_encode_fn(output_path, encode, g2)
Esempio n. 5
0
def write_records(records: List[PayloadAsTokens], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def encode(inst: PayloadAsTokens) -> OrderedDict:
        return encode_two_inputs(max_seq_length, tokenizer, inst)

    write_records_w_encode_fn(output_path, encode, records)
Esempio n. 6
0
def make_training_data(config):
    pos_doc_list_path = config['doc_list_path']
    q_res_path = config['q_res_path']
    save_path = config['save_path']
    balance_test = config['balance_test']

    max_seq_length = 512

    pos_doc_ids = set(
        [l.strip() for l in open(pos_doc_list_path, "r").readlines()])
    doc_ids_unique = get_doc_ids_from_ranked_list_path(q_res_path)

    insts = generate(list(pos_doc_ids), list(doc_ids_unique), max_seq_length)

    train_size = int(0.9 * len(insts))
    train_insts = insts[:train_size]
    val_insts = insts[train_size:]

    val_pos_insts = list([i for i in val_insts if i.label == 1])
    val_neg_insts = list([i for i in val_insts if not i.label])
    print("num pos inst in val", len(val_pos_insts))
    if balance_test:
        val_neg_insts = val_neg_insts[:len(val_pos_insts)]
    val_insts = val_pos_insts + val_neg_insts

    tokenizer = get_tokenizer()

    def encode_fn(inst: Instance) -> OrderedDict:
        return encode_w_data_id(tokenizer, max_seq_length, inst)

    write_records_w_encode_fn(save_path + "train", encode_fn, train_insts)
    write_records_w_encode_fn(save_path + "val", encode_fn, val_insts)
Esempio n. 7
0
 def make_tfrecord(source_name, target_name):
     source_data = data_d[source_name]
     target_data = data_d[target_name]
     combined_data = combine_source_and_target(source_data, target_data, 1)
     save_path = at_output_dir(
         dir_name, "{}_to_{}_train".format(source_name, target_name))
     write_records_w_encode_fn(save_path, encode_fn, combined_data)
Esempio n. 8
0
def make_cppnc_dummy_problem(claims: List[Dict], candidate_perspectives,
                             save_name: str, encode_inner_fn) -> None:

    empty_passage = {'passage': []}

    def get_payload() -> Iterable[Tuple[int, int, List[Dict]]]:
        for cid, candidates in candidate_perspectives.items():
            for candi in candidates:
                yield cid, candi['pid'], [empty_passage]

    tokenizer = get_tokenizer()
    data_id_man = DataIDManager()

    payloads: Iterable[PayloadAsTokens] = put_texts(get_payload(), claims,
                                                    tokenizer, data_id_man)
    max_seq_length = 512

    def encode_fn(r: PayloadAsTokens):
        return encode_inner_fn(max_seq_length, tokenizer, r)

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    save_path = os.path.join(out_dir, save_name + ".tfrecord")
    write_records_w_encode_fn(save_path, encode_fn, payloads)
    info_save_path = os.path.join(out_dir, save_name + ".info")
    print("Payload size : ", len(data_id_man.id_to_info))

    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
    print("tfrecord saved at :", save_path)
    print("info saved at :", info_save_path)
Esempio n. 9
0
def do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split):
    queries = get_qck_queries(split)
    generator = QCInstanceGenerator(candidate_dict, is_correct_fn)
    data_id_manager = DataIDManager()
    insts = generator.generate(queries, data_id_manager)
    save_path = os.path.join(save_dir, split)
    write_records_w_encode_fn(save_path, generator.encode_fn, insts)
    json.dump(data_id_manager.id_to_info, open(save_path + ".info", "w"))
Esempio n. 10
0
def write_records(records: List[Payload], max_seq_length, output_path):
    tokenizer = get_tokenizer()

    def encode(inst: Payload) -> OrderedDict:
        inst_2 = convert_sub_token(tokenizer, inst)
        return encode_inner(max_seq_length, tokenizer, inst_2)

    write_records_w_encode_fn(output_path, encode, records)
Esempio n. 11
0
def main():
    raw_payload: List[ClaimPassages] = load_dev_payload()
    save_path = os.path.join(output_path, "pc_dev_passage_payload")
    encode = get_encode_fn(512)
    data_id_manage = DataIDManager()
    insts = list(generate_instances(raw_payload, data_id_manage))
    write_records_w_encode_fn(save_path, encode, insts, len(insts))
    save_to_pickle(data_id_manage.id_to_info, "pc_dev_passage_payload_info")
Esempio n. 12
0
def main():
    exist_or_mkdir(os.path.join(output_path, "aawd_tfrecord"))
    train, dev, test = load_aawd_splits()
    todo = [(train, "train"), (dev, "dev"), (test, "test")]
    encode_fn = get_encode_fn(256)
    for data, split in todo:
        save_path = at_output_dir("aawd_tfrecord", split)
        write_records_w_encode_fn(save_path, encode_fn, data)
Esempio n. 13
0
    def write(self, insts: List[ClassificationInstanceWDataID], out_path: str):
        def encode_fn(
                inst: ClassificationInstanceWDataID
        ) -> collections.OrderedDict:
            return encode_classification_instance_w_data_id(
                self.tokenizer, self.max_seq_length, inst)

        write_records_w_encode_fn(out_path, encode_fn, insts, len(insts))
Esempio n. 14
0
def write_with_classification_instance_with_id(
        tokenizer, max_seq_length,
        insts: Iterable[ClassificationInstanceWDataID], out_path: str):
    def encode_fn(
            inst: ClassificationInstanceWDataID) -> collections.OrderedDict:
        return encode_classification_instance_w_data_id(
            tokenizer, max_seq_length, inst)

    write_records_w_encode_fn(out_path, encode_fn, insts)
Esempio n. 15
0
def make_pc_qc(queries: Iterable[QCKQuery],
               eval_candidate: Dict[str, List[QCKCandidate]], is_correct_fn,
               save_path: str):
    generator = QCInstanceGenerator(eval_candidate, is_correct_fn)
    data_id_manager = DataIDManager(0, 10000 * 10000)
    insts = generator.generate(queries, data_id_manager)
    insts = list(insts)
    write_records_w_encode_fn(save_path, generator.encode_fn, insts)
    json.dump(data_id_manager.id_to_info, open(save_path + ".info", "w"))
Esempio n. 16
0
def generate_and_write(file_name, generate_fn, tokenizer):
    data_id_man = DataIDManager()
    inst_list = generate_fn(data_id_man)
    max_seq_length = 300
    save_path = at_output_dir("alamri_tfrecord", file_name)
    encode_fn = get_encode_fn(max_seq_length, tokenizer)
    write_records_w_encode_fn(save_path, encode_fn, inst_list)
    info_save_path = at_output_dir("alamri_tfrecord", file_name + ".info")
    json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
Esempio n. 17
0
def binary_gen():
    exist_or_mkdir(os.path.join(output_path, "argu_ana_tfrecord"))
    train_x, train_y, dev_x, dev_y = get_argu_pointwise_data()
    train = zip(train_x, train_y)
    dev = zip(dev_x, dev_y)
    todo = [(train, "train"), (dev, "dev")]
    encode_fn = get_encode_fn(512)
    for data, split in todo:
        save_path = at_output_dir("argu_ana_tfrecord", split)
        write_records_w_encode_fn(save_path, encode_fn, data)
Esempio n. 18
0
def write_qc_records(output_path, qc_records):
    data_id_man = DataIDManager()
    instances = collect_info_transform(qc_records, data_id_man)
    tokenizer = get_tokenizer()
    max_seq_length = 512

    def encode_fn(inst: QCInstance):
        return encode(tokenizer, max_seq_length, inst)

    write_records_w_encode_fn(output_path, encode_fn, instances)
    json.dump(data_id_man.id_to_info, open(output_path + ".info", "w"))
Esempio n. 19
0
def make_and_write(split):
    docs = load_for_split(split)
    data: List[Tuple[str, bool]] = lflatten(lmap(get_inst_from_doc, docs))
    max_seq_length = 512
    random.shuffle(data)
    dir_path = os.path.join(output_path, "mpqa")
    tokenizer = get_tokenizer()

    def encode_fn(t: Tuple[str, bool]) -> OrderedDict:
        return encode(tokenizer, max_seq_length, t)
    exist_or_mkdir(dir_path)
    save_path = os.path.join(dir_path, split)
    write_records_w_encode_fn(save_path, encode_fn, data)
Esempio n. 20
0
def main():
    data_id_man = DataIDManager()
    q_res_path = sys.argv[1]
    save_path = sys.argv[2]
    max_seq_length = 512
    tokenizer = get_tokenizer()
    insts = sentence_payload_gen(q_res_path, 100, data_id_man)

    def encode_fn(t: Tuple[str, bool, int]) -> OrderedDict:
        return encode_w_data_id(tokenizer, max_seq_length, t)

    write_records_w_encode_fn(save_path, encode_fn, insts)
    json_save_path = save_path + ".info"
    json.dump(data_id_man.id_to_info, open(json_save_path, "w"))
Esempio n. 21
0
 def make_tfrecord(self, job_id: int):
     save_path = os.path.join(self.request_dir, str(job_id))
     kdp_list = pickle.load(open(save_path, "rb"))
     data_id_manager = DataIDManager(0, 1000 * 1000)
     print("{} kdp".format(len(kdp_list)))
     insts = self.qck_generator.generate(kdp_list, data_id_manager)
     record_save_path = os.path.join(self.tf_record_dir, str(job_id))
     write_records_w_encode_fn(record_save_path,
                               self.qck_generator.encode_fn, insts)
     # Save for backup
     info_save_path = os.path.join(self.tf_record_dir,
                                   "{}.info".format(job_id))
     pickle.dump(data_id_manager.id_to_info, open(info_save_path, "wb"))
     # launch estimator
     add_estimator_job(job_id)
Esempio n. 22
0
 def work(self, job_id):
     cid = self.cids[job_id]
     entries: List[SimpleRankedListEntry] = self.ranked_list[str(cid)]
     max_items = 1000 * 1000
     base = job_id * max_items
     end = base + max_items
     data_id_manager = DataIDManager(base, end)
     insts = self.get_instances(cid, data_id_manager, entries)
     save_path = os.path.join(self.out_dir, str(job_id))
     writer = self.writer
     write_records_w_encode_fn(save_path, writer.encode, insts)
     info_dir = self.out_dir + "_info"
     exist_or_mkdir(info_dir)
     info_path = os.path.join(info_dir, str(job_id) + ".info")
     json.dump(data_id_manager.id_to_info, open(info_path, "w"))
Esempio n. 23
0
    def write(self, insts: Iterable[PairedInstance], out_path, length=0):
        def encode_fn(inst: PairedInstance) -> OrderedDict:
            return combine_features(inst.tokens1, inst.seg_ids1, inst.tokens2,
                                    inst.seg_ids2, self.tokenizer,
                                    self.max_seq_length)

        return write_records_w_encode_fn(out_path, encode_fn, insts, length)
Esempio n. 24
0
def main():
    data_id_manager = DataIDManager()
    data = []
    for text in enum_f5_data():
        info = {
            'text': text,
        }
        data_id = data_id_manager.assign(info)
        label = 0
        data.append(TextInstance(text, label, data_id))

    encode_fn = get_encode_fn_w_data_id(512, False)
    save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord")
    write_records_w_encode_fn(save_path, encode_fn, data)

    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    json.dump(data_id_manager.id_to_info, open(info_save_path, "w"))
Esempio n. 25
0
    def work(self, job_id):
        max_data_per_job = 1000 * 1000
        base = job_id * max_data_per_job
        data_id_manager = DataIDManager(base, base + max_data_per_job)
        todo = self.qk_candidate[job_id:job_id + 1]
        tprint("Generating instances")
        insts: List = self.generator.generate(todo, data_id_manager)
        tprint("{} instances".format(len(insts)))
        save_path = os.path.join(self.out_dir, str(job_id))
        tprint("Writing")
        write_records_w_encode_fn(save_path, self.generator.encode_fn, insts)
        tprint("writing done")

        info_dir = self.out_dir + "_info"
        exist_or_mkdir(info_dir)
        info_path = os.path.join(info_dir, str(job_id) + ".info")
        json.dump(data_id_manager.id_to_info, open(info_path, "w"))
Esempio n. 26
0
def generate_selected_training_data_w_json(info, max_seq_length, save_dir,
                                           get_score_fn, max_seg):
    data_id_manager = DataIDManager(0, 1000000)
    tprint("data gen")

    def get_query_id_group(query_id):
        for st, ed in robust_query_intervals:
            if st <= int(query_id) <= ed:
                return st

        assert False

    tokenizer = get_tokenizer()
    for data_id, e in info.items():
        input_ids = tokenizer.convert_tokens_to_ids(e['tokens'])
        e['input_ids'] = input_ids

    maybe_num_insts = int(len(info) / 4)
    ticker = TimeEstimator(maybe_num_insts)
    itr = enum_best_segments(get_score_fn, info, max_seg)
    insts = collections.defaultdict(list)
    for selected_entry in itr:
        ticker.tick()
        selected = selected_entry
        query_id = selected['query_id']
        q_group = get_query_id_group(query_id)
        assert len(selected['tokens']) == len(selected['seg_ids'])
        input_ids = tokenizer.convert_tokens_to_ids(selected['tokens'])
        selected['input_ids'] = pad0(input_ids, max_seq_length)
        selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length)
        # data_id = data_id_manager.assign(selected_segment.to_info_d())
        data_id = 0
        ci = InstAsInputIds(selected['input_ids'], selected['seg_ids'],
                            selected['label'], data_id)
        insts[q_group].append(ci)

    def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict:
        return encode_inst_as_input_ids(max_seq_length, inst)

    tprint("writing")
    for q_group, insts_per_group in insts.items():
        out_path = os.path.join(save_dir, str(q_group))
        write_records_w_encode_fn(out_path, encode_fn, insts_per_group,
                                  len(insts_per_group))
        save_info(save_dir, data_id_manager, str(q_group) + ".info")
Esempio n. 27
0
    def write(self, insts: List[PairedInstance], out_path: str):
        def encode_fn(inst: PairedInstance) -> OrderedDict:
            return combine_features(inst.tokens1, inst.seg_ids1, inst.tokens2, inst.seg_ids2,
                                    self.tokenizer, self.max_seq_length)
        try:
            length = len(insts)
        except TypeError:
            length = 0

        return write_records_w_encode_fn(out_path, encode_fn, insts, length)
Esempio n. 28
0
def main():
    dir_path = os.path.join(output_path, "perspective_paraphrase")
    seq_length = 100
    tokenizer = get_tokenizer()
    tokens_d = {}

    def get_tokens(pid):
        if pid not in tokens_d:
            text = perspective_getter(pid)
            tokens_d[pid] = tokenizer.tokenize(text)

        return tokens_d[pid]

    def encode_fn(inst):
        return encode(tokenizer, get_tokens, seq_length, inst)

    exist_or_mkdir(dir_path)
    for split in splits:
        insts = generate_pair_insts(split)
        save_path = os.path.join(dir_path, split)
        write_records_w_encode_fn(save_path, encode_fn, insts)
Esempio n. 29
0
    def work(self, job_id):
        data_id_man = DataIDManager()
        insts = self.generate_instances(job_id, data_id_man)
        save_path = os.path.join(self.out_dir, str(job_id))

        def encode_fn(inst: Instance):
            tokens1 = inst.tokens1
            max_seg2_len = self.max_seq_length - 3 - len(tokens1)

            tokens2 = inst.tokens2[:max_seg2_len]
            tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"]

            segment_ids = [0] * (len(tokens1) + 2) \
                          + [1] * (len(tokens2) + 1)
            tokens = tokens[:self.max_seq_length]
            segment_ids = segment_ids[:self.max_seq_length]
            features = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids)
            features['label_ids'] = create_int_feature([inst.label])
            features['data_id'] = create_int_feature([inst.data_id])
            return features

        write_records_w_encode_fn(save_path, encode_fn, insts)
        info_save_path = os.path.join(self.info_out_dir, str(job_id))
        json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
Esempio n. 30
0
    def write(self, insts: List[SegDoc], out_path: str):
        def encode_fn(inst: ClassificationInstance) -> collections.OrderedDict:
            return NotImplemented

        write_records_w_encode_fn(out_path, encode_fn, insts, len(insts))