def make_cppnc_problem(passage_score_path: FilePath, data_id_to_info: Dict, claims: List[Dict], candidate_perspectives, config, save_name: str, encode_inner_fn) -> None: output: List[Tuple[int, List[Dict]]] = collect_good_passages( data_id_to_info, passage_score_path, config) joined_payloads: List = list( join_perspective(output, candidate_perspectives)) tokenizer = get_tokenizer() data_id_man = DataIDManager() payloads: Iterable[PayloadAsTokens] = put_texts(joined_payloads, claims, tokenizer, data_id_man) max_seq_length = 512 def encode_fn(r: PayloadAsTokens): return encode_inner_fn(max_seq_length, tokenizer, r) out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) save_path = os.path.join(out_dir, save_name + ".tfrecord") write_records_w_encode_fn(save_path, encode_fn, payloads) info_save_path = os.path.join(out_dir, save_name + ".info") print("Payload size : ", len(data_id_man.id_to_info)) json.dump(data_id_man.id_to_info, open(info_save_path, "w")) print("tfrecord saved at :", save_path) print("info saved at :", info_save_path)
def write_qck_as_tfrecord(save_path, payloads: Iterable[QCKCompactEntry]): data_id_man = DataIDManager(0, 1000 * 1000) tokenizer = get_tokenizer() cache_tokenizer = CachedTokenizer(tokenizer) max_seq_length = 512 def encode_fn(e: QCKCompactEntry) -> OrderedDict: query, candidate, qk_out_entry = e candidate: QCKCandidate = candidate info = { 'query': query, 'candidate': candidate, 'kdp': qk_out_entry.kdp } p = PayloadAsTokens(passage=qk_out_entry.passage_tokens, text1=cache_tokenizer.tokenize(query.text), text2=cache_tokenizer.tokenize(candidate.text), data_id=data_id_man.assign(info), is_correct=0 ) return encode_two_inputs(max_seq_length, tokenizer, p) write_records_w_encode_fn(save_path, encode_fn, payloads) return data_id_man
def generate_selected_training_data_ablation_only_pos(info, key, max_seq_length, save_dir, score_dir): data_id_manager = DataIDManager(0, 1000000) out_path = os.path.join(save_dir, str(key)) pred_path = os.path.join(score_dir, str(key)) tprint("data gen") itr = enum_best_segments(pred_path, info) insts = [] for selected_entry in itr: selected = decompress_seg_ids_entry(selected_entry) assert len(selected['input_ids']) == len(selected['seg_ids']) selected['input_ids'] = pad0(selected['input_ids'], max_seq_length) selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length) # data_id = data_id_manager.assign(selected_segment.to_info_d()) data_id = 0 ci = InstAsInputIds( selected['input_ids'], selected['seg_ids'], selected['label'], data_id) insts.append(ci) def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict: return encode_inst_as_input_ids(max_seq_length, inst) tprint("writing") write_records_w_encode_fn(out_path, encode_fn, insts, len(insts)) save_info(save_dir, data_id_manager, str(key) + ".info")
def write_to_file(output_path, g2: Iterable[Tuple[int, Tuple[Vectors, Label]]], max_entries): def encode(e: Tuple[int, Tuple[Vectors, Label]]) -> OrderedDict: data_id, (vector, label) = e features = OrderedDict() features['label_ids'] = create_int_feature([label]) features['data_id'] = create_int_feature([data_id]) vector = np.stack(vector, axis=0) # [n_entries, seq-length, hidden_unit] vector = vector[:max_entries] vector_len, seq_len, hidden_unit = np.shape(vector) valid_mask = np.ones([vector_len, seq_len, 1], np.int) if len(vector) < max_entries: pad_len = max_entries - len(vector) vector = np.concatenate( [vector, np.zeros([pad_len, seq_len, hidden_unit])], axis=0) valid_mask = np.concatenate( [valid_mask, np.zeros([pad_len, seq_len, 1], np.int)], axis=0) v = np.reshape(vector, [-1]) # [n_entries * seq_length] valid_mask = np.reshape(valid_mask, [-1]) # [n_entries * seq_length] features['vectors'] = create_float_feature(v) features['valid_mask'] = create_int_feature(valid_mask) return features write_records_w_encode_fn(output_path, encode, g2)
def write_records(records: List[PayloadAsTokens], max_seq_length, output_path): tokenizer = get_tokenizer() def encode(inst: PayloadAsTokens) -> OrderedDict: return encode_two_inputs(max_seq_length, tokenizer, inst) write_records_w_encode_fn(output_path, encode, records)
def make_training_data(config): pos_doc_list_path = config['doc_list_path'] q_res_path = config['q_res_path'] save_path = config['save_path'] balance_test = config['balance_test'] max_seq_length = 512 pos_doc_ids = set( [l.strip() for l in open(pos_doc_list_path, "r").readlines()]) doc_ids_unique = get_doc_ids_from_ranked_list_path(q_res_path) insts = generate(list(pos_doc_ids), list(doc_ids_unique), max_seq_length) train_size = int(0.9 * len(insts)) train_insts = insts[:train_size] val_insts = insts[train_size:] val_pos_insts = list([i for i in val_insts if i.label == 1]) val_neg_insts = list([i for i in val_insts if not i.label]) print("num pos inst in val", len(val_pos_insts)) if balance_test: val_neg_insts = val_neg_insts[:len(val_pos_insts)] val_insts = val_pos_insts + val_neg_insts tokenizer = get_tokenizer() def encode_fn(inst: Instance) -> OrderedDict: return encode_w_data_id(tokenizer, max_seq_length, inst) write_records_w_encode_fn(save_path + "train", encode_fn, train_insts) write_records_w_encode_fn(save_path + "val", encode_fn, val_insts)
def make_tfrecord(source_name, target_name): source_data = data_d[source_name] target_data = data_d[target_name] combined_data = combine_source_and_target(source_data, target_data, 1) save_path = at_output_dir( dir_name, "{}_to_{}_train".format(source_name, target_name)) write_records_w_encode_fn(save_path, encode_fn, combined_data)
def make_cppnc_dummy_problem(claims: List[Dict], candidate_perspectives, save_name: str, encode_inner_fn) -> None: empty_passage = {'passage': []} def get_payload() -> Iterable[Tuple[int, int, List[Dict]]]: for cid, candidates in candidate_perspectives.items(): for candi in candidates: yield cid, candi['pid'], [empty_passage] tokenizer = get_tokenizer() data_id_man = DataIDManager() payloads: Iterable[PayloadAsTokens] = put_texts(get_payload(), claims, tokenizer, data_id_man) max_seq_length = 512 def encode_fn(r: PayloadAsTokens): return encode_inner_fn(max_seq_length, tokenizer, r) out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) save_path = os.path.join(out_dir, save_name + ".tfrecord") write_records_w_encode_fn(save_path, encode_fn, payloads) info_save_path = os.path.join(out_dir, save_name + ".info") print("Payload size : ", len(data_id_man.id_to_info)) json.dump(data_id_man.id_to_info, open(info_save_path, "w")) print("tfrecord saved at :", save_path) print("info saved at :", info_save_path)
def do_generate_jobs(candidate_dict, is_correct_fn, save_dir, split): queries = get_qck_queries(split) generator = QCInstanceGenerator(candidate_dict, is_correct_fn) data_id_manager = DataIDManager() insts = generator.generate(queries, data_id_manager) save_path = os.path.join(save_dir, split) write_records_w_encode_fn(save_path, generator.encode_fn, insts) json.dump(data_id_manager.id_to_info, open(save_path + ".info", "w"))
def write_records(records: List[Payload], max_seq_length, output_path): tokenizer = get_tokenizer() def encode(inst: Payload) -> OrderedDict: inst_2 = convert_sub_token(tokenizer, inst) return encode_inner(max_seq_length, tokenizer, inst_2) write_records_w_encode_fn(output_path, encode, records)
def main(): raw_payload: List[ClaimPassages] = load_dev_payload() save_path = os.path.join(output_path, "pc_dev_passage_payload") encode = get_encode_fn(512) data_id_manage = DataIDManager() insts = list(generate_instances(raw_payload, data_id_manage)) write_records_w_encode_fn(save_path, encode, insts, len(insts)) save_to_pickle(data_id_manage.id_to_info, "pc_dev_passage_payload_info")
def main(): exist_or_mkdir(os.path.join(output_path, "aawd_tfrecord")) train, dev, test = load_aawd_splits() todo = [(train, "train"), (dev, "dev"), (test, "test")] encode_fn = get_encode_fn(256) for data, split in todo: save_path = at_output_dir("aawd_tfrecord", split) write_records_w_encode_fn(save_path, encode_fn, data)
def write(self, insts: List[ClassificationInstanceWDataID], out_path: str): def encode_fn( inst: ClassificationInstanceWDataID ) -> collections.OrderedDict: return encode_classification_instance_w_data_id( self.tokenizer, self.max_seq_length, inst) write_records_w_encode_fn(out_path, encode_fn, insts, len(insts))
def write_with_classification_instance_with_id( tokenizer, max_seq_length, insts: Iterable[ClassificationInstanceWDataID], out_path: str): def encode_fn( inst: ClassificationInstanceWDataID) -> collections.OrderedDict: return encode_classification_instance_w_data_id( tokenizer, max_seq_length, inst) write_records_w_encode_fn(out_path, encode_fn, insts)
def make_pc_qc(queries: Iterable[QCKQuery], eval_candidate: Dict[str, List[QCKCandidate]], is_correct_fn, save_path: str): generator = QCInstanceGenerator(eval_candidate, is_correct_fn) data_id_manager = DataIDManager(0, 10000 * 10000) insts = generator.generate(queries, data_id_manager) insts = list(insts) write_records_w_encode_fn(save_path, generator.encode_fn, insts) json.dump(data_id_manager.id_to_info, open(save_path + ".info", "w"))
def generate_and_write(file_name, generate_fn, tokenizer): data_id_man = DataIDManager() inst_list = generate_fn(data_id_man) max_seq_length = 300 save_path = at_output_dir("alamri_tfrecord", file_name) encode_fn = get_encode_fn(max_seq_length, tokenizer) write_records_w_encode_fn(save_path, encode_fn, inst_list) info_save_path = at_output_dir("alamri_tfrecord", file_name + ".info") json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
def binary_gen(): exist_or_mkdir(os.path.join(output_path, "argu_ana_tfrecord")) train_x, train_y, dev_x, dev_y = get_argu_pointwise_data() train = zip(train_x, train_y) dev = zip(dev_x, dev_y) todo = [(train, "train"), (dev, "dev")] encode_fn = get_encode_fn(512) for data, split in todo: save_path = at_output_dir("argu_ana_tfrecord", split) write_records_w_encode_fn(save_path, encode_fn, data)
def write_qc_records(output_path, qc_records): data_id_man = DataIDManager() instances = collect_info_transform(qc_records, data_id_man) tokenizer = get_tokenizer() max_seq_length = 512 def encode_fn(inst: QCInstance): return encode(tokenizer, max_seq_length, inst) write_records_w_encode_fn(output_path, encode_fn, instances) json.dump(data_id_man.id_to_info, open(output_path + ".info", "w"))
def make_and_write(split): docs = load_for_split(split) data: List[Tuple[str, bool]] = lflatten(lmap(get_inst_from_doc, docs)) max_seq_length = 512 random.shuffle(data) dir_path = os.path.join(output_path, "mpqa") tokenizer = get_tokenizer() def encode_fn(t: Tuple[str, bool]) -> OrderedDict: return encode(tokenizer, max_seq_length, t) exist_or_mkdir(dir_path) save_path = os.path.join(dir_path, split) write_records_w_encode_fn(save_path, encode_fn, data)
def main(): data_id_man = DataIDManager() q_res_path = sys.argv[1] save_path = sys.argv[2] max_seq_length = 512 tokenizer = get_tokenizer() insts = sentence_payload_gen(q_res_path, 100, data_id_man) def encode_fn(t: Tuple[str, bool, int]) -> OrderedDict: return encode_w_data_id(tokenizer, max_seq_length, t) write_records_w_encode_fn(save_path, encode_fn, insts) json_save_path = save_path + ".info" json.dump(data_id_man.id_to_info, open(json_save_path, "w"))
def make_tfrecord(self, job_id: int): save_path = os.path.join(self.request_dir, str(job_id)) kdp_list = pickle.load(open(save_path, "rb")) data_id_manager = DataIDManager(0, 1000 * 1000) print("{} kdp".format(len(kdp_list))) insts = self.qck_generator.generate(kdp_list, data_id_manager) record_save_path = os.path.join(self.tf_record_dir, str(job_id)) write_records_w_encode_fn(record_save_path, self.qck_generator.encode_fn, insts) # Save for backup info_save_path = os.path.join(self.tf_record_dir, "{}.info".format(job_id)) pickle.dump(data_id_manager.id_to_info, open(info_save_path, "wb")) # launch estimator add_estimator_job(job_id)
def work(self, job_id): cid = self.cids[job_id] entries: List[SimpleRankedListEntry] = self.ranked_list[str(cid)] max_items = 1000 * 1000 base = job_id * max_items end = base + max_items data_id_manager = DataIDManager(base, end) insts = self.get_instances(cid, data_id_manager, entries) save_path = os.path.join(self.out_dir, str(job_id)) writer = self.writer write_records_w_encode_fn(save_path, writer.encode, insts) info_dir = self.out_dir + "_info" exist_or_mkdir(info_dir) info_path = os.path.join(info_dir, str(job_id) + ".info") json.dump(data_id_manager.id_to_info, open(info_path, "w"))
def write(self, insts: Iterable[PairedInstance], out_path, length=0): def encode_fn(inst: PairedInstance) -> OrderedDict: return combine_features(inst.tokens1, inst.seg_ids1, inst.tokens2, inst.seg_ids2, self.tokenizer, self.max_seq_length) return write_records_w_encode_fn(out_path, encode_fn, insts, length)
def main(): data_id_manager = DataIDManager() data = [] for text in enum_f5_data(): info = { 'text': text, } data_id = data_id_manager.assign(info) label = 0 data.append(TextInstance(text, label, data_id)) encode_fn = get_encode_fn_w_data_id(512, False) save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord") write_records_w_encode_fn(save_path, encode_fn, data) info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info") json.dump(data_id_manager.id_to_info, open(info_save_path, "w"))
def work(self, job_id): max_data_per_job = 1000 * 1000 base = job_id * max_data_per_job data_id_manager = DataIDManager(base, base + max_data_per_job) todo = self.qk_candidate[job_id:job_id + 1] tprint("Generating instances") insts: List = self.generator.generate(todo, data_id_manager) tprint("{} instances".format(len(insts))) save_path = os.path.join(self.out_dir, str(job_id)) tprint("Writing") write_records_w_encode_fn(save_path, self.generator.encode_fn, insts) tprint("writing done") info_dir = self.out_dir + "_info" exist_or_mkdir(info_dir) info_path = os.path.join(info_dir, str(job_id) + ".info") json.dump(data_id_manager.id_to_info, open(info_path, "w"))
def generate_selected_training_data_w_json(info, max_seq_length, save_dir, get_score_fn, max_seg): data_id_manager = DataIDManager(0, 1000000) tprint("data gen") def get_query_id_group(query_id): for st, ed in robust_query_intervals: if st <= int(query_id) <= ed: return st assert False tokenizer = get_tokenizer() for data_id, e in info.items(): input_ids = tokenizer.convert_tokens_to_ids(e['tokens']) e['input_ids'] = input_ids maybe_num_insts = int(len(info) / 4) ticker = TimeEstimator(maybe_num_insts) itr = enum_best_segments(get_score_fn, info, max_seg) insts = collections.defaultdict(list) for selected_entry in itr: ticker.tick() selected = selected_entry query_id = selected['query_id'] q_group = get_query_id_group(query_id) assert len(selected['tokens']) == len(selected['seg_ids']) input_ids = tokenizer.convert_tokens_to_ids(selected['tokens']) selected['input_ids'] = pad0(input_ids, max_seq_length) selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length) # data_id = data_id_manager.assign(selected_segment.to_info_d()) data_id = 0 ci = InstAsInputIds(selected['input_ids'], selected['seg_ids'], selected['label'], data_id) insts[q_group].append(ci) def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict: return encode_inst_as_input_ids(max_seq_length, inst) tprint("writing") for q_group, insts_per_group in insts.items(): out_path = os.path.join(save_dir, str(q_group)) write_records_w_encode_fn(out_path, encode_fn, insts_per_group, len(insts_per_group)) save_info(save_dir, data_id_manager, str(q_group) + ".info")
def write(self, insts: List[PairedInstance], out_path: str): def encode_fn(inst: PairedInstance) -> OrderedDict: return combine_features(inst.tokens1, inst.seg_ids1, inst.tokens2, inst.seg_ids2, self.tokenizer, self.max_seq_length) try: length = len(insts) except TypeError: length = 0 return write_records_w_encode_fn(out_path, encode_fn, insts, length)
def main(): dir_path = os.path.join(output_path, "perspective_paraphrase") seq_length = 100 tokenizer = get_tokenizer() tokens_d = {} def get_tokens(pid): if pid not in tokens_d: text = perspective_getter(pid) tokens_d[pid] = tokenizer.tokenize(text) return tokens_d[pid] def encode_fn(inst): return encode(tokenizer, get_tokens, seq_length, inst) exist_or_mkdir(dir_path) for split in splits: insts = generate_pair_insts(split) save_path = os.path.join(dir_path, split) write_records_w_encode_fn(save_path, encode_fn, insts)
def work(self, job_id): data_id_man = DataIDManager() insts = self.generate_instances(job_id, data_id_man) save_path = os.path.join(self.out_dir, str(job_id)) def encode_fn(inst: Instance): tokens1 = inst.tokens1 max_seg2_len = self.max_seq_length - 3 - len(tokens1) tokens2 = inst.tokens2[:max_seg2_len] tokens = ["[CLS]"] + tokens1 + ["[SEP]"] + tokens2 + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 2) \ + [1] * (len(tokens2) + 1) tokens = tokens[:self.max_seq_length] segment_ids = segment_ids[:self.max_seq_length] features = get_basic_input_feature(self.tokenizer, self.max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([inst.label]) features['data_id'] = create_int_feature([inst.data_id]) return features write_records_w_encode_fn(save_path, encode_fn, insts) info_save_path = os.path.join(self.info_out_dir, str(job_id)) json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
def write(self, insts: List[SegDoc], out_path: str): def encode_fn(inst: ClassificationInstance) -> collections.OrderedDict: return NotImplemented write_records_w_encode_fn(out_path, encode_fn, insts, len(insts))