def generate_selected_training_data_ablation_only_pos(info, key, max_seq_length, save_dir, score_dir): data_id_manager = DataIDManager(0, 1000000) out_path = os.path.join(save_dir, str(key)) pred_path = os.path.join(score_dir, str(key)) tprint("data gen") itr = enum_best_segments(pred_path, info) insts = [] for selected_entry in itr: selected = decompress_seg_ids_entry(selected_entry) assert len(selected['input_ids']) == len(selected['seg_ids']) selected['input_ids'] = pad0(selected['input_ids'], max_seq_length) selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length) # data_id = data_id_manager.assign(selected_segment.to_info_d()) data_id = 0 ci = InstAsInputIds( selected['input_ids'], selected['seg_ids'], selected['label'], data_id) insts.append(ci) def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict: return encode_inst_as_input_ids(max_seq_length, inst) tprint("writing") write_records_w_encode_fn(out_path, encode_fn, insts, len(insts)) save_info(save_dir, data_id_manager, str(key) + ".info")
def get_dict_input_features(tokenizer, max_def_length, max_d_loc, max_word_len, segment_ids, dict_def, word_loc_list, dict_word): d_input_ids = tokenizer.convert_tokens_to_ids(dict_def) d_input_ids = d_input_ids[:max_def_length] d_input_mask = [1] * len(d_input_ids) if word_loc_list: target_segment = segment_ids[word_loc_list[0]] d_segment_ids = [target_segment] * len(d_input_ids) else: d_segment_ids = [] if dict_word is not None: selected_word = tokenizer.convert_tokens_to_ids(dict_word.subword_rep) else: selected_word = [] d_input_ids = pad0(d_input_ids, max_def_length) d_input_mask = pad0(d_input_mask, max_def_length) d_location_ids = pad0(word_loc_list[:max_d_loc], max_d_loc) d_segment_ids = pad0(d_segment_ids, max_def_length) selected_word = pad0(selected_word, max_word_len) features = collections.OrderedDict() features["d_input_ids"] = btd.create_int_feature(d_input_ids) features["d_input_mask"] = btd.create_int_feature(d_input_mask) features["d_segment_ids"] = btd.create_int_feature(d_segment_ids) features["d_location_ids"] = btd.create_int_feature(d_location_ids) features["selected_word"] = btd.create_int_feature(selected_word) return features
def encode_dict_as_feature(self, dictionary): new_dict = {} for word, dict_def in dictionary.items(): d_input_ids = self.tokenizer.convert_tokens_to_ids(dict_def) d_input_ids = d_input_ids[:self.max_def_length] d_input_mask = [1] * len(d_input_ids) d_input_ids = pad0(d_input_ids, self.max_def_length) d_input_mask = pad0(d_input_mask, self.max_def_length) new_dict[word] = d_input_ids, d_input_mask return new_dict
def generate_selected_training_data_w_json(info, max_seq_length, save_dir, get_score_fn, max_seg): data_id_manager = DataIDManager(0, 1000000) tprint("data gen") def get_query_id_group(query_id): for st, ed in robust_query_intervals: if st <= int(query_id) <= ed: return st assert False tokenizer = get_tokenizer() for data_id, e in info.items(): input_ids = tokenizer.convert_tokens_to_ids(e['tokens']) e['input_ids'] = input_ids maybe_num_insts = int(len(info) / 4) ticker = TimeEstimator(maybe_num_insts) itr = enum_best_segments(get_score_fn, info, max_seg) insts = collections.defaultdict(list) for selected_entry in itr: ticker.tick() selected = selected_entry query_id = selected['query_id'] q_group = get_query_id_group(query_id) assert len(selected['tokens']) == len(selected['seg_ids']) input_ids = tokenizer.convert_tokens_to_ids(selected['tokens']) selected['input_ids'] = pad0(input_ids, max_seq_length) selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length) # data_id = data_id_manager.assign(selected_segment.to_info_d()) data_id = 0 ci = InstAsInputIds(selected['input_ids'], selected['seg_ids'], selected['label'], data_id) insts[q_group].append(ci) def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict: return encode_inst_as_input_ids(max_seq_length, inst) tprint("writing") for q_group, insts_per_group in insts.items(): out_path = os.path.join(save_dir, str(q_group)) write_records_w_encode_fn(out_path, encode_fn, insts_per_group, len(insts_per_group)) save_info(save_dir, data_id_manager, str(q_group) + ".info")
def encode_dict_as_feature(self, dictionary): new_dict = {} for word, entries in dictionary.items(): enc_entries = [] for dict_def in entries: d_input_ids = self.tokenizer.convert_tokens_to_ids(dict_def) d_input_ids = d_input_ids[:self.max_def_length] d_input_mask = [1] * len(d_input_ids) d_segment_ids = [0] * len(d_input_ids) d_input_ids = pad0(d_input_ids, self.max_def_length) d_input_mask = pad0(d_input_mask, self.max_def_length) d_segment_ids = pad0(d_segment_ids, self.max_def_length) e = d_input_ids, d_input_mask, d_segment_ids enc_entries.append(e) new_dict[word] = enc_entries return new_dict
def index(self, tokens): words = get_word_tokens(tokens) unique_words = filter_unique_words(words) valid_words = [] for word in unique_words: if word.word in self.stopword: pass elif not self.dict_contains(word.word): pass else: word.location = get_locations(tokens, word) word.location = word.location[:self.get_max_d_loc()] word.enc_location = pad0(word.location, self.get_max_d_loc()) valid_words.append(word) return valid_words