def main(): dir_path = sys.argv[1] tokenizer = get_tokenizer() averager = Averager() sbc = SubwordConvertor() df = Counter() collection_size = 0 tikcer = TimeEstimator(485393) for file_path in get_dir_files(dir_path): for idx, record in enumerate( tf.compat.v1.python_io.tf_record_iterator(file_path)): example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature input_ids = feature["input_ids"].int64_list.value tokens = tokenizer.convert_ids_to_tokens(input_ids) sep_idx1 = tokens.index("[SEP]") sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1) doc_tokens = tokens[sep_idx1:sep_idx2] words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens)) dl = len(words) collection_size += dl averager.append(dl) for word in set(words): df[word] += 1 tikcer.tick() print("collection length", collection_size) print("average dl", averager.get_average()) save_to_pickle(df, "subword_df_robust_train")
def get_candidate_all_passage_w_samping_predict( max_seq_length=256) -> Dict[str, List[QCKCandidateWToken]]: qrel_path = os.path.join(data_path, "robust", "qrels.rob04.txt") galago_rank = load_bm25_best() tokens_d = load_robust_tokens_for_predict(4) queries = load_robust04_title_query() tokenizer = get_tokenizer() out_d: Dict[str, List[QCKCandidateWToken]] = {} for query_id in queries: query = queries[query_id] query_tokens = tokenizer.tokenize(query) ranked_list = galago_rank[query_id] ranked_list = ranked_list[:100] doc_ids = list([e.doc_id for e in ranked_list]) candidate = [] for doc_id in doc_ids: tokens = tokens_d[doc_id] for idx, passage in enumerate(enum_passage(tokens, max_seq_length)): if idx == 0: include = True else: include = random.random() < 0.1 if include: c = QCKCandidateWToken(doc_id, "", passage) candidate.append(c) out_d[query_id] = candidate return out_d
def main(): tokens = ["hi", "hello"] seg_ids = [ 0, 0, ] inst = ClassificationInstance(tokens, seg_ids, 0) inst_list = [inst] out_path = "/tmp/temp.youngwoo" max_seq_length = 512 tokenizer = get_tokenizer() input_ids = tokenizer.convert_tokens_to_ids(tokens) def encode_fn(inst: ClassificationInstance) -> OrderedDict: return encode_classification_instance(tokenizer, max_seq_length, inst) features_list: Iterable[OrderedDict] = map(encode_fn, inst_list) writer = tf.python_io.TFRecordWriter(out_path) for e in features_list: # features = OrderedDict() # features["input_ids"] = create_int_feature(input_ids) f = tf.train.Features(feature=e) tf_example = tf.train.Example(features=f) writer.write(tf_example.SerializeToString())
def main(): info_path = os.path.join(job_man_dir, "MMD_pred_info", "1.info") info = load_combine_info_jsons(info_path) tokenizer = get_tokenizer() cnt = 0 fn = os.path.join(job_man_dir, "MMD_pred", "1") for record in tf.compat.v1.python_io.tf_record_iterator(fn): example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature keys = feature.keys() print("---- record -----") v = feature["input_ids"].int64_list.value data_id = feature["data_id"].int64_list.value[0] info_entry = info[str(data_id)] passage_idx = info_entry['passage_idx'] tokens = tokenizer.convert_ids_to_tokens(v) text = " ".join(tokens) sep_idx = text.find("[SEP]") print(passage_idx) print(text[:sep_idx]) print(text[sep_idx:]) cnt += 1 if cnt >= 10: break
def write_records(records: List[Record], max_seq_length, output_path): tokenizer = get_tokenizer() def encode(record: Record) -> OrderedDict: tokens = ["[CLS]"] + record.claim_tokens + [ "[SEP]" ] + record.doc_tokens + ["[SEP]"] segment_ids = [0] * (len(record.claim_tokens) + 2) \ + [1] * (len(record.doc_tokens) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) labels = [0.] * (len(record.claim_tokens) + 2) + record.scores labels += (max_seq_length - len(labels)) * [0.] label_mask = [0] * (len(record.claim_tokens) + 2) + record.valid_mask label_mask += (max_seq_length - len(label_mask)) * [0] features['label_ids'] = create_float_feature(labels) features['label_masks'] = create_int_feature(label_mask) return features writer = RecordWriterWrap(output_path) features: List[OrderedDict] = lmap(encode, records) foreach(writer.write_feature, features) writer.close()
def make_training_data(config): pos_doc_list_path = config['doc_list_path'] q_res_path = config['q_res_path'] save_path = config['save_path'] balance_test = config['balance_test'] max_seq_length = 512 pos_doc_ids = set( [l.strip() for l in open(pos_doc_list_path, "r").readlines()]) doc_ids_unique = get_doc_ids_from_ranked_list_path(q_res_path) insts = generate(list(pos_doc_ids), list(doc_ids_unique), max_seq_length) train_size = int(0.9 * len(insts)) train_insts = insts[:train_size] val_insts = insts[train_size:] val_pos_insts = list([i for i in val_insts if i.label == 1]) val_neg_insts = list([i for i in val_insts if not i.label]) print("num pos inst in val", len(val_pos_insts)) if balance_test: val_neg_insts = val_neg_insts[:len(val_pos_insts)] val_insts = val_pos_insts + val_neg_insts tokenizer = get_tokenizer() def encode_fn(inst: Instance) -> OrderedDict: return encode_w_data_id(tokenizer, max_seq_length, inst) write_records_w_encode_fn(save_path + "train", encode_fn, train_insts) write_records_w_encode_fn(save_path + "val", encode_fn, val_insts)
def main(): dir_path = sys.argv[1] tokenizer = get_tokenizer() averager = Averager() for file_path in get_dir_files(dir_path): for idx, record in enumerate( tf.compat.v1.python_io.tf_record_iterator(file_path)): if idx % 3: continue example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature input_mask = feature["input_mask"].int64_list.value if input_mask[-1]: input_ids = feature["input_ids"].int64_list.value tokens = tokenizer.convert_ids_to_tokens(input_ids) sep_idx1 = tokens.index("[SEP]") sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1) doc_tokens = tokens[sep_idx1:sep_idx2] continue_cnt = 0 for t in doc_tokens: if t[:2] == "##": continue_cnt += 1 ## n_words = len(doc_tokens) - continue_cnt averager.append(n_words) print("average", averager.get_average())
def modify_data_loader(data_loader): tokenizer = get_tokenizer() CLS_ID = tokenizer.convert_tokens_to_ids(["[CLS]"])[0] SEP_ID = tokenizer.convert_tokens_to_ids(["[SEP]"])[0] data_loader.CLS_ID = CLS_ID data_loader.SEP_ID = SEP_ID return data_loader
def file_show(fn): cnt = 0 tokenizer = get_tokenizer() for record in tf.compat.v1.python_io.tf_record_iterator(fn): example = tf.train.Example() example.ParseFromString(record) feature = example.features.feature keys = feature.keys() print("---- record -----") for key in keys: if key == "masked_lm_weights": v = feature[key].float_list.value else: v = feature[key].int64_list.value print(key) print(v) if key in ["input_ids", "input_ids1", "input_ids2"]: tokens = tokenizer.convert_ids_to_tokens(v) print(key) print(" ".join(tokens)) cnt += 1 if cnt >= 5: break
def __init__(self, topic, ranked_list_path, token_file_path): ranked_list_d = load_galago_ranked_list(ranked_list_path) self.ranked_list = ranked_list_d["unk-0"] self.tokenizer = get_tokenizer() self.topic = topic self.tokens = pickle.load(open(token_file_path, "rb")) self.doc_idx = 0
def write_records(records: List[Payload], max_seq_length, output_path): tokenizer = get_tokenizer() def encode(inst: Payload) -> OrderedDict: inst_2 = convert_sub_token(tokenizer, inst) return encode_inner(max_seq_length, tokenizer, inst_2) write_records_w_encode_fn(output_path, encode, records)
def get_recover_subtokens(): tokenizer = get_tokenizer() def recover_subtokens(input_ids) -> List[str]: tokens1, tokens2 = split_p_h_with_input_ids(input_ids, input_ids) return tokenizer.convert_ids_to_tokens(tokens2) return recover_subtokens
def __init__(self): self.continuation = set() tokenizer = get_tokenizer() self.inv_vocab = tokenizer.inv_vocab assert tokenizer is not None for token_id, subword in tokenizer.inv_vocab.items(): if subword[:2] == "##": self.continuation.add(token_id)
def __init__(self, bm25_module, max_seq_length, include_title=False): self.max_seq_length = max_seq_length self.bm25_module = bm25_module pc_tokenize = PCTokenizer() self.tokenize_stem = pc_tokenize.tokenize_stem self.include_title = include_title bert_tokenizer = get_tokenizer() self.bert_tokenize = bert_tokenizer.tokenize
def get_continuation_token_ids() -> Set[int]: tokenizer = get_tokenizer() s = set() for token, token_id in tokenizer.vocab.items(): if token[:2] == "##": s.add(token_id) return s
def __init__(self, candidates_dict: Dict[str, List[QCKCandidate]], is_correct_fn: Callable[[QCKQuery, QCKCandidate], bool], ): self.max_seq_length = 512 self.tokenizer = get_tokenizer() self.candidates_dict: Dict[str, List[QCKCandidate]] = candidates_dict self._is_correct = is_correct_fn
def __init__(self, encoder, max_seq_length, top_k=100, query_type="title"): self.data = self.load_tokens_from_pickles() self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.galago_rank = load_bm25_best() self.top_k = top_k self.encoder = encoder self.tokenizer = get_tokenizer()
def __init__(self, resource: ProcessedResourceI, max_seq_length, max_seg_per_doc): self.resource = resource self.tokenizer = get_tokenizer() self.title_token_max = 64 self.query_token_max = 64 self.max_seq_length = max_seq_length self.max_seg_per_doc = max_seg_per_doc
def __init__(self, file_path, fetch_data_list=None): self.vectors, self.keys, self.data_len = self.estimator_prediction_loader( file_path, fetch_data_list) self.tokenizer = get_tokenizer() self.method_list = list([ func for func in dir(EstimatorPredictionViewer) if callable(getattr(EstimatorPredictionViewer, func)) ])
def __init__( self, cid_to_passages: Dict[int, List[Tuple[List[str], float]]], candidate_perspective: Dict[int, List[int]], ): self.gold = get_claim_perspective_id_dict() self.candidate_perspective = candidate_perspective self.cid_to_passages = cid_to_passages self.tokenizer = get_tokenizer()
def translate_word_tf_to_subword_tf(word_tf): tokenizer = get_tokenizer() out = Counter() for word in word_tf: sub_words = tokenizer.tokenize(word) for sw in sub_words: out[sw] += word_tf[word] return out
def __init__(self, split): query_group: List[List[QueryID]] = load_query_group(split) qrel: SimpleQrel = load_msmarco_simple_qrels(split) self.split = split self.queires = dict(load_queries(split)) self.query_group = query_group self.tokenizer = get_tokenizer() self.qrel = qrel
def get_statistic_for_join(join_result: Iterable[Tuple[str, MSMarcoDoc, JoinedPassage]]): print("get_statistic_for_join()") tokenizer = get_tokenizer() def size_in_tokens(text): return len(tokenizer.tokenize(text)) intervals = list(range(0, 500, 50)) + list(range(500, 5000, 500)) last = "5000 <" keys = intervals + [last] def bin_fn(n): for ceil in intervals: if n < ceil: return ceil return "5000 <" bin_doc = BinHistogram(bin_fn) bin_loc = BinHistogram(bin_fn) bin_passage = BinHistogram(bin_fn) match_fail = 0 for doc, passage in join_result: if passage.loc >= 0: prev = doc.body[:passage.loc] n_tokens_before = len(tokenizer.tokenize(prev)) passage_text = passage.text passage_len = len(passage_text) # print("passage loc", passage_loc) # print(n_tokens_before) bin_doc.add(size_in_tokens(doc.body)) bin_loc.add(size_in_tokens(prev)) bin_passage.add(size_in_tokens(passage_text)) # print(prev) # print(" >>>>> ") # print(passage_maybe) # print(" <<<< ") # print(next) pass else: match_fail += 1 # print("passage not found in doc") # print(doc.body) print('match fail', match_fail) print("doc length") bins = [bin_doc, bin_passage, bin_loc] head = ['', 'bin_doc', 'bin_passage', 'bin_loc'] rows = [head] for key in keys: row = [key] for bin in bins: row.append(bin.counter[key]) rows.append(row) print_table(rows)
def select_word_from_dev(): tokenizer = get_tokenizer() tf_dev = load_from_pickle("nli_tf_dev_mis") selected_words = select_common(tf_dev, tokenizer) print(list(tf_dev.most_common(100))[-1]) save_to_pickle(selected_words, "nli_dev_selected_words")
def __init__(self, candidates_dict: Dict[str, List[QCKCandidateI]], is_correct_fn, kdp_as_sub_token=False): self.max_seq_length = 512 self.tokenizer = get_tokenizer() self.candidates_dict: Dict[str, List[QCKCandidateI]] = candidates_dict self._is_correct = is_correct_fn self.kdp_as_sub_token = kdp_as_sub_token
def baseline_bert_gen_unbal_resplit(outpath, split): tokenizer = get_tokenizer() data: List[PerspectiveCandidate] = load_data_point_50_train_val(split) max_seq_length = 512 writer = RecordWriterWrap(outpath) for entry in data: writer.write_feature(enc_to_feature(tokenizer, max_seq_length, entry)) writer.close()
def __init__(self, split, q_config_id, out_dir): self.out_dir = out_dir self.ci = StaticRankedListInterface(q_config_id) print("load__data_point") self.all_data_points = load_data_point(split) print("Load term stat") _, clue12_13_df = load_clueweb12_B13_termstat() self.clue12_13_df = clue12_13_df self.tokenizer = get_tokenizer()
def __init__(self, encoder, max_seq_length, query_type="title"): self.data = self.load_tokens() qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust_04_query(query_type) self.encoder = encoder self.tokenizer = get_tokenizer() self.galago_rank = load_bm25_best()
def __init__(self, encoder, max_seq_length): self.data = load_robust_tokens_for_train() assert len(self.data) == 174787 qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt" self.judgement = load_qrels_structured(qrel_path) self.max_seq_length = max_seq_length self.queries = load_robust04_title_query() self.encoder = encoder self.tokenizer = get_tokenizer()
def __init__(self, prcessed_resource: ProcessedResourceI, get_tokens_d_bert, get_tokens_d_bm25, parallel_encoder, max_seq_length): self.prcessed_resource = prcessed_resource self.get_tokens_d_bert = get_tokens_d_bert self.get_tokens_d_bm25 = get_tokens_d_bm25 self.encoder = parallel_encoder self.tokenizer = get_tokenizer() self.max_seq_length = max_seq_length