def format_paragraph_features( tokenizer: FullTokenizer, max_seq_length: int, para_feature: ParagraphFeature) -> List[OrderedDict]: text1 = para_feature.datapoint.text1 tokens1 = tokenizer.tokenize(text1) text2 = para_feature.datapoint.text2 tokens2 = tokenizer.tokenize(text2) label: int = int(para_feature.datapoint.label) def encode(score_paragraph: ScoreParagraph) -> OrderedDict: para_tokens: List[Subword] = score_paragraph.paragraph.subword_tokens tokens = tokens1 + ["[SEP]"] + tokens2 + ["[SEP]" ] + para_tokens + ["[SEP]"] segment_ids = [0] * (len(tokens1) + 1) + [1] * ( len(tokens2) + 1) + [2] * (len(para_tokens) + 1) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([label]) return features features: List[OrderedDict] = lmap(encode, para_feature.feature) return features
def get_cpids_and_token_keys( tokenizer: FullTokenizer, claim_entry: ParagraphClaimPersFeature) -> Tuple[str, CPID]: claim_text = claim_entry.claim_pers.claim_text claim_tokens = tokenizer.tokenize(claim_text) p_text = claim_entry.claim_pers.p_text p_tokens = tokenizer.tokenize(p_text) key = " ".join(claim_tokens) + "_" + " ".join(p_tokens) cpid: CPID = CPID("{}_{}".format(claim_entry.claim_pers.cid, claim_entry.claim_pers.pid)) return key, cpid
def get_cpids_and_token_keys( tokenizer: FullTokenizer, para_feature: ParagraphFeature) -> Tuple[str, DPID]: text1 = para_feature.datapoint.text1 tokens1 = tokenizer.tokenize(text1) text2 = para_feature.datapoint.text2 tokens2 = tokenizer.tokenize(text2) key = " ".join(tokens1[1:]) + "_" + " ".join(tokens2) dpid: DPID = para_feature.datapoint.id return key, dpid
def to_retrieval_format(tokenizer: FullTokenizer, max_seq_length: int, data_id_gen: DataIDGen, f: ParagraphClaimPersFeature, ) -> Tuple[Dict, List[OrderedDict]]: info_list = {} def get_feature(tokens1, tokens2, info): data_id = data_id_gen.new_id() info_list[data_id] = info tokens = tokens1 + tokens2 segment_ids = [0] * len(tokens1) + [1] * len(tokens2) tokens = tokens[:max_seq_length] segment_ids = segment_ids[:max_seq_length] features = get_basic_input_feature(tokenizer, max_seq_length, tokens, segment_ids) features['label_ids'] = create_int_feature([0]) features['data_id'] = create_int_feature([data_id]) return features ordered_dict_list = [] for scored_paragraph in f.feature: tokens2 = scored_paragraph.paragraph.subword_tokens claim_tokens = tokenizer.tokenize(f.claim_pers.claim_text) p_tokens = tokenizer.tokenize(f.claim_pers.p_text) data_info_c = { 'cid': f.claim_pers.cid, } out_f = get_feature(claim_tokens, tokens2, data_info_c) ordered_dict_list.append(out_f) data_info_p = { 'pid': f.claim_pers.pid } out_f = get_feature(p_tokens, tokens2, data_info_p) ordered_dict_list.append(out_f) return info_list, ordered_dict_list
def get_biobert_tokenizer(): return FullTokenizer(get_biobert_voca_path())
def __init__(self, out_path): self.out_dir = "/mnt/nfs/work3/youngwookim/data/clueweb12-B13_tokens" voca_path = os.path.join(data_path, "bert_voca.txt") self.tokenizer = FullTokenizer(voca_path, True) self.file_list = load_undone_file_list()
import os import sys import datastore.tool from cpath import data_path from data_generator.tokenizer_wo_tf import FullTokenizer from galagos.doc_processor import process_jsonl def all_pipeline(jsonl_path, tokenize_fn): # Read jsonl f = open(jsonl_path, "r") line_itr = f buffered_saver = datastore.tool.BufferedSaver() process_jsonl(line_itr, tokenize_fn, buffered_saver) buffered_saver.flush() if __name__ == "__main__": jsonl_path = sys.argv[1] if len(sys.argv) == 3: voca_path = sys.argv[2] else: voca_path = os.path.join(data_path, "bert_voca.txt") tokenize_fn = FullTokenizer(voca_path, True).tokenize all_pipeline(jsonl_path, tokenize_fn)
def __init__(self, out_path_not_used): voca_path = os.path.join(data_path, "bert_voca.txt") self.tokenize_fn = FullTokenizer(voca_path, True).tokenize self.jsonl_path_format = "/mnt/nfs/work3/youngwookim/data/perspective/train_claim_perspective/doc_jsonl/{}.jsonl"
def __init__(self, jsonl_path, out_dir): voca_path = os.path.join(data_path, "bert_voca.txt") self.tokenize_fn = FullTokenizer(voca_path, True).tokenize self.jsonl_path = jsonl_path self.out_dir = out_dir exist_or_mkdir(out_dir)
def __init__(self, jsonl_path, out_path_not_used): voca_path = os.path.join(data_path, "bert_voca.txt") self.tokenize_fn = FullTokenizer(voca_path, True).tokenize self.jsonl_path = jsonl_path
def get_tokenizer(): voca_path = os.path.join(data_path, "bert_voca.txt") return FullTokenizer(voca_path)