def initializer(self): global bpe bpe = get_encoder( os.path.join(os.path.join(self.roberta_dir, 'gpt2_bpe', 'encoder.json')), os.path.join(os.path.join(self.roberta_dir, 'gpt2_bpe', 'vocab.bpe')), ) global vocab vocab = Dictionary.load(os.path.join(self.roberta_dir, 'roberta.base', 'dict.txt'))
def initializer(self): global bpe bpe = get_encoder( os.path.join(self.roberta_dir, 'encoder.json'), os.path.join(self.roberta_dir, 'vocab.bpe'), ) global vocab vocab = Dictionary.load(os.path.join(self.roberta_dir, 'dict.txt')) global entities if self.entity_vocab is not None: entities = load_entities(self.entity_vocab)
def create_ent_augmented_target(source_file, target_file, out_text_file, out_bpe_file, tokenizer_dir, special_token=50009, max_len=1024): n_s = count_lines_in_text_file(source_file) n_t = count_lines_in_text_file(target_file) assert n_s == n_t, \ "Number of lines not consistent: {}, {}".format(n_s, n_t) nlp = spacy.load("en_core_web_lg") encoder_args = SimpleNamespace( encoder_json=os.path.join(tokenizer_dir, "encoder.json"), vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"), keep_empty=True) bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe) with open(source_file, 'r') as s_f, \ open(target_file, 'r') as t_f, \ open(out_bpe_file, 'w') as out_bpe_f, \ open(out_text_file, 'w') as out_text_f: for _ in tqdm(range(n_s)): sline = s_f.readline().strip() tline = t_f.readline().strip() doc = nlp(tline) entities_per_example = [] for e in doc.ents: if e[0].ent_type_ in TRACKING_ENTITY_LIST: # if e.text in source: match_result = entity_match(e.text, sline, 2) if match_result: entities_per_example.append(match_result[0]) target_bpe = bpe.encode(tline) if entities_per_example: entity_bpe = bpe.encode(", ".join(entities_per_example)) augmented_target_bpe = entity_bpe + [ special_token, ] + target_bpe else: augmented_target_bpe = [ special_token, ] + target_bpe out_text_f.write("{}".format(entities_per_example) + '\n') out_bpe_f.write( ' '.join(map(str, augmented_target_bpe[:max_len - 1])) + '\n')
def create_ent_labels(source_file, target_file, out_file, tokenizer_dir, first_only=False): n_s = count_lines_in_text_file(source_file) n_t = count_lines_in_text_file(target_file) assert n_s == n_t, \ "Number of lines not consistent: {}, {}".format(n_s, n_t) nlp = spacy.load("en_core_web_lg") entities_found = [] encoder_args = SimpleNamespace( encoder_json=os.path.join(tokenizer_dir, "encoder.json"), vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"), keep_empty=True) bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe) with open(source_file, 'r') as s_f, \ open(target_file, 'r') as t_f, \ open(out_file, 'w') as out_f: for _ in tqdm(range(n_s)): sline = s_f.readline().strip() tline = t_f.readline().strip() tokens = bpe.encode(sline) labels = [0] * len(tokens) doc = nlp(tline) entities_per_example = [] for e in doc.ents: if e[0].ent_type_ in TRACKING_ENTITY_LIST: entity_new = {'text': e.text, 'type': e[0].ent_type_} # if e.text in source: match_result = entity_match(e.text, sline, 2) entity_new['match_result'] = match_result labels = update_bio_labels(labels, sline, match_result, tokens, bpe, first_only=first_only) entities_per_example.append(entity_new) out_f.write(" ".join([str(i) for i in labels]) + '\n') entities_found.append(entities_per_example) return entities_found
def sanity_check(entities, source_bpe_file, label_file, eval_file, tokenizer_dir): n_s = count_lines_in_text_file(source_bpe_file) n_l = count_lines_in_text_file(label_file) assert n_s == n_l == len(entities), \ "Number of lines not consistent: {}, {}, {}, {}".format(n_s, n_l, len(entities)) encoder_args = SimpleNamespace( encoder_json=os.path.join(tokenizer_dir, "encoder.json"), vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"), keep_empty=True) bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe) with open(source_bpe_file, 'r') as s_f, \ open(label_file, 'r') as l_f, \ open(eval_file, 'w') as o_f: for i in tqdm(range(n_l)): sline = s_f.readline().strip() tokens = [int(t) for t in sline.split()] lline = l_f.readline().strip() labels = [int(t) for t in lline.split()] assert len(tokens) == len( labels), "Number of source tokens must equal that of labels!" entities_per_example = entities[i] ent_text = "" for e in entities_per_example: ent_text += e['text'] ent_text += str(e['match_result']) ent_text += ", " spans = extract_ent_from_labels(tokens, labels) ent_text += "FROM LABELS==>" for span in spans: ent_text += bpe.decode(span).strip() ent_text += ', ' ent_text += '\n' o_f.write(ent_text)
def initializer(self): global bpe bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe)
def main(json_folder_dir, debug_mode, dataset_name, percentage, encoder_json_file_path, vocab_bpe_path): if dataset_name == "All": dataset_name_list = [ 'biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'pmc_custom_license' ] else: dataset_name_list = [dataset_name] ## take text out of json txt_dir = "raw_txt_data" json_text(json_folder_dir, dataset_name_list, percentage, txt_dir) ## BPE on text ## To encoder all input arguments here. # encoder_json_file_path = 'gpt2_bpe/encoder.json' # vocab_bpe_path = 'gpt2_bpe/vocab.bpe' bpe = get_encoder(encoder_json_file_path, vocab_bpe_path) def encode(line): # global bpe ids = bpe.encode(line) return list(map(str, ids)) def decode(tokens): # global bpe return bpe.decode(tokens) bpe_dir = 'bpe_data' os.makedirs(bpe_dir, exist_ok=True) input_file_train = open(os.path.join(txt_dir, "train.txt"), "r", encoding="utf-8") input_file_val = open(os.path.join(txt_dir, "val.txt"), "r", encoding="utf-8") output_file_train = open(os.path.join(bpe_dir, "train.bpe"), "w", encoding="utf-8") output_file_val = open(os.path.join(bpe_dir, "val.bpe"), "w", encoding="utf-8") input_file_list = [input_file_train, input_file_val] output_file_list = [output_file_train, output_file_val] for handler_idx, output_handler in enumerate(output_file_list): output_text_files = [output_handler] enc_lines = [] for idx, line in enumerate(input_file_list[handler_idx]): line = line.strip() tokens = encode(line) if len(tokens) == 0: continue enc_lines.append(" ".join(tokens)) encoded_lines = [["PASS", enc_lines]] for i, (filt, enc_lines) in enumerate(encoded_lines, start=1): if filt == "PASS": for enc_line, output_h in zip(enc_lines, output_text_files): print(enc_line, file=output_h) print("BPE has been generated.")
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "sent" in self.config.n_model: import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") from nltk.tokenize import sent_tokenize return PororoSentTokenizer(sent_tokenize, self.config) if self.config.n_model == "mecab_ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabKoTokenizer(model, self.config) if self.config.n_model == "char": return PororoCharTokenizer(self.config) if self.config.n_model == "jamo": return PororoJamoTokenizer(self.config) if self.config.n_model == "word": return PororoWordTokenizer(self.config) if self.config.n_model == "roberta": from fairseq.data.encoders.gpt2_bpe import get_encoder encoder = download_or_load("misc/encoder.json", self.config.lang) vocab = download_or_load("misc/vocab.bpe", self.config.lang) model = get_encoder(encoder, vocab) with open(encoder, "r") as f_vocab: vocab = json.load(f_vocab) inv_dict = {v: k for k, v in vocab.items()} return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config) if self.config.n_model == "moses": try: from sacremoses import MosesDetokenizer, MosesTokenizer except ModuleNotFoundError as error: raise error.__class__( "Please install sacremoses with: `pip install sacremoses`") model = MosesTokenizer(lang="en") detok = MosesDetokenizer(lang="en") return PororoMosesTokenizer(model, detok, self.config) if self.config.n_model == "jieba": try: import jieba except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") model = jieba.cut return PororoJiebaTokenizer(model, self.config) if self.config.n_model == "mecab": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabTokenizer(model, self.config) else: from pororo.tasks.utils.tokenizer import CustomTokenizer path = download_or_load( f"tokenizers/{self.config.n_model}.zip", self.config.lang, ) ext = "json" if "unigram" not in self.config.n_model else "txt" merges_filename = (f"{path}/merges.txt" if "unigram" not in self.config.n_model else None) model = CustomTokenizer.from_file( vocab_filename=f"{path}/vocab.{ext}", merges_filename=merges_filename, normalize=True if "jpe" not in self.config.n_model else False, ) if "jpe" in self.config.n_model: return PororoJamoPairTokenizer(model, self.config) if "mecab.bpe" in self.config.n_model: return PororoMecabSPTokenizer(model, self.config) return PororoSPTokenizer(model, self.config)
def __init__(self, encoder_json_path: str, vocab_bpe_path: str): self.processor = get_encoder(encoder_json_path, vocab_bpe_path)
from fairseq.data.encoders.gpt2_bpe import get_encoder encoder = get_encoder("/path/to/roberta.base/encoder.json", "/path/to/roberta.base/vocab.bpe") train_data = [] with open("dev.csv", "r", encoding="utf-8") as f: lines = f.readlines() tmp_sentence = [] for idx, line in enumerate(lines): if idx == 0: continue if line.split(",")[0].startswith("Sentence: "): if len(tmp_sentence) > 0: train_data.append(tmp_sentence) tmp_sentence = [] line = line.strip().split(",") if len(line) == 4: word = line[1] label = line[3] else: word = "," label = line[-1] ids = encoder.encode( word ) # in ner task, capitalized word are more likely to be predicted as ner word, so use encoder.encode(word.lower()) may help you in real world, although the accuracy may be lower for idx, _id in enumerate(ids): if label.startswith("B") and idx != 0: label = "I" + label[1:] tmp_sentence.append((str(_id), label)) with open("dev.text.txt.bpe", "w", encoding="utf-8") as ft, \ open("dev.label.txt", "w", encoding="utf-8") as fl:
def preprecess_QA_generation_newsqa_squad( input_dir, output_dir, encoder_json="/home/ec2-user/fairseq/encoder.json", vocab_bpe="/home/ec2-user/fairseq/vocab.bpe", only_squad=False): # use '50009' for the special dictionary token to separate question and answers since # this token is not encountered in bpe outputs def _process_data(d, data_source, bpe, source_f, source_bpe_f, target_f, target_bpe_f): if data_source == 'newsqa': source = d['text'].strip() for q in d['questions']: if 'consensus' in q and 'q' in q and 's' in q['consensus']: question = q['q'].strip() answer_s = q['consensus']['s'] answer_e = q['consensus']['e'] answer = source[answer_s:answer_e].strip() truncated_source_bpe, truncated_source, question_answer_bpe = \ _format_question_answers_bpe(bpe, source, question, answer, special_token_id) if truncated_source is None or answer_e >= len( truncated_source ): # skip the question as answer span was truncated in source continue source_f.write( truncated_source.encode( 'unicode-escape').decode().replace('\\\\', '\\') + '\n') source_bpe_f.write( ' '.join(map(str, truncated_source_bpe)) + '\n') target_f.write(bpe.decode(question_answer_bpe) + '\n') target_bpe_f.write( ' '.join(map(str, question_answer_bpe)) + '\n') elif data_source == 'squad': for paragraph in d['paragraphs']: context = paragraph['context'] for qa in paragraph['qas']: question = qa['question'].strip() ans_set = set() for ans in qa['answers']: if ans['text'] not in ans_set: ans_set.add(ans['text']) truncated_source_bpe, truncated_source, question_answer_bpe = \ _format_question_answers_bpe(bpe, context, question, ans['text'], special_token_id) if truncated_source is None: # skip the question continue source_f.write( truncated_source.encode('unicode-escape'). decode().replace('\\\\', '\\') + '\n') source_bpe_f.write( ' '.join(map(str, truncated_source_bpe)) + '\n') target_f.write( bpe.decode(question_answer_bpe) + '\n') target_bpe_f.write( ' '.join(map(str, question_answer_bpe)) + '\n') else: raise Exception("data_source must be squad or newsqa!") special_token_id = 50009 from fairseq.data.encoders.gpt2_bpe import get_encoder bpe = get_encoder(encoder_json, vocab_bpe) if not only_squad: input_json = os.path.join(input_dir, 'combined-newsqa-data-v1.json') with open(input_json, 'r') as f: newsqa = json.load(f) with open(os.path.join(output_dir, 'train.source'), 'w') as train_source_f, \ open(os.path.join(output_dir, 'train.target'), 'w') as train_target_f, \ open(os.path.join(output_dir, 'train.bpe.source'), 'w') as train_source_bpe_f, \ open(os.path.join(output_dir, 'train.bpe.target'), 'w') as train_target_bpe_f, \ open(os.path.join(output_dir, 'val.source'), 'w') as val_source_f, \ open(os.path.join(output_dir, 'val.target'), 'w') as val_target_f, \ open(os.path.join(output_dir, 'val.bpe.source'), 'w') as val_source_bpe_f, \ open(os.path.join(output_dir, 'val.bpe.target'), 'w') as val_target_bpe_f, \ open(os.path.join(output_dir, 'test.source'), 'w') as test_source_f, \ open(os.path.join(output_dir, 'test.target'), 'w') as test_target_f, \ open(os.path.join(output_dir, 'test.bpe.source'), 'w') as test_source_bpe_f, \ open(os.path.join(output_dir, 'test.bpe.target'), 'w') as test_target_bpe_f: if not only_squad: for data in tqdm(newsqa['data']): if data['type'] == 'train': _process_data(data, 'newsqa', bpe, train_source_f, train_source_bpe_f, train_target_f, train_target_bpe_f) elif data['type'] == 'dev': _process_data(data, 'newsqa', bpe, val_source_f, val_source_bpe_f, val_target_f, val_target_bpe_f) elif data['type'] == 'test': _process_data(data, 'newsqa', bpe, test_source_f, test_source_bpe_f, test_target_f, test_target_bpe_f) else: print("data type error!") print(data) break print("Done with NewsQA!") print("Doing Squad now!") data_types = ["validation", "train"] for dtype in data_types: if dtype == "validation": input_file = "dev-v1.1.json" elif dtype == "train": input_file = "train-v1.1.json" else: print("ERROR! data split should be validation or train!") with open(os.path.join(input_dir, input_file), 'r') as f_in: data_dict = json.load(f_in) if dtype == "train": for data in tqdm(data_dict['data']): _process_data(data, 'squad', bpe, train_source_f, train_source_bpe_f, train_target_f, train_target_bpe_f) elif dtype == "validation": for data in data_dict['data']: _process_data(data, 'squad', bpe, val_source_f, val_source_bpe_f, val_target_f, val_target_bpe_f)
def __init__(self, dictionary, entity_dictionary, task=None): self.bpe = get_encoder(self.encoder_json, self.vocab_bpe) self.dictionary = dictionary self.entity_dictionary = entity_dictionary self.task = task