def toMiddleFormat(path): from phraseg import Phraseg punctuations = r"[.﹑︰〈〉─《﹖﹣﹂﹁﹔!?。。"#$%&'()*+,﹐-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏..!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]+" MASKTOKEN = "[MASK]" dataset = MiddleFormat(DATASETINFO, [MASKTOKEN]) phraseg = Phraseg(path) for line in tqdm(nlp2.read_files_yield_lines(path)): line = nlp2.clean_all(line).strip() if len(nlp2.split_sentence_to_array(line)) > 1: phrases = list((phraseg.extract(sent=line, merge_overlap=False)).keys()) reg = "[0-9]+|[a-zA-Z]+\'*[a-z]*|[\w]" + "|" + punctuations reg = "|".join(phrases) + "|" + reg input_sent = re.findall(reg, line, re.UNICODE) target_sent = re.findall(reg, line, re.UNICODE) for ind, word in enumerate(input_sent): prob = random.random() if prob <= 0.15 and len(word) > 0: input_sent[ind] = MASKTOKEN if len(input_sent) > 2 and len(target_sent) > 2 and len( "".join(input_sent).strip()) > 2 and len( "".join(target_sent).strip()) > 2: dataset.add_data(nlp2.join_words_to_sentence(input_sent), nlp2.join_words_to_sentence(target_sent)) return dataset
def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int): unk_count_dict = OrderedDict() for path in file_paths: for input_sent in tqdm(nlp2.read_files_yield_lines(path)): for tok in nlp2.split_sentence_to_array(input_sent): if tokenizer._unk_token in tokenizer.tokenize(tok): unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1 return [key for key, value in unk_count_dict.items() if value >= freqK]
def get_topP_unk_token(tokenizer, file_paths: list, topP: float): unk_count_dict = OrderedDict() for path in file_paths: for input_sent in tqdm(nlp2.read_files_yield_lines(path)): for tok in nlp2.split_sentence_to_array(input_sent): if tokenizer._unk_token in tokenizer.tokenize(tok): unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1 top_range = int(len(unk_count_dict) * (topP / 100)) return list(unk_count_dict.keys())[:top_range]