def preprocess_and_write(params: Tuple[bytes, bytes, PrepConfig, str], bpe_data: Optional[BpeData] = None): src_file_path, dest_file_path, prep_config, part_nonbpe_vocab_folder = params dest_dirname = os.path.dirname(dest_file_path) if not os.path.exists(dest_dirname): os.makedirs(dest_dirname, exist_ok=True) if not REWRITE_PREPROCESSED_FILE and os.path.exists(dest_file_path): logger.warning(f"File {dest_file_path} already exists! Doing nothing.") return not_finished_dest_file_path = dest_file_path + NOT_FINISHED_EXTENSION.encode( ) with gzip.GzipFile(src_file_path, 'rb') as i, open(not_finished_dest_file_path, 'w') as o: token_list = pickle.load(i) bpe_data = get_global_bpe_data_if_available( ) if bpe_data is None else bpe_data repr, metadata = to_repr( prep_config, token_list + [SpecialToken(placeholders['ect'])], bpe_data) o.write(to_literal_str(to_token_str(repr)) + '\n') if part_nonbpe_vocab_folder: save_metadata( metadata, os.path.join( part_nonbpe_vocab_folder, f'{os.path.basename(dest_file_path)}_-_{time.time()}')) os.rename(not_finished_dest_file_path, dest_file_path)
def _dump_vocab_dict(lst: List[Tuple[str, int]], file: str, to_literal=True) -> None: with open(file, 'w') as f: for word, freq in lst: if to_literal: word = to_literal_str(word) f.write(f'{word}{VOCAB_DICT_DELIM}{freq}\n')
def data_to_langmodels_format(file: str, output_path: str) -> None: SEPARATOR = '@@' with open(file, 'r') as f: text = f.read() tokens = text.split(' ') prepped_tokens = [] for token in tokens: if token.endswith(SEPARATOR): token = token[:-len(SEPARATOR)] else: token += '</t>' token = to_literal_str(token) prepped_tokens.append(token) prepped_text = ' '.join(prepped_tokens) if not os.path.exists(output_path): os.makedirs(output_path) with open(os.path.join(output_path, os.path.basename(file)), 'w') as f: f.write(prepped_text)
def create_vocab_for_lm(prep_corpus: PreprocessedCorpus) -> Vocab: return Vocab( ['`unk', '`pad'] + list(map(lambda x: to_literal_str(x), prep_corpus.load_vocab().keys())))
def dump_bpe_cache(dct: Dict[str, List[str]], file: str) -> None: with open(file, 'w') as f: for word, subwords in dct.items(): a = to_literal_str(" ".join(subwords)) f.write(f'{to_literal_str(str(word))}{KEY_VALUE_DELIM}{a}\n')