Esempio n. 1
0
def preprocess_and_write(params: Tuple[bytes, bytes, PrepConfig, str],
                         bpe_data: Optional[BpeData] = None):
    src_file_path, dest_file_path, prep_config, part_nonbpe_vocab_folder = params

    dest_dirname = os.path.dirname(dest_file_path)
    if not os.path.exists(dest_dirname):
        os.makedirs(dest_dirname, exist_ok=True)

    if not REWRITE_PREPROCESSED_FILE and os.path.exists(dest_file_path):
        logger.warning(f"File {dest_file_path} already exists! Doing nothing.")
        return

    not_finished_dest_file_path = dest_file_path + NOT_FINISHED_EXTENSION.encode(
    )
    with gzip.GzipFile(src_file_path,
                       'rb') as i, open(not_finished_dest_file_path, 'w') as o:
        token_list = pickle.load(i)
        bpe_data = get_global_bpe_data_if_available(
        ) if bpe_data is None else bpe_data
        repr, metadata = to_repr(
            prep_config, token_list + [SpecialToken(placeholders['ect'])],
            bpe_data)
        o.write(to_literal_str(to_token_str(repr)) + '\n')

    if part_nonbpe_vocab_folder:
        save_metadata(
            metadata,
            os.path.join(
                part_nonbpe_vocab_folder,
                f'{os.path.basename(dest_file_path)}_-_{time.time()}'))

    os.rename(not_finished_dest_file_path, dest_file_path)
Esempio n. 2
0
def _dump_vocab_dict(lst: List[Tuple[str, int]],
                     file: str,
                     to_literal=True) -> None:
    with open(file, 'w') as f:
        for word, freq in lst:
            if to_literal:
                word = to_literal_str(word)
            f.write(f'{word}{VOCAB_DICT_DELIM}{freq}\n')
Esempio n. 3
0
def data_to_langmodels_format(file: str, output_path: str) -> None:
    SEPARATOR = '@@'
    with open(file, 'r') as f:
        text = f.read()
    tokens = text.split(' ')
    prepped_tokens = []
    for token in tokens:
        if token.endswith(SEPARATOR):
            token = token[:-len(SEPARATOR)]
        else:
            token += '</t>'
        token = to_literal_str(token)
        prepped_tokens.append(token)
    prepped_text = ' '.join(prepped_tokens)
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    with open(os.path.join(output_path, os.path.basename(file)), 'w') as f:
        f.write(prepped_text)
Esempio n. 4
0
def create_vocab_for_lm(prep_corpus: PreprocessedCorpus) -> Vocab:
    return Vocab(
        ['`unk', '`pad'] +
        list(map(lambda x: to_literal_str(x),
                 prep_corpus.load_vocab().keys())))
Esempio n. 5
0
def dump_bpe_cache(dct: Dict[str, List[str]], file: str) -> None:
    with open(file, 'w') as f:
        for word, subwords in dct.items():
            a = to_literal_str(" ".join(subwords))
            f.write(f'{to_literal_str(str(word))}{KEY_VALUE_DELIM}{a}\n')