Beispiel #1
0
def split_text(
    in_file: str,
    out_file: str,
    vocabulary: List[str] = None,
    language='eng',
    remove_square_brackets=True,
    do_lower_case=True,
    min_length=20,
):
    """
    Breaks down the in_file into sentences. Each sentence will be on a separate line.
    Also replaces numbers with a simple spoken equivalent based on NUMBERS_TO_<lang> map and removes punctuation

    Args:
        in_file: path to original transcript
        out_file: path to the output file
        vocabulary: ASR model vocabulary
        language: text language
        remove_square_brackets: Set to True if square brackets [] should be removed from text.
            Text in square brackets often contains unaudibale fragments like notes or translations
        do_lower_case: flag that determines whether to apply lower case to the in_file text
    """

    print(f'Splitting text in {in_file} into sentences.')
    with open(in_file, "r") as f:
        transcript = f.read()

    # remove some symbols for better split into sentences
    transcript = (transcript.replace("\n", " ").replace("\t", " ").replace(
        "…", "...").replace("»", "").replace("«",
                                             "").replace("\\", "").replace(
                                                 "”", "").replace("„", ""))
    # remove extra space
    transcript = re.sub(r' +', ' ', transcript)

    if remove_square_brackets:
        transcript = re.sub(r'(\[.*?\])', ' ', transcript)

    # Read and split transcript by utterance (roughly, sentences)
    split_pattern = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s"

    if language == 'ru':
        lower_case_ru_letters_unicode = '\u0430-\u04FF'
        upper_case_ru_letters_unicode = '\u0410-\u042F'
        # remove space in the middle of the lower case abbreviation to avoid spliting into separate sentences
        matches = re.findall(r'[a-z\u0430-\u04FF]\.\s[a-z\u0430-\u04FF]\.',
                             transcript)
        for match in matches:
            transcript = transcript.replace(match, match.replace('. ', '.'))

        split_pattern = ("(?<!\w\.\w.)(?<![A-Z" +
                         upper_case_ru_letters_unicode + "][a-z" +
                         lower_case_ru_letters_unicode + "]\.)(?<![" +
                         upper_case_ru_letters_unicode + "]\.)(?<=\.|\?|\!)\s")
    elif language not in ['ru', 'eng']:
        print(
            f'Consider using {language} unicode letters for better sentence split.'
        )

    sentences = re.split(split_pattern, transcript)
    sentences_comb = []

    # adds a short sentence to the previous one
    for i in range(len(sentences)):
        if len(sentences[i]) < min_length and len(sentences_comb) > 0:
            sentences_comb[-1] += ' ' + sentences[i].strip()
        else:
            sentences_comb.append(sentences[i].strip())

    sentences = "\n".join([s.strip() for s in sentences_comb if s])

    # save split text with original punctuation and case
    out_dir, out_file_name = os.path.split(out_file)
    with open(os.path.join(out_dir, out_file_name[:-4] + '_with_punct.txt'),
              "w") as f:
        f.write(sentences)

    # substitute common abbreviations before applying lower case
    if language == 'ru':
        for k, v in RU_ABBREVIATIONS.items():
            sentences = sentences.replace(k, v)

    if do_lower_case:
        sentences = sentences.lower()

    if language == 'eng':
        for k, v in NUMBERS_TO_ENG.items():
            sentences = sentences.replace(k, v)
        # remove non acsii characters
        sentences = ''.join(i for i in sentences if ord(i) < 128)
    elif language == 'ru':
        if vocabulary and '-' not in vocabulary:
            sentences = sentences.replace('-', ' ')
        for k, v in NUMBERS_TO_RU.items():
            sentences = sentences.replace(k, v)
        # replace Latin characters with Russian
        for k, v in LATIN_TO_RU.items():
            sentences = sentences.replace(k, v)

    # make sure to leave punctuation present in vocabulary
    all_punct_marks = string.punctuation + "–—’“”"
    if vocabulary:
        for v in vocabulary:
            all_punct_marks = all_punct_marks.replace(v, '')
    sentences = re.sub("[" + all_punct_marks + "]", "", sentences).strip()

    with open(out_file, "w") as f:
        f.write(sentences)
Beispiel #2
0
def split_text(
    in_file: str,
    out_file: str,
    vocabulary: List[str] = None,
    language='eng',
    remove_square_brackets=True,
    do_lower_case=True,
    min_length=20,
    max_length=100,
    additional_split_symbols=None,
):
    """
    Breaks down the in_file into sentences. Each sentence will be on a separate line.
    Also replaces numbers with a simple spoken equivalent based on NUMBERS_TO_<lang> map and removes punctuation

    Args:
        in_file: path to original transcript
        out_file: path to the output file
        vocabulary: ASR model vocabulary
        language: text language
        remove_square_brackets: Set to True if square brackets [] should be removed from text.
            Text in square brackets often contains unaudibale fragments like notes or translations
        do_lower_case: flag that determines whether to apply lower case to the in_file text
    """

    print(f'Splitting text in {in_file} into sentences.')
    with open(in_file, "r") as f:
        transcript = f.read()

    # remove some symbols for better split into sentences
    transcript = (transcript.replace("\n", " ").replace("\t", " ").replace(
        "…",
        "...").replace("»", "").replace("«", "").replace("\\", "").replace(
            "”",
            "").replace("„",
                        "").replace("´",
                                    "").replace("--",
                                                " -- ").replace("-", " - "))
    # remove extra space
    transcript = re.sub(r' +', ' ', transcript)
    transcript = re.sub(r'(\.+)', '. ', transcript)

    if remove_square_brackets:
        transcript = re.sub(r'(\[.*?\])', ' ', transcript)
        # remove text in curly brackets
        transcript = re.sub(r'(\{.*?\})', ' ', transcript)

    # Read and split transcript by utterance (roughly, sentences)
    split_pattern = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s"

    if language == 'ru':
        lower_case_ru_letters_unicode = '\u0430-\u04FF'
        upper_case_ru_letters_unicode = '\u0410-\u042F'
        # remove space in the middle of the lower case abbreviation to avoid spliting into separate sentences
        matches = re.findall(r'[a-z\u0430-\u04FF]\.\s[a-z\u0430-\u04FF]\.',
                             transcript)
        for match in matches:
            transcript = transcript.replace(match, match.replace('. ', '.'))

        split_pattern = ("(?<!\w\.\w.)(?<![A-Z" +
                         upper_case_ru_letters_unicode + "][a-z" +
                         lower_case_ru_letters_unicode + "]\.)(?<![" +
                         upper_case_ru_letters_unicode + "]\.)(?<=\.|\?|\!)\s")
    elif language not in ['ru', 'eng']:
        print(
            f'Consider using {language} unicode letters for better sentence split.'
        )

    sentences = re.split(split_pattern, transcript)

    def additional_split(sentences, split_on_symbols, max_length):
        if len(split_on_symbols) == 0:
            return sentences

        split_on_symbols = split_on_symbols.split('|')
        for i, sym in enumerate(split_on_symbols):
            if sym == '-':
                split_on_symbols[i] = ' - '

        def _split(sentences, symbol, max_length):
            result = []
            for s in sentences:
                if len(s) <= max_length:
                    result.append(s)
                else:
                    result.extend(s.split(symbol))
            return result

        another_sent_split = []
        for sent in sentences:
            split_sent = [sent]
            for sym in split_on_symbols:
                split_sent = _split(split_sent, sym, max_length)
            another_sent_split.extend(split_sent)

        sentences = [s.strip() for s in another_sent_split if s.strip()]
        return sentences

    sentences = additional_split(sentences, additional_split_symbols,
                                 max_length)

    if min_length > 0:
        sentences_comb = []
        sentences_comb.append(sentences[0])
        # combines short sentence
        for i in range(1, len(sentences)):
            if len(sentences_comb[-1]) < min_length or len(
                    sentences[i]) < min_length:
                sentences_comb[-1] += ' ' + sentences[i].strip()
            else:
                sentences_comb.append(sentences[i].strip())
        sentences = "\n".join([s.strip() for s in sentences_comb if s.strip()])
    else:
        sentences = "\n".join([s.strip() for s in sentences if s.strip()])

    # save split text with original punctuation and case
    out_dir, out_file_name = os.path.split(out_file)
    with open(os.path.join(out_dir, out_file_name[:-4] + '_with_punct.txt'),
              "w") as f:
        f.write(sentences)

    # substitute common abbreviations before applying lower case
    if language == 'ru':
        for k, v in RU_ABBREVIATIONS.items():
            sentences = sentences.replace(k, v)

    if do_lower_case:
        sentences = sentences.lower()

    if language == 'eng':
        # remove non acsii characters
        sentences = ''.join(i for i in sentences if ord(i) < 128)
    elif language == 'ru':
        if vocabulary and '-' not in vocabulary:
            sentences = sentences.replace('-', ' ')
        # replace Latin characters with Russian
        for k, v in LATIN_TO_RU.items():
            sentences = sentences.replace(k, v)

    # replace numbers
    try:
        p = re.compile("\d+")
        new_text = ''
        match_end = 0
        for i, m in enumerate(p.finditer(sentences)):
            match = m.group()
            match_start = m.start()
            match_len = len(match)

            if i == 0:
                new_text = sentences[:match_start]
            else:
                new_text += sentences[match_end:match_start]
            match_end = match_start + match_len
            new_text += sentences[match_start:match_end].replace(
                match, num2words(match, lang=language))
        new_text += sentences[match_end:]
        sentences = new_text
    except NotImplementedError:
        print(
            f'{language} might be missing in "num2words" package. Add required language to the choices for the'
            f'--language argument.')
        raise

    # make sure to leave punctuation present in vocabulary
    all_punct_marks = string.punctuation + "–—’“”"
    if vocabulary:
        for v in vocabulary:
            all_punct_marks = all_punct_marks.replace(v, '')
    sentences = re.sub("[" + all_punct_marks + "]", "", sentences).strip()

    # remove extra space
    sentences = re.sub(r' +', ' ', sentences)
    with open(out_file, "w") as f:
        f.write(sentences)
Beispiel #3
0
def split_text(
    in_file: str,
    out_file: str,
    vocabulary: List[str] = None,
    language='eng',
    remove_brackets=True,
    do_lower_case=True,
    min_length=0,
    max_length=100,
    additional_split_symbols=None,
    use_nemo_normalization=False,
):
    """
    Breaks down the in_file roughly into sentences. Each sentence will be on a separate line.
    Written form of the numbers will be converted to its spoken equivalent, OOV punctuation will be removed.

    Args:
        in_file: path to original transcript
        out_file: path to the output file
        vocabulary: ASR model vocabulary
        language: text language
        remove_brackets: Set to True if square [] and curly {} brackets should be removed from text.
            Text in square/curly brackets often contains inaudible fragments like notes or translations
        do_lower_case: flag that determines whether to apply lower case to the in_file text
        min_length: Min number of chars of the text segment for alignment. Short segments will be combined to be
            at least min_length (not recommended for multi speaker data).
        max_length: Max number of chars of the text segment for alignment
        additional_split_symbols: Additional symbols to use for sentence split if eos sentence split resulted in
            segments longer than --max_length
        use_nemo_normalization: Set to True to use NeMo normalization tool to convert numbers from written to spoken
            format. Normalization using num2words will be applied afterwards to make sure there are no numbers present
            in the text, otherwise they will be replaced with a space and that could deteriorate segmentation results.
    """

    print(f'Splitting text in {in_file} into sentences.')
    with open(in_file, "r") as f:
        transcript = f.read()

    # remove some symbols for better split into sentences
    transcript = (transcript.replace("\n", " ").replace("\t", " ").replace(
        "…",
        "...").replace("\\",
                       " ").replace("--",
                                    " -- ").replace(". . .",
                                                    "...").replace("‘", "’"))
    # remove extra space
    transcript = re.sub(r' +', ' ', transcript)
    transcript = re.sub(r'(\.+)', '. ', transcript)

    if remove_brackets:
        transcript = re.sub(r'(\[.*?\])', ' ', transcript)
        # remove text in curly brackets
        transcript = re.sub(r'(\{.*?\})', ' ', transcript)

    lower_case_unicode = ''
    upper_case_unicode = ''
    if language == 'ru':
        lower_case_unicode = '\u0430-\u04FF'
        upper_case_unicode = '\u0410-\u042F'
    elif language not in ['ru', 'eng']:
        print(
            f'Consider using {language} unicode letters for better sentence split.'
        )

    # remove space in the middle of the lower case abbreviation to avoid splitting into separate sentences
    matches = re.findall(
        r'[a-z' + lower_case_unicode + ']\.\s[a-z' + lower_case_unicode +
        ']\.', transcript)
    for match in matches:
        transcript = transcript.replace(match, match.replace('. ', '.'))

    # find phrases in quotes
    with_quotes = re.finditer(r'“[A-Za-z ?]+.*?”', transcript)
    sentences = []
    last_idx = 0
    for m in with_quotes:
        match = m.group()
        match_idx = m.start()
        if last_idx < match_idx:
            sentences.append(transcript[last_idx:match_idx])
        sentences.append(match)
        last_idx = m.end()
    sentences.append(transcript[last_idx:])
    sentences = [s.strip() for s in sentences if s.strip()]

    # Read and split transcript by utterance (roughly, sentences)
    split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s"

    new_sentences = []
    for sent in sentences:
        new_sentences.extend(regex.split(split_pattern, sent))
    sentences = [s.strip() for s in new_sentences if s.strip()]

    def additional_split(sentences, split_on_symbols, max_length):
        if len(split_on_symbols) == 0:
            return sentences

        split_on_symbols = split_on_symbols.split('|')

        def _split(sentences, delimiter, max_length):
            result = []
            for s in sentences:
                if len(s) <= max_length:
                    result.append(s)
                else:
                    split_sent = s.split(delimiter)
                    result.extend([s + delimiter for s in split_sent[:-1]] +
                                  [split_sent[-1]])
            return result

        another_sent_split = []
        for sent in sentences:
            split_sent = [sent]
            for delimiter in split_on_symbols:
                split_sent = _split(split_sent, delimiter + ' ', max_length)
            another_sent_split.extend(split_sent)

        sentences = [s.strip() for s in another_sent_split if s.strip()]
        return sentences

    sentences = additional_split(sentences, additional_split_symbols,
                                 max_length)

    # check to make sure there will be no utterances for segmentation with only OOV symbols
    vocab_no_space_with_digits = set(vocabulary + [i for i in range(10)])
    vocab_no_space_with_digits.remove(' ')
    sentences = [
        s for s in sentences
        if len(vocab_no_space_with_digits.intersection(set(s))) > 0
    ]

    if min_length > 0:
        sentences_comb = []
        sentences_comb.append(sentences[0])
        # combines short sentence
        for i in range(1, len(sentences)):
            if len(sentences_comb[-1]) < min_length or len(
                    sentences[i]) < min_length:
                sentences_comb[-1] += ' ' + sentences[i].strip()
            else:
                sentences_comb.append(sentences[i].strip())
        sentences = sentences_comb

    sentences = [s.strip() for s in sentences if s.strip()]

    # save split text with original punctuation and case
    out_dir, out_file_name = os.path.split(out_file)
    with open(os.path.join(out_dir, out_file_name[:-4] + '_with_punct.txt'),
              "w") as f:
        f.write("\n".join(sentences))

    # substitute common abbreviations before applying lower case
    if language == 'ru':
        for k, v in RU_ABBREVIATIONS.items():
            sentences = [s.replace(k, v) for s in sentences]

    if language == 'ru':
        # replace Latin characters with Russian
        for k, v in LATIN_TO_RU.items():
            sentences = [s.replace(k, v) for s in sentences]

    if language == 'eng' and use_nemo_normalization:
        if not NEMO_NORMALIZATION_AVAILABLE:
            raise ValueError(f'NeMo normalization tool is not installed.')

        print('Using NeMo normalization tool...')
        normalizer = Normalizer(input_case='cased')
        sentences_norm = normalizer.normalize_list(sentences, verbose=False)
        if len(sentences_norm) != len(sentences):
            raise ValueError(
                f'Normalization failed, number of sentences does not match.')

    sentences = '\n'.join(sentences)

    # replace numbers with num2words
    try:
        p = re.compile("\d+")
        new_text = ''
        match_end = 0
        for i, m in enumerate(p.finditer(sentences)):
            match = m.group()
            match_start = m.start()
            if i == 0:
                new_text = sentences[:match_start]
            else:
                new_text += sentences[match_end:match_start]
            match_end = m.end()
            new_text += sentences[match_start:match_end].replace(
                match, num2words(match, lang=language))
        new_text += sentences[match_end:]
        sentences = new_text
    except NotImplementedError:
        print(
            f'{language} might be missing in "num2words" package. Add required language to the choices for the'
            f'--language argument.')
        raise

    sentences = (sentences.replace("’", "'").replace("»", '"').replace(
        "«",
        '"').replace("\\", "").replace("”", '"').replace("„", '"').replace(
            "´", "'").replace("-- --", "--").replace("--", " -- ").replace(
                "’", "'").replace('“', '"').replace('“', '"').replace(
                    "‘", "'").replace('—', '-').replace("- -", "--").replace(
                        '`',
                        "'").replace(' !', '!').replace(' ?', '?').replace(
                            ' ,', ',').replace(' .', '.').replace(
                                ' ;', ';').replace(' :', ':').replace(
                                    '!!', '!').replace('--', '-').replace(
                                        '“',
                                        '"').replace(', , ',
                                                     ', ').replace('=', ''))

    allowed_punct = [',', '.', '?', '!', ':', ';', '-', '"', '(', ')']
    # clean up normalized text and keep only allowed_punct and ASR vocabulary (lower and upper case)
    symbols_to_remove = ''.join(
        set(sentences).difference(
            set(vocabulary + [s.upper()
                              for s in vocabulary] + ['\n'] + allowed_punct)))
    sentences_norm = sentences.translate(''.maketrans(
        symbols_to_remove,
        len(symbols_to_remove) * ' '))

    with open(
            os.path.join(out_dir,
                         out_file_name[:-4] + '_with_punct_normalized.txt'),
            "w") as f:
        f.write(sentences_norm)

    if do_lower_case:
        sentences = sentences.lower()

    # remove all OOV symbols
    symbols_to_remove = ''.join(
        set(sentences).difference(set(vocabulary + ['\n'])))
    sentences = sentences.translate(''.maketrans(symbols_to_remove,
                                                 len(symbols_to_remove) * ' '))

    # remove extra space
    sentences = re.sub(r' +', ' ', sentences)
    with open(out_file, "w") as f:
        f.write(sentences)
Beispiel #4
0
def split_text(
    in_file: str,
    out_file: str,
    vocabulary: List[str],
    language="en",
    remove_brackets=True,
    do_lower_case=True,
    max_length=100,
    additional_split_symbols=None,
    use_nemo_normalization=False,
):
    """
    Breaks down the in_file roughly into sentences. Each sentence will be on a separate line.
    Written form of the numbers will be converted to its spoken equivalent, OOV punctuation will be removed.

    Args:
        in_file: path to original transcript
        out_file: path to the output file
        vocabulary: ASR model vocabulary
        language: text language
        remove_brackets: Set to True if square [] and curly {} brackets should be removed from text.
            Text in square/curly brackets often contains inaudible fragments like notes or translations
        do_lower_case: flag that determines whether to apply lower case to the in_file text
        max_length: Max number of words of the text segment for alignment
        additional_split_symbols: Additional symbols to use for sentence split if eos sentence split resulted in
            segments longer than --max_length
        use_nemo_normalization: Set to True to use NeMo normalization tool to convert numbers from written to spoken
            format. Normalization using num2words will be applied afterwards to make sure there are no numbers present
            in the text, otherwise they will be replaced with a space and that could deteriorate segmentation results.
    """
    print(f"Splitting text in {in_file} into sentences.")
    with open(in_file, "r") as f:
        transcript = f.read()

    # remove some symbols for better split into sentences
    transcript = (transcript.replace("\n", " ").replace("\t", " ").replace(
        "…", "...").replace("\\", " ").replace("--",
                                               " -- ").replace(". . .", "..."))
    # remove extra space
    transcript = re.sub(r" +", " ", transcript)
    transcript = re.sub(r"(\.+)", ". ", transcript)

    if remove_brackets:
        transcript = re.sub(r'(\[.*?\])', ' ', transcript)
        # remove text in curly brackets
        transcript = re.sub(r'(\{.*?\})', ' ', transcript)

    lower_case_unicode = ''
    upper_case_unicode = ''
    if language == "ru":
        lower_case_unicode = '\u0430-\u04FF'
        upper_case_unicode = '\u0410-\u042F'
    elif language not in ["ru", "en"]:
        print(
            f"Consider using {language} unicode letters for better sentence split."
        )

    # remove space in the middle of the lower case abbreviation to avoid splitting into separate sentences
    matches = re.findall(
        r'[a-z' + lower_case_unicode + ']\.\s[a-z' + lower_case_unicode +
        ']\.', transcript)
    for match in matches:
        transcript = transcript.replace(match, match.replace('. ', '.'))

    # find phrases in quotes
    with_quotes = re.finditer(r'“[A-Za-z ?]+.*?”', transcript)
    sentences = []
    last_idx = 0
    for m in with_quotes:
        match = m.group()
        match_idx = m.start()
        if last_idx < match_idx:
            sentences.append(transcript[last_idx:match_idx])
        sentences.append(match)
        last_idx = m.end()
    sentences.append(transcript[last_idx:])
    sentences = [s.strip() for s in sentences if s.strip()]

    # Read and split transcript by utterance (roughly, sentences)
    split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s"

    new_sentences = []
    for sent in sentences:
        new_sentences.extend(regex.split(split_pattern, sent))
    sentences = [s.strip() for s in new_sentences if s.strip()]

    def additional_split(sentences, split_on_symbols):
        if len(split_on_symbols) == 0:
            return sentences

        split_on_symbols = split_on_symbols.split("|")

        def _split(sentences, delimiter):
            result = []
            for sent in sentences:
                split_sent = sent.split(delimiter)
                # keep the delimiter
                split_sent = [(s + delimiter).strip()
                              for s in split_sent[:-1]] + [split_sent[-1]]

                if "," in delimiter:
                    # split based on comma usually results in too short utterance, combine sentences
                    # that result in a single word split. It's usually not recommended to do that for other delimiters.
                    comb = []
                    for s in split_sent:
                        MIN_LEN = 2
                        # if the previous sentence is too short, combine it with the current sentence
                        if len(comb) > 0 and (len(comb[-1].split()) <= MIN_LEN
                                              or len(s.split()) <= MIN_LEN):
                            comb[-1] = comb[-1] + " " + s
                        else:
                            comb.append(s)
                    result.extend(comb)
                else:
                    result.extend(split_sent)
            return result

        another_sent_split = []
        for sent in sentences:
            split_sent = [sent]
            for delimiter in split_on_symbols:
                split_sent = _split(split_sent, delimiter + " ")
            another_sent_split.extend(split_sent)

        sentences = [s.strip() for s in another_sent_split if s.strip()]
        return sentences

    sentences = additional_split(sentences, additional_split_symbols)

    vocabulary_symbols = []
    for x in vocabulary:
        if x != "<unk>":
            # for BPE models
            vocabulary_symbols.extend(
                [x for x in x.replace("##", "").replace("▁", "")])
    vocabulary_symbols = list(set(vocabulary_symbols))
    vocabulary_symbols += [x.upper() for x in vocabulary_symbols]

    # check to make sure there will be no utterances for segmentation with only OOV symbols
    vocab_no_space_with_digits = set(vocabulary_symbols +
                                     [str(i) for i in range(10)])
    if " " in vocab_no_space_with_digits:
        vocab_no_space_with_digits.remove(" ")

    sentences = [
        s.strip() for s in sentences
        if len(vocab_no_space_with_digits.intersection(set(s.lower()))) > 0
        and s.strip()
    ]

    # when no punctuation marks present in the input text, split based on max_length
    if len(sentences) == 1:
        sent = sentences[0].split()
        sentences = []
        for i in range(0, len(sent), max_length):
            sentences.append(" ".join(sent[i:i + max_length]))
    sentences = [s.strip() for s in sentences if s.strip()]

    # save split text with original punctuation and case
    out_dir, out_file_name = os.path.split(out_file)
    with open(os.path.join(out_dir, out_file_name[:-4] + "_with_punct.txt"),
              "w") as f:
        f.write(re.sub(r' +', ' ', "\n".join(sentences)))

    # substitute common abbreviations before applying lower case
    if language == "ru":
        for k, v in RU_ABBREVIATIONS.items():
            sentences = [s.replace(k, v) for s in sentences]
        # replace Latin characters with Russian
        for k, v in LATIN_TO_RU.items():
            sentences = [s.replace(k, v) for s in sentences]

    if language == "en" and use_nemo_normalization:
        if not NEMO_NORMALIZATION_AVAILABLE:
            raise ValueError("NeMo normalization tool is not installed.")

        print("Using NeMo normalization tool...")
        normalizer = Normalizer(input_case="cased",
                                cache_dir=os.path.join(
                                    os.path.dirname(out_file), "en_grammars"))
        sentences_norm = normalizer.normalize_list(sentences,
                                                   verbose=False,
                                                   punct_post_process=True)
        if len(sentences_norm) != len(sentences):
            raise ValueError(
                "Normalization failed, number of sentences does not match.")
        else:
            sentences = sentences_norm

    sentences = '\n'.join(sentences)

    # replace numbers with num2words
    try:
        p = re.compile("\d+")
        new_text = ""
        match_end = 0
        for i, m in enumerate(p.finditer(sentences)):
            match = m.group()
            match_start = m.start()
            if i == 0:
                new_text = sentences[:match_start]
            else:
                new_text += sentences[match_end:match_start]
            match_end = m.end()
            new_text += sentences[match_start:match_end].replace(
                match, num2words(match, lang=language))
        new_text += sentences[match_end:]
        sentences = new_text
    except NotImplementedError:
        print(
            f"{language} might be missing in 'num2words' package. Add required language to the choices for the"
            f"--language argument.")
        raise

    sentences = re.sub(r' +', ' ', sentences)

    with open(
            os.path.join(out_dir,
                         out_file_name[:-4] + "_with_punct_normalized.txt"),
            "w") as f:
        f.write(sentences)

    if do_lower_case:
        sentences = sentences.lower()

    symbols_to_remove = ''.join(
        set(sentences).difference(set(vocabulary_symbols + ["\n", " "])))
    sentences = sentences.translate(''.maketrans(symbols_to_remove,
                                                 len(symbols_to_remove) * " "))

    # remove extra space
    sentences = re.sub(r' +', ' ', sentences)
    with open(out_file, "w") as f:
        f.write(sentences)