예제 #1
0
#!/usr/bin/env python

# Copyright 2018 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
import codecs

from cleaners import english_cleaners


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('text', type=str, help='text to be cleaned')
    args = parser.parse_args()
    with codecs.open(args.text, 'r', 'utf-8') as fid:
        for line in fid.readlines():
            id, _, content = line.split("|")
            clean_content = english_cleaners(content.rstrip())
            print("%s %s" % (id, clean_content))
def transform_text(char_seq,
                   auto_pronounce=True,
                   phone_seq=None,
                   force_char_spc=True,
                   symbol_processing="blended_pref",
                   random_state=None):
    """
    chars format example: "i am learning english."
    phone_seq format example: "@ay @ae@m @l@er@n@ih@ng @ih@ng@g@l@ih@sh"

    chars_only
    phones_only
    blended_pref

    phone_seq formatting can be gotten from text, using the pronounce_chars function with 'from text import pronounce_chars'
        Uses cmudict to do pronunciation
    """
    if random_state is None:
        random_state = lcl_random_state

    if phone_seq is None and auto_pronounce is False and symbol_processing != "chars_only":
        raise ValueError(
            "phone_seq argument must be provided for iterator with self.symbol_processing != 'chars_only', currently '{}'"
            .format(self.symbol_processing))
    clean_char_seq = cleaners.english_cleaners(char_seq)
    char_seq_chunk = clean_char_seq.split(" ")
    dirty_seq_chunk = char_seq.split(" ")

    if auto_pronounce is True:
        if phone_seq is not None:
            raise ValueError(
                "auto_pronounce set to True, but phone_seq was provided! Pass phone_seq=None for auto_pronounce=True"
            )
        # take out specials then put them back...
        specials = "!?.,;:"
        puncts = "!?."
        tsc = []
        for n, csc in enumerate(char_seq_chunk):
            broke = False
            for s in specials:
                if s in csc:
                    new = csc.replace(s, "")
                    tsc.append(new)
                    broke = True
                    break
            if not broke:
                tsc.append(csc)

        if symbol_processing == "blended_pref":
            chunky_phone_seq_chunk = [
                pronounce_chars(w, raw_line=dirty_seq_chunk[ii], cmu_only=True)
                for ii, w in enumerate(tsc)
            ]
            phone_seq_chunk = [
                cpsc[0] if cpsc != None else None
                for cpsc in chunky_phone_seq_chunk
            ]
        else:
            phone_seq_chunk = [pronounce_chars(w) for w in tsc]
        for n, psc in enumerate(phone_seq_chunk):
            for s in specials:
                if char_seq_chunk[n][-1] == s and phone_seq_chunk[n] != None:
                    phone_seq_chunk[n] += char_seq_chunk[n][-1]
                    #if char_seq_chunk[n][-1] in puncts and n != (len(phone_seq_chunk) - 1):
                    #    # add eos
                    #    char_seq_chunk[n] += "~"
                    #    phone_seq_chunk[n] += "~"
                    break
    else:
        raise ValueError("Non auto_pronounce setting not yet configured")

    if len(char_seq_chunk) != len(phone_seq_chunk):
        raise ValueError(
            "Char and phone chunking resulted in different lengths {} and {}!\n{}\n{}"
            .format(len(char_seq_chunk), len(phone_seq_chunk), char_seq_chunk,
                    phone_seq_chunk))

    if symbol_processing != "phones_only":
        spc = text_to_sequence(" ", [clean_names[0]])[0]
    else:
        spc = text_to_sequence(" ", [clean_names[1]])[0]

    int_char_chunks = []
    int_phone_chunks = []
    for n in range(len(char_seq_chunk)):
        int_char_chunks.append(
            text_to_sequence(char_seq_chunk[n], [clean_names[0]])[:-1])
        if phone_seq_chunk[n] == None:
            int_phone_chunks.append([])
        else:
            int_phone_chunks.append(
                text_to_sequence(phone_seq_chunk[n], [clean_names[1]])[:-2])

    # check inverses
    # w = [sequence_to_text(int_char_chunks[i], [self.clean_names[0]]) for i in range(len(int_char_chunks))]
    # p = [sequence_to_text(int_phone_chunks[i], [self.clean_names[1]]) for i in range(len(int_phone_chunks))]

    # TODO: Unify the two functions?
    char_phone_mask = [0] * len(int_char_chunks) + [1] * len(int_phone_chunks)
    random_state.shuffle(char_phone_mask)
    char_phone_mask = char_phone_mask[:len(int_char_chunks)]
    # setting char_phone_mask to 0 will use chars, 1 will use phones
    # these if statements override the default for blended... (above)
    if symbol_processing == "blended_pref":
        char_phone_mask = [
            0 if len(int_phone_chunks[i]) == 0 else 1
            for i in range(len(int_char_chunks))
        ]
    elif symbol_processing == "phones_only":
        # set the mask to use only phones
        # all files should have phones because of earlier preproc...
        char_phone_mask = [1 for i in range(len(char_phone_mask))]
    elif symbol_processing == "chars_only":
        # only use chars
        char_phone_mask = [0 for i in range(len(char_phone_mask))]

    # if the phones entry is None, the word was OOV or not recognized
    char_phone_int_seq = [
        int_char_chunks[i] if
        (len(int_phone_chunks[i]) == 0
         or char_phone_mask[i] == 0) else int_phone_chunks[i]
        for i in range(len(int_char_chunks))
    ]
    # check the inverse is ok
    # char_phone_txt = [sequence_to_text(char_phone_int_seq[i], [self.clean_names[char_phone_mask[i]]]) for i in range(len(char_phone_int_seq))]
    # combine into 1 sequence
    cphi = char_phone_int_seq[0]
    cpm = [char_phone_mask[0]] * len(char_phone_int_seq[0])
    if force_char_spc or self.symbol_processing != "phones_only":
        spc = text_to_sequence(" ", [clean_names[0]])[0]
    else:
        spc = text_to_sequence(" ", [clean_names[1]])[0]
    for i in range(len(char_phone_int_seq[1:])):
        # add space
        cphi += [spc]
        # always treat space as char unless in phones only mode
        if force_char_spc or self.symbol_processing != "phones_only":
            cpm += [0]
        else:
            cpm += [1]
        cphi += char_phone_int_seq[i + 1]
        cpm += [char_phone_mask[i + 1]] * len(char_phone_int_seq[i + 1])
    # trailing space
    #cphi = cphi + [spc]
    # trailing eos
    cphi = cphi + [1]
    # add trailing symbol
    if symbol_processing != "phones_only":
        cpm += [0]
    else:
        cpm += [1]
    # check inverse
    #cpt = "".join([sequence_to_text([cphi[i]], [self.clean_names[cpm[i]]]) for i in range(len(cphi))])
    #if None in phone_seq_chunk:
    #print("NUN")
    #print(cpt)
    #from IPython import embed; embed(); raise ValueError()
    return cphi, cpm
예제 #3
0
def pronounce_chars(line,
                    raw_line=None,
                    cmu_only=False,
                    int_timing_punct=True):
    # line: English_cleaner处理后的字符文本
    # raw_line : 处理之前的字符文本
    # cleaners strip things...
    puncts = ["!", ",", ":", "?", "."]
    #puncts_timing = ["4","1","1","4", "4"]
    puncts_timing = [" ", " ", " ", " ", " "]
    end_punct = [(ni, pi) for ni, pi in enumerate(puncts) if pi in line]
    if len(end_punct) > 0:
        # preserve the end punctuation...
        if end_punct[-1][1] == line[-1]:
            end_punct = end_punct[-1]
        else:
            end_punct = (0, " ")
    else:
        end_punct = (0, " ")
    line = english_cleaners(line)
    if cmu_only:
        r0 = cmu_g2p(line, raw_line)
        return r0  # 返回的全部都是CMU

    r = hybrid_g2p(line)

    if any([p in line for p in puncts]):
        new = []
        psym = r.strip().split(" ")
        lsym = line.strip().split(" ")
        for lss, pss in zip(lsym, psym):
            prev = []
            for ssi in pss.strip().split("@")[1:]:
                which_specials = [p for p in puncts if p in lss]
                if any([p in lss for p in puncts]):
                    prev.append(re.sub(re.escape("|".join(puncts)), "", ssi))
                    # ASSUME ONLY 1?
                else:
                    prev.append(ssi)
            if len(which_specials) > 0:
                prev.append(which_specials[0])
            new.append(prev)
            prev = []

        merged = ""
        for ii, chunk in enumerate(new):
            if any([p in chunk for p in puncts]):
                mstr = ""
                for ci in chunk:
                    if any([p in ci for p in puncts]):
                        which_specials = [(n, p) for n, p in enumerate(puncts)
                                          if p in ci]
                    else:
                        mstr += "@"
                        mstr += ci
                merged += mstr
                if ii < (len(new) - 1):
                    if not int_timing_punct:
                        merged += which_specials[0][1]
                    else:
                        merged += puncts_timing[which_specials[0][0]]
            else:
                merged += "@"
                merged += "@".join(chunk)
                if ii < (len(new) - 1):
                    merged += " "
        if merged[-1] == " ":
            merged = merged[:-1]
        if not int_timing_punct:
            merged += end_punct[1]
        else:
            merged += puncts_timing[end_punct[0]]
        merged += "~"
        return merged
    else:
        return r