#!/usr/bin/env python # Copyright 2018 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import argparse import codecs from cleaners import english_cleaners if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('text', type=str, help='text to be cleaned') args = parser.parse_args() with codecs.open(args.text, 'r', 'utf-8') as fid: for line in fid.readlines(): id, _, content = line.split("|") clean_content = english_cleaners(content.rstrip()) print("%s %s" % (id, clean_content))
def transform_text(char_seq, auto_pronounce=True, phone_seq=None, force_char_spc=True, symbol_processing="blended_pref", random_state=None): """ chars format example: "i am learning english." phone_seq format example: "@ay @ae@m @l@er@n@ih@ng @ih@ng@g@l@ih@sh" chars_only phones_only blended_pref phone_seq formatting can be gotten from text, using the pronounce_chars function with 'from text import pronounce_chars' Uses cmudict to do pronunciation """ if random_state is None: random_state = lcl_random_state if phone_seq is None and auto_pronounce is False and symbol_processing != "chars_only": raise ValueError( "phone_seq argument must be provided for iterator with self.symbol_processing != 'chars_only', currently '{}'" .format(self.symbol_processing)) clean_char_seq = cleaners.english_cleaners(char_seq) char_seq_chunk = clean_char_seq.split(" ") dirty_seq_chunk = char_seq.split(" ") if auto_pronounce is True: if phone_seq is not None: raise ValueError( "auto_pronounce set to True, but phone_seq was provided! Pass phone_seq=None for auto_pronounce=True" ) # take out specials then put them back... specials = "!?.,;:" puncts = "!?." tsc = [] for n, csc in enumerate(char_seq_chunk): broke = False for s in specials: if s in csc: new = csc.replace(s, "") tsc.append(new) broke = True break if not broke: tsc.append(csc) if symbol_processing == "blended_pref": chunky_phone_seq_chunk = [ pronounce_chars(w, raw_line=dirty_seq_chunk[ii], cmu_only=True) for ii, w in enumerate(tsc) ] phone_seq_chunk = [ cpsc[0] if cpsc != None else None for cpsc in chunky_phone_seq_chunk ] else: phone_seq_chunk = [pronounce_chars(w) for w in tsc] for n, psc in enumerate(phone_seq_chunk): for s in specials: if char_seq_chunk[n][-1] == s and phone_seq_chunk[n] != None: phone_seq_chunk[n] += char_seq_chunk[n][-1] #if char_seq_chunk[n][-1] in puncts and n != (len(phone_seq_chunk) - 1): # # add eos # char_seq_chunk[n] += "~" # phone_seq_chunk[n] += "~" break else: raise ValueError("Non auto_pronounce setting not yet configured") if len(char_seq_chunk) != len(phone_seq_chunk): raise ValueError( "Char and phone chunking resulted in different lengths {} and {}!\n{}\n{}" .format(len(char_seq_chunk), len(phone_seq_chunk), char_seq_chunk, phone_seq_chunk)) if symbol_processing != "phones_only": spc = text_to_sequence(" ", [clean_names[0]])[0] else: spc = text_to_sequence(" ", [clean_names[1]])[0] int_char_chunks = [] int_phone_chunks = [] for n in range(len(char_seq_chunk)): int_char_chunks.append( text_to_sequence(char_seq_chunk[n], [clean_names[0]])[:-1]) if phone_seq_chunk[n] == None: int_phone_chunks.append([]) else: int_phone_chunks.append( text_to_sequence(phone_seq_chunk[n], [clean_names[1]])[:-2]) # check inverses # w = [sequence_to_text(int_char_chunks[i], [self.clean_names[0]]) for i in range(len(int_char_chunks))] # p = [sequence_to_text(int_phone_chunks[i], [self.clean_names[1]]) for i in range(len(int_phone_chunks))] # TODO: Unify the two functions? char_phone_mask = [0] * len(int_char_chunks) + [1] * len(int_phone_chunks) random_state.shuffle(char_phone_mask) char_phone_mask = char_phone_mask[:len(int_char_chunks)] # setting char_phone_mask to 0 will use chars, 1 will use phones # these if statements override the default for blended... (above) if symbol_processing == "blended_pref": char_phone_mask = [ 0 if len(int_phone_chunks[i]) == 0 else 1 for i in range(len(int_char_chunks)) ] elif symbol_processing == "phones_only": # set the mask to use only phones # all files should have phones because of earlier preproc... char_phone_mask = [1 for i in range(len(char_phone_mask))] elif symbol_processing == "chars_only": # only use chars char_phone_mask = [0 for i in range(len(char_phone_mask))] # if the phones entry is None, the word was OOV or not recognized char_phone_int_seq = [ int_char_chunks[i] if (len(int_phone_chunks[i]) == 0 or char_phone_mask[i] == 0) else int_phone_chunks[i] for i in range(len(int_char_chunks)) ] # check the inverse is ok # char_phone_txt = [sequence_to_text(char_phone_int_seq[i], [self.clean_names[char_phone_mask[i]]]) for i in range(len(char_phone_int_seq))] # combine into 1 sequence cphi = char_phone_int_seq[0] cpm = [char_phone_mask[0]] * len(char_phone_int_seq[0]) if force_char_spc or self.symbol_processing != "phones_only": spc = text_to_sequence(" ", [clean_names[0]])[0] else: spc = text_to_sequence(" ", [clean_names[1]])[0] for i in range(len(char_phone_int_seq[1:])): # add space cphi += [spc] # always treat space as char unless in phones only mode if force_char_spc or self.symbol_processing != "phones_only": cpm += [0] else: cpm += [1] cphi += char_phone_int_seq[i + 1] cpm += [char_phone_mask[i + 1]] * len(char_phone_int_seq[i + 1]) # trailing space #cphi = cphi + [spc] # trailing eos cphi = cphi + [1] # add trailing symbol if symbol_processing != "phones_only": cpm += [0] else: cpm += [1] # check inverse #cpt = "".join([sequence_to_text([cphi[i]], [self.clean_names[cpm[i]]]) for i in range(len(cphi))]) #if None in phone_seq_chunk: #print("NUN") #print(cpt) #from IPython import embed; embed(); raise ValueError() return cphi, cpm
def pronounce_chars(line, raw_line=None, cmu_only=False, int_timing_punct=True): # line: English_cleaner处理后的字符文本 # raw_line : 处理之前的字符文本 # cleaners strip things... puncts = ["!", ",", ":", "?", "."] #puncts_timing = ["4","1","1","4", "4"] puncts_timing = [" ", " ", " ", " ", " "] end_punct = [(ni, pi) for ni, pi in enumerate(puncts) if pi in line] if len(end_punct) > 0: # preserve the end punctuation... if end_punct[-1][1] == line[-1]: end_punct = end_punct[-1] else: end_punct = (0, " ") else: end_punct = (0, " ") line = english_cleaners(line) if cmu_only: r0 = cmu_g2p(line, raw_line) return r0 # 返回的全部都是CMU r = hybrid_g2p(line) if any([p in line for p in puncts]): new = [] psym = r.strip().split(" ") lsym = line.strip().split(" ") for lss, pss in zip(lsym, psym): prev = [] for ssi in pss.strip().split("@")[1:]: which_specials = [p for p in puncts if p in lss] if any([p in lss for p in puncts]): prev.append(re.sub(re.escape("|".join(puncts)), "", ssi)) # ASSUME ONLY 1? else: prev.append(ssi) if len(which_specials) > 0: prev.append(which_specials[0]) new.append(prev) prev = [] merged = "" for ii, chunk in enumerate(new): if any([p in chunk for p in puncts]): mstr = "" for ci in chunk: if any([p in ci for p in puncts]): which_specials = [(n, p) for n, p in enumerate(puncts) if p in ci] else: mstr += "@" mstr += ci merged += mstr if ii < (len(new) - 1): if not int_timing_punct: merged += which_specials[0][1] else: merged += puncts_timing[which_specials[0][0]] else: merged += "@" merged += "@".join(chunk) if ii < (len(new) - 1): merged += " " if merged[-1] == " ": merged = merged[:-1] if not int_timing_punct: merged += end_punct[1] else: merged += puncts_timing[end_punct[0]] merged += "~" return merged else: return r