Ejemplo n.º 1
0
class PhoneticFeature(Feature):
    # initialize grapheme to phoneme
    g2p = G2p()
    # error counter
    #counter = 0

    @classmethod
    def compute_feature(cls, HL: Headline) -> np.ndarray:
        # replaced word & replacement word.
        words = [HL.sentence[HL.word_index], HL.edit]
        # transcibe each token to arpabet.
        phones = [" ".join(cls.g2p(w.lower())) for w in words]
#         for i, w in enumerate(words):
            # try:
                # s = " "
                # words[i] = s.join(cls.g2p(w))

            # except KeyError:
                # # print erroneous key
                # print(w)
                # # tracks and prints errors
                # cls.counter += 1
                # print(cls.counter)
        # calculate levenshtein distance between the two pronunciation.
        levenshtein_dist = StringMatcher.distance(*phones)
        # scale using the max difference in "word length"
        scale_factor = max([len(w) for w in phones])
        scaled_dist = levenshtein_dist/scale_factor
        return np.array([scaled_dist])
Ejemplo n.º 2
0
def g2p(x):
    from g2p_en import G2p

    global _g2p
    if _g2p is None:
        _g2p = G2p()
    return _g2p(x)
Ejemplo n.º 3
0
def load_phonemes(word, pho_to_words):
    from g2p_en import G2p
    g2p = G2p()

    for p in g2p(word):
        p = ''.join([c for c in p if not c.isdigit()])
        add_word_to_phoneme(p, word, pho_to_words)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--compact",
        action="store_true",
        help="if set, compacts phones",
    )
    args = parser.parse_args()

    compact = args.compact

    wrd_to_phn = {}
    g2p = G2p()
    for line in sys.stdin:
        words = line.strip().split()
        phones = []
        for w in words:
            if w not in wrd_to_phn:
                wrd_to_phn[w] = g2p(w)
                if compact:
                    wrd_to_phn[w] = [
                        p[:-1] if p[-1].isnumeric() else p
                        for p in wrd_to_phn[w]
                    ]
            phones.extend(wrd_to_phn[w])
        try:
            print(" ".join(phones))
        except:
            print(wrd_to_phn, words, phones, file=sys.stderr)
            raise
Ejemplo n.º 5
0
def make_arpabet(text):
    # g2p functions
    g2p = G2p()

    # Define punctuation, prevent punctuation curlies, and make replacement dictionary to fix spacing
    punc = "!?,.;:␤#-_'\"()[]\n,."
    punc = list(punc)
    punc_key = list(punc)
    punc_alt = [" " + item for item in punc]
    punc_dict = {}
    for key in punc_alt:
        for value in punc_key:
            punc_dict[key] = value
            punc_key.remove(value)
            break

    # Text processing
    text = " ".join(g2p(english_cleaners(text))).split("  ")
    outlist = []
    for item in text:
        item = item.strip()
        if item not in punc:
            item = "{" + item + "}"
        outlist.append(item)
    text = " ".join(outlist)
    for key, replacement in punc_dict.items():
        text = text.replace(key, replacement)
    return text
Ejemplo n.º 6
0
def get_phonemes():
    def to_dict():
        phn_dic = {}
        for tag in phonemes:
            if tag.split(' ')[0] == 'q':
                pass
            else:
                phn_dic[tag.split(' ')[0]] = tag.split(' ')[-1]
        return phn_dic

    def to_list():
        phn_list = [strTag.split(' ')[-1] for strTag in phonemes]
        phn_list = list(set(phn_list))
        return phn_list

    g2p = G2p()
    phonemes = ['h#']  # h# : start token
    phonemes.extend(strPhoneme.lower() for strPhoneme in g2p(phonemes))
    phonemes.append('h#')  # h# : end token
    labels = []
    for label in phonemes:
        if label in ['q', ' ', "'"]:
            pass
        else:
            label = ''.join([i for i in label if not i.isdigit()])
            labels.append(to_list().index(to_dict()[label]) + 1)
    return numpy.concatenate(labels)
Ejemplo n.º 7
0
def process_english(text):
    text = text.rstrip(punctuation)
    lexicon = read_lexicon('mytext/lexicon/librispeech-lexicon.txt')

    g2p = G2p()
    phones = []
    words = re.split(r"([,;.\-\?\!\s+])", text)
    for w in words:
        if w.lower() in lexicon:
            phones += lexicon[w.lower()]
        else:
            phones += list(filter(lambda p: p != " ", g2p(w)))
    phones = "{" + "}{".join(phones) + "}"
    phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
    phones = phones.replace("}{", " ")
   
    sequence = np.array(
        phone_to_sequence(
            phones, ["english_cleaners"]
        )
    )
    
    #print("Raw Text Sequence: {}".format(text))
    #print("Phoneme Sequence: {}".format(phones))
    #print(sequence)

    return phones, np.array(sequence)
Ejemplo n.º 8
0
    def phone_synthesize_solution(self, text):
        """
        Synthesize speech from text by concatenating phonemes
        selected from the database
        """
        # Convert all words to lower case
        words = [word.lower() for word in text.split()]
        phones = []
        for word in words:
            try:
                # Use cmudict get phonemic representation
                phones.extend(cmudict.dict()[word][0])
            except IndexError:
                # If word not found in dictionary, use g2p instead
                g2p = G2p()
                phones.extend(g2p(word))
        print(phones)

        # Initialize an empty audio segment
        result = AudioSegment.empty()
        # Concatenate phonems selected from PHONEMES_DIR
        for phone in phones:
            # Ignore accent marker
            phone = phone[0:-1] if phone[-1].isdigit() else phone
            # Look up phoneme wav file using phone_map
            sound_label = phone_map[phone]
            sound_path = PHONEMES_DIR + str(sound_label) + ".wav"
            audio = AudioSegment.from_wav(sound_path)
            result += audio
        # Write the synthesized .wav file to DST_DIR
        result.export(DST_DIR + "gen.wav", format="wav")
        play(result)
Ejemplo n.º 9
0
def preprocess_english(
    texts: List[str],
    preprocess_config,
) -> List[np.array]:
    sequences = []
    for text in texts:
        text = text.rstrip(punctuation)
        lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"])

        g2p = G2p()
        phones = []
        words = re.split(r"([,;.\-\?\!\s+])", text)
        words = filter(lambda x: x != " ", words)

        for w in words:
            if w.lower() in lexicon:
                phones += lexicon[w.lower()]
            else:
                phones += g2p(w)
        phones = "{" + "}{".join(phones) + "}"
        phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
        phones = phones.replace("}{", " ")

        print("Raw Text Sequence: {}".format(text))
        print("Phoneme Sequence: {}".format(phones))
        sequence = np.array(
            text_to_sequence(
                phones,
                preprocess_config["preprocessing"]["text"]["text_cleaners"]))

        sequences.append(sequence)
    return sequences
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("root_dirs", nargs="*")
    parser.add_argument("--insert-silence", "-s", action="store_true")
    args = parser.parse_args()
    sil = "<s>"

    wrd_to_phn = {}
    g2p = G2p()
    for line in sys.stdin:
        words = line.strip().split()
        phones = []
        if args.insert_silence:
            phones.append(sil)
        for w in words:
            if w not in wrd_to_phn:
                wrd_to_phn[w] = g2p(w)
            phones.extend(wrd_to_phn[w])
            if args.insert_silence:
                phones.append(sil)
        try:
            print(" ".join(phones))
        except:
            print(wrd_to_phn, w, phones, file=sys.stderr)
            raise
Ejemplo n.º 11
0
def run(args):
    punc = '!?,;' # punctuation that we want to keep when no_punc is False
    g2p = G2p()
    full_set = set()
    with open(args.src,'r') as f:
        f_lines = f.readlines()
    with open(args.out,'w') as f_out:
        f_out.write(SEP + 'phn_seq\n')
        for line in tqdm(f_lines):
            idx = line.split('|')[0]
            line = line.split('|')[-1].replace('--','')
            line_origin = line
            line = re.sub('[:\"-()]', '', line) # ! ? , . ;
            if not args.no_punc:
                for token in punc:
                    line = line.replace(token, '.')
            else:
                line = re.sub('[!?,.;]', '', line) # ! ? , . ;
            try:
                phn = [phoneme_reduce_mapping[remove_num(phn.lower())] for phn in g2p(line)]
            except:
                print(line_origin)
                print(line)
                out()
            full_set = full_set | set(phn)
            f_out.write(idx + SEP + ' '.join(phn) + '\n')
    print("Total {} phonemes used.".format(len(full_set)))
Ejemplo n.º 12
0
def build_phonetic_form_and_save_lyrics(
        artist_lyrics_map, author_word_to_phonetic_form_dictionary,
        author_word_list):
    artist_songs_phonetic_form_map = {}
    for lyrics_filename, lyrics_text in artist_lyrics_map.items():
        phonetic_form_of_lyrics = []
        g2p = G2p()
        lyrics_lines = lyrics_text.split("\n")
        for line in lyrics_lines:
            is_verse_line = "<verse_start>" in line or "<verse_end>" in line
            if is_verse_line:
                phonetic_form_of_lyrics.append(line)
                if "<verse_start>" in line:
                    author_word_list.append("<verse_start>")
                else:
                    author_word_list.append("<verse_end>")
                continue
            phonetic_form_of_line = build_phonetic_form_of_line(
                g2p, line, author_word_to_phonetic_form_dictionary,
                author_word_list)
            phonetic_form_of_lyrics.append(phonetic_form_of_line)
        artist_songs_phonetic_form_map[
            lyrics_filename] = make_text_lyrics_of_list_of_lines(
                phonetic_form_of_lyrics)

    return artist_songs_phonetic_form_map
Ejemplo n.º 13
0
def run(args):
    punc = '!?,;'  # punctuation that we want to keep
    spkr_dict = dict()
    g2p = G2p()
    full_set = set()
    with open(args.src, 'r') as f:
        f_lines = f.readlines()
    with open(args.out, 'w') as f_out:
        f_out.write(SEP + 'phn_seq\n')
        for line in tqdm(f_lines):
            idx = line.split('|')[0]
            spkr = line.split('|')[1]

            if spkr not in spkr_dict:
                spkr_dict[spkr] = len(spkr_dict)

            line = line.split('|')[-1]
            line = re.sub('[:\"\-()]', '', line)  # ! ? , . ;
            if not args.no_punc:
                for token in punc:
                    line = line.replace(token, '.')
            else:
                line = re.sub('[!?,.;]', '', line)  # ! ? , . ;
            phn = [
                phoneme_reduce_mapping[remove_num(phn.lower())]
                for phn in g2p(line)
            ]
            full_set = full_set | set(phn)
            f_out.write(idx + SEP + ' '.join(phn) + '\n')
    print("Total {} phonemes used, {} speakers used.".format(
        len(full_set), len(spkr_dict)))

    os.makedirs(dirname(args.out_spkr_dict), exist_ok=True)
    json.dump(spkr_dict, open(args.out_spkr_dict, 'w'), indent=2)
Ejemplo n.º 14
0
def list_words(*phonemes):
    from g2p_en import G2p

    with open('phoneme_samples.csv') as f:
        reader = csv.DictReader(f, delimiter=',')

        rows = [row for row in reader]

    if len(phonemes) == 0:
        for row in rows:
            print(row['word'])
        return

    for phoneme in phonemes:
        if len(phonemes) > 1:
            print(phoneme)

        g2p = G2p()

        for row in rows:
            pho = row['phoneme']
            word = row['word']

            phos = g2p(word)

            doc = f"{word} [{' '.join(phos)}]"

            if pho == phoneme:
                if len(phonemes) == 1:
                    print(doc)
                else:
                    print(f'\t{doc}')
Ejemplo n.º 15
0
def main():
    args = get_parser().parse_args()
    logger.debug(f"Args: {args}")
    
    ref_uid_to_tra = load_tra(args.ref_tra)
    hyp_uid_to_tra = load_tra(args.hyp_tra)
    assert not bool(set(hyp_uid_to_tra.keys()) - set(ref_uid_to_tra.keys()))

    lm = kenlm.Model(args.kenlm_path)
    skipwords = set(args.skipwords.split(","))
    def compute_lm_score(s):
        s = " ".join(w for w in s.split() if w not in skipwords)
        s = s.upper() if args.uppercase else s
        return lm.score(s)

    g2p, g2p_dict = None, None
    if args.phonemize:
        if args.phonemize_lexicon:
            g2p_dict = load_lex(args.phonemize_lexicon)
        else:
            g2p = G2p()

    wer = compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict)
    lm_ppl = compute_lm_ppl(hyp_uid_to_tra, compute_lm_score)
    
    gt_wer = -math.inf
    if args.gt_tra:
        gt_uid_to_tra = load_tra(args.gt_tra)
        gt_wer = compute_wer(gt_uid_to_tra, hyp_uid_to_tra, None, None)

    score = math.log(lm_ppl) * max(wer, args.min_vt_uer)
    logging.info(f"{args.hyp_tra}: score={score:.4f}; wer={wer*100:.2f}%; lm_ppl={lm_ppl:.4f}; gt_wer={gt_wer*100:.2f}%")
Ejemplo n.º 16
0
 def __init__(self, hp, split='train'):
     self.hp = hp
     self.split = split
     self.data_files = self._get_data_files(hp.dataset, hp.data_dir,
                                            hp.data_file)
     self.mel_matrix = librosa.filters.mel(sr=22050, n_fft=1024, n_mels=80)
     self.g2p_en = G2p()
Ejemplo n.º 17
0
class Text2machineSeq:
    from g2p_en import G2p
    from text import symbols
    from string import punctuation
    from text import text_to_sequence
    g2p = G2p()

    def __init__(self, lexicon_path, text_cleaners, useG2p=True):
        self.useG2p = useG2p
        self.lexicon = self.read_lexicon(lexicon_path)
        self.text_cleaners = text_cleaners

    @staticmethod
    def getSymbols():
        print(f"nText2machineSeq.symbols: {len(Text2machineSeq.symbols)}")
        return Text2machineSeq.symbols

    @staticmethod
    def read_lexicon(lex_path):
        lexicon = {}
        with open(getPath(lex_path)) as f:
            for line in f:
                temp = re.split(r"\s+", line.strip("\n"))
                word = temp[0]
                phones = temp[1:]
                if word.lower() not in lexicon:
                    lexicon[word.lower()] = phones
        return lexicon

    def text2seq(self, text, verbose=False):
        useG2p = self.useG2p
        lexicon = self.lexicon
        g2p = self.g2p
        phones = []
        text = text.rstrip(Text2machineSeq.punctuation)
        if useG2p:
            words = re.split(r"([,;.\-\?\!\s+])", text)
            for w in words:
                if w.lower() in lexicon:
                    phones += lexicon[w.lower()]
                else:
                    phones += list(filter(lambda p: p != " ", g2p(w)))
            phones = "{" + "}{".join(phones) + "}"
            phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
            phones = phones.replace("}{", " ")
        else:
            phones = text
        sequence = np.array(
            Text2machineSeq.text_to_sequence(phones, self.text_cleaners))
        if verbose:
            print(f"Raw Text Sequence [{int(useG2p)}]: {text}")
            print(f"Phoneme Sequence  [{int(useG2p)}]: {phones}")
            print(f"Machine Language  [{int(useG2p)}]: {sequence}")
            if useG2p:
                phoneList = replaces(phones, '{', '', '}', '').split()
                assert len(phoneList) == len(sequence)
                p2s = [f'{p}-{s}' for p, s in zip(phoneList, sequence)]
                print(f"phone2machine     [{int(useG2p)}]: {' '.join(p2s)}")
        return sequence
Ejemplo n.º 18
0
    def processtxtph(self,intxt):
      g2p = G2p()
      ptext =  self._clean_text(intxt,[self.cleaner_names])
      phs = _g2p2synth(g2p(ptext))

      arpatxt = " ".join(phs)
      ids = self._arpabet_to_sequence(arpatxt)
      
      return ids, arpatxt
Ejemplo n.º 19
0
 def __init__(self, nltk_data_directory: Path):
     # workaround for https://github.com/Kyubyong/g2p/issues/12
     nltk_data_directory.mkdir(exist_ok=True, parents=True)
     nltk.download("averaged_perceptron_tagger", download_dir=nltk_data_directory)
     nltk.download("cmudict", download_dir=nltk_data_directory)
     nltk.download("punkt", download_dir=nltk_data_directory)
     nltk.data.path.append(nltk_data_directory.resolve())
     from g2p_en import G2p
     self._g2p = G2p()
Ejemplo n.º 20
0
def grapheme_to_phoneme(text):
    """Converts prapheme to phoneme with punctuation"""
    g2p = G2p()
    phones = []
    words = filter(None, re.split(r"([,:;.\(\)\'\-\?\!\s+])", text))
    for w in words:
        if w in punctuation:
            phones += [w]
        else:
            phones += list(filter(lambda p: p != " ", g2p(w)))
    return phones
Ejemplo n.º 21
0
def get_ARPABET_phonetic_transcription(word_list):
    """
    :param word_list (list): List of words to encode with ARPABET phonetic transcription
    :return arpabet_word_list (list): List of lists of enocded phonemes
    """
    g2p = G2p()
    arpabet_word_list = []
    for word in word_list:
        transcription = g2p(word)
        arpabet_word_list.append(transcription)

    return arpabet_word_list
Ejemplo n.º 22
0
class TextProcessor:

    phonemes = [
        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2',
        'B', 'CH', 'D', 'DH',
        'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
        'F', 'G', 'HH',
        'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
        'JH', 'K', 'L', 'M', 'N', 'NG',
        'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2',
        'P', 'R', 'S', 'SH', 'T', 'TH',
        'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2',
        'V', 'W', 'Y', 'Z', 'ZH'
    ]
    g2p = G2p()

    def __init__(self, hparams):
        self.units = self.graphemes = hparams.graphemes
        # self.phonemes = hparams.phonemes
        self.phonemize = hparams.use_phonemes
        if self.phonemize:
            self.units = self.phonemes
        self.specials = hparams.specials
        self.punctuations = hparams.punctuations
        self.units = self.specials + self.units + self.punctuations
        self.txt2idx = {txt: idx for idx, txt in enumerate(self.units)}
        self.idx2txt = {idx: txt for idx, txt in enumerate(self.units)}

    def normalize(self, text):
        text = text.lower()
        text = re.sub("[ ]+", " ", text)
        # keep_re = "[^" + str(self.graphemes+self.punctuations) +"]"
        # text = re.sub(keep_re, " ", text)  # remove
        text = [ch if ch in self.graphemes+self.punctuations else ' ' for ch in text]
        text = list(text)
        if self.phonemize:
            text = self.g2p(''.join(text))
        return text

    def __call__(self, texts, max_n=None):
        if not isinstance(texts, (str, list)):
            raise TypeError("Inputs must be str or list(str)")
        if isinstance(texts, str):
            texts = [texts]
        normalized_texts = [self.normalize(line) for line in texts]  # text normalization
        tlens = [len(l) for l in normalized_texts]
        max_n = max_n or max(tlens)
        texts = np.zeros((len(normalized_texts), max_n), np.long)
        for i, text in enumerate(normalized_texts):
            texts[i, :len(text)] = [self.txt2idx.get(ch, 1) for ch in text]
        return texts, tlens
Ejemplo n.º 23
0
def process_sents(sents, args):
    g2p = G2p()
    out_sents = []
    res_wrds = load_reserve_word(args.reserve_word)
    for sent in sents:
        col1 = ""
        if args.reserve_first_column:
            col1, sent = sent.split(None, 1)
        sent = process_sent(sent, g2p, res_wrds, args)
        if args.reserve_first_column and col1 != "":
            sent = f"{col1} {sent}"
        out_sents.append(sent)
    return out_sents
Ejemplo n.º 24
0
def preprocess(text):
    g2p = G2p()
    phone = g2p(text)
    phone = list(filter(lambda p: p != ' ', phone))
    phone = '{'+ '}{'.join(phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones
    phone = re.sub(r'\{[^\w\s]?\}', ' ', phone)
    phone = phone.replace('}{', ' ')
    
    print('|' + phone + '|')    
    sequence = np.array(text_to_sequence(phone, hp.text_cleaners))
    sequence = np.stack([sequence])

    return torch.from_numpy(sequence).long().to(device)
Ejemplo n.º 25
0
    def __init__(self):
        self.g2p = G2p()

        extra_syms = [' ']
        self.ph2id = self.g2p.p2idx.copy()
        self.id2ph = self.g2p.idx2p.copy()
        for sym in extra_syms:
            self.ph2id[sym] = len(self.id2ph)
            self.id2ph[len(self.id2ph)] = sym

        self.start_token = '<s>'
        self.end_token = '</s>'
        self.pad_token = '<pad>'
        self.unk_token = '<unk>'
Ejemplo n.º 26
0
def Say(text):
	#converts text to phonemes
	if(text == 'QUIT'):
		exit()
	g2p = G2p()
	out = g2p(text)

	#identify sounds from phoneme name

	output = AudioSegment.silent(duration=100)

	for i, pho in enumerate(out):
		if (pho == 'HH'):
			pho = 'H'
		elif (pho == 'NX'):
			pho = 'NG'
		elif (pho == 'TH'):
			pho = 'DH'

		if (pho[-1].isalpha() != True):
			pho = pho[:-1]

		if (out[i].isspace() or out[i] == '' or out[i] == "'" or out[i] == "-" or out[i] =='.' or out[i] == ',' or out[i] == '!' or out[i] == '?'):
			audio = AudioSegment.silent(duration=300)
			audio.fade_in(duration=300).fade_out(duration=300)
			output = output.append(audio, crossfade=10)
		else:
			phonemes[pho]= phonemes[pho].fade_in(duration=25)
			phonemes[pho] = phonemes[pho].fade_out(duration=25)
			#phonemes[pho] = normalize(phonemes[pho])
			#print(len(phonemes[pho]))
			try:
				output = output.append(phonemes[pho], crossfade=95)
				#print("Crossfade completed")
			except:
				try:
					output = output.append(phonemes[pho], crossfade=25)
					print(pho)
					print(str(len(output)) + " | " + str(len(phonemes[pho])))
				except:
					output = output.append(phonemes[pho], crossfade=0)
					print("last resort: " + pho)
			output = output.append(AudioSegment.silent(duration=10), crossfade=0)

	output += AudioSegment.silent(duration=300)
	output = normalize(output)
	output.set_frame_rate(44100)

	play(output)
	print(text)
Ejemplo n.º 27
0
def preprocess(text):
    text = text.rstrip(punctuation)

    g2p = G2p()
    phone = g2p(text)
    phone = list(filter(lambda p: p != ' ', phone))
    phone = '{' + '}{'.join(phone) + '}'
    phone = re.sub(r'\{[^\w\s]?\}', '{sp}', phone)
    phone = phone.replace('}{', ' ')

    print('|' + phone + '|')
    sequence = np.array(text_to_sequence(phone, hp.text_cleaners))
    sequence = np.stack([sequence])

    return torch.from_numpy(sequence).long().to(device)
Ejemplo n.º 28
0
    def __init__(self):
        # TODO Move this into a config File, give option of different models
        self.trans_type = "phn"
        dict_path = "/home/ntrusse2/espnet/downloads/en/fastspeech/data/lang_1phn/phn_train_no_dev_units.txt"
        model_path = "/home/ntrusse2/espnet/downloads/en/fastspeech/exp/phn_train_no_dev_pytorch_train_tacotron2.v3_fastspeech.v4.single/results/model.last1.avg.best"
        vocoder_path = "/home/ntrusse2/espnet/downloads/en/parallel_wavegan/ljspeech.parallel_wavegan.v2/checkpoint-400000steps.pkl"
        vocoder_conf = "/home/ntrusse2/espnet/downloads/en/parallel_wavegan/ljspeech.parallel_wavegan.v2/config.yml"

        # Copied right out of the examples on ESPNETs DEMO
        self.device = torch.device("cuda")
        print("Loading Torch Model...")
        self.idim, odim, train_args = get_model_conf(model_path)
        model_class = dynamic_import(train_args.model_module)
        model = model_class(self.idim, odim, train_args)
        torch_load(model_path, model)
        self.model = model.eval().to(self.device)
        self.inference_args = Namespace(
            **{
                "threshold": 0.5,
                "minlenratio": 0.0,
                "maxlenratio": 10.0,
                "use_attention_constraint": True,
                "backward_window": 1,
                "forward_window": 3,
            })

        print("Loading Vocoder...")
        with open(vocoder_conf) as f:
            self.config = yaml.load(f, Loader=yaml.Loader)
        vocoder_class = self.config.get("generator_type",
                                        "ParallelWaveGANGenerator")
        vocoder = getattr(parallel_wavegan.models,
                          vocoder_class)(**self.config["generator_params"])
        vocoder.load_state_dict(
            torch.load(vocoder_path, map_location="cpu")["model"]["generator"])
        vocoder.remove_weight_norm()
        self.vocoder = vocoder.eval().to(self.device)

        print("Loading Text Frontend...")
        with open(dict_path) as f:
            lines = f.readlines()
        lines = [line.replace("\n", "").split(" ") for line in lines]
        self.char_to_id = {c: int(i) for c, i in lines}
        self.g2p = G2p()

        self.pad_fn = torch.nn.ReplicationPad1d(
            self.config["generator_params"].get("aux_context_window", 0))
        self.use_noise_input = vocoder_class == "ParallelWaveGANGenerator"
Ejemplo n.º 29
0
    def phonemes_encoding(sources,
                          add_sos_eos_pad_tokens=True,
                          idx_to_phonemes=None,
                          phonemes_to_idx=None,
                          **kwargs):
        '''
    Encodes given sources into numerical vectors

    Params:
      * sources : list of str
      * sos_tok (optional) : str
      * eos_tok (optional) : str
      * pad_tok (optional) : str
      * idx_to_letters (optional) : list of str
      * letters_to_idx (optional) : dict

    Returns:
      sources_encoded, idx_to_phonemes, phonemes_to_idx : list of list of int, list of str, dict
    '''
        if add_sos_eos_pad_tokens:
            sos_tok, eos_tok, pad_tok = kwargs.get(
                'sos_tok',
                '<sos>'), kwargs.get('eos_tok',
                                     '<eos>'), kwargs.get('pad_tok', '<pad>')

        g2p = G2p()
        converted_sources = [g2p(s.lower()) for s in tqdm(sources)]

        if idx_to_phonemes is None or phonemes_to_idx is None:
            phonemes = list(
                sorted(set([p for s in converted_sources for p in s])))
            idx_to_phonemes = [
                sos_tok, eos_tok, pad_tok
            ] + phonemes if add_sos_eos_pad_tokens else phonemes
            phonemes_to_idx = {p: i for i, p in enumerate(idx_to_phonemes)}

        sources_encoded = [[phonemes_to_idx[p] for p in s]
                           for s in converted_sources]

        if add_sos_eos_pad_tokens:
            sources_encoded = Data.add_sos_eos_tokens(sources_encoded,
                                                      phonemes_to_idx,
                                                      sos_tok=sos_tok,
                                                      eos_tok=eos_tok)

        return sources_encoded, idx_to_phonemes, phonemes_to_idx
Ejemplo n.º 30
0
class TextProcessor:

    g2p = G2p()

    def __init__(self, hparams):
        self.units = self.graphemes = hparams.graphemes
        self.phonemes = hparams.phonemes
        self.phonemize = hparams.use_phonemes
        if self.phonemize:
            self.units = self.phonemes
        self.specials = hparams.specials
        self.punctuations = hparams.punctuations
        self.units = self.specials + self.units + self.punctuations
        self.txt2idx = {txt: idx for idx, txt in enumerate(self.units)}
        self.idx2txt = {idx: txt for idx, txt in enumerate(self.units)}

    def normalize(self, text):
        text = text.lower()
        text = re.sub("[ ]+", " ", text)
        # keep_re = "[^" + str(self.graphemes+self.punctuations) +"]"
        # text = re.sub(keep_re, " ", text)  # remove
        text = [
            ch if ch in self.graphemes + self.punctuations else ' '
            for ch in text
        ]
        text = list(text)
        if self.phonemize:
            text = self.g2p(''.join(text))
        return text

    def __call__(self, texts, max_n=None):
        if not isinstance(texts, (str, list)):
            raise TypeError("Inputs must be str or list(str)")
        if isinstance(texts, str):
            texts = [texts]
        normalized_texts = [self.normalize(line)
                            for line in texts]  # text normalization
        tlens = [len(l) for l in normalized_texts]
        max_n = max_n or max(tlens)
        texts = np.zeros((len(normalized_texts), max_n), np.long)
        for i, text in enumerate(normalized_texts):
            texts[i, :len(text)] = [self.txt2idx.get(ch, 1) for ch in text]
        return texts, tlens