def inference(self, sentence, output_path): g2p = G2p() text = kor_preprocess(sentence) synthesize(self.model, self.vocoder, text, sentence, prefix='step_{}'.format(200000))
def convert_g2pk_scripts_pandas(scripts): g2pk_scripts = [] g2p = G2p() for index, row in tqdm(scripts.iterrows()): script = row['x'] label = row['y'] temp = g2p(script) # g2pk_scripts.append([temp, label]) # temp = g2p(script, descriptive=True) # g2pk_scripts.append([temp, label]) # temp = g2p(script, group_vowels=True) # g2pk_scripts.append([temp, label]) # temp = g2p(script, to_syl=False) g2pk_scripts.append([temp, label]) return g2pk_scripts
def convert_g2pk_scripts(scripts): g2pk_scripts = [] g2p = G2p() for idx, item in enumerate(scripts): script = item[0] label = item[1] temp = g2p(script) # g2pk_scripts.append([temp, label]) # temp = g2p(script, descriptive=True) # g2pk_scripts.append([temp, label]) # temp = g2p(script, group_vowels=True) # g2pk_scripts.append([temp, label]) # temp = g2p(script, to_syl=False) g2pk_scripts.append([temp, label]) return g2pk_scripts
def kor_preprocess(text): text = text.rstrip(punctuation) g2p=G2p() phone = g2p(text) print('after g2p: ',phone) phone = h2j(phone) print('after h2j: ',phone) phone = list(filter(lambda p: p != ' ', phone)) phone = '{' + '}{'.join(phone) + '}' print('phone: ',phone) phone = re.sub(r'\{[^\w\s]?\}', '{sp}', phone) print('after re.sub: ',phone) phone = phone.replace('}{', ' ') print('|' + phone + '|') sequence = np.array(text_to_sequence(phone,hp.text_cleaners)) sequence = np.stack([sequence]) return torch.from_numpy(sequence).long().to(device)
from layers import TacotronSTFT symbol_to_id = {s: i for i, s in enumerate(symbols)} id_to_symbol = {i: s for i, s in enumerate(symbols)} csv_file = '/hd0/speech-aligner/metadata/metadata.csv' root_dir = '/hd0/dataset/VCTK/VCTK-Corpus/wav48' data_dir = '/hd0/speech-aligner/preprocessed/VCTK20_engspks' os.makedirs(data_dir, exist_ok=True) os.makedirs(os.path.join(data_dir, 'char_seq'), exist_ok=True) os.makedirs(os.path.join(data_dir, 'phone_seq'), exist_ok=True) os.makedirs(os.path.join(data_dir, 'melspectrogram'), exist_ok=True) g2p = G2p() metadata = {} with codecs.open(csv_file, 'r', 'utf-8') as fid: for line in fid.readlines(): id, text, spk = line.split("|") id = os.path.splitext(id)[0] clean_char = custom_english_cleaners(text.rstrip()) clean_phone = [] for s in g2p(clean_char.lower()): if '@' + s in symbol_to_id: clean_phone.append('@' + s) else: clean_phone.append(s) metadata[id] = {'char': clean_char, 'phone': clean_phone}
import torch from collections import namedtuple import os import scipy.io.wavfile as wavfile import numpy as np from tqdm import tqdm import librosa import torch import shutil import threading from time import sleep from scipy import signal from g2pk import G2p g2pk = G2p() import sys sys.path.append(os.path.join(os.path.dirname(__file__), '../')) from settings import configs from utils import text2encoding, encoding2text FTPair = namedtuple('FileTextPair', ['file_path', 'text']) PHONEME_DICT = dict() class AudioTextDataset(Dataset): def __init__(self, meta_file_path, configs): # , transform=None): self.file_text_pair_list = load_file_text_pair_list(meta_file_path)