def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000
def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, pad=2, upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval()
def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval()
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r'])
def main(): """ Call train.py as a new process and pass command arguments """ parser = argparse.ArgumentParser() parser.add_argument('--restore_path', type=str, help='Folder path to checkpoints', default='') parser.add_argument( '--config_path', type=str, help='path to config file for training', ) parser.add_argument('--data_path', type=str, help='dataset path.', default='') args = parser.parse_args() CONFIG = load_config(args.config_path) OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name, True) stdout_path = os.path.join(OUT_PATH, "process_stdout/") num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") # set arguments for train.py command = ['train.py'] command.append('--restore_path={}'.format(args.restore_path)) command.append('--config_path={}'.format(args.config_path)) command.append('--group_id=group_{}'.format(group_id)) command.append('--data_path={}'.format(args.data_path)) command.append('--output_path={}'.format(OUT_PATH)) command.append('') if not os.path.isdir(stdout_path): os.makedirs(stdout_path) os.chmod(stdout_path, 0o775) # run processes processes = [] for i in range(num_gpus): my_env = os.environ.copy() my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[6] = '--rank={}'.format(i) stdout = None if i == 0 else open( os.path.join(stdout_path, "process_{}.log".format(i)), "w") p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env) processes.append(p) print(command) for p in processes: p.wait()
def main(args): """ Call train.py as a new process and pass command arguments """ CONFIG = load_config(args.config_path) if args.output_path == "": OUT_PATH = os.path.join(_, CONFIG.output_path) else: OUT_PATH = args.output_path OUT_PATH = create_experiment_folder(OUT_PATH, CONFIG.model_name) stdout_path = os.path.join(OUT_PATH, "process_stdout/") num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") if args.lr_find: command = ['find_lr.py'] command.append('--restore_path={}'.format(args.restore_path)) command.append('--config_path={}'.format(args.config_path)) command.append('--group_id=group_{}'.format(group_id)) command.append('--data_path={}'.format(args.data_path)) command.append('--output_path={}'.format(OUT_PATH)) command.append('--init_lr={}'.format(args.init_lr)) command.append('--end_lr={}'.format(args.end_lr)) command.append('') else: # set arguments for train.py command = ['train.py'] command.append('--restore_path={}'.format(args.restore_path)) command.append('--config_path={}'.format(args.config_path)) command.append('--group_id=group_{}'.format(group_id)) command.append('--data_path={}'.format(args.data_path)) command.append('--output_path={}'.format(OUT_PATH)) command.append('') if not os.path.isdir(stdout_path): os.makedirs(stdout_path) os.chmod(stdout_path, 0o775) # run processes processes = [] for i in range(num_gpus): my_env = os.environ.copy() my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[6] = '--rank={}'.format(i) stdout = None if i == 0 else open( os.path.join(stdout_path, "process_{}.log".format(i)), "w") p = subprocess.Popen(['python3'.format(i)] + command, stdout=stdout, env=my_env) processes.append(p) print(command) for p in processes: p.wait()
def load_model(self, model_path, model_name, model_config, use_cuda): #build the config's path model_config = os.path.join(model_path, model_config) #build the model's path model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > Model config path: ", model_config) print(" | > Model file path: ", model_file) config = load_config(model_config) self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [config.text_cleaner], config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [config.text_cleaner]) self.model = Tacotron(num_chars=config['num_chars'], embedding_dim=config['embedding_size'], linear_dim=self.ap.num_freq, mel_dim=self.ap.num_mels, r=config['r']) #load model state if use_cuda: cp = torch.load(model_file) else: cp = torch.load(model_file, map_location=lambda storage, loc: storage) #load the model self.model.load_state_dict(cp['model']) #if cuda is enabled & available move tensors to GPU if use_cuda: self.model.cuda() #disables normalization techniques present in code self.model.eval()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def test_in_out(self): self._create_random_model() config = load_config(os.path.join(get_tests_input_path(), 'server_config.json')) synthesizer = Synthesizer(config) synthesizer.tts("Better this test works!!")
import os import unittest from tests import get_tests_path, get_tests_input_path, get_tests_output_path from utils.audio import AudioProcessor from utils.generic_utils import load_config TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) self.ap = AudioProcessor(**conf.audio) def test_audio_synthesis(self): """ 1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm self.ap.signal_norm = signal_norm
'--librispeech', type=str, required=False, default=False, help="Librispeech format, if true load with librispeech format") args = parser.parse_args() os.makedirs(args.out_dir, exist_ok=True) if args.train_data_csv: os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True) if args.test_data_csv: os.makedirs(os.path.join(args.out_dir, 'test'), exist_ok=True) cpu_num = cpu_count() # num threads = num cpu cores config = load_config(args.config) ap = AudioProcessor(config.audio) sample_rate = config.audio[config.audio['backend']]['sample_rate'] audio_len = config.audio['audio_len'] form = config.dataset['format'] output_dir_train = os.path.join(args.out_dir, 'train') output_dir_test = os.path.join(args.out_dir, 'test') dataset_root_dir = args.dataset_root_dir train_data_csv = None test_data_csv = None noise_files = open(args.noise_csv).readlines() if args.train_data_csv:
help='Path to input text file.', ) parser.add_argument( '--output', type=str, help='Path to save final wav file.', ) args = parser.parse_args() try: path = os.path.realpath(os.path.dirname(__file__)) except NameError as e: path = './' C = load_config(os.path.join(path, 'pretrained_models/TTS/config.json')) C.forward_attn_mask = False C.windowing = True # load the audio processor ap = AudioProcessor(**C.audio) num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(os.path.join(path, 'pretrained_models/TTS/best_model.pth.tar'), map_location='cpu') model.load_state_dict(cp['model'], strict=False) model.r = cp['r'] model.decoder.r = cp['r']
def run_test_all_seeds(args, cuda=True, debug=False, return_potential=False): runs_list = os.listdir(args.experiment_dir) runs_list.sort() num_runs = len(runs_list) votes = [] wav_files = [] targets = [] # define loss function criterion = nn.BCELoss(reduction='sum') for run in runs_list: blockPrint() run_dir = os.path.join(args.experiment_dir, run) if os.path.isfile(run_dir): continue model_name = os.listdir(run_dir)[0] checkpoint_path = os.path.join(run_dir, model_name, 'best_checkpoint.pt') config_path = os.path.join(run_dir, model_name, 'config.json') c = load_config(config_path) ap = AudioProcessor(**c.audio) c.dataset['test_csv'] = args.test_csv c.dataset['test_data_root_path'] = args.test_root_dir c.test_config['batch_size'] = args.batch_size c.test_config['num_workers'] = args.num_workers max_seq_len = c.dataset['max_seq_len'] c.train_config['seed'] = 0 testdataloader = test_dataloader(c, ap, max_seq_len=max_seq_len) # load model model = return_model(c) enablePrint() if checkpoint_path is not None: print("Loading checkpoint: %s" % checkpoint_path) try: checkpoint = torch.load(checkpoint_path, map_location='cpu') model.load_state_dict(checkpoint['model']) print("Model Sucessful Load !") except Exception as e: raise ValueError( "You need pass a valid checkpoint, may be you need check your config.json because de the of this checkpoint cause the error: " + e) blockPrint() # convert model from cuda if cuda: model = model.cuda() model.train(False) vote, targets, wav_path = test(criterion, ap, model, c, testdataloader, cuda=cuda, confusion_matrix=True, debug=debug, simples_vote=args.simples_vote) # print(vote) wav_files.append(wav_path) votes.append(vote) if len(wav_files): if wav_files[-1] != wav_files[0]: raise ValueError( "Diferents files or order for the test in diferrents seeds or folds" ) # mean vote, and round is necessary if use composite vote preds = np.mean(np.array(votes), axis=0) # print(preds) if not return_potential: preds = preds.round() file_names = wav_files[0] if debug and not return_potential: enablePrint() targets = np.array(targets) preds = np.array(preds) names = np.array(file_names) idxs = np.nonzero(targets == c.dataset['control_class']) control_target = targets[idxs] control_preds = preds[idxs] names_control = names[idxs] idxs = np.nonzero(targets == c.dataset['patient_class']) patient_target = targets[idxs] patient_preds = preds[idxs] names_patient = names[idxs] if debug: print('+' * 40) print("Control Files Classified incorrectly:") incorrect_ids = np.nonzero( control_preds != c.dataset['control_class']) inc_names = names_control[incorrect_ids] print("Num. Files:", len(inc_names)) print(inc_names) print('+' * 40) print('-' * 40) print("Patient Files Classified incorrectly:") incorrect_ids = np.nonzero( patient_preds != c.dataset['patient_class']) inc_names = names_patient[incorrect_ids] print("Num. Files:", len(inc_names)) print(inc_names) print('-' * 40) acc_control = (control_preds == control_target).mean() acc_patient = (patient_preds == patient_target).mean() acc_balanced = (acc_control + acc_patient) / 2 f1 = f1_score(targets.tolist(), preds.tolist()) uar = recall_score(targets.tolist(), preds.tolist(), average='macro') print("======== Confusion Matrix ==========") y_target = pd.Series(targets, name='Target') y_pred = pd.Series(preds, name='Predicted') df_confusion = pd.crosstab(y_target, y_pred, rownames=['Target'], colnames=['Predicted'], margins=True) print(df_confusion) print("Test\n ", "Acurracy Control: ", acc_control, "Acurracy Patient: ", acc_patient, "Acurracy Balanced", acc_balanced) print("F1:", f1, "UAR:", uar) if return_potential: return preds, file_names else: df = pd.DataFrame({ 'filename': file_names, 'prediction': preds.astype(int) }) df['prediction'] = df['prediction'].replace( int(c.dataset['control_class']), 'negative', regex=True).replace(int(c.dataset['patient_class']), 'positive', regex=True) if args.output_csv: out_csv_path = args.output_csv else: out_csv_path = os.path.join( args.experiment_dir, os.path.basename(c.dataset['test_csv'])) df.to_csv(out_csv_path, index=False)
parser.add_argument( "--output_path", type=str, help="path for training outputs.", default="" ) # DISTRUBUTED parser.add_argument( "--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.", ) parser.add_argument( "--group_id", type=str, default="", help="DISTRIBUTED: process group id." ) args = parser.parse_args() CONFIG = load_config(args.config_path) if args.data_path != "": CONFIG.data_path = args.data_path DATA_PATH = CONFIG.data_path # DISTRUBUTED if num_gpus > 1: init_distributed( args.rank, num_gpus, args.group_id, CONFIG.distributed["backend"], CONFIG.distributed["url"], )
parser.add_argument('--reset_lr', action="store_true", help='reset lr.') # DISTRUBUTED parser.add_argument( '--rank', type=int, default=0, help='DISTRIBUTED: process rank for distributed training.') parser.add_argument('--group_id', type=str, default="", help='DISTRIBUTED: process group id.') args = parser.parse_args() # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) if args.data_path != '': c.data_path = args.data_path if args.output_path == '': OUT_PATH = os.path.join(_, c.output_path) else: OUT_PATH = args.output_path if args.group_id == '' and args.output_folder == '': OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug) else: OUT_PATH = os.path.join(OUT_PATH, args.output_folder) AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar' if args.step is None: MODEL_PATH = ROOT_PATH + 'best_model.pth.tar' else: MODEL_PATH = MODEL_PATH_TMP.format(args.step) print(MODEL_PATH) CONFIG_PATH = ROOT_PATH + '/config.json' OUT_FOLDER = ROOT_PATH + '/test/' duration_folder = os.path.join(OUT_FOLDER, 'durations') plot_folder = os.path.join(OUT_FOLDER, 'plot') os.makedirs(duration_folder, exist_ok=True) os.makedirs(plot_folder, exist_ok=True) c = load_config(CONFIG_PATH) ap = AudioProcessor(**c.audio) use_cuda = True data_loader = setup_loader(c, is_val=False) num_chars = len(phonemes) if c.use_phonemes else len(symbols) model = Tacotron( num_chars=num_chars, embedding_dim=c.embedding_size, linear_dim=ap.num_freq, mel_dim=ap.num_mels, r=c.r, memory_size=c.memory_size) checkpoint = torch.load(MODEL_PATH)
def __init__(self, *args, **kwargs): super(TestTTSDatasetCached, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.c = load_config(os.path.join(c.data_path_cache, 'config.json')) self.ap = AudioProcessor(**self.c.audio)
parser.add_argument("--val_split", type=int, default=0, help="Number of instances for validation.") parser.add_argument("--meta_file", type=str, help="Meta data file to be used for the dataset.") parser.add_argument("--process_audio", type=bool, default=False, help="Preprocess audio files.") args = parser.parse_args() DATA_PATH = args.data_path CACHE_PATH = args.cache_path CONFIG = load_config(args.config) # load the right preprocessor preprocessor = importlib.import_module('datasets.preprocess') preprocessor = getattr(preprocessor, args.dataset.lower()) items = preprocessor(args.data_path, args.meta_file) print(" > Input path: ", DATA_PATH) print(" > Cache path: ", CACHE_PATH) ap = AudioProcessor(**CONFIG.audio) def extract_mel(item): """ Compute spectrograms, length information """ text = item[0] file_path = item[1]
type=str, default='', help="data path to overwrite config.json.") parser.add_argument("--out_path", type=str, default='', help="destination to write files.") parser.add_argument("--ignore_errors", type=bool, default=False, help="ignore bad files.") args = parser.parse_args() config_path = args.config_path CONFIG = load_config(config_path) if args.data_path != '': CONFIG.data_path = args.data_path if type(CONFIG.mode) is int: CONFIG.audio['bits'] = CONFIG.mode ap = AudioProcessor(**CONFIG.audio) SEG_PATH = CONFIG.data_path # OUT_PATH = os.path.join(args.out_path, CONFIG.run_name, "data/") OUT_PATH = args.out_path QUANT_PATH = os.path.join(OUT_PATH, "quant/") MEL_PATH = os.path.join(OUT_PATH, "mel/") os.makedirs(OUT_PATH, exist_ok=True) os.makedirs(QUANT_PATH, exist_ok=True)
import os import glob import tqdm import torch import random import librosa import argparse import numpy as np from multiprocessing import Pool, cpu_count from utils.audio_processor import WrapperAudioProcessor as AudioProcessor from utils.generic_utils import mix_wavfiles from utils.generic_utils import load_config import pandas as pd import librosa config = load_config('config.json') ap = AudioProcessor(config.audio) data_path = '../test-my-data-prepo/train/' files = os.listdir(data_path) for file_name in files: if '.pt' in file_name: spec = ap.inv_spectrogram( torch.load(os.path.join(data_path, file_name)).cpu().detach().numpy()) ap.save_wav(spec, os.path.join(data_path, file_name + '.wav'))
def _create_random_model(self): config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
#!flask/bin/python import argparse import torch from synthesizer import Synthesizer from utils.generic_utils import load_config from flask import Flask, Response, request, render_template, send_file parser = argparse.ArgumentParser() parser.add_argument('-c', '--config_path', type=str, help='path to config file for training') args = parser.parse_args() config = load_config('server/config.json') app = Flask(__name__) synthesizer = Synthesizer() synthesizer.load_model(config.model_path, config.model_name, config.model_config, config.use_cuda) @app.route('/') def index(): return render_template('index.html') @app.route('/api/tts', methods=['GET']) def tts(): text = request.args.get('text') synthesizer.ap.frame_shift_ms = int(request.args.get('shift')) synthesizer.ap.griffin_lim_iters = int(request.args.get('iter'))
import torch import unittest import numpy as np from torch import optim from torch import nn from utils.generic_utils import load_config from layers.losses import L1LossMasked from models.tacotron import Tacotron torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") file_path = os.path.dirname(os.path.realpath(__file__)) c = load_config(os.path.join(file_path, 'test_config.json')) class TacotronTrainTest(unittest.TestCase): def test_train_step(self): input = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0
type=str, help="JSON file for multi-speaker model.", default="") parser.add_argument( '--text_gst_prediction', type=bool, default=True, help='Predict style from the text itself for more dynamic speech.') args = parser.parse_args() if args.vocoder_path != "": from WaveRNN.models.wavernn import Model as VocoderModel # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C)
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # save config to tmp place to be loaded by subsequent modules. file_name = str(os.getpid()) tmp_path = os.path.join("/tmp/", file_name+'_tts') pickle.dump(c, open(tmp_path, "wb")) # setup tensorboard LOG_DIR = OUT_PATH tb = SummaryWriter(LOG_DIR) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(1) signal.signal(signal.SIGINT, signal_handler) # Setup the dataset dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) # setup the model model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) # plot model on tensorboard dummy_input = dataset.get_dummy_data() ## TODO: onnx does not support RNN fully yet # model_proto_path = os.path.join(OUT_PATH, "model.proto") # onnx.export(model, dummy_input, model_proto_path, verbose=True) # tb.add_graph_onnx(model_proto_path) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) if args.restore_step: checkpoint = torch.load(os.path.join( args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) start_epoch = checkpoint['step'] // len(dataloader) best_loss = checkpoint['linear_loss'] else: start_epoch = 0 print("\n > Starting a new training") num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay, # patience=c.lr_patience, verbose=True) epoch_time = 0 best_loss = float('inf') for epoch in range(0, c.epochs): print("\n | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(dataset) / c.batch_size) for num_iter, data in enumerate(dataloader): start_time = time.time() text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step) for params_group in optimizer.param_groups: params_group['lr'] = current_lr optimizer.zero_grad() # Add a single frame of zeros to Mel Specs for better end detection #try: # mel_input = np.concatenate((np.zeros( # [c.batch_size, 1, c.num_mels], dtype=np.float32), # mel_input[:, 1:, :]), axis=1) #except: # raise TypeError("not same dimension") # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length. # TODO: might be unnecessary sorted_lengths, indices = torch.sort( text_lengths.view(-1), dim=0, descending=True) sorted_lengths = sorted_lengths.long().numpy() text_input_var = text_input_var[indices] mel_spec_var = mel_spec_var[indices] linear_spec_var = linear_spec_var[indices] if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var, input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths))) mel_loss = criterion(mel_output, mel_spec_var) #linear_loss = torch.abs(linear_output - linear_spec_var) #linear_loss = 0.5 * \ #torch.mean(linear_loss) + 0.5 * \ #torch.mean(linear_loss[:, :n_priority_freq, :]) linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) loss = mel_loss + linear_loss # loss = loss.cuda() loss.backward() grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.) ## TODO: maybe no need optimizer.step() step_time = time.time() - start_time epoch_time += step_time progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) # Plot Learning Stats tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step) tb.add_scalar('Loss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.data[0], OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_spec_var[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, dataset.ap) gt_spec = plot_spectrogram(gt_spec, dataset.ap) tb.add_image('Spec/Reconstruction', const_spec, current_step) tb.add_image('Spec/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() dataset.ap.griffin_lim_iters = 60 audio_signal = dataset.ap.inv_spectrogram(audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: print("\n > Error at audio signal on TB!!") print(audio_signal.max()) print(audio_signal.min()) # average loss after the epoch avg_epoch_loss = np.mean( progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) best_loss = save_best_model(model, optimizer, avg_epoch_loss, best_loss, OUT_PATH, current_step, epoch) #lr_scheduler.step(loss.data[0]) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(0) signal.signal(signal.SIGINT, signal_handler) dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner) model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) try: checkpoint = torch.load( os.path.join(CHECKPOINT_PATH, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) except: print("\n > Starting a new training\n") model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) for epoch in range(c.epochs): dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=32) progbar = Progbar(len(dataset) / c.batch_size) for i, data in enumerate(dataloader): text_input = data[0] magnitude_input = data[1] mel_input = data[2] current_step = i + args.restore_step + epoch * len(dataloader) + 1 optimizer.zero_grad() try: mel_input = np.concatenate( (np.zeros([c.batch_size, 1, c.num_mels], dtype=np.float32), mel_input[:, 1:, :]), axis=1) except: raise TypeError("not same dimension") if use_cuda: text_input_var = Variable(torch.from_numpy(text_input).type( torch.cuda.LongTensor), requires_grad=False).cuda() mel_input_var = Variable(torch.from_numpy(mel_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() mel_spec_var = Variable(torch.from_numpy(mel_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() linear_spec_var = Variable( torch.from_numpy(magnitude_input).type( torch.cuda.FloatTensor), requires_grad=False).cuda() else: text_input_var = Variable(torch.from_numpy(text_input).type( torch.LongTensor), requires_grad=False) mel_input_var = Variable(torch.from_numpy(mel_input).type( torch.FloatTensor), requires_grad=False) mel_spec_var = Variable(torch.from_numpy(mel_input).type( torch.FloatTensor), requires_grad=False) linear_spec_var = Variable( torch.from_numpy(magnitude_input).type(torch.FloatTensor), requires_grad=False) mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_input_var) mel_loss = criterion(mel_output, mel_spec_var) linear_loss = torch.abs(linear_output - linear_spec_var) linear_loss = 0.5 * \ torch.mean(linear_loss) + 0.5 * \ torch.mean(linear_loss[:, :n_priority_freq, :]) loss = mel_loss + linear_loss loss = loss.cuda() start_time = time.time() loss.backward() nn.utils.clip_grad_norm(model.parameters(), 1.) optimizer.step() time_per_step = time.time() - start_time progbar.update(i, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0])]) if current_step % c.save_step == 0: checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step) checkpoint_path = os.path.join(OUT_PATH, checkpoint_path) save_checkpoint( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': current_step, 'total_loss': loss.data[0], 'linear_loss': linear_loss.data[0], 'mel_loss': mel_loss.data[0], 'date': datetime.date.today().strftime("%B %d, %Y") }, checkpoint_path) print(" > Checkpoint is saved : {}".format(checkpoint_path)) if current_step in c.decay_step: optimizer = adjust_learning_rate(optimizer, current_step)