def train(self): try: with torch.autograd.profiler.emit_nvtx( enabled=self.pyprof_enabled): for i in range(self.step + 1, self.final_steps + 1): self.step = i tprint( "------------- TRAIN step : {} -------------".format( i)) if self.nvprof_iter_start and i == self.nvprof_iter_start: profiler.start() with Nvtx("step #{}".format(self.step)): loss, meta = self.do_step() if self.nvprof_iter_end and i == self.nvprof_iter_end: profiler.stop() if self.lr_scheduler: for param_group in self.optimizer.param_groups: tprint("lr: {:06f}".format(param_group['lr'])) self.lr_scheduler.step(self.step) if self.step % self.log_steps == 0: self.log(loss, meta) if self.ckpt_path and self.save_steps and i % self.save_steps == 0: self.save() tprint("Training has been done.") except StopIteration: # done by n_epochs tprint("Training has been done. (by n_epochs)") except KeyboardInterrupt: tprint("Training has been canceled.")
def __init__(self, ckpt_file, device='cuda', use_fp16=False, use_denoiser=False): self.ckpt_file = ckpt_file self.device = device self.use_fp16 = use_fp16 self.use_denoiser = use_denoiser # model sys.path.append('waveglow') self.model = torch.load(self.ckpt_file, map_location=self.device)['model'] self.model = self.model.remove_weightnorm(self.model) self.model.eval() self.model = to_device_async(self.model, self.device) if self.use_fp16: self.model = self.model.half() self.model = self.model if self.use_denoiser: self.denoiser = Denoiser(self.model, device=device) self.denoiser = to_device_async(self.denoiser, self.device) tprint('Using WaveGlow denoiser.')
def preprocess_mel(hparam="base.yaml", **kwargs): """The script for preprocessing mel-spectrograms from the dataset. By default, this script assumes to load parameters in the default config file, fastspeech/hparams/base.yaml. Besides the flags, you can also set parameters in the config file via the command-line. For examples, --dataset_path=DATASET_PATH Path to dataset directory. --mels_path=MELS_PATH Path to output preprocessed mels directory. Refer to fastspeech/hparams/base.yaml to see more parameters. Args: hparam (str, optional): Path to default config file. Defaults to "base.yaml". """ hp.set_hparam(hparam, kwargs) tprint("Hparams:\n{}".format(pp.pformat(hp))) pathlib.Path(hp.mels_path).mkdir(parents=True, exist_ok=True) dataset = LJSpeechDataset(hp.dataset_path, mels_path=None) for data in tqdm(dataset): name = data["name"] mel = data["mel"] save_path = os.path.join(hp.mels_path, name + ".mel.npy") if os.path.exists(save_path): continue # print(name, mel) np.save(save_path, mel)
def verify(hparam="trt.yaml", text=SAMPLE_TEXT, **kwargs): hp.set_hparam(hparam, kwargs) tprint("Hparams:\n{}".format(pp.pformat(hp))) tprint("Device count: {}".format(torch.cuda.device_count())) outs_trt, acts_trt = infer_trt(text) outs, acts = infer_pytorch(text) both, pytorch, trt = join_dict(acts, acts_trt) # print diff print("## Diff ##\n\n") for name, (act, act_trt) in both.items(): act = act.float() act_trt = act_trt.float() diff = act.reshape(-1) - act_trt.reshape(-1) is_identical = diff.eq(0).all() errors = diff[diff.ne(0)] max_error = torch.max(torch.abs(errors)) if len(errors) > 0 else 0 print( "# {} #\n\n[PyTorch]\n{}\n\n[TRT]: \n{}\n\n[Diff]: \n{}\n\n[Errors]: \n{}\n- identical? {}\n- {} errors out of {}\n- max: {}\n\n" .format( name, act, act_trt, diff, errors, is_identical, len(errors), len(diff), max_error, ))
def __exit__(self, *exc_info): if self.device == 'cuda' and self.cuda_sync: torch.cuda.synchronize() self.end_time = time.time() self.time_elapsed = self.end_time - self.start_time tprint(("[{}] Time elapsed: {" + self.format + "}").format( self.name, self.time_elapsed))
def end(self): if not hasattr(self, "start_time"): return if self.device == 'cuda' and self.cuda_sync: torch.cuda.synchronize() self.end_time = time.time() self.time_elapsed = self.end_time - self.start_time tprint(("[{}] Time elapsed: {" + self.format + "}").format(self.name, self.time_elapsed))
def save(self): state_dict = { 'step': self.step, 'model': self.model.state_dict(), 'optim': self.optimizer.state_dict(), } torch.save(state_dict, self.ckpt_path + '/checkpoint_{:06d}.pt'.format(self.step)) tprint('[Save] Model "{}". Step={}.'.format(self.model_name, self.step))
def build_engine(self): # load engines and create contexts self.engine_list = [] self.context_list = [] for i, (trt_max_input_seq_len, trt_max_output_seq_len, trt_file_path) in enumerate( self.max_seq_lens_and_file_path_list): if trt_file_path and os.path.isfile( trt_file_path) and not self.trt_force_build: with open(trt_file_path, 'rb') as f: engine_str = f.read() with trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(engine_str) tprint('TRT Engine Loaded from {} successfully.'.format( trt_file_path)) else: self.trt_max_input_seq_len = trt_max_input_seq_len self.trt_max_output_seq_len = trt_max_output_seq_len self.trt_file_path = trt_file_path tprint('Building a TRT Engine..') engine = self.do_build_engine() tprint('TRT Engine Built.') with open(self.trt_file_path, 'wb') as f: f.write(engine.serialize()) tprint('TRT Engine Saved in {}.'.format(self.trt_file_path)) self.engine_list.append(engine)
def load(self, load_optim=True): files_exist = glob.glob(os.path.join(self.ckpt_path, '*')) if files_exist: # load the latest created file. latest_file = max(files_exist, key=os.path.getctime) state_dict = torch.load(latest_file) self.step = state_dict['step'] self.model.load_state_dict(state_dict['model']) if load_optim: self.optimizer.load_state_dict(state_dict['optim']) tprint('[Load] Checkpoint \'{}\'. Step={}'.format( latest_file, self.step)) else: tprint('No checkpoints in {}. Load skipped.'.format( self.ckpt_path))
def load(self, ckpt_file): # load latest checkpoint file if not defined. if not ckpt_file: files_exist = glob.glob(os.path.join(self.ckpt_path, '*')) if files_exist: ckpt_file = max(files_exist, key=os.path.getctime) if ckpt_file: state_dict = torch.load(ckpt_file, map_location=self.device) self.step = state_dict['step'] self.model.load_state_dict(state_dict['model']) tprint('[Load] Checkpoint \'{}\'. Step={}'.format(ckpt_file, self.step)) else: tprint('No checkpoints in {}. Load skipped.'.format(self.ckpt_path)) raise Exception("No checkpoints found.")
def __init__(self, model_name, model, data_loader=None, ckpt_path=None, ckpt_file=None, log_path=None, device='cuda', use_fp16=False, seed=None): self.data_loader = data_loader self.model_name = model_name self.model = model self.ckpt_path = ckpt_path self.log_path = log_path self.device = device self.seed = seed self.step = 0 self.ckpt_file = ckpt_file self.use_fp16 = use_fp16 # model self.model.eval() to_device_async(self.model, self.device) num_param = sum(param.numel() for param in model.parameters()) tprint('The number of {} parameters: {}'.format(self.model_name, num_param)) # precision if self.use_fp16: self.model = self.model.half() # data parallel self.model = nn.DataParallel(self.model) # set seed if seed is None: seed = np.random.randint(2**16) np.random.seed(seed) torch.manual_seed(seed) self.data_loader_iter = iter(self.data_loader) # logging if log_path: # tensorboard log path : {log_path}/YYYYMMDD-HHMMMSS log_path = os.path.join(log_path, time.strftime('%Y%m%d-%H%M%S')) self.tbwriter = SummaryWriter(log_dir=log_path, flush_secs=10) # checkpoint path if self.ckpt_path: self.ckpt_path = os.path.join(self.ckpt_path, self.model_name) pathlib.Path(self.ckpt_path).mkdir(parents=True, exist_ok=True) # load checkpoint self.load(ckpt_file)
def __init__(self, ckpt_file, device='cuda', use_fp16=False, use_denoiser=False): self.ckpt_file = ckpt_file self.device = device self.use_fp16 = use_fp16 self.use_denoiser = use_denoiser # model # sys.path.append('waveglow') from waveglow.arg_parser import parse_waveglow_args parser = parser = argparse.ArgumentParser() model_parser= parse_waveglow_args(parser) args, _ = model_parser.parse_known_args() model_config = dict( n_mel_channels=args.n_mel_channels, n_flows=args.flows, n_group=args.groups, n_early_every=args.early_every, n_early_size=args.early_size, WN_config=dict( n_layers=args.wn_layers, kernel_size=args.wn_kernel_size, n_channels=args.wn_channels ) ) self.model = WaveGlow(**model_config) state_dict = torch.load(self.ckpt_file, map_location=self.device)['state_dict'] state_dict = unwrap_distributed(state_dict) self.model.load_state_dict(state_dict) self.model = to_device_async(self.model, self.device) self.model = self.model.remove_weightnorm(self.model) self.model.eval() if self.use_fp16: self.model = self.model.half() self.model = self.model if self.use_denoiser: self.denoiser = Denoiser(self.model, device=device) self.denoiser = to_device_async(self.denoiser, self.device) tprint('Using WaveGlow denoiser.')
def set_engine_and_context(self, length): for i, (trt_max_input_seq_len, trt_max_output_seq_len, trt_file_path) in enumerate( self.max_seq_lens_and_file_path_list): if length <= trt_max_input_seq_len: self.engine = self.engine_list[i] self.context = self.context_list[i] self.trt_max_input_seq_len = trt_max_input_seq_len self.trt_max_output_seq_len = trt_max_output_seq_len self.trt_file_path = trt_file_path break else: self.engine = self.engine_list[-1] self.context = self.context_list[-1] self.trt_max_input_seq_len = trt_max_input_seq_len self.trt_max_output_seq_len = trt_max_output_seq_len self.trt_file_path = trt_file_path tprint('TRT Engine {} is selected.'.format(self.trt_file_path))
def __init__(self, ckpt_file, engine_file, use_fp16=False, use_denoiser=False, stride=256, n_groups=8): self.ckpt_file = ckpt_file self.engine_file = engine_file self.use_fp16 = use_fp16 self.use_denoiser = use_denoiser self.stride = stride self.n_groups = n_groups if self.use_denoiser: sys.path.append('waveglow') waveglow = torch.load(self.ckpt_file)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.eval() self.denoiser = Denoiser(waveglow) self.denoiser = to_gpu_async(self.denoiser) tprint('Using WaveGlow denoiser.') # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow torch.cuda.empty_cache() # load engine with open(self.engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: self.engine = runtime.deserialize_cuda_engine(f.read()) if self.engine: tprint('TRT Engine Loaded from {} successfully.'.format(self.engine_file)) return else: tprint('Loading TRT Engine from {} failed.'.format(self.engine_file))
def build_engine(self): if self.trt_file_path and os.path.isfile( self.trt_file_path) and not self.trt_force_build: with open(self.trt_file_path, 'rb') as f: engine_str = f.read() with trt.Runtime(TRT_LOGGER) as runtime: self.engine = runtime.deserialize_cuda_engine(engine_str) if self.engine: tprint('TRT Engine Loaded from {} successfully.'.format( self.trt_file_path)) return else: tprint('Loading TRT Engine from {} failed.'.format( self.trt_file_path)) tprint('Building a TRT Engine..') self.engine = self.do_build_engine() tprint('TRT Engine Built.') if self.trt_file_path: with open(self.trt_file_path, 'wb') as f: f.write(self.engine.serialize()) tprint('TRT Engine Saved in {}.'.format(self.trt_file_path))
def generate(hparam='infer.yaml', text='test_sentences.txt', results_path='results', device=DEFAULT_DEVICE, **kwargs): """The script for generating waveforms from texts with a vocoder. By default, this script assumes to load parameters in the default config file, fastspeech/hparams/infer.yaml. Besides the flags, you can also set parameters in the config file via the command-line. For examples, --checkpoint_path=CHECKPOINT_PATH Path to checkpoint directory. The latest checkpoint will be loaded. --waveglow_path=WAVEGLOW_PATH Path to the WaveGlow checkpoint file. --waveglow_engine_path=WAVEGLOW_ENGINE_PATH Path to the WaveGlow engine file. It can be only used with --use_trt=True. --batch_size=BATCH_SIZE Batch size to use. Defaults to 1. Refer to fastspeech/hparams/infer.yaml to see more parameters. Args: hparam (str, optional): Path to default config file. Defaults to "infer.yaml". text (str, optional): a sample text or a text file path to generate its waveform. Defaults to 'test_sentences.txt'. results_path (str, optional): Path to output waveforms directory. Defaults to 'results'. device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu". """ hp.set_hparam(hparam, kwargs) if os.path.isfile(text): f = open(text, 'r', encoding="utf-8") texts = f.read().splitlines() else: # single string texts = [text] dataset = TextDataset(texts) data_loader = PadDataLoader(dataset, batch_size=hp.batch_size, num_workers=hp.n_workers, shuffle=False, drop_last=False) # text to mel model = Fastspeech( max_seq_len=hp.max_seq_len, d_model=hp.d_model, phoneme_side_n_layer=hp.phoneme_side_n_layer, phoneme_side_head=hp.phoneme_side_head, phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size, phoneme_side_output_size=hp.phoneme_side_output_size, mel_side_n_layer=hp.mel_side_n_layer, mel_side_head=hp.mel_side_head, mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size, mel_side_output_size=hp.mel_side_output_size, duration_predictor_filter_size=hp.duration_predictor_filter_size, duration_predictor_kernel_size=hp.duration_predictor_kernel_size, fft_conv1d_kernel=hp.fft_conv1d_kernel, fft_conv1d_padding=hp.fft_conv1d_padding, dropout=hp.dropout, n_mels=hp.num_mels, fused_layernorm=hp.fused_layernorm) fs_inferencer = get_inferencer(model, data_loader, device) # set up WaveGlow if hp.use_trt: from fastspeech.trt.waveglow_trt_inferencer import WaveGlowTRTInferencer wb_inferencer = WaveGlowTRTInferencer( ckpt_file=hp.waveglow_path, engine_file=hp.waveglow_engine_path, use_fp16=hp.use_fp16) else: wb_inferencer = WaveGlowInferencer(ckpt_file=hp.waveglow_path, device=device, use_fp16=hp.use_fp16) tprint("Generating {} sentences.. ".format(len(dataset))) with fs_inferencer, wb_inferencer: try: for i in range(len(data_loader)): tprint("------------- BATCH # {} -------------".format(i)) with TimeElapsed(name="Inferece Time: E2E", format=":.6f"): ## Text-to-Mel ## with TimeElapsed(name="Inferece Time: FastSpeech", device=device, cuda_sync=True, format=":.6f"), torch.no_grad(): outputs = fs_inferencer.infer() texts = outputs["text"] mels = outputs["mel"] # (b, n_mels, t) mel_masks = outputs['mel_mask'] # (b, t) # assert(mels.is_cuda) # remove paddings mel_lens = mel_masks.sum(axis=1) max_len = mel_lens.max() mels = mels[..., :max_len] mel_masks = mel_masks[..., :max_len] ## Vocoder ## with TimeElapsed(name="Inferece Time: WaveGlow", device=device, cuda_sync=True, format=":.6f"), torch.no_grad(): wavs = wb_inferencer.infer(mels) wavs = to_cpu_numpy(wavs) ## Write wavs ## pathlib.Path(results_path).mkdir(parents=True, exist_ok=True) for i, (text, wav) in enumerate(zip(texts, wavs)): tprint("TEXT #{}: \"{}\"".format(i, text)) # remove paddings in case of batch size > 1 wav_len = mel_lens[i] * hp.hop_len wav = wav[:wav_len] path = os.path.join(results_path, text + ".wav") librosa.output.write_wav(path, wav, hp.sr) except StopIteration: tprint("Generation has been done.") except KeyboardInterrupt: tprint("Generation has been canceled.")
def train(hparam="train.yaml", device=DEFAULT_DEVICE, **kwargs): """ The FastSpeech model training script. By default, this script assumes to load parameters in the default config file, fastspeech/hparams/train.yaml. Besides the flags, you can also set parameters in the config file via the command-line. For examples, --dataset_path=DATASET_PATH Path to dataset directory. --tacotron2_path=TACOTRON2_PATH Path to tacotron2 checkpoint file. --mels_path=MELS_PATH Path to preprocessed mels directory. --aligns_path=ALIGNS_PATH Path to preprocessed alignments directory. --log_path=LOG_PATH Path to log directory. --checkpoint_path=CHECKPOINT_PATH Path to checkpoint directory. The latest checkpoint will be loaded. --batch_size=BATCH_SIZE Batch size to use. Defaults to 16. Refer to fastspeech/hparams/train.yaml to see more parameters. Args: hparam (str, optional): Path to default config file. Defaults to "train.yaml". device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu". """ hp.set_hparam(hparam, kwargs) tprint("Hparams:\n{}".format(pp.pformat(hp))) tprint("Device count: {}".format(torch.cuda.device_count())) # model model = Fastspeech( max_seq_len=hp.max_seq_len, d_model=hp.d_model, phoneme_side_n_layer=hp.phoneme_side_n_layer, phoneme_side_head=hp.phoneme_side_head, phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size, phoneme_side_output_size=hp.phoneme_side_output_size, mel_side_n_layer=hp.mel_side_n_layer, mel_side_head=hp.mel_side_head, mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size, mel_side_output_size=hp.mel_side_output_size, duration_predictor_filter_size=hp.duration_predictor_filter_size, duration_predictor_kernel_size=hp.duration_predictor_kernel_size, fft_conv1d_kernel=hp.fft_conv1d_kernel, fft_conv1d_padding=hp.fft_conv1d_padding, dropout=hp.dropout, n_mels=hp.num_mels, fused_layernorm=hp.fused_layernorm) # dataset dataset = LJSpeechDataset( root_path=hp.dataset_path, meta_file=hp.meta_file, mels_path=hp.mels_path, aligns_path=hp.aligns_path, sr=hp.sr, n_fft=hp.n_fft, win_len=hp.win_len, hop_len=hp.hop_len, n_mels=hp.num_mels, mel_fmin=hp.mel_fmin, mel_fmax=hp.mel_fmax, ) tprint("Dataset size: {}".format(len(dataset))) # data loader data_loader = PadDataLoader( dataset, batch_size=hp.batch_size, num_workers=hp.n_workers, drop_last=True, ) # optimizer def get_optimizer(model): optimizer = torch.optim.Adam(model.parameters(), lr=hp.learning_rate, betas=(0.9, 0.98), eps=1e-9) return optimizer def get_warmup_lr_scheduler(optimizer): d_model = hp.d_model warmup_steps = hp.warmup_steps lr = lambda step: d_model**-0.5 * min( (step + 1)**-0.5, (step + 1) * warmup_steps**-1.5) / hp.learning_rate scheduler = LambdaLR(optimizer, lr_lambda=[lr]) return scheduler # trainer trainer = FastspeechTrainer( data_loader, 'fastspeech', model, optimizer_fn=get_optimizer, final_steps=hp.final_steps, log_steps=hp.log_step, ckpt_path=hp.checkpoint_path, save_steps=hp.save_step, log_path=hp.log_path, lr_scheduler_fn=get_warmup_lr_scheduler, pre_aligns=True if hp.aligns_path else False, device=device, use_amp=hp.use_amp, nvprof_iter_start=hp.nvprof_iter_start, nvprof_iter_end=hp.nvprof_iter_end, pyprof_enabled=hp.pyprof_enabled, ) trainer.train()
def forward(self, seq, pos, duration_target=None, alpha=1.0, seq_output_len=None, use_fp16=False, acts=None): # Phoneme Embedding output = self.word_emb(seq) if acts is not None: acts["act.emb"] = output if use_fp16: output = output.half() # Phoneme Side FFT Blocks output, output_mask = self.phoneme_side(output, pos, acts=acts) if acts is not None: acts["act.phoneme_side.seq"] = output # Length Regulator output, pos, duration = self.length_regulator(output, output_mask, target=duration_target, alpha=alpha) if seq_output_len: output = F.pad(output, pad=(0, 0, 0, seq_output_len - output.size(1))) pos = F.pad(pos, pad=(0, seq_output_len - pos.size(1))) # length of output mel shouldn't exceed max_seq_len output = output[:, :self.max_seq_len] pos = pos[:, :self.max_seq_len] if acts is not None: acts["act.length_regulator.seq"] = output acts["act.length_regulator.dur"] = torch.round(duration) if self.training or output.bool().any(): # Mel Side FFT Blocks output, output_mask = self.mel_side(output, pos, acts=acts) if acts is not None: acts["act.mel_side.seq"] = output # Linear Layer output = self.mel_linear(output) if acts is not None: acts["out.seq_mask"] = output_mask acts["out.seq"] = output else: # seq length could be zero, in case duration predictor outputs all zeros. # In this case, skip feed-forwarding. tprint( "Duration Predictor outputs all zeros. Output will be zero length." ) output_shape = (output.size(0), 0, output_mask.size(2)) output = torch.zeros(size=(output_shape)) output_mask = torch.ones(size=(output_shape)) if torch.cuda.device_count() > 1: # In a multi-gpu setting, all output mels from devices must have the same length. # otherwise, an error occurs in process of gathering output. if not seq_output_len: seq_output_len = self.max_seq_len padding = (0, 0, 0, seq_output_len - output.size(1)) output = F.pad(output, padding) output = output[:, :seq_output_len, :] output_mask = F.pad(output_mask, padding) output_mask = output_mask[:, :seq_output_len, :] return output, output_mask, duration
def console_log(self, tag, output): # console logging msg = "" for key, value in sorted(output.items()): msg += ',\t{}: {}'.format(key, value) tprint(msg)
def perf_inference(hparam="infer.yaml", with_vocoder=False, n_iters=None, device=DEFAULT_DEVICE, **kwargs): """The script for estimating inference performance. By default, this script assumes to load parameters in the default config file, fastspeech/hparams/infer.yaml. Besides the flags, you can also set parameters in the config file via the command-line. For examples, --dataset_path=DATASET_PATH Path to dataset directory. --checkpoint_path=CHECKPOINT_PATH Path to checkpoint directory. The latest checkpoint will be loaded. --batch_size=BATCH_SIZE Batch size to use. Defaults to 1. Refer to fastspeech/hparams/infer.yaml to see more parameters. Args: hparam (str, optional): Path to default config file. Defaults to "infer.yaml". with_vocoder (bool, optional): Whether or not to estimate with a vocoder. Defaults to False. n_iters (int, optional): Number of batches to estimate. Defaults to None (an epoch). device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu". """ hp.set_hparam(hparam, kwargs) tprint("Hparams:\n{}".format(pp.pformat(hp))) tprint("Device count: {}".format(torch.cuda.device_count())) model = Fastspeech( max_seq_len=hp.max_seq_len, d_model=hp.d_model, phoneme_side_n_layer=hp.phoneme_side_n_layer, phoneme_side_head=hp.phoneme_side_head, phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size, phoneme_side_output_size=hp.phoneme_side_output_size, mel_side_n_layer=hp.mel_side_n_layer, mel_side_head=hp.mel_side_head, mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size, mel_side_output_size=hp.mel_side_output_size, duration_predictor_filter_size=hp.duration_predictor_filter_size, duration_predictor_kernel_size=hp.duration_predictor_kernel_size, fft_conv1d_kernel=hp.fft_conv1d_kernel, fft_conv1d_padding=hp.fft_conv1d_padding, dropout=hp.dropout, n_mels=hp.num_mels, fused_layernorm=hp.fused_layernorm) dataset_size = hp.batch_size * (n_iters if n_iters else 1) tprint("Dataset size: {}".format(dataset_size)) dataset = TextDataset([INPUT_TEXT] * (dataset_size + (WARMUP_ITERS * hp.batch_size))) data_loader = PadDataLoader( dataset, batch_size=hp.batch_size, num_workers=hp.n_workers, shuffle=False if hp.use_trt and hp.trt_multi_engine else True, drop_last=True, ) fs_inferencer = get_inferencer(model, data_loader, device) if with_vocoder: if hp.use_trt: from fastspeech.trt.waveglow_trt_inferencer import WaveGlowTRTInferencer wb_inferencer = WaveGlowTRTInferencer( ckpt_file=hp.waveglow_path, engine_file=hp.waveglow_engine_path, use_fp16=hp.use_fp16) else: wb_inferencer = WaveGlowInferencer(ckpt_file=hp.waveglow_path, device=device, use_fp16=hp.use_fp16) with fs_inferencer, wb_inferencer if with_vocoder else ExitStack(): tprint("Perf started. Batch size={}.".format(hp.batch_size)) latencies = [] throughputs = [] for i in tqdm(range(len(data_loader))): start = time.time() outputs = fs_inferencer.infer() mels = outputs['mel'] mel_masks = outputs['mel_mask'] assert (mels.is_cuda) if with_vocoder: # remove padding max_len = mel_masks.sum(axis=1).max() mels = mels[..., :max_len] mel_masks = mel_masks[..., :max_len] with torch.no_grad(): wavs = wb_inferencer.infer(mels) wavs = to_cpu_numpy(wavs) else: # include time for DtoH copy to_cpu_numpy(mels) to_cpu_numpy(mel_masks) end = time.time() if i > WARMUP_ITERS - 1: time_elapsed = end - start generated_samples = len(mel_masks.nonzero()) * hp.hop_len throughput = generated_samples / time_elapsed latencies.append(time_elapsed) throughputs.append(throughput) latencies.sort() avg_latency = np.mean(latencies) std_latency = np.std(latencies) latency_90 = max(latencies[:int(len(latencies) * 0.90)]) if n_iters > 1 else 0 latency_95 = max(latencies[:int(len(latencies) * 0.95)]) if n_iters > 1 else 0 latency_99 = max(latencies[:int(len(latencies) * 0.99)]) if n_iters > 1 else 0 throughput = np.mean(throughputs) rtf = throughput / (hp.sr * hp.batch_size) tprint( "Batch size\tPrecision\tAvg Latency(s)\tStd Latency(s)\tLatency 90%(s)\tLatency 95%(s)\tLatency 99%(s)\tThroughput(samples/s)\tAvg RTF\n\ {}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{}\t{:.2f}".format( hp.batch_size, "FP16" if hp.use_fp16 else "FP32", avg_latency, std_latency, latency_90, latency_95, latency_99, int(throughput), rtf))
def console_log(self, tag, loss, meta): # console logging msg = 'loss: {:.6f}'.format(loss) for key, value in meta.items(): msg += ',\t{}: {:.4f}'.format(key, value) tprint(msg)
def __init__(self, data_loader, model_name, model, optimizer_fn, final_steps, lr_scheduler_fn=None, step=0, ckpt_path=None, log_path=None, n_epochs=None, save_steps=None, log_steps=10, device='cuda', use_amp=False, nvprof_iter_start=None, nvprof_iter_end=None, pyprof_enabled=False, detect_anomaly=False, seed=None): self.data_loader = data_loader self.model_name = model_name self.model = model self.n_epochs = n_epochs self.save_steps = save_steps self.log_steps = log_steps self.ckpt_path = ckpt_path self.log_path = log_path self.final_steps = final_steps self.step = step self.device = device self.use_amp = use_amp self.nvprof_iter_start = nvprof_iter_start self.nvprof_iter_end = nvprof_iter_end self.pyprof_enabled = pyprof_enabled self.detect_anomaly = detect_anomaly # model self.model.train() to_device_async(self.model, self.device) num_param = sum(param.numel() for param in model.parameters()) tprint('The number of {} parameters: {}'.format( self.model_name, num_param)) # optimizer self.optimizer = optimizer_fn(model) # lr scheduler if lr_scheduler_fn: self.lr_scheduler = lr_scheduler_fn(self.optimizer) else: self.lr_scheduler = None # automatic mixed precision if self.use_amp: from apex import amp self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1') # profile if nvprof_iter_start and nvprof_iter_end is not None and pyprof_enabled: from apex import pyprof pyprof.nvtx.init() # data parallel self.model = nn.DataParallel(self.model) # set seed if seed is None: seed = np.random.randint(2**16) np.random.seed(seed) torch.manual_seed(seed) # data loader self.data_loader_iter = self.repeat(self.data_loader, n_epochs) # logging if log_path: # tensorboard log path : {log_path}/YYYYMMDD-HHMMMSS log_path = os.path.join(log_path, time.strftime('%Y%m%d-%H%M%S')) self.tbwriter = SummaryWriter(log_dir=log_path, flush_secs=10) # checkpoint path if self.ckpt_path: self.ckpt_path = os.path.join(self.ckpt_path, self.model_name) pathlib.Path(self.ckpt_path).mkdir(parents=True, exist_ok=True) # load checkpoint self.load()
def infer(hparam="infer.yaml", device=DEFAULT_DEVICE, n_iters=1, **kwargs): """ The FastSpeech model inference script. By default, this script assumes to load parameters in the default config file, fastspeech/hparams/infer.yaml. Besides the flags, you can also set parameters in the config file via the command-line. For examples, --dataset_path=DATASET_PATH Path to dataset directory. --checkpoint_path=CHECKPOINT_PATH Path to checkpoint directory. The latest checkpoint will be loaded. --batch_size=BATCH_SIZE Batch size to use. Defaults to 1. Refer to fastspeech/hparams/infer.yaml to see more parameters. Args: hparam (str, optional): Path to default config file. Defaults to "infer.yaml". device (str, optional): Device to use. Defaults to "cuda" if avaiable, or "cpu". n_iters (int, optional): Number of batches to infer. Defaults to 1. """ hp.set_hparam(hparam, kwargs) tprint("Hparams:\n{}".format(pp.pformat(hp))) tprint("Device count: {}".format(torch.cuda.device_count())) # model model = Fastspeech( max_seq_len=hp.max_seq_len, d_model=hp.d_model, phoneme_side_n_layer=hp.phoneme_side_n_layer, phoneme_side_head=hp.phoneme_side_head, phoneme_side_conv1d_filter_size=hp.phoneme_side_conv1d_filter_size, phoneme_side_output_size=hp.phoneme_side_output_size, mel_side_n_layer=hp.mel_side_n_layer, mel_side_head=hp.mel_side_head, mel_side_conv1d_filter_size=hp.mel_side_conv1d_filter_size, mel_side_output_size=hp.mel_side_output_size, duration_predictor_filter_size=hp.duration_predictor_filter_size, duration_predictor_kernel_size=hp.duration_predictor_kernel_size, fft_conv1d_kernel=hp.fft_conv1d_kernel, fft_conv1d_padding=hp.fft_conv1d_padding, dropout=hp.dropout, n_mels=hp.num_mels, fused_layernorm=hp.fused_layernorm ) dataset = LJSpeechDataset(root_path=hp.dataset_path, meta_file=hp.meta_file, sr=hp.sr, n_fft=hp.n_fft, win_len=hp.win_len, hop_len=hp.hop_len, n_mels=hp.num_mels, mel_fmin=hp.mel_fmin, mel_fmax=hp.mel_fmax, exclude_mels=True, sort_by_length=True if hp.use_trt and hp.trt_multi_engine else False ) tprint("Dataset size: {}".format(len(dataset))) data_loader = PadDataLoader(dataset, batch_size=hp.batch_size, num_workers=hp.n_workers, shuffle=False if hp.use_trt and hp.trt_multi_engine else True, drop_last=True, ) inferencer = get_inferencer(model, data_loader, device) try: n_iters = min(len(data_loader), n_iters) if n_iters else len(data_loader) tprint("Num of iters: {}".format(n_iters)) with inferencer: for i in range(n_iters): tprint("------------- INFERENCE : batch #{} -------------".format(i)) with TimeElapsed(name="Inference Time", cuda_sync=True): out_batch = inferencer.infer() # tprint("Output:\n{}".format(pp.pformat(out_batch))) tprint("Inference has been done.") except KeyboardInterrupt: tprint("Inference has been canceled.")