def world_synthesis(wav_name, feat_param, f0, ap, spectral, spectral_type): """WORLD SPEECH SYNTHESIS Args: wav_name (str): filename of synthesised wav feat_param (dict): acoustic feature parameter dictionary f0(np array): pitch features ap: aperiodicity features spectral: spectral features spectral_type: spectral feature type (sp or mcc) """ synthesizer = Synthesizer(fs=feat_param['fs'], fftl=feat_param['fftl'], shiftms=feat_param['shiftms']) if spectral_type == 'mcc': wav = synthesizer.synthesis(f0, spectral, ap, alpha=feat_param['mcep_alpha']) elif spectral_type == 'sp': wav = synthesizer.synthesis_spc(f0, spectral, ap) else: logging.info("Currently support 'mcep' or 'spc' only.") raise ValueError wav = np.clip(wav, -32768, 32767) wavfile.write(wav_name, feat_param['fs'], wav.astype(np.int16)) logging.info("wrote %s." % (wav_name))
def melcepstrum_noise_shaping(wav_list, args): """APPLY NOISE SHAPING USING STFT-BASED MCEP""" # define synthesizer synthesizer = Synthesizer(fs=args.fs, shiftms=args.shiftms, fftl=args.fftl) for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) if x.dtype != np.int16: logging.warn("wav file format is not 16 bit PCM.") x = np.float64(x) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # get frame number num_frames = int(1000 * len(x) / fs / args.shiftms) + 1 # load average mcep mlsa_coef = read_hdf5(args.stats, "/mcep/mean") * args.mag mlsa_coef[0] = 0.0 if args.inv: mlsa_coef[1:] = -1.0 * mlsa_coef[1:] mlsa_coef = np.float64(np.tile(mlsa_coef, [num_frames, 1])) # synthesis and write x_ns = synthesizer.synthesis_diff(x, mlsa_coef, alpha=args.mcep_alpha) x_ns = low_cut_filter(x_ns, args.fs, cutoff=70) write_name = args.writedir + "/" + os.path.basename(wav_name) wavfile.write(write_name, args.fs, np.int16(x_ns))
def __init__(self, feature_queue: Queue, converted_queue: Queue, mcep_gmm_config: configs.McepGMMConfig, f0_stats_config: configs.F0StatsConfig, gv_config: configs.GVConfig, synthesizer_config: configs.SynthesizerConfig): self._mcep_gmm = GMMConvertor(n_mix=mcep_gmm_config.n_mix, covtype=mcep_gmm_config.covtype, gmmmode=None) self._mcep_gmm.open_from_param(mcep_gmm_config.param) self._mcep_gmm_config = mcep_gmm_config self._feature_queue: Queue = feature_queue self._converted_queue: Queue = converted_queue self._f0_stats = F0statistics() self._f0_stats_config = f0_stats_config self._mcep_gv = GV() self._mcep_gv_config = gv_config self._synthesizer = Synthesizer(fs=synthesizer_config.fs, fftl=synthesizer_config.fftl, shiftms=synthesizer_config.shiftms) self._synthesizer_config = synthesizer_config
def noise_shaping(wav_list, args): """APPLY NOISE SHAPING""" # define feature extractor feature_extractor = FeatureExtractor( analyzer="world", fs=args.fs, shiftms=args.shiftms, fftl=args.fftl) # define synthesizer synthesizer = Synthesizer( fs=args.fs, shiftms=args.shiftms, fftl=args.fftl) for i, feat_id in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (feat_id, i + 1, len(wav_list))) # load wavfile and apply low cut filter wav_filename = args.outdir.replace("feat_id", feat_id) fs, x = wavfile.read(wav_filename) wav_type = x.dtype x = np.array(x, dtype=np.float64) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) ## extract features (only for get the number of frames) f0, _, _ = feature_extractor.analyze(x) num_frames = f0.shape[0] # load average mcep mlsa_coef = read_hdf5(args.stats, "/%s/mean" % args.feature_type) mlsa_coef = mlsa_coef[args.mcep_dim_start:args.mcep_dim_end] * args.mag mlsa_coef[0] = 0.0 if args.inv: mlsa_coef[1:] = -1.0 * mlsa_coef[1:] mlsa_coef = np.tile(mlsa_coef, [num_frames, 1]) # synthesis and write x_ns = synthesizer.synthesis_diff(x, mlsa_coef, alpha=args.mcep_alpha) x_ns = low_cut_filter(x_ns, args.fs, cutoff=70) write_name = args.writedir.replace("feat_id", feat_id) # check directory existence wav = np.clip(x_ns, -32768, 32767) if wav_type == np.int16: wavfile.write(write_name, args.fs, np.int16(wav)) else: wavfile.write(write_name, args.fs, wav)
class ConverterWorker: ''' 特徴量→変換特徴量→修正特徴量→変換音声 ''' def __init__(self, feature_queue: Queue, converted_queue: Queue, mcep_gmm_config: configs.McepGMMConfig, f0_stats_config: configs.F0StatsConfig, gv_config: configs.GVConfig, synthesizer_config: configs.SynthesizerConfig): self._mcep_gmm = GMMConvertor(n_mix=mcep_gmm_config.n_mix, covtype=mcep_gmm_config.covtype, gmmmode=None) self._mcep_gmm.open_from_param(mcep_gmm_config.param) self._mcep_gmm_config = mcep_gmm_config self._feature_queue: Queue = feature_queue self._converted_queue: Queue = converted_queue self._f0_stats = F0statistics() self._f0_stats_config = f0_stats_config self._mcep_gv = GV() self._mcep_gv_config = gv_config self._synthesizer = Synthesizer(fs=synthesizer_config.fs, fftl=synthesizer_config.fftl, shiftms=synthesizer_config.shiftms) self._synthesizer_config = synthesizer_config def convert_from_feature(self, f0, spc, ap, mcep) -> numpy.ndarray: cv_f0 = self._f0_stats.convert(f0, self._f0_stats_config.source_stats, self._f0_stats_config.target_stats) cv_mcep_wopow = self._mcep_gmm.convert( static_delta(mcep[:, 1:]), cvtype=self._mcep_gmm_config.cvtype) cv_mcep = numpy.c_[mcep[:, 0], cv_mcep_wopow] cv_mcep_wGV = self._mcep_gv.postfilter( cv_mcep, self._mcep_gv_config.target_stats, cvgvstats=self._mcep_gv_config.cvgv_stats, alpha=self._mcep_gv_config.morph_coeff, startdim=1) output_wav = self._synthesizer.synthesis( cv_f0, cv_mcep_wGV, ap, rmcep=mcep, alpha=self._synthesizer_config.mcep_alpha) return output_wav.clip(-32768, 32767).astype(numpy.core.int16) def start(self): while True: feature = self._feature_queue.get() # 同期処理 f0, spc, ap, mcep = feature output_wav = self.convert_from_feature(f0, spc, ap, mcep) self._converted_queue.put(output_wav)
def world_speech_synthesis(queue, wav_list, args): """WORLD SPEECH SYNTHESIS Parameters ---------- queue : multiprocessing.Queue() the queue to store the file name of utterance wav_list : list list of the wav files args : feature extract arguments """ # define ynthesizer synthesizer = Synthesizer(fs=args.fs, fftl=args.fftl, shiftms=args.shiftms) # synthesis for i, wav_name in enumerate(wav_list): if args.feature_dir == None: restored_name = wav_name.replace("wav", args.feature_format + "_restored") restored_name = restored_name.replace( ".%s" % args.feature_format + "_restored", ".wav") feat_name = wav_name.replace("wav", args.feature_format) else: restored_name = rootdir_replace(wav_name, newdir=args.feature_dir + "restored") feat_name = rootdir_replace(wav_name, extname=args.feature_format, newdir=args.feature_dir) if os.path.exists(restored_name): if args.overwrite: logging.info("overwrite %s (%d/%d)" % (restored_name, i + 1, len(wav_list))) else: logging.info("skip %s (%d/%d)" % (restored_name, i + 1, len(wav_list))) continue else: logging.info("now processing %s (%d/%d)" % (restored_name, i + 1, len(wav_list))) # load acoustic features if check_hdf5(feat_name, "/world"): h = read_hdf5(feat_name, "/world") else: logging.error("%s is not existed." % (feat_name)) sys.exit(1) if check_hdf5(feat_name, "/f0"): f0 = read_hdf5(feat_name, "/f0") else: uv = h[:, 0].copy(order='C') f0 = h[:, args.f0_dim_idx].copy(order='C') # cont_f0_lpf fz_idx = np.where(uv == 0.0) f0[fz_idx] = 0.0 if check_hdf5(feat_name, "/ap"): ap = read_hdf5(feat_name, "/ap") else: codeap = h[:, args.ap_dim_idx:].copy(order='C') ap = pyworld.decode_aperiodicity(codeap, args.fs, args.fftl) mcep = h[:, args.mcep_dim_start:args.mcep_dim_end].copy(order='C') # waveform synthesis wav = synthesizer.synthesis(f0, mcep, ap, alpha=args.mcep_alpha) wav = np.clip(wav, -32768, 32767) wavfile.write(restored_name, args.fs, wav.astype(np.int16)) #logging.info("wrote %s." % (restored_name)) queue.put('Finish')
def main(): parser = argparse.ArgumentParser( description="making feature file argsurations.") parser.add_argument("--waveforms", default=None, help="directory or list of filename of input wavfile") parser.add_argument("--stats", default=None, help="filename of hdf5 format") parser.add_argument("--writedir", default=None, help="directory to save preprocessed wav file") parser.add_argument("--fs", default=FS, type=int, help="Sampling frequency") parser.add_argument("--shiftms", default=SHIFTMS, type=int, help="Frame shift in msec") parser.add_argument("--fftl", default=FFTL, type=int, help="FFT length") parser.add_argument("--mcep_dim_start", default=MCEP_DIM_START, type=int, help="Start index of mel cepstrum") parser.add_argument("--mcep_dim_end", default=MCEP_DIM_END, type=int, help="End index of mel cepstrum") parser.add_argument("--mcep_alpha", default=MCEP_ALPHA, type=float, help="Alpha of mel cepstrum") parser.add_argument("--mag", default=MAG, type=float, help="magnification of noise shaping") parser.add_argument("--verbose", default=1, type=int, help="log message level") parser.add_argument('--n_jobs', default=1, type=int, help="number of parallel jobs") parser.add_argument('--inv', default=False, type=strtobool, help="if True, inverse filtering will be performed") args = parser.parse_args() # read list if os.path.isdir(args.waveforms): file_list = sorted(find_files(args.waveforms, "*.wav")) else: file_list = read_txt(args.waveforms) # define feature extractor feature_extractor = FeatureExtractor(analyzer="world", fs=args.fs, shiftms=args.shiftms, fftl=args.fftl) # define synthesizer synthesizer = Synthesizer(fs=args.fs, shiftms=args.shiftms, fftl=args.fftl) # check directory existence if not os.path.exists(args.writedir): os.makedirs(args.writedir) def noise_shaping(wav_list): for wav_name in wav_list: # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) wav_type = x.dtype x = np.array(x, dtype=np.float64) # check sampling frequency if not fs == args.fs: print("ERROR: sampling frequency is not matched.") sys.exit(1) # extract features (only for get the number of frames) f0, _, _ = feature_extractor.analyze(x) num_frames = f0.shape[0] # load average mcep mlsa_coef = read_hdf5(args.stats, "/mean") mlsa_coef = mlsa_coef[args.mcep_dim_start:args. mcep_dim_end] * args.mag mlsa_coef[0] = 0.0 if args.inv: mlsa_coef[1:] = -1.0 * mlsa_coef[1:] mlsa_coef = np.tile(mlsa_coef, [num_frames, 1]) # synthesis and write x_ns = synthesizer.synthesis_diff(x, mlsa_coef, alpha=args.mcep_alpha) x_ns = low_cut_filter(x_ns, args.fs, cutoff=70) if wav_type == np.int16: write_name = args.writedir + "/" + os.path.basename(wav_name) wavfile.write(write_name, args.fs, np.int16(x_ns)) else: wavfile.write(write_name, args.fs, x_ns) # divie list file_lists = np.array_split(file_list, args.n_jobs) file_lists = [f_list.tolist() for f_list in file_lists] # multi processing processes = [] for f in file_lists: p = mp.Process(target=noise_shaping, args=(f, )) p.start() processes.append(p) # wait for all process for p in processes: p.join()