def get_coefs(wav_file_path): sample_rate, x = wavfile.read(wav_file_path) # al.play(x.astype(float) / x.max(), fs=sample_rate) frames = librosa.util.frame( x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T frames *= pysptk.blackman(frame_length) f0 = pysptk.swipe( x.astype(np.float64), fs=sample_rate, hopsize=hop_length, min=50, max=500) # order = 40 # alpha = 0.41 mc = np.apply_along_axis( pysptk.mcep, 1, frames, order, alpha) return sample_rate, f0, mc # sample_rate- ?, f0, mel-cepstrum coefs
def get_synt_wav(wav_file_path): # # Synthesis from mel-cepstrum sample_rate, x = wavfile.read(wav_file_path) # assert sample_rate == 16000 # al.play(x.astype(float) / x.max(), fs=sample_rate) # Audio(x, rate=sample_rate) # all of pysptk functions assume input array is C-contiguous # and np.float4 element type frames = librosa.util.frame( x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T # Windowing frames *= pysptk.blackman(frame_length) # assert frames.shape[1] == frame_length # F0 estimation f0 = pysptk.swipe( x.astype(np.float64), fs=sample_rate, hopsize=hop_length, min=50, max=500) generator = excite.ExcitePulse(sample_rate, hop_length, False) source_excitation = generator.gen(f0) # apply function along with `time` axis (=1) mc = np.apply_along_axis( pysptk.mcep, 1, frames, order, alpha) # Convert mel-cesptrum to MLSADF coefficients b = np.apply_along_axis(pysptk.mc2b, 1, mc, alpha) synthesizer = pysptk.synthesis.Synthesizer( pysptk.synthesis.MLSADF( order=order, alpha=alpha), hop_length) x_synthesized = synthesizer.synthesis(source_excitation, b) # Audio(x_synthesized, rate=sample_rate) # al.play(x_synthesized.astype(float) / x_synthesized.max(), fs=sample_rate) return x_synthesized
def get_random_peseudo_mcep(order=24, alpha=0.42): T, N = 100, 513 frames = np.random.rand(T, N) * pysptk.blackman(N) mc = pysptk.mcep(frames, order=order, alpha=alpha) return mc
MIN_F0 = 60 MAX_F0 = 240 ORDER = 20 IN_WAVE_FILE = "in.wav" # 入力音声 OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 # 音声の読み込み fs, x = wavfile.read(IN_WAVE_FILE) x = x.astype(np.float64) # 音声の切り出しと窓掛け frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # 線形予測分析による線形予測符号化(LPC)係数の抽出 lpc = pysptk.lpc(frames, ORDER) lpc[:, 0] = np.log(lpc[:, 0])
def __dummy_windowed_frames(source, frame_len=512, hopsize=80): np.random.seed(98765) n_frames = int(len(source) / hopsize) + 1 windowed = np.random.randn(n_frames, frame_len) * pysptk.blackman(frame_len) return 0.5 * 32768.0 * windowed
import pysptk from scipy.io import wavfile import librosa.util frame_length = 2048 hop_length = 512 order = 20 path = '../ETTS_newdata/data/wav/lmy00001.wav' # LPC sr, x = wavfile.read(path) x = x.astype(np.float64) librosa.util.valid_audio(x) x = np.pad(x, int(frame_length // 2), mode='reflect') frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T frames *= pysptk.blackman(frame_length) lpc = pysptk.lpc(frames, order) lpc[:, 0] = np.log(lpc[:, 0]) #MFCC y, sr = librosa.load(path) y = y.astype(np.float64) mfcc = librosa.feature.mfcc(y=y, sr=sr, n_fft=frame_length, hop_length=hop_length) print(d)
order = 25 alpha = 0.42 gamma = -0.35 # Loading pyrenn Model net = pyrenn.loadNN('pyrennweights_2.csv') # Input sr, sx = wavfile.read(sourcefile) l = len(sx) # framing sourceframes = librosa.util.frame(sx, frame_length=frameLength, hop_length=hop_length).astype(np.float64).T # Windowing sourceframes *= pysptk.blackman(frameLength) # extract MCEPs sourcemcepvectors = np.apply_along_axis(pysptk.mcep, 1, sourceframes, order, alpha) # provide the source MCEPs as input to the trained neural network which gives the target MCEPs mgc = pyrenn.NNOut(sourcemcepvectors.transpose(), net).transpose() mgc = mgc.copy(order="C") # Finding Log Spectrum. logspec = np.apply_along_axis(pysptk.mgc2sp, 1, mgc, 0.41, 0.0, frameLength) # Convert to FFT Domain. spec = np.exp(logspec).T # Convert to Time Domain. output_speechover = librosa.core.istft(spec, hop_length, frameLength, pysptk.blackman(frameLength)) # Output.
# Parameters. frameLength = 1024 overlap = 0.25 hop_length = frameLength * overlap order = 25 alpha = 0.42 gamma = -0.35 # Feature Extraction. sr, sx = wavfile.read(sourcefile) sourceframes = librosa.util.frame( sx, frame_length=frameLength, # framing the source audio hop_length=hop_length).astype(np.float64).T sourceframes *= pysptk.blackman(frameLength) # windowing sourcemcepvectors = np.apply_along_axis( pysptk.mcep, 1, sourceframes, order, alpha) # extract MCEPs of the source frames sr, tx = wavfile.read(targetfile) targetframes = librosa.util.frame( tx, frame_length=frameLength, # framing the target audio hop_length=hop_length).astype(np.float64).T targetframes *= pysptk.blackman(frameLength) # windowing targetmcepvectors = np.apply_along_axis( pysptk.mcep, 1, targetframes, order, alpha) # extract mceps of target frames # Normalising for feeding into RNN. norm = min(len(sourcemcepvectors), len(targetmcepvectors))
frameLength = 1024 overlap = 0.25 hop_length=256 subFrameLength = frameLength * overlap net=pyrenn.CreateNN([26,30,30,26]) order = 25 alpha = 0.41 gamma = -0.35 count=0 for sourcefile,targetfile in zip(source,target) : print(sourcefile,targetfile) sr, sx = wavfile.read(sourcefile) sourceframes = librosa.util.frame(sx, frame_length=frameLength, #framing the source audio hop_length=hop_length).astype(np.float64).T sourceframes *= pysptk.blackman(frameLength) #windowing sourcemcepvectors = np.apply_along_axis(pysptk.mcep, 1, sourceframes, order, alpha) #extract MCEPs of the source frames sr, tx = wavfile.read(targetfile) targetframes = librosa.util.frame(tx, frame_length=frameLength, #framing the target audio hop_length=hop_length).astype(np.float64).T targetframes *= pysptk.blackman(frameLength) . #windowing targetmcepvectors = np.apply_along_axis(pysptk.mcep, 1, targetframes, order, alpha) . #extract mceps of target frames reslen=min(len(sourcemcepvectors),len(targetmcepvectors)) transsourcemcepvectorsmod=np.empty([26,reslen]) transtargetmcepvectorsmod=np.empty([26,reslen]) transsourcemcepvectorsmod=np.transpose(sourcemcepvectors[0:reslen]) transtargetmcepvectorsmod=np.transpose(targetmcepvectors[0:reslen]) print(len(sourcemcepvectors),len(targetmcepvectors)) # to find if there are any NANs in the MCEP vectors for i in range(len(sourcemcepvectors)): for j in range(len(sourcemcepvectors[i])):
def load_mfcc_mceps(path_to_data, config_mfcc_mceps): '''extract normalized mfcc and mceps from list of data path input: list of paths to data, mfcc_mceps setting dictionary return: dictionary: key: speaker code + _ + audio name value: tuple (mfcc normalized, mceps normalized) target scaler: contains mcep mean and variance of target speaker in order to scale back mcep results ''' _data_x = {} path_audios = os.listdir(path_to_data) total_mceps = np.empty( (0, config_mfcc_mceps['order_mcep'] + 1), float) #used to store mean and std for denormalize results target_scaler = {} for p in path_audios: if p.split(".")[-1] != "wav": continue x, _ = librosa.load(path_to_data + '/' + p, sr=config_mfcc_mceps["sampling_frequency"]) mfcc_l = math.ceil(x.shape[0] / config_mfcc_mceps["hop_length"]) mcep_l = math.ceil((x.shape[0] - config_mfcc_mceps["n_fft"]) / config_mfcc_mceps["hop_length"]) final_shape = x.shape[0] + config_mfcc_mceps["hop_length"] * (mfcc_l - mcep_l) x.resize((final_shape, )) frames = librosa.util.frame( x, frame_length=config_mfcc_mceps["n_fft"], hop_length=config_mfcc_mceps["hop_length"]).astype(np.float64).T # Windowing frames *= pysptk.blackman(config_mfcc_mceps["n_fft"], normalize=1) mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep']) #,alpha) total_mceps = np.vstack((total_mceps, mceps)) mfccs = librosa.feature.mfcc( y=x, sr=config_mfcc_mceps["sampling_frequency"], n_mfcc=config_mfcc_mceps["order_mfcc"], n_fft=config_mfcc_mceps["n_fft"], hop_length=config_mfcc_mceps["hop_length"]) mfccs = normalize_mfcc( mfccs.T).T #transpose twice in order to normalize on right axis id_ = "_" + p _data_x[id_] = ( mfccs.T, mceps ) #Don't forget mfcc.T -> now both have shape (#frames, #mfcc/mceps) target_scaler["mean"] = list(np.mean(total_mceps, 0)) target_scaler["std"] = list(np.std(total_mceps, 0)) #apply normalization for k, v in _data_x.items(): mcep = v[1] mcep = (mcep - target_scaler["mean"]) / target_scaler["std"] _data_x[k] = (v[0], mcep) return _data_x, target_scaler
import pysptk as SPTK from scipy.io import wavfile fs, x = wavfile.read('test.wav') assert fs == 16000 x = 1. * x #change to float64 from cle.cle.utils import segment_axis frame_length = 1024 hopsize = 80 noverlap = frame_length - hopsize frames = segment_axis(x, frame_length, noverlap).astype('float64').T frames = xw * SPTK.blackman(frame_length).reshape((1024, 1)) #frames = frames.T #frames = frames.copy(order='C') frames = frames.T order = 20 alpha = 0.41 stage = 4 gamma = -1.0 / stage mgc = np.apply_along_axis(SPTK.mgcep, 1, frames, order, alpha, gamma) mgc_sp = np.apply_along_axis(SPTK.mgc2sp, 1, mgc, alpha, gamma, frame_length).real mgc_sp_test = np.hstack([mgc_sp, mgc_sp[:, ::-1][:, 1:-1]])
def extract_features(audiodata, speaker, sr=16000, feat='mcep'): """ Feature extraction. For each type of feature, see corresponding case below Currently support: MCEP, MFCC. Will be updated when needed :param audiodata: 162 files time-series data :param sr: sampling rate. :param feat: type of currently supported feature :return: """ print("=======================") logging.info("Extracting MCEP from {}'s data ...".format(speaker)) if feat.lower() == 'mfcc': """ extract mfcc from audio time series data (from librosa.load) """ mfccs = [] for audio in tqdm(audiodata): mfccs.append( lbr.util.normalize(lbr.feature.mfcc(audio, sr=sr, n_fft=frame_length, hop_length=hop_length), norm=1, axis=0)) # return np.stack(mfccs) return mfccs, feat elif feat.lower() == 'mcep' or feat.lower() == 'mcc': """ MCEP is extracted via pysptk. See the link below for more details https://github.com/eYSIP-2017/eYSIP-2017_Speech_Spoofing_and_Verification/wiki/Feature-Extraction-for-Speech-Spoofing Example of using pysptk to extract mcep (copied from the above link): frameLength = 1024 overlap = 0.25 hop_length = frameLength * overlap order = 25 alpha = 0.42 gamma = -0.35 sourceframes = librosa.util.frame(speech, frame_length=frameLength, hop_length=hop_length).astype(np.float64).T sourceframes *= pysptk.blackman(frameLength) sourcemcepvectors = np.apply_along_axis(pysptk.mcep, 1, sourceframes, order, alpha) """ # Check if data exists temp_filename = os.path.join(feature_path, "{}_{}.pkl".format(speaker, feat)) if os.path.isfile(temp_filename): logging.info("Found {}. Load data from {}_{}".format( temp_filename, speaker, feat)) with open(temp_filename, "rb") as f: return pickle.load(f), feat else: mceps = [] logging.info("Calculating ...") for audio in tqdm(audiodata): frame = lbr.util.frame(audio, frame_length=frame_length, hop_length=hop_length).T frame *= pysptk.blackman(frame_length) mceps.append( np.apply_along_axis(pysptk.mcep, 1, frame, order=order, alpha=alpha).T) # Save to .pkl for later load with open(temp_filename, "wb") as f: pickle.dump(mceps, f, protocol=3) return mceps, feat else: logging.critical('{} feature is not supported yet, exiting ...') exit()
import pysptk as SPTK from scipy.io import wavfile fs, x = wavfile.read('test.wav') assert fs == 16000 x = 1.*x #change to float64 from cle.cle.utils import segment_axis frame_length = 1024 hopsize = 80 noverlap = frame_length - hopsize frames = segment_axis(x,frame_length, noverlap).astype('float64').T frames = xw*SPTK.blackman(frame_length).reshape((1024,1)) #frames = frames.T #frames = frames.copy(order='C') frames = frames.T order = 20 alpha = 0.41 stage = 4 gamma = -1.0 / stage mgc = np.apply_along_axis(SPTK.mgcep, 1, frames, order, alpha, gamma) mgc_sp = np.apply_along_axis(SPTK.mgc2sp, 1, mgc, alpha, gamma, frame_length).real mgc_sp_test = np.hstack([mgc_sp,mgc_sp[:,::-1][:,1:-1]]) mgc_sp_test = mgc_sp_test.copy(order = 'C')
def framing_windowing(np_data): frames = librosa.util.frame(np_data, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).astype(np.float64).T frames *= ps.blackman(FRAME_LENGTH) return frames