def combine_with_mfcc(xdir, neighbor=2, nfft=256, normal_flag=0): _, x = wavfile.read(xdir) _, _, Zxx = stft(x, freq, nfft=nfft) Zxx = log((abss(Zxx)).T + 1e-7) if normal_flag == 0 else normalize_mean(log((abss(Zxx)).T + 1e-7)) m, n = Zxx.shape tmp = zer(m * n * (neighbor * 2 + 1), dtype='float32').reshape(m, -1) for i in range(2 * neighbor + 1): if i <= neighbor: shift = neighbor - i tmp[shift:m, i * n: (i + 1) * n] = Zxx[:m - shift] for j in range(shift): tmp[j, i * n: (i + 1) * n] = Zxx[0, :] else: shift = i - neighbor tmp[:m - shift, i * n: (i + 1) * n] = Zxx[shift:m] for j in range(shift): tmp[m - (j + 1), i * n: (i + 1) * n] = Zxx[m - 1, :] # now tmp is "make_window_buffer" output # then calc mfcc & d & dd mfcc_data = combine_mfcc_d_dd(mfcc(x, freq, winlen=0.016, winstep=0.008, nfft=256, winfunc=np.bartlett)) while True: try: tmp1 = np.concatenate((tmp, mfcc_data), axis=1) break except ValueError: mfcc_data = np.concatenate((mfcc_data, np.zeros([1, mfcc_data.shape[1]])), axis=0) continue return tmp1
def get_Zyy(ydir, nfft=256, normal_flag=0): """ get the stft.T of the wav file in ydir usually used in train and test, not in practical application :param ydir: :param nfft: :param normal_flag: 0 for log power, 1 for normalized log power :return: """ _, y = wavfile.read(ydir) _, _, Zyy = stft(y, freq, nfft=nfft) return log((abss(Zyy)).T+1e-7) if normal_flag == 0 else normalize_mean(log((abss(Zyy)).T+1e-7))
def combine_with_mfcc_Zyy(ydir, nfft=256, normal_flag=0): _, y = wavfile.read(ydir) _, _, Zyy = stft(y, freq, nfft=nfft) y_data = log((abss(Zyy)).T + 1e-7) if normal_flag == 0 else normalize_mean(log((abss(Zyy)).T + 1e-7)) mfcc_data = combine_mfcc_d_dd(mfcc(y, freq, winlen=0.016, winstep=0.008, nfft=256, winfunc=np.bartlett)) while True: try: tmp1 = np.concatenate((y_data, mfcc_data), axis=1) break except ValueError: mfcc_data = np.concatenate((mfcc_data, np.zeros([1, mfcc_data.shape[1]])), axis=0) continue return tmp1
def make_window_buffer(xdir, neighbor=2, nfft=256, normal_flag=0): """ get frame group for DNN input, and this is the key expend every row of the array to the combination of itself and its neighbors Example: given array like this: [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4], [5, 5, 5], [6, 6, 6], [7, 7, 7]] and we combine 1 neighbor, then we have: [[1, 1, 1, 1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2, 3, 3, 3], [2, 2, 2, 3, 3, 3, 4, 4, 4], [3, 3, 3, 4, 4, 4, 5, 5, 5], [4, 4, 4, 5, 5, 5, 6, 6, 6], [5, 5, 5, 6, 6, 6, 7, 7, 7], [6, 6, 6, 7, 7, 7, 7, 7, 7]] (neighbor)↑,↑,↑ (neighbor) noticing this column marked above, they are the origin array noticing that for the start and end, it will repeat 'neighbour' times to make up :param xdir: :param neighbor: :param nfft: :param normal_flag: 0 for log power, 1 for normalized log power :return: """ _, x = wavfile.read(xdir) _, _, Zxx = stft(x, freq, nfft=nfft) Zxx = log((abss(Zxx)).T+1e-7) if normal_flag == 0 else normalize_mean(log((abss(Zxx)).T+1e-7)) m, n = Zxx.shape tmp = zer(m * n * (neighbor * 2 + 1), dtype='float32').reshape(m, -1) for i in range(2 * neighbor + 1): if i <= neighbor: shift = neighbor - i tmp[shift:m, i * n: (i + 1) * n] = Zxx[:m - shift] for j in range(shift): tmp[j, i * n: (i + 1) * n] = Zxx[0, :] else: shift = i - neighbor tmp[:m-shift, i * n: (i+1) * n] = Zxx[shift:m] for j in range(shift): tmp[m-(j + 1), i * n: (i + 1) * n] = Zxx[m-1, :] return tmp
def test_GRU(model, input_path, output_path): _, s = wavfile.read(input_path) _, _, Zxx = stft(s, freq) Zxx1 = log((abss(Zxx)).T + 1e-7) print(Zxx1.shape) yt = pack_GRU(input_path) y = model.predict(np.reshape(yt, [1, -1, 22])) y = unpack_GRU(y) print(y.shape) ypreComplex = exp(y.T * Zxx1.T) * exp(complex(0, 1) * ang(Zxx)) _, xrec = istft(ypreComplex, freq) dataWrite = xrec.astype(np.int16) wavfile.write(output_path, freq, dataWrite)
def test_model_mfcc(model, input_path, output_path, neighbor, nffts, normal_flag=0): _, s = wavfile.read(input_path) _, _, Zxx = stft(s, freq) Zxx1 = log((abss(Zxx)).T+1e-7) y_input = combine_with_mfcc(input_path, neighbor=neighbor, nfft=nffts, normal_flag=normal_flag) y = model.predict(y_input) y = (np.delete(y, np.s_[-39:], axis=1)).T # delete mfcc data ypreComplex = exp(y) * exp(complex(0, 1) * ang(Zxx)) if normal_flag == 0 \ else exp(unnormalize(y, Zxx1)) * exp(complex(0, 1) * ang(Zxx)) # ypreComplex = unnormalize(exp(y) * exp(complex(0, 1) * ang(Zxx)), abss(Zxx)) # wrong code _, xrec = istft(ypreComplex, freq) dataWrite = xrec.astype(np.int16) wavfile.write(output_path, freq, dataWrite)
def test_model(model, input_path, output_path, neighbor, nffts, normal_flag=0): _, s = wavfile.read(input_path) _, _, Zxx = stft(s, freq) Zxx1 = log((abss(Zxx)).T+1e-7) y_input = make_window_buffer(input_path, neighbor=neighbor, nfft=nffts, normal_flag=normal_flag) y = model.predict(y_input).T # print(y.shape, unnormalize(y, abss(Zxx)).shape, unnormalize(y, abss(Zxx)).dtype) ypreComplex = exp(y) * exp(complex(0, 1) * ang(Zxx)) if normal_flag == 0 \ else exp(unnormalize(y, Zxx1)) * exp(complex(0, 1) * ang(Zxx)) # ypreComplex = unnormalize(exp(y) * exp(complex(0, 1) * ang(Zxx)), abss(Zxx)) # wrong code _, xrec = istft(ypreComplex, freq) dataWrite = xrec.astype(np.int16) wavfile.write(output_path, freq, dataWrite)
def pack_GRU(xdir): _, x = wavfile.read(xdir) _, _, Zxx = stft(x, freq) Zxx = log((abss(Zxx)) + 1e-7) return bark_dct(bark_rescale(fr2bark(Zxx)))