def get_dataset(clean, noisy, ratio=0.2, maxlen=1200, n_fft=512): fft_size = n_fft // 2 + 1 clean, noisy = list_dataset(clean, noisy) assert clean, "No data with common filenames" assert noisy, "No data with common filenames" X = np.zeros([len(clean), maxlen + 16, fft_size], np.float32) Y = np.zeros([len(clean), maxlen, fft_size], np.float32) sel = np.random.random(len(clean)) > ratio for ix, (cl, ns) in enumerate(zip(clean, noisy)): print("Loading file", ix) cl, ns = open_sound(cl), open_sound(ns) assert cl[0] == ns[0] cl, ns = cl[1], ns[1] if len(ns.shape) > 1: ns = ns[:, 0] spec = -np.log(np.abs(stft(cl, n_fft=n_fft))**2 + 2e-12).T[:maxlen] spec = np.pad(spec, ((16, maxlen - spec.shape[0]), (0, 0)), 'constant', constant_values=-np.log(2e-12)) X[ix, :, :] = spec spec = -np.log(np.abs(stft(ns, n_fft=n_fft))**2 + 2e-12).T[:maxlen] spec = np.pad(spec, ((0, maxlen - spec.shape[0]), (0, 0)), 'constant', constant_values=-np.log(2e-12)) Y[ix, :, :] = spec return [X[sel], Y[sel]], [X[~sel], Y[~sel]]
def _load(self, key): flist = glob.glob(self.index_dict[key]) if not len(flist): raise RuntimeError( "Could not find file matches template \'{}\'".format( self.index_dict[key])) if len(flist) == 1: return stft(flist[0], **self.stft_kwargs) else: return np.array( [stft(f, **self.stft_kwargs) for f in sorted(flist)])
def __getitem__(self, index, mus_win=512, mus_hop=256, eeg_win=32, eeg_hop=2): """Generates one sample of data""" # Select sample index_rand = round(np.random.uniform(0, len(self.eeg) - 1)) X = chunker(self.eeg[index_rand], self.music[index_rand], self.sample_len, self.eeg_sr, self.sr) X_m = stft(X[1], self.sr, mus_win, mus_hop) X_e = stft_eeg(X[0], self.sr, eeg_win, eeg_hop) if self.use_noise: X_m = add_rand_noise(abs(X_m)) X_e = add_rand_noise(abs(X_e)) X_m = to_log(abs(X_m) + 1e-6) X_e = z_norm(X_e) X_e = to_log(abs(X_e) + 1e-6) X_e = z_norm(X_e) X_e = torch.tensor(X_e).float() X_m = torch.tensor(X_m).float() X_m = (X_m - X_m.mean(dim=0, keepdim=True)) / ( X_m.std(dim=0, keepdim=True) + 1e-6) for i in np.arange(X_e.size(0)): X_e[i] = (X_e[i] - X_e[i].mean(dim=0, keepdim=True)) / ( X_e[i].std(dim=0, keepdim=True) + 1e-6) return X_e, X_m
def calc_mfcc(wav, hop, win_length, filterbank): """ Calculate Mel Frequency Cepstrum Coeffcient(MFCC). Parameters: wav : ndarray, real-valued Time series of measurement values. hop : float Hop (Overlap) size. win_length : int Window size. filter_bank : ndarray mel filter bank Returns: mel_spec : ndarray (n_channels, n_frames) Mel scale spectrogram. mfcc : ndarray (n_channels, n_frames) Mel Frequency Cepstrum Coeffcient(MFCC). """ pre_wav = utils.pre_emphasis(wav, p=0.97) spec = utils.stft(pre_wav, hop=hop, win_length=win_length) # hop_length = int(win_length * hop) # spec = spec[:, :hop_length] mel_spec = np.dot(filterbank, np.abs(spec[:-1])) mfcc = np.zeros_like(mel_spec) for i in range(mel_spec.shape[1]): mfcc[:, i] = dct(mel_spec[:, i], type=2, norm="ortho", axis=-1) return mel_spec, mfcc
def get_feats(audio): """ Function to get acoustic input features, starting with STFT, needs to be extended to include MFCCs, will ask how many coefficients to use. """ # stft = librosa.core.stft(audio, n_fft = config.nfft, hop_length = config.hopsize, window = config.window).T stft = utils.stft(audio, window=config.window, hopsize=config.hopsize, nfft=config.nfft, fs=config.fs) assert abs(stft).max() <= 1.0 # voc_stft_mag = 2 * abs(voc_stft)/np.sum(config.window) # voc_stft_phase = np.angle(voc_stft) # cqt = librosa.core.cqt(audio, sr = config.fs, hop_length = config.hopsize, n_bins = config.cqt_bins, fmin = config.fmin, bins_per_octave = config.bins_per_octave).T # hcqt = get_hcqt(audio) # hcqt = np.swapaxes(hcqt, 0,1) return stft
def run(args): num_bins, config_dict = parse_yaml(args.config) dataloader_conf = config_dict["dataloader"] spectrogram_conf = config_dict["spectrogram_reader"] # Load cmvn dict_mvn = dataloader_conf["mvn_dict"] if dict_mvn: if not os.path.exists(dict_mvn): raise FileNotFoundError("Could not find mvn files") with open(dict_mvn, "rb") as f: dict_mvn = pickle.load(f) # default: True apply_log = dataloader_conf[ "apply_log"] if "apply_log" in dataloader_conf else True dcnet = PITNet(num_bins, **config_dict["model"]) frame_length = spectrogram_conf["frame_length"] frame_shift = spectrogram_conf["frame_shift"] window = spectrogram_conf["window"] separator = Separator(dcnet, args.state_dict, cuda=args.cuda) utt_dict = parse_scps(args.wave_scp) num_utts = 0 for key, utt in utt_dict.items(): try: samps, stft_mat = stft(utt, frame_length=frame_length, frame_shift=frame_shift, window=window, center=True, return_samps=True) except FileNotFoundError: print("Skip utterance {}... not found".format(key)) continue print("Processing utterance {}".format(key)) num_utts += 1 norm = np.linalg.norm(samps, np.inf) spk_mask, spk_spectrogram = separator.seperate(stft_mat, cmvn=dict_mvn, apply_log=apply_log) for index, stft_mat in enumerate(spk_spectrogram): istft(os.path.join(args.dump_dir, '{}.spk{}.wav'.format(key, index + 1)), stft_mat, frame_length=frame_length, frame_shift=frame_shift, window=window, center=True, norm=norm, fs=8000, nsamps=samps.size) if args.dump_mask: sio.savemat( os.path.join(args.dump_dir, '{}.spk{}.mat'.format(key, index + 1)), {"mask": spk_mask[index]}) print("Processed {} utterance!".format(num_utts))
def spectral_flux(wavedata, window_size, sample_rate): magnitude_spectrum = stft(wavedata, window_size) timebins, freqbins = np.shape(magnitude_spectrum) timestamps = (np.arange(0,timebins - 1) * (timebins / float(sample_rate))) sf = np.sqrt(np.sum(np.diff(np.abs(magnitude_spectrum))**2, axis=1)) / freqbins return sf[1:], np.asarray(timestamps)
def spectral_flux(wavedata, window_size, sample_rate): magnitude_spectrum = stft(wavedata, window_size) timebins, freqbins = np.shape(magnitude_spectrum) timestamps = (np.arange(0, timebins - 1) * (timebins / float(sample_rate))) sf = np.sqrt(np.sum(np.diff(np.abs(magnitude_spectrum))**2, axis=1)) / freqbins return sf[1:], np.asarray(timestamps)
def main(): # maximus=np.zeros(66) # minimus=np.ones(66)*1000 wav_files = [ x for x in os.listdir(config.wav_dir_mus) if x.endswith('.stem.mp4') and not x.startswith(".") ] count = 0 for lf in wav_files: # lf = "Actions - One Minute Smile.stem.mp4" # print(lf) audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_mus, lf), stem_id=[0, 1, 2, 3, 4]) mixture = audio[0] drums = audio[1] bass = audio[2] acc = audio[3] vocals = audio[4] # out_feats = utils.stft_to_feats((vocals[:,0]+vocals[:,1])/2,fs) # utils.feats_to_audio(out_feats,lf,fs=fs) # import pdb;pdb.set_trace() backing = np.clip(drums + bass + acc, 0.0, 1.0) if len(backing.shape) == 2: backing = (backing[:, 0] + backing[:, 1]) / 2 # import pdb;pdb.set_trace() back_stft = abs(utils.stft(backing)) hdf5_file = h5py.File(config.backing_dir + 'mus_' + lf[:-9] + '.hdf5', mode='w') hdf5_file.create_dataset("back_stft", back_stft.shape, np.float32) hdf5_file["back_stft"][:, :] = back_stft hdf5_file.close() count += 1 utils.progress(count, len(wav_files))
def main(): # maximus=np.zeros(66) # minimus=np.ones(66)*1000 wav_files = [x for x in os.listdir(config.wav_dir) if x.endswith('.wav')] count = 0 for lf in wav_files: # print(lf) audio, fs = sf.read(os.path.join(config.wav_dir, lf)) vocals = np.array(audio[:, 1]) mixture = np.clip(audio[:, 0] + audio[:, 1], 0.0, 1.0) backing = np.array(audio[:, 0]) voc_stft = abs(utils.stft(vocals)) mix_stft = abs(utils.stft(mixture)) back_stft = abs(utils.stft(backing)) assert voc_stft.shape == mix_stft.shape out_feats = utils.input_to_feats(os.path.join(config.wav_dir, lf)) out_feats = np.concatenate( ((out_feats, np.zeros((1, out_feats.shape[1]))))) assert out_feats.shape[0] == voc_stft.shape[0] np.save(config.dir_npy + lf[:-4] + '_voc_stft', voc_stft) np.save(config.dir_npy + lf[:-4] + '_mix_stft', mix_stft) np.save(config.dir_npy + lf[:-4] + '_back_stft', back_stft) np.save(config.dir_npy + lf[:-4] + '_synth_feats', out_feats) count += 1 utils.progress(count, len(wav_files)) import pdb pdb.set_trace()
def test_stft(): nperseg = 4 noverlap = nperseg // 2 x = normalize(np.arange(nperseg * 4, dtype=np.float32)) a = librosa.core.stft(x, window='hamming', n_fft=nperseg, hop_length=nperseg - noverlap, center=False) b = stft(x, nperseg=nperseg, noverlap=noverlap) assert np.allclose(a, b)
def spectual_centroid(wavedata, window_size, sample_rate): magnitude_spectrum = stft(wavedata, window_size) timebins, freqbins = np.shape(magnitude_spectrum) timestamps = np.arange(0,timebins-1) * (timebins / float(sample_rate)) sc = [] for t in range(timebins-1): power_spectrum = np.abs(magnitude_spectrum[t])**2 sc_t = np.sum(power_spectrum * np.arange(1, freqbins+1)) / np.sum(power_spectrum) sc.append(sc_t) sc = np.asarray(sc) sc = np.nan_to_num(sc) return sc, np.asarray(timestamps)
def read_wav_file(self, file_name): audio, fs = librosa.core.load(file_name, sr=config.fs) audio = np.float64(audio) if len(audio.shape) == 2: vocals = np.array((audio[:,1]+audio[:,0])/2) else: vocals = np.array(audio) voc_stft = np.clip(abs(utils.stft(vocals, hopsize = config.hopsize, nfft = config.nfft, fs = config.fs, window = config.window)), 0.0, 1.0) return voc_stft
def spectual_centroid(wavedata, window_size, sample_rate): magnitude_spectrum = stft(wavedata, window_size) timebins, freqbins = np.shape(magnitude_spectrum) timestamps = np.arange(0, timebins - 1) * (timebins / float(sample_rate)) sc = [] for t in range(timebins - 1): power_spectrum = np.abs(magnitude_spectrum[t])**2 sc_t = np.sum(power_spectrum * np.arange(1, freqbins + 1)) / np.sum(power_spectrum) sc.append(sc_t) sc = np.asarray(sc) sc = np.nan_to_num(sc) return sc, np.asarray(timestamps)
def spectral_rolloff(wavedata, window_size, sample_rate, k=0.85): magnitude_spectrum = stft(wavedata, window_size) power_spectrum = np.abs(magnitude_spectrum) ** 2 timebins, freqbins = np.shape(magnitude_spectrum) timestamps = (np.arange(0,timebins - 1) * (timebins / float(sample_rate))) sr = [] spectral_sum = np.sum(power_spectrum, axis=1) for t in range(timebins-1): sr_t = np.where(np.cumsum(power_spectrum[t,:]) >= k * spectral_sum[t])[0][0] sr.append(sr_t) sr = np.asarray(sr).astype(float) sr = (sr / freqbins) * (sample_rate / 2.0) return sr, np.asarray(timestamps)
def spectral_rolloff(wavedata, window_size, sample_rate, k=0.85): magnitude_spectrum = stft(wavedata, window_size) power_spectrum = np.abs(magnitude_spectrum)**2 timebins, freqbins = np.shape(magnitude_spectrum) timestamps = (np.arange(0, timebins - 1) * (timebins / float(sample_rate))) sr = [] spectral_sum = np.sum(power_spectrum, axis=1) for t in range(timebins - 1): sr_t = np.where( np.cumsum(power_spectrum[t, :]) >= k * spectral_sum[t])[0][0] sr.append(sr_t) sr = np.asarray(sr).astype(float) sr = (sr / freqbins) * (sample_rate / 2.0) return sr, np.asarray(timestamps)
def preprocess(audio_path, sample_rate=16000, window_size=0.02, window_stride=0.01, window='hamming'): audio = load_audio(audio_path, sample_rate) nfft = int(sample_rate * window_size) win_length = nfft hop_length = int(sample_rate * window_stride) d = stft(audio, n_fft=nfft, hop_length=hop_length, win_length=win_length, window=window) spect, phase = magphase(d) pcen_result = pcen2(e=spect, sr=sample_rate, hop_length=hop_length) mean_pcen = pcen_result.mean() std_pcen = pcen_result.std() pcen_result = np.add(pcen_result, -mean_pcen) pcen_result = pcen_result / std_pcen return pcen_result
def read_wav_file(self, file_name): audio, fs = librosa.core.load(file_name, sr=config.fs) audio = np.float64(audio) if len(audio.shape) == 2: vocals = np.array((audio[:,1]+audio[:,0])/2) else: vocals = np.array(audio) voc_stft = abs(utils.stft(vocals)) feats = utils.stft_to_feats(vocals,fs) voc_stft = np.clip(voc_stft, 0.0, 1.0) return voc_stft, feats
def load(self, path): fnames = list_sounds(path) fnames = random.sample(fnames, self.n_records) max_len = max([len(open_sound(x)[1]) for x in fnames]) max_len = 1 + (max_len - 512) // 128 self.X = np.ones([self.n_records, max_len, 257], np.float32) self.Y = np.ones([self.n_records, max_len, 257], np.float32) self.X *= np.log(2e-12) self.Y *= np.log(2e-12) for ix, fname in enumerate(fnames): sr, rec = open_sound(fname) assert sr == 16000 rec = np.log(2e-12 + np.abs( stft(rec.astype(np.float32) / (2**15), n_fft=512, hop_length=128).T[:max_len])**2) self.X[ix, :len(rec)] = self.mask(rec) self.Y[ix, :len(rec)] = rec return ([self.X[:self.train], self.Y[:self.train]], [ self.X[self.train:self.train + self.valid], self.Y[self.train:self.train + self.valid] ], [self.X[-self.test:], self.Y[-self.test:]])
def run(args): num_bins, config_dict = parse_yaml(args.config) # Load cmvn dict_mvn = config_dict["dataloader"]["mvn_dict"] if dict_mvn: if not os.path.exists(dict_mvn): raise FileNotFoundError("Could not find mvn files") with open(dict_mvn, "rb") as f: dict_mvn = pickle.load(f) dcnet = DCNet(num_bins, **config_dict["dcnet"]) frame_length = config_dict["spectrogram_reader"]["frame_length"] frame_shift = config_dict["spectrogram_reader"]["frame_shift"] window = config_dict["spectrogram_reader"]["window"] cluster = DeepCluster(dcnet, args.dcnet_state, args.num_spks, pca=args.dump_pca, cuda=args.cuda) utt_dict = parse_scps(args.wave_scp) num_utts = 0 for key, utt in utt_dict.items(): try: samps, stft_mat = stft(utt, frame_length=frame_length, frame_shift=frame_shift, window=window, center=True, return_samps=True) except FileNotFoundError: print("Skip utterance {}... not found".format(key)) continue print("Processing utterance {}".format(key)) num_utts += 1 norm = np.linalg.norm(samps, np.inf) pca_mat, spk_mask, spk_spectrogram = cluster.seperate(stft_mat, cmvn=dict_mvn) for index, stft_mat in enumerate(spk_spectrogram): istft(os.path.join(args.dump_dir, '{}.spk{}.wav'.format(key, index + 1)), stft_mat, frame_length=frame_length, frame_shift=frame_shift, window=window, center=True, norm=norm, fs=8000, nsamps=samps.size) if args.dump_mask: sio.savemat( os.path.join(args.dump_dir, '{}.spk{}.mat'.format(key, index + 1)), {"mask": spk_mask[index]}) if args.dump_pca: sio.savemat(os.path.join(args.dump_dir, '{}.mat'.format(key)), {"pca_matrix": pca_mat}) print("Processed {} utterance!".format(num_utts))
def main(): # maximus=np.zeros(66) # minimus=np.ones(66)*1000 wav_files=[x for x in os.listdir(config.wav_dir) if x.endswith('.wav') and not x.startswith('.')] count=0 for lf in wav_files: # print(lf) audio,fs = sf.read(os.path.join(config.wav_dir,lf)) vocals = np.array(audio[:,1]) mixture = (audio[:,0]+audio[:,1])*0.7 backing = np.array(audio[:,0]) voc_stft = abs(utils.stft(vocals)) mix_stft = abs(utils.stft(mixture)) back_stft = abs(utils.stft(backing)) assert voc_stft.shape==mix_stft.shape out_feats = utils.stft_to_feats(vocals,fs) if not out_feats.shape[0]==voc_stft.shape[0] : if out_feats.shape[0]<voc_stft.shape[0]: while out_feats.shape[0]<voc_stft.shape[0]: out_feats = np.concatenate(((out_feats,np.zeros((1,out_feats.shape[1]))))) elif out_feats.shape[0]<voc_stft.shape[0]: print("You are an idiot") assert out_feats.shape[0]==voc_stft.shape[0] hdf5_file = h5py.File(config.voice_dir+'ikala_'+lf[:-4]+'.hdf5', mode='w') hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32) hdf5_file.create_dataset("feats", out_feats.shape, np.float32) hdf5_file["voc_stft"][:,:] = voc_stft hdf5_file["feats"][:,:] = out_feats hdf5_file.close() hdf5_file = h5py.File(config.backing_dir+'ikala_'+lf[:-4]+'.hdf5', mode='w') hdf5_file.create_dataset("back_stft", back_stft.shape, np.float32) hdf5_file.create_dataset("mix_stft", mix_stft.shape, np.float32) hdf5_file["back_stft"][:,:] = back_stft hdf5_file["mix_stft"][:,:] = mix_stft hdf5_file.close() count+=1 utils.progress(count,len(wav_files))
def mse(true_path, pred_path): true_data, _ = librosa.load(true_path) true_stft = utils.stft(true_data) pred_data, _ = librosa.load(pred_path) pred_stft = utils.stft(pred_data) return np.mean(np.square(np.subtract(true_stft, pred_stft)))
def main(): singers = next(os.walk(config.wav_dir_nus))[1] for singer in singers: sing_dir = config.wav_dir_nus + singer + '/sing/' read_dir = config.wav_dir_nus + singer + '/read/' sing_wav_files = [ x for x in os.listdir(sing_dir) if x.endswith('.wav') and not x.startswith('.') ] count = 0 print("Processing singer %s" % singer) for lf in sing_wav_files: audio, fs = librosa.core.load(os.path.join(sing_dir, lf), sr=config.fs) audio = np.float64(audio) if len(audio.shape) == 2: vocals = np.array((audio[:, 1] + audio[:, 0]) / 2) else: vocals = np.array(audio) voc_stft = abs(utils.stft(vocals)) out_feats = utils.stft_to_feats(vocals, fs) strings_p = process_lab_file( os.path.join(sing_dir, lf[:-4] + '.txt'), len(voc_stft)) voc_stft, out_feats, strings_p = utils.match_time( [voc_stft, out_feats, strings_p]) hdf5_file = h5py.File(config.voice_dir + 'nus_' + singer + '_sing_' + lf[:-4] + '.hdf5', mode='a') if not "phonemes" in hdf5_file: hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int) hdf5_file["phonemes"][:, ] = strings_p[:, 0] hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32) hdf5_file.create_dataset("feats", out_feats.shape, np.float32) hdf5_file["voc_stft"][:, :] = voc_stft hdf5_file["feats"][:, :] = out_feats hdf5_file.close() count += 1 utils.progress(count, len(sing_wav_files)) read_wav_files = [ x for x in os.listdir(read_dir) if x.endswith('.wav') and not x.startswith('.') ] print("Processing reader %s" % singer) count = 0 for lf in read_wav_files: audio, fs = librosa.core.load(os.path.join(read_dir, lf), sr=config.fs) audio = np.float64(audio) if len(audio.shape) == 2: vocals = np.array((audio[:, 1] + audio[:, 0]) / 2) else: vocals = np.array(audio) voc_stft = abs(utils.stft(vocals)) out_feats = utils.stft_to_feats(vocals, fs) strings_p = process_lab_file( os.path.join(read_dir, lf[:-4] + '.txt'), len(voc_stft)) voc_stft, out_feats, strings_p = utils.match_time( [voc_stft, out_feats, strings_p]) hdf5_file = h5py.File(config.voice_dir + 'nus_' + singer + '_read_' + lf[:-4] + '.hdf5', mode='a') if not "phonemes" in hdf5_file: hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int) hdf5_file["phonemes"][:, ] = strings_p[:, 0] hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32) hdf5_file.create_dataset("feats", out_feats.shape, np.float32) hdf5_file["voc_stft"][:, :] = voc_stft hdf5_file["feats"][:, :] = out_feats hdf5_file.close() count += 1 utils.progress(count, len(read_wav_files))
def test_stft(): data = DATA[0] spec, f, t = stft(data, 44100, 1024, 512) print np.sqrt(spec[:, 50] * 44100 / 1024)
parser.add_option("-f", type="float", dest="end_time", default=16) (options, args) = parser.parse_args() if len(args) == 0: filename = "Queen_mono.wav" else: filename = args[0] print options window = 1024 step = window / 4 (nyq, signal) = utils.slurp_wav(filename, int(options.start_time * 44100), int(44100 * options.end_time)) print "computing spectrogram" spectrogram = utils.stft(signal) print "computing power" power = utils.estimate_spectral_power(spectrogram) print "whitening spectrum" whitened = spectrogram / np.sqrt(power) whitened = utils.normalize_total_power(whitened, utils.total_power(spectrogram)) print "unwhitening spectrum" unwhitened = whitened * np.sqrt(power) unwhitened = utils.normalize_total_power(unwhitened, utils.total_power(spectrogram)) print "resynthesizing from whitened-unwhitened spectrogram" resynth = utils.resynthesize(unwhitened)
def test_stft(): data = DATA[0] spec, f, t = stft(data, 44100, 1024, 512) print np.sqrt(spec[:,50] * 44100 / 1024)
def run(args): num_bins, config_dict = parse_yaml(args.config) # Load cmvn dict_mvn = config_dict["dataloader"]["mvn_dict"] if dict_mvn: if not os.path.exists(dict_mvn): raise FileNotFoundError("Could not find mvn files") with open(dict_mvn, "rb") as f: dict_mvn = pickle.load(f) dcnet = DCNet(num_bins, **config_dict["dcnet"]) frame_length = config_dict["spectrogram_reader"]["frame_length"] frame_shift = config_dict["spectrogram_reader"]["frame_shift"] window = config_dict["spectrogram_reader"]["window"] cluster = DeepCluster( dcnet, args.dcnet_state, args.num_spks, pca=args.dump_pca, cuda=args.cuda) utt_dict = parse_scps(args.wave_scp) num_utts = 0 for key, utt in utt_dict.items(): try: samps, stft_mat = stft( utt, frame_length=frame_length, frame_shift=frame_shift, window=window, center=True, return_samps=True) except FileNotFoundError: print("Skip utterance {}... not found".format(key)) continue print("Processing utterance {}".format(key)) num_utts += 1 norm = np.linalg.norm(samps, np.inf) pca_mat, spk_mask, spk_spectrogram = cluster.seperate( stft_mat, cmvn=dict_mvn) for index, stft_mat in enumerate(spk_spectrogram): istft( os.path.join(args.dump_dir, '{}.spk{}.wav'.format( key, index + 1)), stft_mat, frame_length=frame_length, frame_shift=frame_shift, window=window, center=True, norm=norm, fs=8000, nsamps=samps.size) if args.dump_mask: sio.savemat( os.path.join(args.dump_dir, '{}.spk{}.mat'.format( key, index + 1)), {"mask": spk_mask[index]}) if args.dump_pca: sio.savemat( os.path.join(args.dump_dir, '{}.mat'.format(key)), {"pca_matrix": pca_mat}) print("Processed {} utterance!".format(num_utts))
def _load(self, key): return stft(self.wave_dict[key], **self.stft_kwargs)
def train(epoch, model, optimizer, scaler, scheduler, log_train, args): global global_step global start_time epoch_loss = 0.0 running_loss = [0., 0., 0.] log_interval = args.log_interval synth_interval = args.synth_interval timestemp = time.time() model.train() criterion_frame = nn.MSELoss() for batch_idx, (x, c) in enumerate(train_loader): global_step += 1 optimizer.zero_grad() with autocast(): x, c = x.to(device), c.to(device) log_p, log_det = model(x, c) loss = -(log_p + log_det) scaler.scale(loss).backward() with autocast(): z = torch.randn_like(x) y_gen = model.reverse(z, c) stft_est = stft(y_gen[:, 0], scale='linear') stft_gt = stft(x[:, 0], scale='linear') loss_frame = 0.005 * criterion_frame(stft_est, stft_gt) scaler.scale(loss_frame).backward() if torch.isnan(loss) or torch.isnan(loss_frame): continue scaler.step(optimizer) scaler.update() scheduler.step() running_loss[0] += loss.item() running_loss[1] += log_p.item() running_loss[2] += log_det.item() epoch_loss += loss.item() if (batch_idx + 1) % log_interval == 0: epoch_step = batch_idx + 1 running_loss[0] /= log_interval running_loss[1] /= log_interval running_loss[2] /= log_interval avg_rn_loss = np.array(running_loss) avg_time = (time.time() - timestemp) / log_interval print( 'Global Step : {}, [{}, {}] [NLL, Log p(z), Log Det] : {}, STFT_loss: {}, avg time: {:0.4f}' .format(global_step, epoch, epoch_step, avg_rn_loss, loss_frame.item(), avg_time)) state = {} state['Global Step'] = global_step state['Epoch'] = epoch state['Epoch Step'] = epoch_step state['NLL, Log p(z), Log Det'] = running_loss state['avg time'] = avg_time state['total time'] = time.time() - start_time log_train.write('%s\n' % json.dumps(state)) log_train.flush() timestemp = time.time() running_loss = [0., 0., 0.] if (batch_idx + 1) % synth_interval == 0: with torch.no_grad(): synthesize(model, args.num_sample, args.sr) model.train() del x, c, log_p, log_det, loss del running_loss gc.collect() print('{} Epoch Training Loss : {:.4f}'.format( epoch, epoch_loss / (len(train_loader)))) return epoch_loss / len(train_loader)
parser.add_option("-s", type="float", dest="start_time", default=1) parser.add_option("-f", type="float", dest="end_time", default=16) (options, args) = parser.parse_args() if len(args)==0: filename = "Queen_mono.wav" else: filename = args[0] print options window = 1024 step = window / 4 (nyq, signal) = utils.slurp_wav(filename, int(options.start_time * 44100), int(44100 * options.end_time)) print "computing spectrogram" spectrogram = utils.stft(signal) print "computing power" power = utils.estimate_spectral_power(spectrogram) print "whitening spectrum" whitened = spectrogram / np.sqrt(power) whitened = utils.normalize_total_power(whitened, utils.total_power(spectrogram)) print "unwhitening spectrum" unwhitened = whitened * np.sqrt(power) unwhitened = utils.normalize_total_power(unwhitened, utils.total_power(spectrogram)) print "resynthesizing from whitened-unwhitened spectrogram" resynth = utils.resynthesize(unwhitened) wavfile.write("resynth.wav", int(2 * nyq), resynth)
def main(): # maximus=np.zeros(66) # minimus=np.ones(66)*1000 singers = next(os.walk(config.wav_dir_nus))[1] # singers = [x for x in singers if x not in["VKOW","SAMF","MPUR","JLEE","KENN"]] # import pdb;pdb.set_trace() # phonemas = set([]) for singer in singers: sing_dir = config.wav_dir_nus+singer+'/sing/' read_dir = config.wav_dir_nus+singer+'/read/' sing_wav_files=[x for x in os.listdir(sing_dir) if x.endswith('.wav') and not x.startswith('.')] count = 0 print ("Processing singer %s" % singer) for lf in sing_wav_files: # print(lf) # if not os.path.exists(config.voice_dir+'nus_'+singer+'_sing_'+lf[:-4]+'.hdf5'): audio,fs = sf.read(os.path.join(sing_dir,lf)) if fs !=config.fs: command = "ffmpeg -y -i "+os.path.join(sing_dir,lf)+" -ar "+str(config.fs)+" "+os.path.join(sing_dir,lf) os.system(command) audio,fs = sf.read(os.path.join(sing_dir,lf)) if len(audio.shape) == 2: vocals = np.array((audio[:,1]+audio[:,0])/2) else: vocals = np.array(audio) voc_stft = abs(utils.stft(vocals)) lab_f = open(os.path.join(sing_dir,lf[:-4]+'.txt')) # note_f=open(in_dir+lf[:-4]+'.notes') phos = lab_f.readlines() lab_f.close() phonemes=[] for pho in phos: st,end,phonote=pho.split() # import pdb;pdb.set_trace() st = int(np.round(float(st)/0.005804576860324892)) en = int(np.round(float(end)/0.005804576860324892)) if phonote=='pau' or phonote=='br': phonote='sil' phonemes.append([st,en,phonote]) # phonemas.add(phonote) # div_fac = float(end)/len(voc_stft) # for i in range(len(phonemes)): # phonemes[i][0] = int(float(phonemes[i][0])/div_fac) # phonemes[i][1] = int(float(phonemes[i][1])/div_fac) # import pdb;pdb.set_trace() phonemes[-1][1] = len(voc_stft) strings_p = np.zeros(phonemes[-1][1]) for i in range(len(phonemes)): pho=phonemes[i] value = config.phonemas.index(pho[2]) strings_p[pho[0]:pho[1]+1] = value # import pdb;pdb.set_trace() if not len(strings_p) == len(voc_stft): import pdb;pdb.set_trace() # out_feats = utils.stft_to_feats(vocals,fs) # if not out_feats.shape[0]==voc_stft.shape[0] : # if out_feats.shape[0]<voc_stft.shape[0]: # while out_feats.shape[0]<voc_stft.shape[0]: # out_feats = np.concatenate(((out_feats,np.zeros((1,out_feats.shape[1]))))) # elif out_feats.shape[0]<voc_stft.shape[0]: # print("You are an idiot") # assert out_feats.shape[0]==voc_stft.shape[0] hdf5_file = h5py.File(config.voice_dir+'nus_'+singer+'_sing_'+lf[:-4]+'.hdf5', mode='a') # import pdb;pdb.set_trace() if not "phonemes" in hdf5_file: hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int) hdf5_file["phonemes"][:,] = strings_p # hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32) # hdf5_file.create_dataset("feats", out_feats.shape, np.float32) # hdf5_file["voc_stft"][:,:] = voc_stft # hdf5_file["feats"][:,:] = out_feats hdf5_file.create_dataset("voc_stft_phase", voc_stft_phase.shape, np.float32) # hdf5_file["phonemes"][:,] = strings_p # hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32) # hdf5_file.create_dataset("feats", out_feats.shape, np.float32) # hdf5_file["voc_stft"][:,:] = voc_stft hdf5_file["voc_stft_phase"][:,:] = voc_stft_phase hdf5_file.close() count+=1 utils.progress(count,len(sing_wav_files)) read_wav_files=[x for x in os.listdir(read_dir) if x.endswith('.wav') and not x.startswith('.')] print ("Processing reader %s" % singer) count = 0 if not singer == 'KENN': for lf in sing_wav_files: # if not os.path.exists(config.voice_dir+'nus_'+singer+'_read_'+lf[:-4]+'.hdf5'): # print(lf) audio,fs = sf.read(os.path.join(read_dir,lf)) if fs !=config.fs: command = "ffmpeg -y -i "+os.path.join(read_dir,lf)+" -ar "+str(config.fs)+" "+os.path.join(read_dir,lf) os.system(command) audio,fs = sf.read(os.path.join(read_dir,lf)) if len(audio.shape) == 2: vocals = np.array((audio[:,1]+audio[:,0])/2) else: vocals = np.array(audio) voc_stft = abs(utils.stft(vocals)) lab_f = open(os.path.join(read_dir,lf[:-4]+'.txt')) # note_f=open(in_dir+lf[:-4]+'.notes') phos = lab_f.readlines() lab_f.close() phonemes=[] for pho in phos: st,end,phonote=pho.split() # import pdb;pdb.set_trace() st = int(np.round(float(st)/0.005804576860324892)) en = int(np.round(float(end)/0.005804576860324892)) if phonote=='pau' or phonote=='br': phonote='sil' phonemes.append([st,en,phonote]) # phonemas.add(phonote) phonemes[-1][1] = len(voc_stft) # div_fac = float(end)/len(voc_stft) # for i in range(len(phonemes)): # phonemes[i][0] = int(float(phonemes[i][0])/div_fac) # phonemes[i][1] = int(float(phonemes[i][1])/div_fac) strings_p = np.zeros(phonemes[-1][1]) for i in range(len(phonemes)): pho=phonemes[i] # if singer == 'KENN': # import pdb;pdb.set_trace() value = config.phonemas.index(pho[2]) strings_p[pho[0]:pho[1]+1] = value if not len(strings_p) == len(voc_stft): import pdb;pdb.set_trace() # out_feats = utils.stft_to_feats(vocals,fs) # if not out_feats.shape[0]==voc_stft.shape[0] : # if out_feats.shape[0]<voc_stft.shape[0]: # while out_feats.shape[0]<voc_stft.shape[0]: # out_feats = np.concatenate(((out_feats,np.zeros((1,out_feats.shape[1]))))) # elif out_feats.shape[0]<voc_stft.shape[0]: # print("You are an idiot") # assert out_feats.shape[0]==voc_stft.shape[0] hdf5_file = h5py.File(config.voice_dir+'nus_'+singer+'_read_'+lf[:-4]+'.hdf5', mode='a') if not "phonemes" in hdf5_file: hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int) hdf5_file["phonemes"][:,] = strings_p # hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32) # hdf5_file.create_dataset("feats", out_feats.shape, np.float32) # hdf5_file["voc_stft"][:,:] = voc_stft # hdf5_file["feats"][:,:] = out_feats hdf5_file.close() count+=1 utils.progress(count,len(read_wav_files)) import pdb;pdb.set_trace()
def main(args): """ fname = "aiueo.wav" """ # get current working directory path = os.path.dirname(os.path.abspath(__file__)) # load audio file fname = os.path.join(path, "data", args.fname) wav, sr = librosa.load(fname, mono=True) # plot signal plt.figure() ax = plt.subplot(111) librosa.display.waveplot(wav, sr=sr, color="g", ax=ax) ax.set(title="Original signal", xlabel="Time [s]", ylabel="Magnitude") save_fname = os.path.join(path, "result", "signal.png") plt.savefig(save_fname, transparent=True) plt.show() # parameter hop = 0.5 win_length = 1024 hop_length = int(win_length * hop) # make mel filter bank n_channels = 20 # the number of mel filter bank channels df = sr / win_length # frequency resolution (Hz width per frequency index 1) filterbank, _ = melFilterBank(sr, win_length, n_channels) # plot mel filter bank for c in range(n_channels): plt.plot(np.arange(0, win_length / 2) * df, filterbank[c]) plt.title("Mel filter bank") plt.xlabel("Frequency [Hz]") save_fname = os.path.join(path, "result", "MelFilterBank.png") plt.savefig(save_fname, transparent=True) plt.show() # spectrogram (ex1) fig, ax = plt.subplots(nrows=1, ncols=1) amp = utils.stft(wav, hop=hop, win_length=win_length) db = librosa.amplitude_to_db(np.abs(amp)) img = librosa.display.specshow( db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear", ax=ax, cmap="rainbow", ) ax.set(title="Spectrogram", xlabel=None, ylabel="Frequency [Hz]") fig.colorbar(img, aspect=10, pad=0.01, ax=ax, format="%+2.f dB") save_fname = os.path.join(path, "result", "spectrogram.png") plt.savefig(save_fname, transparent=True) plt.show() fig, ax = plt.subplots(nrows=4, ncols=1, sharex=True, figsize=(10, 6)) plt.subplots_adjust(hspace=0.6) # calculate mel spectrogram and mfcc mel_spec, mfcc = calc_mfcc(wav, hop, win_length, filterbank) # mel spectrogram wav_time = wav.shape[0] // sr f_nyq = sr // 2 extent = [0, wav_time, 0, f_nyq] img = ax[0].imshow( librosa.amplitude_to_db(mel_spec), aspect="auto", extent=extent, cmap="rainbow", ) ax[0].set( title="Mel spectrogram", xlabel=None, ylabel="Mel frequency [mel]", ylim=[0, 8000], yticks=range(0, 10000, 2000), ) fig.colorbar(img, aspect=10, pad=0.01, ax=ax[0], format="%+2.f dB") # mfcc n_mfcc = 12 extent = [0, wav_time, 0, n_mfcc] img = ax[1].imshow(np.flipud(mfcc[:n_mfcc]), aspect="auto", extent=extent, cmap="rainbow") ax[1].set( title="MFCC sequence", xlabel=None, ylabel="MFCC", yticks=range(0, 13, 4), ) fig.colorbar(img, aspect=10, pad=0.01, ax=ax[1], format="%+2.f dB") # d-mfcc d_mfcc = delta_mfcc(mfcc, k=2) img = ax[2].imshow(np.flipud(d_mfcc[:n_mfcc]), aspect="auto", extent=extent, cmap="rainbow") ax[2].set( title="ΔMFCC sequence", xlabel=None, ylabel="ΔMFCC", yticks=range(0, 13, 4), ) fig.colorbar(img, aspect=10, pad=0.01, ax=ax[2], format="%+2.f dB") # dd-mfcc dd_mfcc = delta_mfcc(d_mfcc, k=2) img = ax[3].imshow(np.flipud(dd_mfcc[:n_mfcc]), aspect="auto", extent=extent, cmap="rainbow") ax[3].set( title="ΔΔMFCC sequence", xlabel="Time [s]", ylabel="ΔΔMFCC", yticks=range(0, 13, 4), ) fig.colorbar(img, aspect=10, pad=0.01, ax=ax[3], format="%+2.f dB") save_fname = os.path.join(path, "result", "mfcc_result.png") plt.savefig(save_fname, transparent=True) plt.show()