def main(argv): os.makedirs(FLAGS.output_dir, exist_ok=True) ''' Initialize model ''' unet = Unet() restore(net=unet, ckpt_path=FLAGS.ckpt_path) ''' Load data ''' mix_wav, _ = load(FLAGS.original_wav, sr=SAMPLE_RATE) mix_wav_mag, mix_wav_phase = magphase(stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) mix_wav_mag= mix_wav_mag[:, START:END] mix_wav_phase= mix_wav_phase[:, START:END] '''Load gt ''' if FLAGS.gt == True: gt_wav, _ = load(FLAGS.original_gt, sr=SAMPLE_RATE) gt_wav_mag, gt_wav_phase = magphase(stft(gt_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) gt_wav_mag= gt_wav_mag[:, START:END] gt_wav_phase= gt_wav_phase[:, START:END] '''Save input spectrogram image and gt''' write_wav(FLAGS.output_dir+'original_mix.wav', istft(mix_wav_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'original_mix.wav',0) if FLAGS.gt == True: write_wav(FLAGS.output_dir+'gt.wav', istft(gt_wav_mag * gt_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'gt.wav',0) ''' run data ''' inputs = mix_wav_mag[1:].reshape(1, 512, 128, 1) mask = unet(inputs).numpy().reshape(512, 128) predict = inputs.reshape(512, 128)*mask ''' evaluation metrics ''' if FLAGS.gt == True: expand_pre = np.expand_dims(predict.flatten(), axis=0) expand_gt = np.expand_dims(gt_wav_mag[1:].flatten(), axis=0) expand_input = np.expand_dims(inputs.flatten(), axis=0) (SDR, SIR, SAR, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_pre) (SDR2, _, _, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_input) NSDR = SDR - SDR2 #SDR(Se, Sr) − SDR(Sm, Sr) fout = open(FLAGS.output_dir+'metrics.txt','a') print('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****') fout.write('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****') fout.close() ''' Convert model output to target magnitude ''' target_pred_mag = np.vstack((np.zeros((128)), predict)) ''' Write vocal prediction audio files ''' write_wav(FLAGS.output_dir+'pred_vocal.wav', istft(target_pred_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True) spectogram_librosa(FLAGS.output_dir+'pred_vocal.wav',1)
def __call__(self, audio): spec = stft(audio.numpy().reshape(-1, ), hop_length=self.hop, win_length=self.ws, n_fft=self.n_fft) mag, ph = magphase(spec) mag = torch.Tensor(mag) ph = np.angle(ph) ph = torch.Tensor(ph) out = torch.stack((mag, ph), dim=0) return out
def test(): vis = Visualizer(env='svs') model = getattr(models, 'Unet')().eval() # model.cuda() model.load_state_dict( t.load('G:/Unet_svs/check/epoch_219__0724_16_57_35.pth')) mix_wav, _ = load("C:/Users/lenovo/Music/c.mp3", sr=8192) mix_wav_mag, mix_wav_phase = magphase( stft(mix_wav, n_fft=1024, hop_length=768)) START = 700 END = START + 128 mix_wav_mag = mix_wav_mag[:, START:END] mix_wav_phase = mix_wav_phase[:, START:END] print(mix_wav_mag.shape) gg = mix_wav_mag[1:] gg = t.from_numpy(gg) gg.unsqueeze_(0) gg.unsqueeze_(0) vis.img('a', gg) print(gg.shape) with t.no_grad(): gg = Variable(gg) score = model(gg) predict = gg.data * score.data print(predict.shape) target_pred_mag = predict.view(512, 128).cpu().numpy() target_pred_mag = np.vstack((np.zeros((128)), target_pred_mag)) vis.img('b', t.from_numpy(target_pred_mag)) print(target_pred_mag.shape) write_wav( f'C:/Users/lenovo/Music/pred_vocal.wav', istft( target_pred_mag * mix_wav_phase # (mix_wav_mag * target_pred_mag) * mix_wav_phase , win_length=1024, hop_length=768), 8192, norm=True) write_wav(f'C:/Users/lenovo/Music/pred_mix.wav', istft(mix_wav_mag * mix_wav_phase, win_length=1024, hop_length=768), 8192, norm=True)
def gl_rec(S): sr, nfft, wlen, hop = 22050, 1022, 1022, 256 S = 10**(S) angles = 3.1415 * (np.random.randn(S.shape[0], S.shape[1]) - 0.5) #print (angles.shape) #print (S.shape) y = core.istft(S * angles, hop_length=hop, win_length=wlen) num_samples = y.shape[0] #print (y.shape) for i in range(40): angles = core.stft(y, n_fft=nfft, hop_length=hop, win_length=wlen) S = S[:, :angles.shape[1]] _, angles = core.magphase(angles) y = core.istft(S * angles, hop_length=hop, win_length=wlen) #y = y[:num_samples] return y
def LoadAudioTrainingDataFromFile(csv_file_name, validation_size, nmfcc=None, nfft=None, output_type=None): #initiliaze arrays mfcc_input = [] mag_input = [] digit = [] output = [] #read in CSV file with file names #????? Should probably require output as a seperate column instead of reading from file name with open(csv_file_name, 'r') as f: reader = csv.reader(f) file_list = list(reader) #load in audio and file name data dirname = os.path.dirname(os.path.abspath(inspect.stack()[0][1])) for files in file_list: relative_path = 'recordings/' + files[0] file_name = os.path.join(os.path.dirname(__file__), relative_path) y, sr = load(file_name, sr=None) filesize = sys.getsizeof(y) if output_type == 'spectrum': spectrum = stft(y, nfft, hop_length=int(filesize / 2)) mag, phase = magphase(spectrum) mag_input.append(mag) mfcc = feature.mfcc(y, sr, n_mfcc=nmfcc, hop_length=int(filesize / 2)) mfcc = mfcc[1:nmfcc] mfcc_input.append(mfcc) digit.append(files[0][0]) #build array of one hot vectors for output based on 1st character in file name for num in digit: if num == '0': output.append(digits.zero) if num == '1': output.append(digits.one) if num == '2': output.append(digits.two) if num == '3': output.append(digits.three) if num == '4': output.append(digits.four) if num == '5': output.append(digits.five) if num == '6': output.append(digits.six) if num == '7': output.append(digits.seven) if num == '8': output.append(digits.eight) if num == '9': output.append(digits.nine) #normalize data to between 0 and 1 if output_type == 'mfcc': training_input = numpy.asarray(mfcc_input, dtype=numpy.float64) training_input = numpy.squeeze(training_input) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) training_input = min_max_scaler.fit_transform(training_input) elif output_type == 'spectrum': training_input = numpy.asarray(mag_input, dtype=numpy.float64) training_input = numpy.squeeze(training_input) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) training_input = min_max_scaler.fit_transform(training_input) training_output = numpy.asarray(output, dtype=numpy.float64) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 6)) training_output = min_max_scaler.fit_transform(training_output) #randommize before dividing test/train sets: randomize = numpy.arange(len(training_input)) numpy.random.shuffle(randomize) training_input = training_input[randomize] training_output = training_output[randomize] #pull out validation set validation_input = training_input[0:validation_size, :] validation_output = training_output[0:validation_size, :] return training_input, training_output, validation_input, validation_output
def load_as_mag(file): wav, _ = load(file, sr=None) spectrogram = stft(wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH) mag, _ = magphase(spectrogram) return mag.astype(np.float32)
def predict(self, loader): """ Predict for an input. Args ---- loader : PyTorch DataLoader. """ self.model.eval() all_preds = [] all_ys = [] all_cs = [] all_ts = [] all_ms = [] all_idx = [] if isinstance(loader.dataset, torch.utils.data.Subset): n_frames = loader.dataset.dataset.n_frames elif isinstance(loader.dataset, torch.utils.data.ConcatDataset): n_frames = loader.dataset.datasets[0].n_frames else: n_frames = loader.dataset.n_frames with torch.no_grad(): for batch_samples in tqdm(loader): # prepare training sample X = batch_samples['X'] if X.dim() == 4: full_track = False # batch_size x in_channels x 1025 x 129 else: bs = X.size(0) ns = X.size(1) full_track = True # batch_size * splits x in_channels x 1025 x 129 X = X.view(bs * ns, self.in_channels, self.n_fft, n_frames) # batch_size x in_channels x 1025 x 129 x 2 X_complex = batch_samples['X_complex'] if X_complex.dim() != 5: # batch_size * splits x in_channels x 1025 x 129 x 2 X_complex = X_complex.view( bs * ns, self.out_channels, self.n_fft, n_frames, 2) # batch_size x nclasses x in_channels x 1025 x time samples x 2 y = batch_samples['y_complex'] # batch_size x nclasses cs = batch_samples['c'] # batch_size x 1 ts = batch_samples['t'] track_idx = batch_samples['track_idx'] if self.USE_CUDA: X = X.cuda() X_complex = X_complex.cuda() y = y.cuda() if X.size(0) > 4: X_list = torch.split(X, 4, dim=0) else: X_list = [X] masks_list = [] pred_list = [] for X in X_list: # detach hidden state self.model.detach_hidden(X.size(0)) # forward pass preds, mask = self.model(X) masks_list += [mask] pred_list += [preds] mask = torch.cat(masks_list, dim=0) preds = torch.cat(pred_list, dim=0) if full_track: # batch size x nclasses x in_channels x 1025 x time samples if self.regression: preds = preds.view( bs, ns, self.n_classes, self.out_channels, self.n_fft, n_frames) preds = torch.unbind(preds, dim=1) preds = torch.cat(preds, dim=4) else: mask = mask.view( bs, ns, self.n_classes, self.out_channels, self.n_fft, n_frames) mask = torch.unbind(mask, dim=1) mask = torch.cat(mask, dim=4) # batch_size x in_channels x 1025 x time samples x 2 X_complex = X_complex.view( bs, ns, self.out_channels, self.n_fft, n_frames, 2) X_complex = torch.unbind(X_complex, dim=1) X_complex = torch.cat(X_complex, dim=3) # convert to complex # batch size x nclasses x in_channels x 1025 x time samples x 2 X_complex = X_complex.unsqueeze(1).repeat( 1, self.n_classes, 1, 1, 1, 1) X_complex = self._to_complex(X_complex) if self.regression: _, X_phase = magphase(X_complex) preds = preds.cpu().numpy() * X_phase else: preds = mask.cpu().numpy() * X_complex # batch size x nclasses x in_channels x 1025 x time samples ys = self._to_complex(y) all_preds += [preds] all_ys += [ys] all_cs += [cs] all_ts += [ts] all_ms += [mask.cpu().numpy()] all_idx += [track_idx] return all_preds, all_ys, all_cs, all_ts, all_ms, all_idx
import numpy as np from librosa.core import istft, load, stft, magphase from librosa.output import write_wav from config import * import keras as keras if __name__ == '__main__': # load test audio and convert to mag/phase mix_wav, _ = load("../wav_files/mixture.wav", sr=SAMPLE_RATE) mix_wav_mag, mix_wav_phase = magphase( stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) vocal_wav, _ = load("../wav_files/vocals.wav", sr=SAMPLE_RATE) vocal_wav_mag, vocal_wav_phase = magphase( stft(vocal_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)) START = 0 END = START + 128 mix_wav_mag = mix_wav_mag[:, START:END] mix_wav_phase = mix_wav_phase[:, START:END] vocal_wav_mag = vocal_wav_mag[:, START:END] vocal_wav_phase = vocal_wav_phase[:, START:END] # load saved model model = keras.models.load_model('../models/vocal_20_test_model.h5') #model = keras.models.load_model('../models/vocal_20.h5') # predict and write into file X = mix_wav_mag[1:].reshape(1, 512, 128, 1)
def mag_phase_angle(x): mag, ph = magphase(x) ph = np.angle(ph) out = np.stack([mag, ph]) return out
def separate_instruments(file_path): plt.rcParams['figure.figsize'] = (14, 5) x, sr = librosa.load(file_path, sr=None) winlen = 1024 h, i, X = stft(x=x, fs=sr, window='hann', nperseg=winlen, noverlap=int(winlen / 2), nfft=winlen, detrend=False, return_onesided=True, padded=True, axis=-1) # information about wav print(len(x)) # short-time fourier transform # X = librosa.stft(x) # log-amplitude Xmag = librosa.amplitude_to_db(X) # show harm-perc spectrogram librosa.display.specshow(Xmag, sr=sr, x_axis='time', y_axis='log') plt.colorbar() plt.show() ############# S = X kernel_size = 31 power = 2.0 mask = False margin = 1.0 if np.iscomplexobj(S): S, phase = core.magphase(S) else: phase = 1 if np.isscalar(kernel_size): win_harm = kernel_size win_perc = kernel_size else: win_harm = kernel_size[0] win_perc = kernel_size[1] if np.isscalar(margin): margin_harm = margin margin_perc = margin else: margin_harm = margin[0] margin_perc = margin[1] split_zeros = (margin_harm == 1 and margin_perc == 1) # Compute median filters. Pre-allocation here preserves memory layout. harm = np.empty_like(S) harm[:] = median_filter(S, size=(1, win_harm), mode='reflect') perc = np.empty_like(S) perc[:] = median_filter(S, size=(win_perc, 1), mode='reflect') Hmag = librosa.amplitude_to_db(harm) # librosa.display.specshow(harm, sr=sr, x_axis='time', y_axis='log') # plt.colorbar() # plt.show() Pmag = librosa.amplitude_to_db(perc) # librosa.display.specshow(perc, sr=sr, x_axis='time', y_axis='log') # plt.colorbar() # plt.show() mask_harm_soft = util.softmask(harm, perc * margin_harm, power=power, split_zeros=split_zeros) mask_perc_soft = util.softmask(perc, harm * margin_perc, power=power, split_zeros=split_zeros) soft_mask_X_harm = (S * mask_harm_soft) * phase Xmag_harm_soft = librosa.amplitude_to_db(soft_mask_X_harm) soft_mask_X_perc = (S * mask_perc_soft) * phase Xmag_perc_soft = librosa.amplitude_to_db(soft_mask_X_perc) # mask_harm_hard = harm > perc * margin_harm # mask_perc_hard = perc > harm * margin_perc # hard_mask_X_harm = (S * mask_harm_hard) * phase # Xmag_harm_hard = librosa.amplitude_to_db(hard_mask_X_harm) # hard_mask_X_perc = (S * mask_perc_hard) * phase # Xmag_perc_hard = librosa.amplitude_to_db(hard_mask_X_perc) librosa.display.specshow(Xmag_harm_soft, sr=sr, x_axis='time', y_axis='log') plt.colorbar() plt.show() librosa.display.specshow(Xmag_perc_soft, sr=sr, x_axis='time', y_axis='log') plt.colorbar() plt.show() # x_h, sr_h = librosa.load('my_audio_mod/01_AF_NM_h.wav', duration=6, sr=None) # x_p, sr_p = librosa.load('my_audio_mod/01_AF_NM_p.wav', duration=6, sr=None) # librosa.display.waveplot( x_h, sr=sr_h) # plt.show() # librosa.display.waveplot( x_p, sr=sr_p) # plt.show() H = (S * mask_harm_soft) * phase P = (S * mask_perc_soft) * phase Hmag = librosa.amplitude_to_db(H) Pmag = librosa.amplitude_to_db(P) librosa.display.specshow(Hmag, sr=sr, x_axis='time', y_axis='log') plt.colorbar() plt.show() librosa.display.specshow(Pmag, sr=sr, x_axis='time', y_axis='log') plt.colorbar() plt.show() # h = librosa.istft(H) # p = librosa.istft(P) _, h = istft(H, fs=sr, window='hann', nperseg=winlen, noverlap=int(winlen / 2), nfft=winlen, input_onesided=True) _, p = istft(P, fs=sr, window='hann', nperseg=winlen, noverlap=int(winlen / 2), nfft=winlen, input_onesided=True) # saving librosa.output.write_wav( os.path.splitext(file_path)[0] + '_H_med.wav', h, sr) librosa.output.write_wav( os.path.splitext(file_path)[0] + '_P_med.wav', p, sr)
def LoadAudio(path_audio): y, sr = load(path_audio, sr=SR) S_mag, _ = magphase(stft(y, n_fft=FFTSIZE, hop_length=H)) return S_mag