Beispiel #1
0
def main(argv):
    os.makedirs(FLAGS.output_dir, exist_ok=True)
    ''' Initialize model '''
    unet = Unet()
    restore(net=unet, ckpt_path=FLAGS.ckpt_path)

    ''' Load data '''
    mix_wav, _ = load(FLAGS.original_wav, sr=SAMPLE_RATE)
    mix_wav_mag, mix_wav_phase = magphase(stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))
    mix_wav_mag= mix_wav_mag[:, START:END]
    mix_wav_phase= mix_wav_phase[:, START:END]

    '''Load gt '''
    if FLAGS.gt == True:
        gt_wav, _ = load(FLAGS.original_gt, sr=SAMPLE_RATE)
        gt_wav_mag, gt_wav_phase = magphase(stft(gt_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))
        gt_wav_mag= gt_wav_mag[:, START:END]
        gt_wav_phase= gt_wav_phase[:, START:END]

    '''Save input spectrogram image and gt'''
    write_wav(FLAGS.output_dir+'original_mix.wav', 
                istft(mix_wav_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                SAMPLE_RATE, norm=True)
    spectogram_librosa(FLAGS.output_dir+'original_mix.wav',0)
    if FLAGS.gt == True:
        write_wav(FLAGS.output_dir+'gt.wav', 
                    istft(gt_wav_mag * gt_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                    SAMPLE_RATE, norm=True)
        spectogram_librosa(FLAGS.output_dir+'gt.wav',0)

    ''' run data '''
    inputs = mix_wav_mag[1:].reshape(1, 512, 128, 1)
    mask = unet(inputs).numpy().reshape(512, 128)
    predict = inputs.reshape(512, 128)*mask

    ''' evaluation metrics '''
    if FLAGS.gt == True:
        expand_pre = np.expand_dims(predict.flatten(), axis=0)
        expand_gt = np.expand_dims(gt_wav_mag[1:].flatten(), axis=0)
        expand_input = np.expand_dims(inputs.flatten(), axis=0)
        (SDR, SIR, SAR, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_pre)
        (SDR2, _, _, _) = mir_eval.separation.bss_eval_sources(expand_gt,expand_input)
        NSDR = SDR - SDR2 #SDR(Se, Sr) − SDR(Sm, Sr)

        fout = open(FLAGS.output_dir+'metrics.txt','a')
        print('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****')
        fout.write('*****SDR = '+ str(SDR) + ', SIR = '+ str(SIR) + ', SAR = '+ str(SAR) + ', NSDR = '+ str(NSDR) + '*****')
        fout.close()

    ''' Convert model output to target magnitude '''
    target_pred_mag = np.vstack((np.zeros((128)), predict))

    ''' Write vocal prediction audio files '''
    write_wav(FLAGS.output_dir+'pred_vocal.wav', 
                istft(target_pred_mag * mix_wav_phase,win_length=WINDOW_SIZE,hop_length=HOP_LENGTH),
                SAMPLE_RATE, norm=True)

    spectogram_librosa(FLAGS.output_dir+'pred_vocal.wav',1)
Beispiel #2
0
 def __call__(self, audio):
     spec = stft(audio.numpy().reshape(-1, ),
                 hop_length=self.hop,
                 win_length=self.ws,
                 n_fft=self.n_fft)
     mag, ph = magphase(spec)
     mag = torch.Tensor(mag)
     ph = np.angle(ph)
     ph = torch.Tensor(ph)
     out = torch.stack((mag, ph), dim=0)
     return out
Beispiel #3
0
def test():
    vis = Visualizer(env='svs')
    model = getattr(models, 'Unet')().eval()
    #    model.cuda()
    model.load_state_dict(
        t.load('G:/Unet_svs/check/epoch_219__0724_16_57_35.pth'))
    mix_wav, _ = load("C:/Users/lenovo/Music/c.mp3", sr=8192)
    mix_wav_mag, mix_wav_phase = magphase(
        stft(mix_wav, n_fft=1024, hop_length=768))
    START = 700
    END = START + 128

    mix_wav_mag = mix_wav_mag[:, START:END]
    mix_wav_phase = mix_wav_phase[:, START:END]

    print(mix_wav_mag.shape)

    gg = mix_wav_mag[1:]
    gg = t.from_numpy(gg)
    gg.unsqueeze_(0)
    gg.unsqueeze_(0)
    vis.img('a', gg)
    print(gg.shape)
    with t.no_grad():
        gg = Variable(gg)
    score = model(gg)
    predict = gg.data * score.data
    print(predict.shape)
    target_pred_mag = predict.view(512, 128).cpu().numpy()
    target_pred_mag = np.vstack((np.zeros((128)), target_pred_mag))
    vis.img('b', t.from_numpy(target_pred_mag))
    print(target_pred_mag.shape)
    write_wav(
        f'C:/Users/lenovo/Music/pred_vocal.wav',
        istft(
            target_pred_mag * mix_wav_phase
            #     (mix_wav_mag * target_pred_mag) * mix_wav_phase
            ,
            win_length=1024,
            hop_length=768),
        8192,
        norm=True)
    write_wav(f'C:/Users/lenovo/Music/pred_mix.wav',
              istft(mix_wav_mag * mix_wav_phase,
                    win_length=1024,
                    hop_length=768),
              8192,
              norm=True)
Beispiel #4
0
def gl_rec(S):
    sr, nfft, wlen, hop = 22050, 1022, 1022, 256
    S = 10**(S)
    angles = 3.1415 * (np.random.randn(S.shape[0], S.shape[1]) - 0.5)
    #print (angles.shape)
    #print (S.shape)
    y = core.istft(S * angles, hop_length=hop, win_length=wlen)
    num_samples = y.shape[0]
    #print (y.shape)
    for i in range(40):
        angles = core.stft(y, n_fft=nfft, hop_length=hop, win_length=wlen)
        S = S[:, :angles.shape[1]]
        _, angles = core.magphase(angles)
        y = core.istft(S * angles, hop_length=hop, win_length=wlen)
        #y = y[:num_samples]
    return y
Beispiel #5
0
def LoadAudioTrainingDataFromFile(csv_file_name,
                                  validation_size,
                                  nmfcc=None,
                                  nfft=None,
                                  output_type=None):

    #initiliaze arrays
    mfcc_input = []
    mag_input = []
    digit = []
    output = []

    #read in CSV file with file names
    #????? Should probably require output as a seperate column instead of reading from file name
    with open(csv_file_name, 'r') as f:
        reader = csv.reader(f)
        file_list = list(reader)

    #load in audio and file name data
    dirname = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
    for files in file_list:
        relative_path = 'recordings/' + files[0]
        file_name = os.path.join(os.path.dirname(__file__), relative_path)
        y, sr = load(file_name, sr=None)
        filesize = sys.getsizeof(y)

        if output_type == 'spectrum':
            spectrum = stft(y, nfft, hop_length=int(filesize / 2))
            mag, phase = magphase(spectrum)
            mag_input.append(mag)

        mfcc = feature.mfcc(y, sr, n_mfcc=nmfcc, hop_length=int(filesize / 2))
        mfcc = mfcc[1:nmfcc]
        mfcc_input.append(mfcc)

        digit.append(files[0][0])

    #build array of one hot vectors for output based on 1st character in file name
    for num in digit:
        if num == '0':
            output.append(digits.zero)
        if num == '1':
            output.append(digits.one)
        if num == '2':
            output.append(digits.two)
        if num == '3':
            output.append(digits.three)
        if num == '4':
            output.append(digits.four)
        if num == '5':
            output.append(digits.five)
        if num == '6':
            output.append(digits.six)
        if num == '7':
            output.append(digits.seven)
        if num == '8':
            output.append(digits.eight)
        if num == '9':
            output.append(digits.nine)

    #normalize data to between 0 and 1
    if output_type == 'mfcc':
        training_input = numpy.asarray(mfcc_input, dtype=numpy.float64)
        training_input = numpy.squeeze(training_input)
        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
        training_input = min_max_scaler.fit_transform(training_input)

    elif output_type == 'spectrum':
        training_input = numpy.asarray(mag_input, dtype=numpy.float64)
        training_input = numpy.squeeze(training_input)
        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
        training_input = min_max_scaler.fit_transform(training_input)

    training_output = numpy.asarray(output, dtype=numpy.float64)
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 6))
    training_output = min_max_scaler.fit_transform(training_output)

    #randommize before dividing test/train sets:
    randomize = numpy.arange(len(training_input))
    numpy.random.shuffle(randomize)
    training_input = training_input[randomize]
    training_output = training_output[randomize]

    #pull out validation set
    validation_input = training_input[0:validation_size, :]
    validation_output = training_output[0:validation_size, :]

    return training_input, training_output, validation_input, validation_output
def load_as_mag(file):
    wav, _ = load(file, sr=None)
    spectrogram = stft(wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH)
    mag, _ = magphase(spectrogram)
    return mag.astype(np.float32)
    def predict(self, loader):
        """
        Predict for an input.

        Args
        ----
            loader : PyTorch DataLoader.

        """
        self.model.eval()
        all_preds = []
        all_ys = []
        all_cs = []
        all_ts = []
        all_ms = []
        all_idx = []

        if isinstance(loader.dataset, torch.utils.data.Subset):
            n_frames = loader.dataset.dataset.n_frames
        elif isinstance(loader.dataset, torch.utils.data.ConcatDataset):
            n_frames = loader.dataset.datasets[0].n_frames
        else:
            n_frames = loader.dataset.n_frames

        with torch.no_grad():
            for batch_samples in tqdm(loader):

                # prepare training sample
                X = batch_samples['X']
                if X.dim() == 4:
                    full_track = False
                    # batch_size x in_channels x 1025 x 129
                else:
                    bs = X.size(0)
                    ns = X.size(1)
                    full_track = True
                    # batch_size * splits x in_channels x 1025 x 129
                    X = X.view(bs * ns, self.in_channels, self.n_fft, n_frames)

                # batch_size x in_channels x 1025 x 129 x 2
                X_complex = batch_samples['X_complex']
                if X_complex.dim() != 5:
                    # batch_size * splits x in_channels x 1025 x 129 x 2
                    X_complex = X_complex.view(
                        bs * ns, self.out_channels, self.n_fft, n_frames, 2)

                # batch_size x nclasses x in_channels x 1025 x time samples x 2
                y = batch_samples['y_complex']
                # batch_size x nclasses
                cs = batch_samples['c']
                # batch_size x 1
                ts = batch_samples['t']
                track_idx = batch_samples['track_idx']

                if self.USE_CUDA:
                    X = X.cuda()
                    X_complex = X_complex.cuda()
                    y = y.cuda()

                if X.size(0) > 4:
                    X_list = torch.split(X, 4, dim=0)
                else:
                    X_list = [X]

                masks_list = []
                pred_list = []
                for X in X_list:
                    # detach hidden state
                    self.model.detach_hidden(X.size(0))
                    # forward pass
                    preds, mask = self.model(X)
                    masks_list += [mask]
                    pred_list += [preds]
                mask = torch.cat(masks_list, dim=0)
                preds = torch.cat(pred_list, dim=0)

                if full_track:
                    # batch size x nclasses x in_channels x 1025 x time samples
                    if self.regression:
                        preds = preds.view(
                            bs, ns, self.n_classes, self.out_channels,
                            self.n_fft, n_frames)
                        preds = torch.unbind(preds, dim=1)
                        preds = torch.cat(preds, dim=4)
                    else:
                        mask = mask.view(
                            bs, ns, self.n_classes, self.out_channels,
                            self.n_fft, n_frames)
                        mask = torch.unbind(mask, dim=1)
                        mask = torch.cat(mask, dim=4)
                    # batch_size x in_channels x 1025 x time samples x 2
                    X_complex = X_complex.view(
                        bs, ns, self.out_channels, self.n_fft, n_frames, 2)
                    X_complex = torch.unbind(X_complex, dim=1)
                    X_complex = torch.cat(X_complex, dim=3)

                # convert to complex
                # batch size x nclasses x in_channels x 1025 x time samples x 2
                X_complex = X_complex.unsqueeze(1).repeat(
                    1, self.n_classes, 1, 1, 1, 1)
                X_complex = self._to_complex(X_complex)
                if self.regression:
                    _, X_phase = magphase(X_complex)
                    preds = preds.cpu().numpy() * X_phase
                else:
                    preds = mask.cpu().numpy() * X_complex
                # batch size x nclasses x in_channels x 1025 x time samples
                ys = self._to_complex(y)

                all_preds += [preds]
                all_ys += [ys]
                all_cs += [cs]
                all_ts += [ts]
                all_ms += [mask.cpu().numpy()]
                all_idx += [track_idx]

        return all_preds, all_ys, all_cs, all_ts, all_ms, all_idx
import numpy as np
from librosa.core import istft, load, stft, magphase
from librosa.output import write_wav
from config import *
import keras as keras

if __name__ == '__main__':
    # load test audio and convert to mag/phase
    mix_wav, _ = load("../wav_files/mixture.wav", sr=SAMPLE_RATE)
    mix_wav_mag, mix_wav_phase = magphase(
        stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))

    vocal_wav, _ = load("../wav_files/vocals.wav", sr=SAMPLE_RATE)
    vocal_wav_mag, vocal_wav_phase = magphase(
        stft(vocal_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))

    START = 0
    END = START + 128

    mix_wav_mag = mix_wav_mag[:, START:END]
    mix_wav_phase = mix_wav_phase[:, START:END]

    vocal_wav_mag = vocal_wav_mag[:, START:END]
    vocal_wav_phase = vocal_wav_phase[:, START:END]

    # load saved model
    model = keras.models.load_model('../models/vocal_20_test_model.h5')
    #model = keras.models.load_model('../models/vocal_20.h5')

    # predict and write into file
    X = mix_wav_mag[1:].reshape(1, 512, 128, 1)
Beispiel #9
0
def mag_phase_angle(x):
    mag, ph = magphase(x)
    ph = np.angle(ph)
    out = np.stack([mag, ph])
    return out
def separate_instruments(file_path):
    plt.rcParams['figure.figsize'] = (14, 5)

    x, sr = librosa.load(file_path, sr=None)
    winlen = 1024
    h, i, X = stft(x=x,
                   fs=sr,
                   window='hann',
                   nperseg=winlen,
                   noverlap=int(winlen / 2),
                   nfft=winlen,
                   detrend=False,
                   return_onesided=True,
                   padded=True,
                   axis=-1)
    # information about wav
    print(len(x))
    # short-time fourier transform
    # X = librosa.stft(x)
    #  log-amplitude
    Xmag = librosa.amplitude_to_db(X)

    # show harm-perc spectrogram
    librosa.display.specshow(Xmag, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()
    plt.show()
    #############
    S = X
    kernel_size = 31
    power = 2.0
    mask = False
    margin = 1.0

    if np.iscomplexobj(S):
        S, phase = core.magphase(S)
    else:
        phase = 1

    if np.isscalar(kernel_size):
        win_harm = kernel_size
        win_perc = kernel_size
    else:
        win_harm = kernel_size[0]
        win_perc = kernel_size[1]

    if np.isscalar(margin):
        margin_harm = margin
        margin_perc = margin
    else:
        margin_harm = margin[0]
        margin_perc = margin[1]

    split_zeros = (margin_harm == 1 and margin_perc == 1)
    # Compute median filters. Pre-allocation here preserves memory layout.
    harm = np.empty_like(S)
    harm[:] = median_filter(S, size=(1, win_harm), mode='reflect')

    perc = np.empty_like(S)
    perc[:] = median_filter(S, size=(win_perc, 1), mode='reflect')

    Hmag = librosa.amplitude_to_db(harm)
    # librosa.display.specshow(harm, sr=sr, x_axis='time', y_axis='log')
    # plt.colorbar()
    # plt.show()

    Pmag = librosa.amplitude_to_db(perc)
    # librosa.display.specshow(perc, sr=sr, x_axis='time', y_axis='log')
    # plt.colorbar()
    # plt.show()

    mask_harm_soft = util.softmask(harm,
                                   perc * margin_harm,
                                   power=power,
                                   split_zeros=split_zeros)
    mask_perc_soft = util.softmask(perc,
                                   harm * margin_perc,
                                   power=power,
                                   split_zeros=split_zeros)
    soft_mask_X_harm = (S * mask_harm_soft) * phase
    Xmag_harm_soft = librosa.amplitude_to_db(soft_mask_X_harm)
    soft_mask_X_perc = (S * mask_perc_soft) * phase
    Xmag_perc_soft = librosa.amplitude_to_db(soft_mask_X_perc)

    # mask_harm_hard = harm > perc * margin_harm
    # mask_perc_hard = perc > harm * margin_perc
    # hard_mask_X_harm = (S * mask_harm_hard) * phase
    # Xmag_harm_hard = librosa.amplitude_to_db(hard_mask_X_harm)
    # hard_mask_X_perc = (S * mask_perc_hard) * phase
    # Xmag_perc_hard = librosa.amplitude_to_db(hard_mask_X_perc)

    librosa.display.specshow(Xmag_harm_soft,
                             sr=sr,
                             x_axis='time',
                             y_axis='log')
    plt.colorbar()
    plt.show()

    librosa.display.specshow(Xmag_perc_soft,
                             sr=sr,
                             x_axis='time',
                             y_axis='log')
    plt.colorbar()
    plt.show()

    # x_h, sr_h = librosa.load('my_audio_mod/01_AF_NM_h.wav', duration=6, sr=None)
    # x_p, sr_p = librosa.load('my_audio_mod/01_AF_NM_p.wav', duration=6, sr=None)
    # librosa.display.waveplot( x_h, sr=sr_h)
    # plt.show()
    # librosa.display.waveplot( x_p, sr=sr_p)
    # plt.show()
    H = (S * mask_harm_soft) * phase
    P = (S * mask_perc_soft) * phase

    Hmag = librosa.amplitude_to_db(H)
    Pmag = librosa.amplitude_to_db(P)

    librosa.display.specshow(Hmag, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()
    plt.show()

    librosa.display.specshow(Pmag, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()
    plt.show()

    # h = librosa.istft(H)
    # p = librosa.istft(P)
    _, h = istft(H,
                 fs=sr,
                 window='hann',
                 nperseg=winlen,
                 noverlap=int(winlen / 2),
                 nfft=winlen,
                 input_onesided=True)
    _, p = istft(P,
                 fs=sr,
                 window='hann',
                 nperseg=winlen,
                 noverlap=int(winlen / 2),
                 nfft=winlen,
                 input_onesided=True)

    # saving
    librosa.output.write_wav(
        os.path.splitext(file_path)[0] + '_H_med.wav', h, sr)
    librosa.output.write_wav(
        os.path.splitext(file_path)[0] + '_P_med.wav', p, sr)
Beispiel #11
0
def LoadAudio(path_audio):
    y, sr = load(path_audio, sr=SR)
    S_mag, _ = magphase(stft(y, n_fft=FFTSIZE, hop_length=H))
    
    return S_mag