コード例 #1
0
ファイル: post_filter.py プロジェクト: PICTEC/DASH
def get_dataset(clean, noisy, ratio=0.2, maxlen=1200, n_fft=512):
    fft_size = n_fft // 2 + 1
    clean, noisy = list_dataset(clean, noisy)
    assert clean, "No data with common filenames"
    assert noisy, "No data with common filenames"
    X = np.zeros([len(clean), maxlen + 16, fft_size], np.float32)
    Y = np.zeros([len(clean), maxlen, fft_size], np.float32)
    sel = np.random.random(len(clean)) > ratio
    for ix, (cl, ns) in enumerate(zip(clean, noisy)):
        print("Loading file", ix)
        cl, ns = open_sound(cl), open_sound(ns)
        assert cl[0] == ns[0]
        cl, ns = cl[1], ns[1]
        if len(ns.shape) > 1:
            ns = ns[:, 0]
        spec = -np.log(np.abs(stft(cl, n_fft=n_fft))**2 + 2e-12).T[:maxlen]
        spec = np.pad(spec, ((16, maxlen - spec.shape[0]), (0, 0)),
                      'constant',
                      constant_values=-np.log(2e-12))
        X[ix, :, :] = spec
        spec = -np.log(np.abs(stft(ns, n_fft=n_fft))**2 + 2e-12).T[:maxlen]
        spec = np.pad(spec, ((0, maxlen - spec.shape[0]), (0, 0)),
                      'constant',
                      constant_values=-np.log(2e-12))
        Y[ix, :, :] = spec
    return [X[sel], Y[sel]], [X[~sel], Y[~sel]]
コード例 #2
0
 def _load(self, key):
     flist = glob.glob(self.index_dict[key])
     if not len(flist):
         raise RuntimeError(
             "Could not find file matches template \'{}\'".format(
                 self.index_dict[key]))
     if len(flist) == 1:
         return stft(flist[0], **self.stft_kwargs)
     else:
         return np.array(
             [stft(f, **self.stft_kwargs) for f in sorted(flist)])
コード例 #3
0
 def __getitem__(self,
                 index,
                 mus_win=512,
                 mus_hop=256,
                 eeg_win=32,
                 eeg_hop=2):
     """Generates one sample of data"""
     # Select sample
     index_rand = round(np.random.uniform(0, len(self.eeg) - 1))
     X = chunker(self.eeg[index_rand], self.music[index_rand],
                 self.sample_len, self.eeg_sr, self.sr)
     X_m = stft(X[1], self.sr, mus_win, mus_hop)
     X_e = stft_eeg(X[0], self.sr, eeg_win, eeg_hop)
     if self.use_noise:
         X_m = add_rand_noise(abs(X_m))
         X_e = add_rand_noise(abs(X_e))
     X_m = to_log(abs(X_m) + 1e-6)
     X_e = z_norm(X_e)
     X_e = to_log(abs(X_e) + 1e-6)
     X_e = z_norm(X_e)
     X_e = torch.tensor(X_e).float()
     X_m = torch.tensor(X_m).float()
     X_m = (X_m - X_m.mean(dim=0, keepdim=True)) / (
         X_m.std(dim=0, keepdim=True) + 1e-6)
     for i in np.arange(X_e.size(0)):
         X_e[i] = (X_e[i] - X_e[i].mean(dim=0, keepdim=True)) / (
             X_e[i].std(dim=0, keepdim=True) + 1e-6)
     return X_e, X_m
コード例 #4
0
ファイル: ex5_mfcc.py プロジェクト: shin04/B4Lecture-2021
def calc_mfcc(wav, hop, win_length, filterbank):
    """
    Calculate Mel Frequency Cepstrum Coeffcient(MFCC).

    Parameters:
        wav : ndarray, real-valued
            Time series of measurement values.
        hop : float
            Hop (Overlap) size.
        win_length : int
            Window size.
        filter_bank : ndarray
            mel filter bank

    Returns:
        mel_spec : ndarray (n_channels, n_frames)
            Mel scale spectrogram.
        mfcc : ndarray (n_channels, n_frames)
            Mel Frequency Cepstrum Coeffcient(MFCC).
    """
    pre_wav = utils.pre_emphasis(wav, p=0.97)
    spec = utils.stft(pre_wav, hop=hop, win_length=win_length)
    # hop_length = int(win_length * hop)
    # spec = spec[:, :hop_length]
    mel_spec = np.dot(filterbank, np.abs(spec[:-1]))

    mfcc = np.zeros_like(mel_spec)
    for i in range(mel_spec.shape[1]):
        mfcc[:, i] = dct(mel_spec[:, i], type=2, norm="ortho", axis=-1)

    return mel_spec, mfcc
コード例 #5
0
ファイル: sig_process.py プロジェクト: Kitsunetic/sep_content
def get_feats(audio):
    """
    Function to get acoustic input features, starting with STFT, needs to be extended to include MFCCs, will ask how many coefficients to use.
    """

    # stft = librosa.core.stft(audio, n_fft = config.nfft, hop_length = config.hopsize, window = config.window).T

    stft = utils.stft(audio,
                      window=config.window,
                      hopsize=config.hopsize,
                      nfft=config.nfft,
                      fs=config.fs)

    assert abs(stft).max() <= 1.0

    # voc_stft_mag = 2 * abs(voc_stft)/np.sum(config.window)

    # voc_stft_phase = np.angle(voc_stft)

    # cqt = librosa.core.cqt(audio, sr = config.fs, hop_length = config.hopsize, n_bins = config.cqt_bins, fmin = config.fmin, bins_per_octave = config.bins_per_octave).T

    # hcqt = get_hcqt(audio)

    # hcqt = np.swapaxes(hcqt, 0,1)

    return stft
コード例 #6
0
def run(args):
    num_bins, config_dict = parse_yaml(args.config)
    dataloader_conf = config_dict["dataloader"]
    spectrogram_conf = config_dict["spectrogram_reader"]
    # Load cmvn
    dict_mvn = dataloader_conf["mvn_dict"]
    if dict_mvn:
        if not os.path.exists(dict_mvn):
            raise FileNotFoundError("Could not find mvn files")
        with open(dict_mvn, "rb") as f:
            dict_mvn = pickle.load(f)
    # default: True
    apply_log = dataloader_conf[
        "apply_log"] if "apply_log" in dataloader_conf else True

    dcnet = PITNet(num_bins, **config_dict["model"])

    frame_length = spectrogram_conf["frame_length"]
    frame_shift = spectrogram_conf["frame_shift"]
    window = spectrogram_conf["window"]

    separator = Separator(dcnet, args.state_dict, cuda=args.cuda)

    utt_dict = parse_scps(args.wave_scp)
    num_utts = 0
    for key, utt in utt_dict.items():
        try:
            samps, stft_mat = stft(utt,
                                   frame_length=frame_length,
                                   frame_shift=frame_shift,
                                   window=window,
                                   center=True,
                                   return_samps=True)
        except FileNotFoundError:
            print("Skip utterance {}... not found".format(key))
            continue
        print("Processing utterance {}".format(key))
        num_utts += 1
        norm = np.linalg.norm(samps, np.inf)
        spk_mask, spk_spectrogram = separator.seperate(stft_mat,
                                                       cmvn=dict_mvn,
                                                       apply_log=apply_log)

        for index, stft_mat in enumerate(spk_spectrogram):
            istft(os.path.join(args.dump_dir,
                               '{}.spk{}.wav'.format(key, index + 1)),
                  stft_mat,
                  frame_length=frame_length,
                  frame_shift=frame_shift,
                  window=window,
                  center=True,
                  norm=norm,
                  fs=8000,
                  nsamps=samps.size)
            if args.dump_mask:
                sio.savemat(
                    os.path.join(args.dump_dir,
                                 '{}.spk{}.mat'.format(key, index + 1)),
                    {"mask": spk_mask[index]})
    print("Processed {} utterance!".format(num_utts))
コード例 #7
0
ファイル: features.py プロジェクト: nwang57/genreClassifier
def spectral_flux(wavedata, window_size, sample_rate):
    magnitude_spectrum = stft(wavedata, window_size)
    timebins, freqbins = np.shape(magnitude_spectrum)

    timestamps = (np.arange(0,timebins - 1) * (timebins / float(sample_rate)))

    sf = np.sqrt(np.sum(np.diff(np.abs(magnitude_spectrum))**2, axis=1)) / freqbins

    return sf[1:], np.asarray(timestamps)
コード例 #8
0
ファイル: features.py プロジェクト: nwang57/genreClassifier
def spectral_flux(wavedata, window_size, sample_rate):
    magnitude_spectrum = stft(wavedata, window_size)
    timebins, freqbins = np.shape(magnitude_spectrum)

    timestamps = (np.arange(0, timebins - 1) * (timebins / float(sample_rate)))

    sf = np.sqrt(np.sum(np.diff(np.abs(magnitude_spectrum))**2,
                        axis=1)) / freqbins

    return sf[1:], np.asarray(timestamps)
コード例 #9
0
def main():

    # maximus=np.zeros(66)
    # minimus=np.ones(66)*1000
    wav_files = [
        x for x in os.listdir(config.wav_dir_mus)
        if x.endswith('.stem.mp4') and not x.startswith(".")
    ]

    count = 0

    for lf in wav_files:

        # lf = "Actions - One Minute Smile.stem.mp4"
        # print(lf)
        audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_mus, lf),
                                       stem_id=[0, 1, 2, 3, 4])

        mixture = audio[0]

        drums = audio[1]

        bass = audio[2]

        acc = audio[3]

        vocals = audio[4]

        # out_feats = utils.stft_to_feats((vocals[:,0]+vocals[:,1])/2,fs)

        # utils.feats_to_audio(out_feats,lf,fs=fs)

        # import pdb;pdb.set_trace()

        backing = np.clip(drums + bass + acc, 0.0, 1.0)

        if len(backing.shape) == 2:
            backing = (backing[:, 0] + backing[:, 1]) / 2

        # import pdb;pdb.set_trace()

        back_stft = abs(utils.stft(backing))

        hdf5_file = h5py.File(config.backing_dir + 'mus_' + lf[:-9] + '.hdf5',
                              mode='w')

        hdf5_file.create_dataset("back_stft", back_stft.shape, np.float32)

        hdf5_file["back_stft"][:, :] = back_stft

        hdf5_file.close()

        count += 1

        utils.progress(count, len(wav_files))
コード例 #10
0
def main():

    # maximus=np.zeros(66)
    # minimus=np.ones(66)*1000

    wav_files = [x for x in os.listdir(config.wav_dir) if x.endswith('.wav')]
    count = 0

    for lf in wav_files:
        # print(lf)
        audio, fs = sf.read(os.path.join(config.wav_dir, lf))

        vocals = np.array(audio[:, 1])

        mixture = np.clip(audio[:, 0] + audio[:, 1], 0.0, 1.0)

        backing = np.array(audio[:, 0])

        voc_stft = abs(utils.stft(vocals))
        mix_stft = abs(utils.stft(mixture))
        back_stft = abs(utils.stft(backing))

        assert voc_stft.shape == mix_stft.shape

        out_feats = utils.input_to_feats(os.path.join(config.wav_dir, lf))

        out_feats = np.concatenate(
            ((out_feats, np.zeros((1, out_feats.shape[1])))))

        assert out_feats.shape[0] == voc_stft.shape[0]

        np.save(config.dir_npy + lf[:-4] + '_voc_stft', voc_stft)
        np.save(config.dir_npy + lf[:-4] + '_mix_stft', mix_stft)
        np.save(config.dir_npy + lf[:-4] + '_back_stft', back_stft)
        np.save(config.dir_npy + lf[:-4] + '_synth_feats', out_feats)

        count += 1
        utils.progress(count, len(wav_files))
    import pdb
    pdb.set_trace()
コード例 #11
0
def test_stft():
    nperseg = 4
    noverlap = nperseg // 2

    x = normalize(np.arange(nperseg * 4, dtype=np.float32))

    a = librosa.core.stft(x,
                          window='hamming',
                          n_fft=nperseg,
                          hop_length=nperseg - noverlap,
                          center=False)
    b = stft(x, nperseg=nperseg, noverlap=noverlap)
    assert np.allclose(a, b)
コード例 #12
0
ファイル: features.py プロジェクト: nwang57/genreClassifier
def spectual_centroid(wavedata, window_size, sample_rate):
    magnitude_spectrum = stft(wavedata, window_size)

    timebins, freqbins = np.shape(magnitude_spectrum)
    timestamps = np.arange(0,timebins-1) * (timebins / float(sample_rate))

    sc = []
    for t in range(timebins-1):
        power_spectrum = np.abs(magnitude_spectrum[t])**2
        sc_t = np.sum(power_spectrum * np.arange(1, freqbins+1)) / np.sum(power_spectrum)
        sc.append(sc_t)

    sc = np.asarray(sc)
    sc = np.nan_to_num(sc)

    return sc, np.asarray(timestamps)
コード例 #13
0
    def read_wav_file(self, file_name):

        audio, fs = librosa.core.load(file_name, sr=config.fs)

        audio = np.float64(audio)

        if len(audio.shape) == 2:

            vocals = np.array((audio[:,1]+audio[:,0])/2)

        else: 
            vocals = np.array(audio)

        voc_stft = np.clip(abs(utils.stft(vocals, hopsize = config.hopsize, nfft = config.nfft, fs = config.fs, window = config.window)), 0.0, 1.0)

        return voc_stft
コード例 #14
0
ファイル: features.py プロジェクト: nwang57/genreClassifier
def spectual_centroid(wavedata, window_size, sample_rate):
    magnitude_spectrum = stft(wavedata, window_size)

    timebins, freqbins = np.shape(magnitude_spectrum)
    timestamps = np.arange(0, timebins - 1) * (timebins / float(sample_rate))

    sc = []
    for t in range(timebins - 1):
        power_spectrum = np.abs(magnitude_spectrum[t])**2
        sc_t = np.sum(power_spectrum *
                      np.arange(1, freqbins + 1)) / np.sum(power_spectrum)
        sc.append(sc_t)

    sc = np.asarray(sc)
    sc = np.nan_to_num(sc)

    return sc, np.asarray(timestamps)
コード例 #15
0
ファイル: features.py プロジェクト: nwang57/genreClassifier
def spectral_rolloff(wavedata, window_size, sample_rate, k=0.85):
    magnitude_spectrum = stft(wavedata, window_size)
    power_spectrum = np.abs(magnitude_spectrum) ** 2
    timebins, freqbins = np.shape(magnitude_spectrum)

    timestamps = (np.arange(0,timebins - 1) * (timebins / float(sample_rate)))

    sr = []

    spectral_sum = np.sum(power_spectrum, axis=1)
    for t in range(timebins-1):
        sr_t = np.where(np.cumsum(power_spectrum[t,:]) >= k * spectral_sum[t])[0][0]
        sr.append(sr_t)

    sr = np.asarray(sr).astype(float)
    sr = (sr / freqbins) * (sample_rate / 2.0)
    return sr, np.asarray(timestamps)
コード例 #16
0
ファイル: features.py プロジェクト: nwang57/genreClassifier
def spectral_rolloff(wavedata, window_size, sample_rate, k=0.85):
    magnitude_spectrum = stft(wavedata, window_size)
    power_spectrum = np.abs(magnitude_spectrum)**2
    timebins, freqbins = np.shape(magnitude_spectrum)

    timestamps = (np.arange(0, timebins - 1) * (timebins / float(sample_rate)))

    sr = []

    spectral_sum = np.sum(power_spectrum, axis=1)
    for t in range(timebins - 1):
        sr_t = np.where(
            np.cumsum(power_spectrum[t, :]) >= k * spectral_sum[t])[0][0]
        sr.append(sr_t)

    sr = np.asarray(sr).astype(float)
    sr = (sr / freqbins) * (sample_rate / 2.0)
    return sr, np.asarray(timestamps)
コード例 #17
0
ファイル: data_loader.py プロジェクト: sovaai/sova-asr
def preprocess(audio_path, sample_rate=16000, window_size=0.02, window_stride=0.01, window='hamming'):
    audio = load_audio(audio_path, sample_rate)
    nfft = int(sample_rate * window_size)
    win_length = nfft
    hop_length = int(sample_rate * window_stride)

    d = stft(audio, n_fft=nfft, hop_length=hop_length,
             win_length=win_length, window=window)

    spect, phase = magphase(d)
    pcen_result = pcen2(e=spect, sr=sample_rate, hop_length=hop_length)
    mean_pcen = pcen_result.mean()
    std_pcen = pcen_result.std()

    pcen_result = np.add(pcen_result, -mean_pcen)
    pcen_result = pcen_result / std_pcen

    return pcen_result
コード例 #18
0
    def read_wav_file(self, file_name):

        audio, fs = librosa.core.load(file_name, sr=config.fs)

        audio = np.float64(audio)

        if len(audio.shape) == 2:

            vocals = np.array((audio[:,1]+audio[:,0])/2)

        else: 
            vocals = np.array(audio)

        voc_stft = abs(utils.stft(vocals))

        feats = utils.stft_to_feats(vocals,fs)

        voc_stft = np.clip(voc_stft, 0.0, 1.0)

        return voc_stft, feats
コード例 #19
0
ファイル: dae.py プロジェクト: PICTEC/DASH
 def load(self, path):
     fnames = list_sounds(path)
     fnames = random.sample(fnames, self.n_records)
     max_len = max([len(open_sound(x)[1]) for x in fnames])
     max_len = 1 + (max_len - 512) // 128
     self.X = np.ones([self.n_records, max_len, 257], np.float32)
     self.Y = np.ones([self.n_records, max_len, 257], np.float32)
     self.X *= np.log(2e-12)
     self.Y *= np.log(2e-12)
     for ix, fname in enumerate(fnames):
         sr, rec = open_sound(fname)
         assert sr == 16000
         rec = np.log(2e-12 + np.abs(
             stft(rec.astype(np.float32) / (2**15),
                  n_fft=512,
                  hop_length=128).T[:max_len])**2)
         self.X[ix, :len(rec)] = self.mask(rec)
         self.Y[ix, :len(rec)] = rec
     return ([self.X[:self.train], self.Y[:self.train]], [
         self.X[self.train:self.train + self.valid],
         self.Y[self.train:self.train + self.valid]
     ], [self.X[-self.test:], self.Y[-self.test:]])
コード例 #20
0
def run(args):
    num_bins, config_dict = parse_yaml(args.config)
    # Load cmvn
    dict_mvn = config_dict["dataloader"]["mvn_dict"]
    if dict_mvn:
        if not os.path.exists(dict_mvn):
            raise FileNotFoundError("Could not find mvn files")
        with open(dict_mvn, "rb") as f:
            dict_mvn = pickle.load(f)

    dcnet = DCNet(num_bins, **config_dict["dcnet"])

    frame_length = config_dict["spectrogram_reader"]["frame_length"]
    frame_shift = config_dict["spectrogram_reader"]["frame_shift"]
    window = config_dict["spectrogram_reader"]["window"]

    cluster = DeepCluster(dcnet,
                          args.dcnet_state,
                          args.num_spks,
                          pca=args.dump_pca,
                          cuda=args.cuda)

    utt_dict = parse_scps(args.wave_scp)
    num_utts = 0
    for key, utt in utt_dict.items():
        try:
            samps, stft_mat = stft(utt,
                                   frame_length=frame_length,
                                   frame_shift=frame_shift,
                                   window=window,
                                   center=True,
                                   return_samps=True)
        except FileNotFoundError:
            print("Skip utterance {}... not found".format(key))
            continue
        print("Processing utterance {}".format(key))
        num_utts += 1
        norm = np.linalg.norm(samps, np.inf)
        pca_mat, spk_mask, spk_spectrogram = cluster.seperate(stft_mat,
                                                              cmvn=dict_mvn)

        for index, stft_mat in enumerate(spk_spectrogram):
            istft(os.path.join(args.dump_dir,
                               '{}.spk{}.wav'.format(key, index + 1)),
                  stft_mat,
                  frame_length=frame_length,
                  frame_shift=frame_shift,
                  window=window,
                  center=True,
                  norm=norm,
                  fs=8000,
                  nsamps=samps.size)
            if args.dump_mask:
                sio.savemat(
                    os.path.join(args.dump_dir,
                                 '{}.spk{}.mat'.format(key, index + 1)),
                    {"mask": spk_mask[index]})
        if args.dump_pca:
            sio.savemat(os.path.join(args.dump_dir, '{}.mat'.format(key)),
                        {"pca_matrix": pca_mat})
    print("Processed {} utterance!".format(num_utts))
コード例 #21
0
def main():

    # maximus=np.zeros(66)
    # minimus=np.ones(66)*1000
    wav_files=[x for x in os.listdir(config.wav_dir) if x.endswith('.wav') and not x.startswith('.')]
    count=0


    for lf in wav_files:
        # print(lf)
        audio,fs = sf.read(os.path.join(config.wav_dir,lf))

        vocals = np.array(audio[:,1])

        mixture = (audio[:,0]+audio[:,1])*0.7

        backing = np.array(audio[:,0])

        voc_stft = abs(utils.stft(vocals))
        mix_stft = abs(utils.stft(mixture))
        back_stft = abs(utils.stft(backing))

        assert voc_stft.shape==mix_stft.shape

        out_feats = utils.stft_to_feats(vocals,fs)

        if not out_feats.shape[0]==voc_stft.shape[0] :
            if out_feats.shape[0]<voc_stft.shape[0]:
                while out_feats.shape[0]<voc_stft.shape[0]:
                    out_feats = np.concatenate(((out_feats,np.zeros((1,out_feats.shape[1])))))
            elif out_feats.shape[0]<voc_stft.shape[0]:
                print("You are an idiot")

        assert out_feats.shape[0]==voc_stft.shape[0]

        hdf5_file = h5py.File(config.voice_dir+'ikala_'+lf[:-4]+'.hdf5', mode='w')

        hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32)

        hdf5_file.create_dataset("feats", out_feats.shape, np.float32)

        hdf5_file["voc_stft"][:,:] = voc_stft

        hdf5_file["feats"][:,:] = out_feats

        hdf5_file.close()

        hdf5_file = h5py.File(config.backing_dir+'ikala_'+lf[:-4]+'.hdf5', mode='w')

        hdf5_file.create_dataset("back_stft", back_stft.shape, np.float32)

        hdf5_file.create_dataset("mix_stft", mix_stft.shape, np.float32)

        hdf5_file["back_stft"][:,:] = back_stft

        hdf5_file["mix_stft"][:,:] = mix_stft

        hdf5_file.close()

        count+=1

        utils.progress(count,len(wav_files))
コード例 #22
0
def mse(true_path, pred_path):
    true_data, _ = librosa.load(true_path)
    true_stft = utils.stft(true_data)
    pred_data, _ = librosa.load(pred_path)
    pred_stft = utils.stft(pred_data)
    return np.mean(np.square(np.subtract(true_stft, pred_stft)))
コード例 #23
0
def main():

    singers = next(os.walk(config.wav_dir_nus))[1]

    for singer in singers:
        sing_dir = config.wav_dir_nus + singer + '/sing/'
        read_dir = config.wav_dir_nus + singer + '/read/'
        sing_wav_files = [
            x for x in os.listdir(sing_dir)
            if x.endswith('.wav') and not x.startswith('.')
        ]

        count = 0

        print("Processing singer %s" % singer)
        for lf in sing_wav_files:

            audio, fs = librosa.core.load(os.path.join(sing_dir, lf),
                                          sr=config.fs)

            audio = np.float64(audio)

            if len(audio.shape) == 2:

                vocals = np.array((audio[:, 1] + audio[:, 0]) / 2)

            else:
                vocals = np.array(audio)

            voc_stft = abs(utils.stft(vocals))

            out_feats = utils.stft_to_feats(vocals, fs)

            strings_p = process_lab_file(
                os.path.join(sing_dir, lf[:-4] + '.txt'), len(voc_stft))

            voc_stft, out_feats, strings_p = utils.match_time(
                [voc_stft, out_feats, strings_p])

            hdf5_file = h5py.File(config.voice_dir + 'nus_' + singer +
                                  '_sing_' + lf[:-4] + '.hdf5',
                                  mode='a')

            if not "phonemes" in hdf5_file:

                hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int)

            hdf5_file["phonemes"][:, ] = strings_p[:, 0]

            hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32)

            hdf5_file.create_dataset("feats", out_feats.shape, np.float32)

            hdf5_file["voc_stft"][:, :] = voc_stft

            hdf5_file["feats"][:, :] = out_feats

            hdf5_file.close()

            count += 1

            utils.progress(count, len(sing_wav_files))

        read_wav_files = [
            x for x in os.listdir(read_dir)
            if x.endswith('.wav') and not x.startswith('.')
        ]
        print("Processing reader %s" % singer)
        count = 0

        for lf in read_wav_files:
            audio, fs = librosa.core.load(os.path.join(read_dir, lf),
                                          sr=config.fs)

            audio = np.float64(audio)

            if len(audio.shape) == 2:

                vocals = np.array((audio[:, 1] + audio[:, 0]) / 2)

            else:
                vocals = np.array(audio)

            voc_stft = abs(utils.stft(vocals))

            out_feats = utils.stft_to_feats(vocals, fs)

            strings_p = process_lab_file(
                os.path.join(read_dir, lf[:-4] + '.txt'), len(voc_stft))

            voc_stft, out_feats, strings_p = utils.match_time(
                [voc_stft, out_feats, strings_p])

            hdf5_file = h5py.File(config.voice_dir + 'nus_' + singer +
                                  '_read_' + lf[:-4] + '.hdf5',
                                  mode='a')

            if not "phonemes" in hdf5_file:

                hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int)

            hdf5_file["phonemes"][:, ] = strings_p[:, 0]

            hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32)

            hdf5_file.create_dataset("feats", out_feats.shape, np.float32)

            hdf5_file["voc_stft"][:, :] = voc_stft

            hdf5_file["feats"][:, :] = out_feats

            hdf5_file.close()

            count += 1

            utils.progress(count, len(read_wav_files))
コード例 #24
0
def test_stft():
    data = DATA[0]
    spec, f, t = stft(data, 44100, 1024, 512)
    print np.sqrt(spec[:, 50] * 44100 / 1024)
コード例 #25
0
    parser.add_option("-f", type="float", dest="end_time", default=16)
    (options, args) = parser.parse_args()

    if len(args) == 0:
        filename = "Queen_mono.wav"
    else:
        filename = args[0]
    print options

    window = 1024
    step = window / 4

    (nyq, signal) = utils.slurp_wav(filename, int(options.start_time * 44100),
                                    int(44100 * options.end_time))
    print "computing spectrogram"
    spectrogram = utils.stft(signal)
    print "computing power"
    power = utils.estimate_spectral_power(spectrogram)

    print "whitening spectrum"
    whitened = spectrogram / np.sqrt(power)
    whitened = utils.normalize_total_power(whitened,
                                           utils.total_power(spectrogram))

    print "unwhitening spectrum"
    unwhitened = whitened * np.sqrt(power)
    unwhitened = utils.normalize_total_power(unwhitened,
                                             utils.total_power(spectrogram))

    print "resynthesizing from whitened-unwhitened spectrogram"
    resynth = utils.resynthesize(unwhitened)
コード例 #26
0
def test_stft():
    data = DATA[0]
    spec, f, t = stft(data, 44100, 1024, 512)
    print np.sqrt(spec[:,50] * 44100 / 1024)
コード例 #27
0
ファイル: separate.py プロジェクト: jhuiac/deep-clustering
def run(args):
    num_bins, config_dict = parse_yaml(args.config)
    # Load cmvn
    dict_mvn = config_dict["dataloader"]["mvn_dict"]
    if dict_mvn:
        if not os.path.exists(dict_mvn):
            raise FileNotFoundError("Could not find mvn files")
        with open(dict_mvn, "rb") as f:
            dict_mvn = pickle.load(f)

    dcnet = DCNet(num_bins, **config_dict["dcnet"])

    frame_length = config_dict["spectrogram_reader"]["frame_length"]
    frame_shift = config_dict["spectrogram_reader"]["frame_shift"]
    window = config_dict["spectrogram_reader"]["window"]

    cluster = DeepCluster(
        dcnet,
        args.dcnet_state,
        args.num_spks,
        pca=args.dump_pca,
        cuda=args.cuda)

    utt_dict = parse_scps(args.wave_scp)
    num_utts = 0
    for key, utt in utt_dict.items():
        try:
            samps, stft_mat = stft(
                utt,
                frame_length=frame_length,
                frame_shift=frame_shift,
                window=window,
                center=True,
                return_samps=True)
        except FileNotFoundError:
            print("Skip utterance {}... not found".format(key))
            continue
        print("Processing utterance {}".format(key))
        num_utts += 1
        norm = np.linalg.norm(samps, np.inf)
        pca_mat, spk_mask, spk_spectrogram = cluster.seperate(
            stft_mat, cmvn=dict_mvn)

        for index, stft_mat in enumerate(spk_spectrogram):
            istft(
                os.path.join(args.dump_dir, '{}.spk{}.wav'.format(
                    key, index + 1)),
                stft_mat,
                frame_length=frame_length,
                frame_shift=frame_shift,
                window=window,
                center=True,
                norm=norm,
                fs=8000,
                nsamps=samps.size)
            if args.dump_mask:
                sio.savemat(
                    os.path.join(args.dump_dir, '{}.spk{}.mat'.format(
                        key, index + 1)), {"mask": spk_mask[index]})
        if args.dump_pca:
            sio.savemat(
                os.path.join(args.dump_dir, '{}.mat'.format(key)),
                {"pca_matrix": pca_mat})
    print("Processed {} utterance!".format(num_utts))
コード例 #28
0
ファイル: dataset.py プロジェクト: jhuiac/deep-clustering
 def _load(self, key):
     return stft(self.wave_dict[key], **self.stft_kwargs)
コード例 #29
0
def train(epoch, model, optimizer, scaler, scheduler, log_train, args):
    global global_step
    global start_time

    epoch_loss = 0.0
    running_loss = [0., 0., 0.]
    log_interval = args.log_interval
    synth_interval = args.synth_interval

    timestemp = time.time()
    model.train()
    criterion_frame = nn.MSELoss()

    for batch_idx, (x, c) in enumerate(train_loader):
        global_step += 1
        optimizer.zero_grad()
        with autocast():
            x, c = x.to(device), c.to(device)
            log_p, log_det = model(x, c)
            loss = -(log_p + log_det)

        scaler.scale(loss).backward()

        with autocast():
            z = torch.randn_like(x)
            y_gen = model.reverse(z, c)

        stft_est = stft(y_gen[:, 0], scale='linear')
        stft_gt = stft(x[:, 0], scale='linear')
        loss_frame = 0.005 * criterion_frame(stft_est, stft_gt)
        scaler.scale(loss_frame).backward()

        if torch.isnan(loss) or torch.isnan(loss_frame):
            continue

        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        running_loss[0] += loss.item()
        running_loss[1] += log_p.item()
        running_loss[2] += log_det.item()

        epoch_loss += loss.item()

        if (batch_idx + 1) % log_interval == 0:
            epoch_step = batch_idx + 1
            running_loss[0] /= log_interval
            running_loss[1] /= log_interval
            running_loss[2] /= log_interval
            avg_rn_loss = np.array(running_loss)
            avg_time = (time.time() - timestemp) / log_interval

            print(
                'Global Step : {}, [{}, {}] [NLL, Log p(z), Log Det] : {}, STFT_loss: {}, avg time: {:0.4f}'
                .format(global_step, epoch, epoch_step, avg_rn_loss,
                        loss_frame.item(), avg_time))

            state = {}
            state['Global Step'] = global_step
            state['Epoch'] = epoch
            state['Epoch Step'] = epoch_step
            state['NLL, Log p(z), Log Det'] = running_loss
            state['avg time'] = avg_time
            state['total time'] = time.time() - start_time
            log_train.write('%s\n' % json.dumps(state))
            log_train.flush()

            timestemp = time.time()
            running_loss = [0., 0., 0.]

        if (batch_idx + 1) % synth_interval == 0:
            with torch.no_grad():
                synthesize(model, args.num_sample, args.sr)
            model.train()

        del x, c, log_p, log_det, loss

    del running_loss
    gc.collect()

    print('{} Epoch Training Loss : {:.4f}'.format(
        epoch, epoch_loss / (len(train_loader))))

    return epoch_loss / len(train_loader)
コード例 #30
0
    parser.add_option("-s", type="float", dest="start_time", default=1)
    parser.add_option("-f", type="float", dest="end_time", default=16)
    (options, args) = parser.parse_args()

    if len(args)==0:
	    filename = "Queen_mono.wav"
    else:
	    filename = args[0]
    print options

    window = 1024
    step = window / 4

    (nyq, signal) = utils.slurp_wav(filename, int(options.start_time * 44100), int(44100 * options.end_time))
    print "computing spectrogram"
    spectrogram = utils.stft(signal)
    print "computing power"
    power = utils.estimate_spectral_power(spectrogram)

    print "whitening spectrum"
    whitened = spectrogram / np.sqrt(power)
    whitened = utils.normalize_total_power(whitened, utils.total_power(spectrogram))

    print "unwhitening spectrum"
    unwhitened = whitened * np.sqrt(power)
    unwhitened = utils.normalize_total_power(unwhitened, utils.total_power(spectrogram))

    print "resynthesizing from whitened-unwhitened spectrogram"
    resynth = utils.resynthesize(unwhitened)
    wavfile.write("resynth.wav", int(2 * nyq), resynth)
コード例 #31
0
def main():

    # maximus=np.zeros(66)
    # minimus=np.ones(66)*1000
    singers = next(os.walk(config.wav_dir_nus))[1]
    # singers = [x for x in singers if x not in["VKOW","SAMF","MPUR","JLEE","KENN"]]
    # import pdb;pdb.set_trace()

    # phonemas = set([])
    

    for singer in singers:
        sing_dir = config.wav_dir_nus+singer+'/sing/'
        read_dir = config.wav_dir_nus+singer+'/read/'
        sing_wav_files=[x for x in os.listdir(sing_dir) if x.endswith('.wav') and not x.startswith('.')]

        count = 0

        print ("Processing singer %s" % singer)
        for lf in sing_wav_files:
        # print(lf)
            # if not os.path.exists(config.voice_dir+'nus_'+singer+'_sing_'+lf[:-4]+'.hdf5'):

            audio,fs = sf.read(os.path.join(sing_dir,lf))
            if fs !=config.fs:
                command = "ffmpeg -y -i "+os.path.join(sing_dir,lf)+" -ar "+str(config.fs)+" "+os.path.join(sing_dir,lf)
                os.system(command)
            audio,fs = sf.read(os.path.join(sing_dir,lf))

            if len(audio.shape) == 2:

                vocals = np.array((audio[:,1]+audio[:,0])/2)

            else: 
                vocals = np.array(audio)

            voc_stft = abs(utils.stft(vocals))



            lab_f = open(os.path.join(sing_dir,lf[:-4]+'.txt'))
            # note_f=open(in_dir+lf[:-4]+'.notes')
            phos = lab_f.readlines()
            lab_f.close()

            phonemes=[]

            for pho in phos:
                st,end,phonote=pho.split()
                # import pdb;pdb.set_trace()
                st = int(np.round(float(st)/0.005804576860324892))
                en = int(np.round(float(end)/0.005804576860324892))
                if phonote=='pau' or phonote=='br':
                    phonote='sil'
                phonemes.append([st,en,phonote])
                # phonemas.add(phonote)

            # div_fac = float(end)/len(voc_stft)

            # for i in range(len(phonemes)):
            #     phonemes[i][0] = int(float(phonemes[i][0])/div_fac)
            #     phonemes[i][1] = int(float(phonemes[i][1])/div_fac)

            # import pdb;pdb.set_trace()

            phonemes[-1][1] = len(voc_stft)




            strings_p = np.zeros(phonemes[-1][1])

            for i in range(len(phonemes)):
                pho=phonemes[i]
                value = config.phonemas.index(pho[2])
                strings_p[pho[0]:pho[1]+1] = value

            # import pdb;pdb.set_trace()

            if not len(strings_p) == len(voc_stft):
                import pdb;pdb.set_trace()


            # out_feats = utils.stft_to_feats(vocals,fs)


            # if not out_feats.shape[0]==voc_stft.shape[0] :
            #     if out_feats.shape[0]<voc_stft.shape[0]:
            #         while out_feats.shape[0]<voc_stft.shape[0]:
            #             out_feats = np.concatenate(((out_feats,np.zeros((1,out_feats.shape[1])))))
            #     elif out_feats.shape[0]<voc_stft.shape[0]:
            #         print("You are an idiot")

            # assert out_feats.shape[0]==voc_stft.shape[0]

            hdf5_file = h5py.File(config.voice_dir+'nus_'+singer+'_sing_'+lf[:-4]+'.hdf5', mode='a')
            # import pdb;pdb.set_trace()

            if not  "phonemes" in hdf5_file:

                hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int)

            hdf5_file["phonemes"][:,] = strings_p

            # hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32)

            # hdf5_file.create_dataset("feats", out_feats.shape, np.float32)

            # hdf5_file["voc_stft"][:,:] = voc_stft

            # hdf5_file["feats"][:,:] = out_feats


            hdf5_file.create_dataset("voc_stft_phase", voc_stft_phase.shape, np.float32)

            # hdf5_file["phonemes"][:,] = strings_p

            # hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32)

            # hdf5_file.create_dataset("feats", out_feats.shape, np.float32)

            # hdf5_file["voc_stft"][:,:] = voc_stft
            hdf5_file["voc_stft_phase"][:,:] = voc_stft_phase

            hdf5_file.close()






            count+=1

            utils.progress(count,len(sing_wav_files))


        read_wav_files=[x for x in os.listdir(read_dir) if x.endswith('.wav') and not x.startswith('.')]
        print ("Processing reader %s" % singer)
        count = 0
        if not singer == 'KENN':
            for lf in sing_wav_files:

                # if not os.path.exists(config.voice_dir+'nus_'+singer+'_read_'+lf[:-4]+'.hdf5'):
            # print(lf)
                audio,fs = sf.read(os.path.join(read_dir,lf))
                if fs !=config.fs:
                    command = "ffmpeg -y -i "+os.path.join(read_dir,lf)+" -ar "+str(config.fs)+" "+os.path.join(read_dir,lf)
                    os.system(command)
                audio,fs = sf.read(os.path.join(read_dir,lf))

                if len(audio.shape) == 2:

                    vocals = np.array((audio[:,1]+audio[:,0])/2)

                else: 
                    vocals = np.array(audio)

                voc_stft = abs(utils.stft(vocals))

                lab_f = open(os.path.join(read_dir,lf[:-4]+'.txt'))
            # note_f=open(in_dir+lf[:-4]+'.notes')
                phos = lab_f.readlines()
                lab_f.close()

                phonemes=[]

                for pho in phos:
                    st,end,phonote=pho.split()
                    # import pdb;pdb.set_trace()
                    st = int(np.round(float(st)/0.005804576860324892))
                    en = int(np.round(float(end)/0.005804576860324892))
                    if phonote=='pau' or phonote=='br':
                        phonote='sil'
                    phonemes.append([st,en,phonote])
                    # phonemas.add(phonote)

                phonemes[-1][1] = len(voc_stft)

                # div_fac = float(end)/len(voc_stft)

                # for i in range(len(phonemes)):
                #     phonemes[i][0] = int(float(phonemes[i][0])/div_fac)
                #     phonemes[i][1] = int(float(phonemes[i][1])/div_fac)


                strings_p = np.zeros(phonemes[-1][1])
                for i in range(len(phonemes)):
                    pho=phonemes[i]
                    # if singer == 'KENN':
                        # import pdb;pdb.set_trace()
                    value = config.phonemas.index(pho[2])
                    strings_p[pho[0]:pho[1]+1] = value

                if not len(strings_p) == len(voc_stft):
                    import pdb;pdb.set_trace()

                    # out_feats = utils.stft_to_feats(vocals,fs)

                    

                    # if not out_feats.shape[0]==voc_stft.shape[0] :
                    #     if out_feats.shape[0]<voc_stft.shape[0]:
                    #         while out_feats.shape[0]<voc_stft.shape[0]:
                    #             out_feats = np.concatenate(((out_feats,np.zeros((1,out_feats.shape[1])))))
                    #     elif out_feats.shape[0]<voc_stft.shape[0]:
                    #         print("You are an idiot")

                    # assert out_feats.shape[0]==voc_stft.shape[0] 

                hdf5_file = h5py.File(config.voice_dir+'nus_'+singer+'_read_'+lf[:-4]+'.hdf5', mode='a')

                if not  "phonemes" in hdf5_file:

                    hdf5_file.create_dataset("phonemes", [voc_stft.shape[0]], int)

                hdf5_file["phonemes"][:,] = strings_p

                # hdf5_file.create_dataset("voc_stft", voc_stft.shape, np.float32)

                # hdf5_file.create_dataset("feats", out_feats.shape, np.float32)

                # hdf5_file["voc_stft"][:,:] = voc_stft

                # hdf5_file["feats"][:,:] = out_feats

                hdf5_file.close()

                count+=1

                utils.progress(count,len(read_wav_files))
    import pdb;pdb.set_trace()        
コード例 #32
0
ファイル: ex5_mfcc.py プロジェクト: shin04/B4Lecture-2021
def main(args):
    """
    fname = "aiueo.wav"
    """

    # get current working directory
    path = os.path.dirname(os.path.abspath(__file__))

    # load audio file
    fname = os.path.join(path, "data", args.fname)
    wav, sr = librosa.load(fname, mono=True)

    # plot signal
    plt.figure()
    ax = plt.subplot(111)
    librosa.display.waveplot(wav, sr=sr, color="g", ax=ax)
    ax.set(title="Original signal", xlabel="Time [s]", ylabel="Magnitude")
    save_fname = os.path.join(path, "result", "signal.png")
    plt.savefig(save_fname, transparent=True)
    plt.show()

    # parameter
    hop = 0.5
    win_length = 1024
    hop_length = int(win_length * hop)

    # make mel filter bank
    n_channels = 20  # the number of mel filter bank channels
    df = sr / win_length  # frequency resolution (Hz width per frequency index 1)
    filterbank, _ = melFilterBank(sr, win_length, n_channels)

    # plot mel filter bank
    for c in range(n_channels):
        plt.plot(np.arange(0, win_length / 2) * df, filterbank[c])

    plt.title("Mel filter bank")
    plt.xlabel("Frequency [Hz]")
    save_fname = os.path.join(path, "result", "MelFilterBank.png")
    plt.savefig(save_fname, transparent=True)
    plt.show()

    # spectrogram (ex1)
    fig, ax = plt.subplots(nrows=1, ncols=1)
    amp = utils.stft(wav, hop=hop, win_length=win_length)
    db = librosa.amplitude_to_db(np.abs(amp))
    img = librosa.display.specshow(
        db,
        sr=sr,
        hop_length=hop_length,
        x_axis="time",
        y_axis="linear",
        ax=ax,
        cmap="rainbow",
    )
    ax.set(title="Spectrogram", xlabel=None, ylabel="Frequency [Hz]")
    fig.colorbar(img, aspect=10, pad=0.01, ax=ax, format="%+2.f dB")
    save_fname = os.path.join(path, "result", "spectrogram.png")
    plt.savefig(save_fname, transparent=True)
    plt.show()

    fig, ax = plt.subplots(nrows=4, ncols=1, sharex=True, figsize=(10, 6))
    plt.subplots_adjust(hspace=0.6)

    # calculate mel spectrogram and mfcc
    mel_spec, mfcc = calc_mfcc(wav, hop, win_length, filterbank)

    # mel spectrogram
    wav_time = wav.shape[0] // sr
    f_nyq = sr // 2
    extent = [0, wav_time, 0, f_nyq]

    img = ax[0].imshow(
        librosa.amplitude_to_db(mel_spec),
        aspect="auto",
        extent=extent,
        cmap="rainbow",
    )
    ax[0].set(
        title="Mel spectrogram",
        xlabel=None,
        ylabel="Mel frequency [mel]",
        ylim=[0, 8000],
        yticks=range(0, 10000, 2000),
    )
    fig.colorbar(img, aspect=10, pad=0.01, ax=ax[0], format="%+2.f dB")

    # mfcc
    n_mfcc = 12
    extent = [0, wav_time, 0, n_mfcc]
    img = ax[1].imshow(np.flipud(mfcc[:n_mfcc]),
                       aspect="auto",
                       extent=extent,
                       cmap="rainbow")
    ax[1].set(
        title="MFCC sequence",
        xlabel=None,
        ylabel="MFCC",
        yticks=range(0, 13, 4),
    )
    fig.colorbar(img, aspect=10, pad=0.01, ax=ax[1], format="%+2.f dB")

    # d-mfcc
    d_mfcc = delta_mfcc(mfcc, k=2)

    img = ax[2].imshow(np.flipud(d_mfcc[:n_mfcc]),
                       aspect="auto",
                       extent=extent,
                       cmap="rainbow")
    ax[2].set(
        title="ΔMFCC sequence",
        xlabel=None,
        ylabel="ΔMFCC",
        yticks=range(0, 13, 4),
    )
    fig.colorbar(img, aspect=10, pad=0.01, ax=ax[2], format="%+2.f dB")

    # dd-mfcc
    dd_mfcc = delta_mfcc(d_mfcc, k=2)
    img = ax[3].imshow(np.flipud(dd_mfcc[:n_mfcc]),
                       aspect="auto",
                       extent=extent,
                       cmap="rainbow")
    ax[3].set(
        title="ΔΔMFCC sequence",
        xlabel="Time [s]",
        ylabel="ΔΔMFCC",
        yticks=range(0, 13, 4),
    )
    fig.colorbar(img, aspect=10, pad=0.01, ax=ax[3], format="%+2.f dB")

    save_fname = os.path.join(path, "result", "mfcc_result.png")
    plt.savefig(save_fname, transparent=True)
    plt.show()
コード例 #33
0
 def _load(self, key):
     return stft(self.wave_dict[key], **self.stft_kwargs)