def get_tempogram_features(feats): # filtered tempogram tempogram = feats['tempogram'] tgf = util.normalize( np.maximum( 0.0, tempogram - scipy.ndimage.median_filter(tempogram, size=(9, 1)))) tr = util.normalize(tempogram_ratio(tgf)) return np.median(tgf, axis=1), np.median(tr, axis=1), tempo(tgf, aggregate=np.median)
def tonnetz(y=None, sr=22050, chroma=None): if y is None and chroma is None: raise ParameterError( 'Either the audio samples or the chromagram must be ' 'passed as an argument.') if chroma is None: chroma = chroma_cqt(y=y, sr=sr) # Generate Transformation matrix dim_map = np.linspace(0, 12, num=chroma.shape[0], endpoint=False) scale = np.asarray([7. / 6, 7. / 6, 3. / 2, 3. / 2, 2. / 3, 2. / 3]) V = np.multiply.outer(scale, dim_map) # Even rows compute sin() V[::2] -= 0.5 R = np.array([ 1, 1, # Fifths 1, 1, # Minor 0.5, 0.5 ]) # Major phi = R[:, np.newaxis] * np.cos(np.pi * V) # Do the transform to tonnetz return phi.dot(util.normalize(chroma, norm=1, axis=0))
def inference(a, with_postnet=False): generator = Generator(hp.model.in_channels).to(device) state_dict_g = load_checkpoint(a.checkpoint_file, device) generator.load_state_dict(state_dict_g['generator']) filelist = os.listdir(a.input_wavs_dir) os.makedirs(a.output_dir, exist_ok=True) generator.eval() #generator.remove_weight_norm() with torch.no_grad(): for i, filename in enumerate(filelist): wav, sr = load_wav(os.path.join(a.input_wavs_dir, filename)) wav = wav / MAX_WAV_VALUE wav = normalize(wav) * 0.95 wav = torch.FloatTensor(wav) wav = wav.reshape((1, 1, wav.shape[0],)).to(device) before_y_g_hat, y_g_hat = generator(wav, with_postnet) audio = before_y_g_hat.reshape((before_y_g_hat.shape[2],)) audio = audio * MAX_WAV_VALUE audio = audio.cpu().numpy().astype('int16') output_file = os.path.join( a.output_dir, os.path.splitext(filename)[0] + '_generated.wav' ) write(output_file, hp.audio.sampling_rate, audio) print(output_file)
def extract(audio_fn): # Read and Resample the audio try: data, _ = librosa.core.load(audio_fn, sr=sampling_rate) data = normalize(data) except Exception as e: logging.exception(e) return None # ensure length if len(data) > duration: data = data[:duration] elif len(data) < duration: data = np.pad(data, (duration - len(data), ), mode='constant', constant_values=0) # spectrogram f, t, Sxx = sp.signal.spectrogram(data, fs=sampling_rate, window=window, nperseg=frame_length, noverlap=overlap_length, nfft=nfft) if mel_scale: # spectrogram -> log mel fb f_to_mel = filters.mel(sr=sampling_rate, n_fft=nfft, n_mels=n_freq_bins) Sxx = f_to_mel.dot(Sxx) Sxx = np.expand_dims(np.log(1e-8 + Sxx), axis=-1) return Sxx
def spectral_bandwidth(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, freq=None, centroid=None, norm=True, p=2): S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral bandwidth is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral bandwidth is only defined ' 'with non-negative energies') if centroid is None: centroid = spectral_centroid(y=y, sr=sr, S=S, n_fft=n_fft, hop_length=hop_length, freq=freq) # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) if freq.ndim == 1: deviation = np.abs(np.subtract.outer(freq, centroid[0])) else: deviation = np.abs(freq - centroid[0]) # Column-normalize S if norm: S = util.normalize(S, norm=1, axis=0) return np.sum(S * deviation**p, axis=0, keepdims=True)**(1./p)
def LPC(self): fft = self.fft(self.windowed_x) self.Phase(fft) self.Spectrum() invert = self.ISTFT(self.magnitude_spectrum) invert = np.array(invert).T self.correlation = util.normalize(invert, norm = np.inf) subslice = [slice(None)] * np.array(self.correlation).ndim subslice[0] = slice(self.windowed_x.shape[0]) self.correlation = np.array(self.correlation)[subslice] if not np.iscomplexobj(self.correlation): self.correlation = self.correlation.real #compute autocorrelation of the frame self.correlation.flags.writeable = False E = np.copy(self.correlation[0]) corr = np.copy(self.correlation) p = 14 reflection = np.zeros(p) lpc = np.zeros(p+1) lpc[0] = 1 temp = np.zeros(p) for i in range(1, p+1): k = float(self.correlation[i]) for j in range(i): k += self.correlation[i-j] * lpc[j] k /= E reflection[i-1] = k lpc[i] = -k for j in range(1,i): temp[j] = lpc[j] - k * lpc[i-j] for j in range(1,i): lpc[j] = temp[j] E *= (1-pow(k,2)) return lpc[1:]
def fbank(path, fft_span, hop_span, n_mels, fmin, fmax,affichage=False): """ :param path: emplacement du fichier :param fft_span: taille de la fenetre pour la transformee de fourrier en seconde :param hop_span: pas entre deux echantillons en seconde :param n_mels: nombre de bandes de frequences mel :param fmin: frequence minimale de la decomposition :param fmax: frequence maximale de la decomposition :param affichage: True si on veut afficher le spectrogramme :return: Renvoie les vecteurs fbank representant le signal X matrice representant la decomposition fbank au cours du temps (une ligne = une decomposition pour une periode hop_span, de taille n_mels) """ # 1ere facon d ouvrir un fichier # wav_signal = scipy.io.wavfile.read(path) # wav = np.array(wav_signal[1]) # s_rate = wav_signal[0] # Deuxieme facon d ouvrir un fichier wav, s_rate = librosa.load(path) X = feature.melspectrogram(util.normalize(wav), s_rate, S=None, n_fft=int(np.floor(fft_span * s_rate)), hop_length=int(np.floor(hop_span * s_rate)), n_mels=n_mels, fmin=fmin, fmax=fmax) # #Verification nombre d'echantillons (un toutes les 10ms) # size = X.shape # print 'Taille de la matrice de sortie',size # print 'Taille d un morceau de signal de 10ms que l on obtient' ,len(wav)/size[1] # print 'taille theorique d un morceau de signal',0.01*s_rate # print 's_rate',s_rate # print 'longueur',wav.shape # print wav.shape[0]/s_rate X = np.log(X) if affichage: afficherSpec(X,s_rate,hop_span) return np.transpose(X)
def audio_to_array(audio): #extract audio data and sampling rate from file data, fs = sf.read(audio) #convert to wav file at correct sampling rate sf.write(audio, data, fs) #read the audio sample audio = read(audio) #[removed] #y, sr = load(audio, offset=30, duration=5) #audio_arr = mfcc(y=y, sr=sr) #convert the audio to an array audio_arr = np.array(audio[1],dtype=float) #normalize audio_arr = normalize(audio_arr, np.inf, 0) #short-time Fourier transform audio_arr = np.abs(stft(audio_arr)) #[removed] #Mel - frequency cepstral coefficients(MFCCs) #audio_arr = np.abs(mfcc(audio_arr)) #audio_arr = mfcc(audio_arr, sr=44100) #reduce number of dimensions pca = PCA(n_components=5) audio_arr = pca.fit_transform(audio_arr) return audio_arr
def LPC(self): fft = self.fft(self.windowed_x) self.Phase(fft) self.Spectrum() invert = self.ISTFT(self.magnitude_spectrum) invert = np.array(invert).T self.correlation = util.normalize(invert, norm=np.inf) subslice = [slice(None)] * np.array(self.correlation).ndim subslice[0] = slice(self.windowed_x.shape[0]) self.correlation = np.array(self.correlation)[subslice] if not np.iscomplexobj(self.correlation): self.correlation = self.correlation.real #compute autocorrelation of the frame self.correlation.flags.writeable = False E = np.copy(self.correlation[0]) corr = np.copy(self.correlation) p = 14 reflection = np.zeros(p) lpc = np.zeros(p + 1) lpc[0] = 1 temp = np.zeros(p) for i in range(1, p + 1): k = float(self.correlation[i]) for j in range(i): k += self.correlation[i - j] * lpc[j] k /= E reflection[i - 1] = k lpc[i] = -k for j in range(1, i): temp[j] = lpc[j] - k * lpc[i - j] for j in range(1, i): lpc[j] = temp[j] E *= (1 - pow(k, 2)) return lpc[1:]
def chroma_cqt(y=None, sr=22050, C=None, hop_length=512, fmin=None, norm=np.inf, threshold=0.0, tuning=None, n_chroma=12, n_octaves=7, window=None, bins_per_octave=None, cqt_mode='full'): cqt_func = {'full': cqt, 'hybrid': hybrid_cqt} if bins_per_octave is None: bins_per_octave = n_chroma # Build the CQT if we don't have one already if C is None: C = np.abs(cqt_func[cqt_mode](y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_octaves * bins_per_octave, bins_per_octave=bins_per_octave, tuning=tuning)) # Map to chroma cq_to_chr = filters.cq_to_chroma(C.shape[0], bins_per_octave=bins_per_octave, n_chroma=n_chroma, fmin=fmin, window=window) chroma = cq_to_chr.dot(C) if threshold is not None: chroma[chroma < threshold] = 0.0 # Normalize if norm is not None: chroma = util.normalize(chroma, norm=norm, axis=0) return chroma
def spectral_centroid(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, freq=None): S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length) if not np.isrealobj(S): raise ParameterError('Spectral centroid is only defined ' 'with real-valued input') elif np.any(S < 0): raise ParameterError('Spectral centroid is only defined ' 'with non-negative energies') # Compute the center frequencies of each bin if freq is None: freq = fft_frequencies(sr=sr, n_fft=n_fft) if freq.ndim == 1: freq = freq.reshape((-1, 1)) # Column-normalize S return np.sum(freq * util.normalize(S, norm=1, axis=0), axis=0, keepdims=True)
def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None): if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm)**2 win_sq = librosa_util.pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x
def ChordSignature7thbass(chordid, feature, sevenths=True, inv=True): if chordid == const.N_CHORDS - 1: return "N" chroma = normalize(feature[:, 12:24], axis=1) bass_chroma = normalize(feature[:, :12], axis=1) root_note = chordid % 12 third_note = (root_note + 4 - (chordid // 12)) % 12 fifth_note = (root_note + 7) % 12 seventh_note = (root_note + 10) % 12 majseventh_note = (root_note + 11) % 12 mean_root = np.mean(bass_chroma[:, root_note]) mean_3rd = np.mean(bass_chroma[:, third_note]) mean_5th = np.mean(bass_chroma[:, fifth_note]) mean_7th = np.mean(chroma[:, seventh_note]) mean_maj7th = np.mean(chroma[:, majseventh_note]) root = PitchChr[root_note] quality = OutputQualityList[chordid // 12] bass = "" #determine seventh if sevenths: if (mean_7th > 0.5) or (mean_maj7th > 0.5): if mean_7th >= mean_maj7th: if quality == "min": quality = "min7" else: quality = "7" else: if quality == "maj": quality = "maj7" else: quality = "minmaj7" #determine bass if inv: if (mean_3rd > 0.6 and mean_3rd > mean_root) or (mean_5th > 0.6 and mean_5th > mean_root): if mean_3rd > mean_5th: if (quality == "min") or (quality == "min7"): bass = "b3" else: bass = "3" else: bass = "5" sign = "%s:%s" % (root, quality) if bass != "": sign += ("/" + bass) return sign
def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None): # 总共800长度,n:总共解析多少个针 """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. This is used to estimate modulation effects induced by windowing observations in short-time fourier transforms. Parameters ---------- window : string, tuple, number, callable, or list-like Window specification, as in `get_window` n_frames : int > 0 The number of analysis frames hop_length : int > 0 The number of samples to advance between frames win_length : [optional] The length of the window function. By default, this matches `n_fft`. n_fft : int > 0 The length of each analysis frame. dtype : np.dtype The data type of the output Returns ------- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` The sum-squared envelope of the window function """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) #总长 x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) #采样函数 win_sq = librosa_util.normalize(win_sq, norm=norm)**2 #平方 win_sq = librosa_util.pad_center(win_sq, n_fft) #填充0. 结果长度是n_fft,如果win_length指定了, #那么这行代码彩旗效果. # Fill the envelope#下一个函数进行函数波形每次的偏右200然后叠加的运算.所以叫sum_square for i in range(n_frames): #hop_length 表示跳过的大小.就是静音时间段的长度. sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x
def window_sumsquare( window, n_frames, hop_length, win_length, n_fft, dtype=np.float32, norm=None, ): """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. This is used to estimate modulation effects induced by windowing observations in short-time fourier transforms. Parameters ---------- window : string, tuple, number, callable, or list-like Window specification, as in `get_window` n_frames : int > 0 The number of analysis frames hop_length : int > 0 The number of samples to advance between frames win_length : [optional] The length of the window function. By default, this matches `n_fft`. n_fft : int > 0 The length of each analysis frame. dtype : np.dtype The data type of the output Returns ------- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` The sum-squared envelope of the window function """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 win_sq = librosa_util.pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] return x
def load_wav_to_torch(self, full_path): """ Loads wavdata into torch array """ data, sampling_rate = load(full_path, sr=None) data = 0.95 * normalize(data) data = torch.from_numpy(data).float() if self.augs is not None: data = self.augs(data) return data, sampling_rate
def __getitem__(self, index): filename = self.audio_files[index] if self._cache_ref_count == 0: audio, sampling_rate = load_wav(filename) audio = audio / MAX_WAV_VALUE if not self.fine_tuning: audio = normalize(audio) * 0.95 self.cached_wav = audio if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) self._cache_ref_count = self.n_cache_reuse else: audio = self.cached_wav self._cache_ref_count -= 1 audio = torch.FloatTensor(audio) audio = audio.unsqueeze(0) if not self.fine_tuning: if self.split: if audio.size(1) >= self.segment_size: max_audio_start = audio.size(1) - self.segment_size audio_start = random.randint(0, max_audio_start) audio = audio[:, audio_start:audio_start+self.segment_size] else: audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') mel = mel_spectrogram(audio, self.n_fft, self.num_mels, self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, center=False) else: mel = np.load( os.path.join(self.base_mels_path, os.path.splitext(filename)[0] + '.npy')) mel = torch.from_numpy(mel) if len(mel.shape) < 3: mel = mel.unsqueeze(0) if self.split: frames_per_seg = math.ceil(self.segment_size / self.hop_size) if audio.size(1) >= self.segment_size: mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) mel = mel[:, :, mel_start:mel_start + frames_per_seg] audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] else: mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, center=False) return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
def autocorrelation(self): self.N = (self.windowed_x[:, 0].shape[0] + 1) * 2 corr = ifft(fft(self.windowed_x, n=self.N)) self.correlation = util.normalize(corr, norm=np.inf) subslice = [slice(None)] * np.array(self.correlation).ndim subslice[0] = slice(self.windowed_x.shape[0]) self.correlation = np.array(self.correlation)[subslice] if not np.iscomplexobj(self.correlation): self.correlation = self.correlation.real return self.correlation
def gen_win_sq(denoiser): window = denoiser.stft.window win_length = denoiser.stft.win_length n_fft = denoiser.stft.filter_length # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=None)**2 win_sq = librosa_util.pad_center(win_sq, n_fft) return win_sq
def autocorrelation(self): self.N = (self.windowed_x[:,0].shape[0]+1) * 2 corr = ifft(fft(self.windowed_x, n = self.N)) self.correlation = util.normalize(corr, norm = np.inf) subslice = [slice(None)] * np.array(self.correlation).ndim subslice[0] = slice(self.windowed_x.shape[0]) self.correlation = np.array(self.correlation)[subslice] if not np.iscomplexobj(self.correlation): self.correlation = self.correlation.real return self.correlation
def smooth(self, feat, win_len_smooth=4): ''' This code is similar to the one used on librosa for smoothing cens: https://librosa.github.io/librosa/generated/librosa.feature.chroma_cens.html ''' win = filters.get_window('hann', win_len_smooth + 2, fftbins=False) win /= np.sum(win) win = np.atleast_2d(win) feat = scipy.signal.convolve2d(feat, win, mode='same', boundary='fill') return util.normalize(feat, norm=2, axis=0)
def load_wav_to_torch(self, full_path): """ Loads wavdata into torch array """ data, sampling_rate = load(full_path, sr=self.sampling_rate) data = 0.95 * normalize(data) if self.augment: amplitude = np.random.uniform(low=0.3, high=1.0) data = data * amplitude return torch.from_numpy(data).float(), sampling_rate
def read_wav( fname: str, sr: int, norm: float = 0, pre_emphasis: bool = False ) -> np.ndarray: "Read a wave file into a normalized array" (S, _) = librosa.load(fname, sr=sr) (S, _) = effects.trim(S) if pre_emphasis: S[1:] -= S[:-1] if norm is not 0: S = librosa_util.normalize(S, norm=norm) return S
def add_noise(data, noise_ratio=.05): """ adds randomness (white noise) to signal Args: data: array of audio file(s) noise_ratio: how much noise to add Returns: normalized audio file with white noise applied """ noisy_data = data + noise_ratio * np.random.normal(loc=0.0, scale=1.0, size=data.shape) return normalize(noisy_data)
def __getitem__(self, index): filename = self.audio_files[index] if self._cache_ref_count == 0: audio, sampling_rate = load_wav(filename) audio = audio / MAX_WAV_VALUE audio = normalize(audio) * 0.95 self.cached_wav = audio if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) self._cache_ref_count = self.n_cache_reuse else: audio = self.cached_wav self._cache_ref_count -= 1 audio = torch.FloatTensor(audio) audio = audio.unsqueeze(0) if self.split: if audio.size(1) >= self.segment_size: max_audio_start = audio.size(1) - self.segment_size audio_start = random.randint(0, max_audio_start) audio = audio[:, audio_start:audio_start + self.segment_size] else: audio = torch.nn.functional.pad( audio, (0, self.segment_size - audio.size(1)), "constant") mel = mel_spectrogram( audio, self.n_fft, self.num_mels, self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, center=False, ) mel_loss = mel_spectrogram( audio, self.n_fft, self.num_mels, self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, center=False, ) return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
def getLocalFeatures(X, T=True): import librosa from librosa.util import normalize def MFCC(sig, sample_rate=22050): return librosa.feature.mfcc(y=np.array(sig), sr=sample_rate, n_mfcc=12) X = np.asarray([map(MFCC, s) for s in X]) X_flattened = [v for lis in X for v in lis] if T: X_flattened = np.asarray(map(np.transpose, X_flattened)) X_normalized = normalize(X_flattened, norm=2) X_flattened = np.asarray([x for clip in X_normalized for x in clip]) return X_flattened
def load_wav_to_torch(self, full_path, offset): """ Loads wavdata into torch array """ load_duration = self.segment_length / self.sampling_rate data, _ = load( full_path, sr=self.sampling_rate, offset=offset, duration=load_duration ) data = 0.95 * normalize(data) if self.augment: amplitude = self.random_state.uniform(low=0.3, high=1.0) data = data * amplitude return torch.from_numpy(data).float()
def load_wav_to_torch(self, full_path): """ Loads wavdata into torch array """ data, sampling_rate = load(full_path, sr=self.sampling_rate) #print(data.shape,flush=True) try: data = 0.95 * normalize(data) except: print(full_path, flush=True) sys.exit(-1) if self.augment: amplitude = np.random.uniform(low=0.3, high=1.0) data = data * amplitude return torch.from_numpy(data).float(), sampling_rate
def featureExtraction(self, X, transpose=True): def MFCC(signal, sr=22050): return librosa.feature.mfcc(y=np.array(signal), sr=sr, n_mfcc=20) X = np.array([map(MFCC, song) for song in X]) print "--> After MFCC X.shape", X.shape X_train_flattened = [val for sublist in X for val in sublist] print "--> X_train_flattened.shape", np.array(X_train_flattened).shape if transpose: # X_train_flattened = np.array(map(np.transpose, X_train_flattened)) X_train_flattened = np.array([np.transpose(clip).flatten() for clip in X_train_flattened]) print "--> After transpose X_train_flattened.shape", X_train_flattened.shape X_train_flattened_norm = normalize(X_train_flattened, norm=2) return X_train_flattened_norm
def mix(x0, x1, snr): """Mix two signals Args: x0 (numpy.ndarray): signal (n_samples,) x1 (numpy.ndarray): signal (n_samples,) snr (float): mixing coefficient applied on x1 (dB) Returns: numpy.ndarray: mixed signal (n_samples,) """ # apply x0 = _norm_n_weight(x0, 0) # set this signal as `signal` x1 = _norm_n_weight(x1, -snr) # treat this as `noise` y = normalize(x0 + x1) return y
def gd_eval(codes, grain_is, lr=0.05, maxiter=20, verbose=0, **kwargs): code_dot = [] for i, (src_i, tgt_i) in enumerate(grain_is): print("{}/{}: {}, {}".format(i + 1, len(grain_is), src_i, tgt_i)) target = normalize(autocorr_t( codes.sample(tgt_i), codes.t(tgt_i), ).feature, norm=2) trajectory = choose_molecule_pitch_opt(target, codes.acorr_coef(src_i), trace=True, lr=lr, maxiter=maxiter, verbose=verbose, **kwargs) code_dot.append(trajectory) return code_dot
def butter_bandpass_filter(data, lowcut, highcut, fs, order=5, normalize=False): """ applies butter bandpass to audio array Args: data: 1D array audio file lowcut: low frequency cutoff point highcut: high frequency cutoff point fs: sample rate order: roll-off (smaller is more aggressive) normalize: if True, normalizes data Returns: 1D array audio file with butter bandpass filter applied """ b, a = butter_bandpass(lowcut, highcut, fs, order=order) y = lfilter(b, a, data) if normalize: y = normalize(y) return y
def _norm_n_weight(x, dB): """Normlize and weight to given dB ratio Args: x (numpy.ndarray): signal (n_samples,) dB (float): target dB (*ratio) Returns: numpy.ndarray: processed signal (n_samples,) """ # normalize both signal x = normalize(x) # get the RMS of each signal rms = np.linalg.norm(x) / np.sqrt(len(x)) # get the weight ratio = (10**(dB / 20)) / rms return x * ratio
def window_sumsquare(window, n_frames, hop_length=120, win_length=800, n_fft=800, dtype=float, norm=None): if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm)**2 win_sq = librosa_util.pad_center(win_sq, n_fft) for i in range(n_frames): sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x
def fbank(path, fft_span, hop_span, n_mels, fmin, fmax, affichage=False): """ :param path: emplacement du fichier :param fft_span: taille de la fenetre pour la transformee de fourrier en seconde :param hop_span: pas entre deux echantillons en seconde :param n_mels: nombre de bandes de frequences mel :param fmin: frequence minimale de la decomposition :param fmax: frequence maximale de la decomposition :param affichage: True si on veut afficher le spectrogramme :return: Renvoie les vecteurs fbank representant le signal X matrice representant la decomposition fbank au cours du temps (une ligne = une decomposition pour une periode hop_span, de taille n_mels) """ # 1ere facon d ouvrir un fichier # wav_signal = scipy.io.wavfile.read(path) # wav = np.array(wav_signal[1]) # s_rate = wav_signal[0] # Deuxieme facon d ouvrir un fichier wav, s_rate = librosa.load(path) X = feature.melspectrogram(util.normalize(wav), s_rate, S=None, n_fft=int(np.floor(fft_span * s_rate)), hop_length=int(np.floor(hop_span * s_rate)), n_mels=n_mels, fmin=fmin, fmax=fmax) # #Verification nombre d'echantillons (un toutes les 10ms) # size = X.shape # print 'Taille de la matrice de sortie',size # print 'Taille d un morceau de signal de 10ms que l on obtient' ,len(wav)/size[1] # print 'taille theorique d un morceau de signal',0.01*s_rate # print 's_rate',s_rate # print 'longueur',wav.shape # print wav.shape[0]/s_rate X = np.log(X) if affichage: afficherSpec(X, s_rate, hop_span) return np.transpose(X)
def loadSamples(): collections = pickle.load(open('samples.in', 'r')) X, y = [], [] words = [] for idx in xrange(len(collections.keys())): samples = collections[collections.keys()[idx]] to_clustered = random.randint(0, len(samples)-1) for i in xrange(len(samples)): if i == to_clustered: words.append([clip[:66058] for clip in samples[i]]) else: X.append([clip[:66058] for clip in samples[i]]) y.append(idx) X = [[runMFCC(window) for window in sample] for sample in X] words = [[runMFCC(window) for window in sample] for sample in words] normedData = np.asarray(normalize(X + words, norm=2)) X, words = normedData[:30], normedData[30:] return np.asarray(map(flatten, X)), np.asarray(y), np.asarray(map(flatten, words))
def featureExtraction(X, transpose=True): def MFCC(signal, sr=22050): return librosa.feature.mfcc(y=np.array(signal), sr=sr, n_mfcc=12) X = np.array([map(MFCC, song) for song in X]) # X = np.array([[MFCC(clip) for clip in song] for song in X]) print "After MFCC X.shape", X.shape X_train_flattened = [val for sublist in X for val in sublist] print "X_train_flattened.shape", np.array(X_train_flattened).shape # librosa.display.specshow(X_train_flattened[0], x_axis='time') # plt.colorbar() # plt.title('MFCC X_train_flattened[0]') # plt.tight_layout() # plt.show() if transpose: X_train_flattened = np.array(map(np.transpose, X_train_flattened)) print "After transpose X_train_flattened.shape", X_train_flattened.shape X_train_flattened_norm = normalize(X_train_flattened, norm=2) X_train_flattened_norm_final = np.array([mfcc for clip in X_train_flattened_norm for mfcc in clip]) return X_train_flattened_norm_final
def featureExtraction(self, X, transpose=True): def MFCC(signal, sr=22050): y = np.array(signal, dtype=np.float64) y_h1, y_p1 = librosa.effects.hpss(y) y_h2, y_p2 = librosa.effects.hpss(y_h1) y_h3, y_p3 = librosa.effects.hpss(y_p1) y_h4, y_p4 = librosa.effects.hpss(y_h2) y_h5, y_p5 = librosa.effects.hpss(y_p2) y_h6, y_p6 = librosa.effects.hpss(y_h3) y_h7, y_p7 = librosa.effects.hpss(y_p3) mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20) y_h1 = librosa.feature.mfcc(y=y_h1, sr=sr, n_mfcc=20) y_p1 = librosa.feature.mfcc(y=y_p1, sr=sr, n_mfcc=20) y_h2 = librosa.feature.mfcc(y=y_h2, sr=sr, n_mfcc=20) y_p2 = librosa.feature.mfcc(y=y_p2, sr=sr, n_mfcc=20) y_h3 = librosa.feature.mfcc(y=y_h3, sr=sr, n_mfcc=20) y_p3 = librosa.feature.mfcc(y=y_p3, sr=sr, n_mfcc=20) y_h4 = librosa.feature.mfcc(y=y_h4, sr=sr, n_mfcc=20) y_p4 = librosa.feature.mfcc(y=y_p4, sr=sr, n_mfcc=20) y_h5 = librosa.feature.mfcc(y=y_h5, sr=sr, n_mfcc=20) y_p5 = librosa.feature.mfcc(y=y_p5, sr=sr, n_mfcc=20) y_h6 = librosa.feature.mfcc(y=y_h6, sr=sr, n_mfcc=20) y_p6 = librosa.feature.mfcc(y=y_p6, sr=sr, n_mfcc=20) y_h7 = librosa.feature.mfcc(y=y_h7, sr=sr, n_mfcc=20) y_p7 = librosa.feature.mfcc(y=y_p7, sr=sr, n_mfcc=20) return np.vstack([y_h1, y_p1, y_h2, y_p2, y_h3, y_p3, y_h4, y_p4, y_h5, y_p5, y_h6, y_p6, y_h7, y_p7]) # y = np.array(signal, dtype=np.float64) # y_harmonic, y_percussive = librosa.effects.hpss(y) # mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20) # mfcc_h = librosa.feature.mfcc(y=y_harmonic, sr=sr, n_mfcc=20) # mfcc_p = librosa.feature.mfcc(y=y_percussive, sr=sr, n_mfcc=20) # delta_mfcc = librosa.feature.delta(mfcc) # delta2_mfcc = librosa.feature.delta(mfcc, order=2) # delta_mfcc_h = librosa.feature.delta(mfcc_h) # delta2_mfcc_h = librosa.feature.delta(mfcc_h, order=2) # delta_mfcc_p = librosa.feature.delta(mfcc_p) # delta2_mfcc_p = librosa.feature.delta(mfcc_p, order=2) # return np.vstack([mfcc_h, delta_mfcc_h, delta2_mfcc_h, mfcc_p, delta_mfcc_p, delta2_mfcc_p]) # y_harmonic, y_percussive = librosa.effects.hpss(np.array(signal, dtype=np.float64)) # mfcc_orig = librosa.feature.mfcc(y=np.array(signal, dtype=np.float64), sr=sr, n_mfcc=20) # mfcc_h = librosa.feature.mfcc(y=y_harmonic, sr=sr, n_mfcc=20) # mfcc_p = librosa.feature.mfcc(y=y_percussive, sr=sr, n_mfcc=20) # return np.vstack([mfcc_h, mfcc_p]) # S = librosa.feature.melspectrogram(np.array(signal), sr=sr, n_mels=20) # # Convert to log scale (dB). We'll use the peak power as reference. # log_S = librosa.logamplitude(S, ref_power=np.max) # return librosa.feature.mfcc(S=log_S, n_mfcc=20) # S = librosa.feature.melspectrogram(y=np.array(signal), sr=sr, n_mels=20,fmax=9000) # return librosa.logamplitude(S, ref_power=np.max) # return librosa.feature.mfcc(y=np.array(signal), sr=sr, n_mfcc=20) X = np.array([map(MFCC, song) for song in X]) print "--> After MFCC X.shape", X.shape X_train_flattened = [val for sublist in X for val in sublist] print "--> X_train_flattened.shape", np.array(X_train_flattened).shape if transpose: # X_train_flattened = np.array(map(np.transpose, X_train_flattened))) X_train_flattened = np.array([np.transpose(clip) for clip in X_train_flattened]) # X_train_flattened = np.array([np.transpose(clip).flatten() for clip in X_train_flattened]) print "--> After transpose X_train_flattened.shape", X_train_flattened.shape # #Important X_train_flattened_norm = normalize(X_train_flattened, norm=2) # return X_train_flattened_norm X_train_flattened_norm_final = np.array([mfcc for clip in X_train_flattened_norm for mfcc in clip]) return X_train_flattened_norm_final
print "Perform beat_track and cqt" tempo, beats = librosa.beat.beat_track(y=y, sr=sr) cqt = librosa.cqt(y=y, sr=sr) print "saving cqt and beats... " np.save("./tempArray/cqt.npy", cqt) np.save("./tempArray/beats.npy", beats) else: print "Loading cqt_med and frameConversion... " cqt = np.load('./tempArray/cqt.npy') beats = np.load('./tempArray/beats.npy') sr = 44100 print "Perform sync ..." cqt_med, frameConversion = librosaF.sync(cqt, beats, aggregate=np.median) cqt_med = cqt_med.T cqt_med = normalize(cqt_med, norm=2) print "Perform loadInterval2Frame ..." interval = librosaF.loadInterval2Frame("../data/anno/698/parsed/textfile1_uppercase.txt", sr, frameConversion) print "Creating sigmas matrix ..." sigmas = np.random.rand(cqt_med.shape[0], cqt_med.shape[0]) + 1e-7 #add a base in case of 0 sigma sigmas = ((sigmas + sigmas.T)/2) gm = RM.feature2GaussianMatrix(cqt_med, sigmas) #(nSample, nFeature) L = scipy.sparse.csgraph.laplacian(gm, normed=True) m_true = RM.label2RecurrenceMatrix("../data/2.jams", gm.shape[0], interval) L_true = scipy.sparse.csgraph.laplacian(m_true, normed=True) np.save("./tempArray/L_true.npy", L_true) print "cqt_med [min, max]: %s" % str((cqt_med.min(), cqt_med.max()))