def audio_to_data(signal): if config.silence_thr_db is not None: signal, _ = trim(signal, config.silence_thr_db, frame_length=config.fft_bins, hop_length=config.fft_hop_len) spec = abs( stft(signal, config.fft_bins, config.fft_hop_len, config.fft_window_len)) # mfccs = mfcc(signal, config.sample_rate, n_mfcc=config.mfcc_bins) # chroma = chroma_stft(signal, config.sample_rate, n_fft=config.fft_bins, hop_length=config.fft_hop_len, win_length=config.fft_window_len) # show(specshow(spec, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(specshow(mfccs, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(plot(chroma)) vector = deepcopy(spec) print('\tmax min initially:', max(vector), min(vector)) vector = amplitude_to_db(vector) print('\tmax min in db:', max(vector), min(vector)) # vector = concatenate([vector, chroma], 0) vector = vector.T print('\tfinal vector shape:', vector.shape) return vector
def _adjust_time_resolution(self, batch, local_condition, max_time_steps): '''Adjust time resolution between audio and local condition ''' if local_condition: new_batch = [] for b in batch: x, c, g, l = b self._assert_ready_for_upsample(x, c) if max_time_steps is not None: max_steps = _ensure_divisible(max_time_steps, self._hparams.hop_size, True) if len(x) > max_time_steps: max_time_frames = max_steps // self._hparams.hop_size start = np.random.randint(0, len(c) - max_time_frames) time_start = start * self._hparams.hop_size x = x[time_start:time_start + max_time_frames * self._hparams.hop_size] c = c[start:start + max_time_frames, :] self._assert_ready_for_upsample(x, c) new_batch.append((x, c, g, l)) return new_batch else: new_batch = [] for b in batch: x, c, g, l = b x = trim(x) if max_time_steps is not None and len(x) > max_time_steps: start = np.random.randint(0, len(c) - max_time_steps) x = x[start:start + max_time_steps] new_batch.append((x, c, g, l)) return new_batch
def extract_features(audio, rate): audio = reduce_noise_power(audio, rate) audio, indexes = trim(audio) mfcc_feature = mfcc(y=audio, sr=rate, n_mfcc=13, n_fft=int(0.025 * rate), n_mels=40, fmin=20, hop_length=int(0.03 * rate)) mfcc_feature = preprocessing.scale(mfcc_feature, axis=1) mfcc_feature = stats.zscore(mfcc_feature) pitches, magnitudes = pitch(y=audio, sr=rate, fmin=50, fmax=400, n_fft=int(0.025 * rate), hop_length=int(0.03 * rate)) #delta_f = delta(mfcc_feature) #d_delta_f = delta(mfcc_feature, order=2) combined = np.hstack((np.transpose(mfcc_feature), np.transpose(pitches))) return combined
def trim(examples: Sequence[EmplacableExample], top_db: int = 40): return [ ex.emplaced_audio_data( torch.from_numpy( effects.trim(ex.audio_data.cpu().numpy(), top_db=top_db)[0])) for ex in examples ]
def Prepare(data): T = data.T if np.shape(T)[0] == 2: data = T[0] data = trim(data, 30)[0] return data
def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray: """Load waveform.""" wav = load(file_path, sr=cls.sample_rate)[0] if is_trim: wav = trim(wav, top_db=cls.top_db)[0] wav = np.clip(wav, -1.0 + 1e-6, 1.0 - 1e-6) return wav
def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray: """Load waveform.""" wav, _ = load(file_path, sr=cls.sample_rate, mono=True) # Trimming if is_trim: wav, _ = trim(wav) return wav
def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray: """Load waveform.""" wav, _ = load(file_path, sr=cls.sample_rate) wav = wav / (np.abs(wav).max() + 1e-6) if is_trim: wav, _ = trim(wav, top_db=cls.top_db) return wav
def load_wav(cls, file_path: str, is_trim: bool) -> np.ndarray: """Load waveform.""" wav = load(file_path, sr=cls.sample_rate)[0] wav = wav / (np.abs(wav).max() + 1e-6) if is_trim: wav = trim(wav, top_db=cls.top_db)[0] wav = filtfilt(*cls.butter_highpass(), wav) wav = wav * 0.96 return wav
def load_wav_to_torch(full_path: str, sr: Optional[int] = 24000) -> Tuple[torch.Tensor, int]: """Load audio file from `full_path` with optional resamplling to `sr`. Args: full_path (str): path to audio file. sr (int, optional): sample rate to resample to. Returns: (torch.Tensor, sampling_rate) """ data, sampling_rate = load(full_path, sr) return torch.from_numpy(trim(data)[0]), sampling_rate
def read_wav( fname: str, sr: int, norm: float = 0, pre_emphasis: bool = False ) -> np.ndarray: "Read a wave file into a normalized array" (S, _) = librosa.load(fname, sr=sr) (S, _) = effects.trim(S) if pre_emphasis: S[1:] -= S[:-1] if norm is not 0: S = librosa_util.normalize(S, norm=norm) return S
def guess(self): wav, _ = load(self.WAVE_OUTPUT_FILENAME, sr=self.sr) wav, _ = trim(wav, top_db=self.top_db) write_wav(self.WAVE_OUTPUT_FILENAME, wav, self.sr) print(">> save as", self.WAVE_OUTPUT_FILENAME) #dtw recognition x = self.getMfcc(wav, self.sr) res = self.recognition(x) print(res) self.audio_num = self.audio_num + 1 self.WAVE_OUTPUT_FILENAME = "./saved/" + str(self.audio_num) + ".wav"
def __init__(self, file, dim=dim): self.directory = file # print(file) self.sound = sa.WaveObject.from_wave_file(self.directory) # self.sound = QtCore.QObject.QtMultimedia.QSound(file) self._raw, self.rate = load(file) trimmed, ind = trim(self._raw, top_db=50) # print(index[0]) if ((ind[1] + int(0.03 * self.rate) - ind[0]) % 2): self.data = self._raw[ind[0]:ind[1] + int(0.03 * self.rate) - 1] else: self.data = self._raw[ind[0]:ind[1] + int(0.03 * self.rate)] self.seg = floor(self.data.shape[0] / dim) self.length = self._raw.shape[0]
def file2spectrogram(cls, file_path): """Load audio file and create spectrogram.""" wav = load(file_path, sr=cls.sample_rate)[0] wav = trim(wav, top_db=cls.top_db)[0] wav = filtfilt(*cls.butter_highpass(), wav) wav = wav * 0.96 d_mag = cls.short_time_fourier_transform(wav) d_mel = np.dot(d_mag.T, cls.mel_basis) db_val = 20 * np.log10(np.maximum(cls.min_level, d_mel)) db_scaled = db_val - cls.ref_db db_normalized = (db_scaled + cls.max_db) / cls.max_db return np.clip(db_normalized, 0, 1).astype(np.float32)
def trim_signal_length(signal, sample_rate, length=SAMPLE_DURATION): # Trim leading and trailing white space signal, i = trim(y=signal) # Replace zero values with small value to avoid divide by zero error later on signal[signal == 0] = 0.0001 # Extend audio to be 3.5 seconds long if needed target = int(sample_rate * length) signal_length = len(signal) # If its longer, clip the length if signal_length > target: return signal[0:target] else: return np.pad(signal, (0, target - signal_length), 'wrap')
def load_sample(path, upper_bound, lower_bound): samples, rate = lc.load(path=path, mono=True, duration=upper_bound) samples, _ = le.trim(samples[int(rate * lower_bound):int(rate * upper_bound)], top_db=20) return samples, rate
def trim_data(data, threshold): trimmed = [] for record in data: trimmed.append((trim(record, top_db=threshold))[0]) # trim function from librosa return trimmed
#add the 7frame buffer back to the audio smples=np.int(sr*7/fps) aud = np.pad(np.float32(aud), (smples, smples), 'constant', constant_values=(0, 0)) if (opt==1 or opt ==2): #image is of shape timepts x landmark pts x 1 x coordinates e.g. (28, 68, 2) image1=np.load(vfile) else: #image is of shape timepts x landmark pts x 1 x coordinates e.g. (28, 68, 2) video=spio.loadmat(vfile) image1=video['joint_mean'] # PREPROCESSING AUDIO BELOW: #remove silence from the audio aud, index=effects.trim(np.float32(aud), top_db=freq) #16000 13.44 40.32 7168 21504 14336 #compute video frame numbers from index stframe=np.floor(fps*index[0]/sr) endframe=np.ceil(fps*index[1]/sr) #trim video based on audio image1=image1[np.int(stframe):np.int(endframe),:,:] image=[] image_aug=[] for tp in range(np.shape(image1)[0]): #tp x landmarks x coordinates # measurements based on nose center point 33
def audio_to_data(signal, song_id): meta = [song_id] if config.silence_thr_db: signal, _ = trim(signal, config.silence_thr_db, frame_length=config.fft_bins, hop_length=config.fft_hop_len) spec = abs( stft(signal, config.fft_bins, config.fft_hop_len, config.fft_window_len)) # mfccs = mfcc(signal, config.sample_rate, n_mfcc=config.mfcc_bins) # chroma = chroma_stft(signal, config.sample_rate, n_fft=config.fft_bins, hop_length=config.fft_hop_len, win_length=config.fft_window_len) # rows-frequencies cols-times # show(specshow(spec, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(specshow(mfccs, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(plot(chroma)) spec_mod = deepcopy(spec) print('\tmax min initially:', max(spec_mod), min(spec_mod)) spec_mod = stack([ spec_mod[config.frequencies_of_bins.index(i), :] for i in config.frequencies_to_pick ], 0) print('\tmax min after bandpass:'******'\tmax min in db:', max(spec_mod), min(spec_mod)) # spec_mod = clip(spec_mod, config.amp_min_thr_db, config.amp_max_thr_db) # print('db clipped.') if config.zscore_scale: mean = spec_mod.mean() std = spec_mod.std() spec_mod -= mean spec_mod /= std print('\tmax min after std:', max(spec_mod), min(spec_mod)) scale = max([abs(max(spec_mod)), abs(min(spec_mod))]) spec_mod /= scale meta.extend([mean, std, scale]) elif config.minmax_scale: spec_min = min(spec_mod) spec_max = max(spec_mod) spec_mod -= spec_min spec_mod /= spec_max - spec_min print('\tmax min after min/max:', max(spec_mod), min(spec_mod)) meta.extend([spec_min, spec_max]) elif config.log_scale: spec_mod = log(spec_mod + 1e-10) print('\tmax min after log:', max(spec_mod), min(spec_mod)) vector = spec_mod # vector = concatenate([vector, chroma], 0) vector = vector.T # now first index time, second index frequency print('\tfinal vector shape:', vector.shape) return vector, meta
def index(): if request.method == 'POST': curr_time = str(int(time.time())) data = request.get_json() lowerbound = data['lowerbound'] upperbound = data['upperbound'] fft_on = data['fft_on'] audiofile = data['uploaded_audiofile'] prefix = "data:audio/wav;base64," # handle other audio file formats audiofile = audiofile.replace(prefix, "") original_filename = "upload_" + curr_time + ".wav" try: audiofile = base64.b64decode(audiofile) with open(original_filename, "wb") as f: f.write(audiofile) except Exception as e: return (str(e)) samples, rate = lc.load( original_filename ) # can you get these directly from the base64 encoding sent it without saving it samples, _ = le.trim(samples[int(rate * lowerbound):int(rate * upperbound)], top_db=20) harmonic_samples, percussive_samples = split_hp(samples) response = dict() response['original_file'] = original_filename response['harmonic_file'] = write_wav(path="h_" + curr_time + ".wav", y=harmonic_samples, sr=rate) response['percussive_file'] = write_wav(path="p_" + curr_time + ".wav", y=percussive_samples, sr=rate) # get full working directory for these response['original_samples'] = [str(s) for s in samples] response['harmonic_samples'] = [str(s) for s in harmonic_samples] response['percussive_samples'] = [str(s) for s in percussive_samples] waveplot_encoding = "data:image/png;base64," + str( generate_waveplot( samples, rate)) # this should return an image of the matplotlib plot response['waveplot'] = waveplot_encoding spectrogram_encoding = "data:image/png;base64," + str( generate_spectrogram(samples, rate, opt=0)) response['spectrogram'] = spectrogram_encoding h_spectrogram_encoding = "data:image/png;base64," + str( generate_spectrogram(harmonic_samples, rate, opt=1)) response['h_spectrogram'] = h_spectrogram_encoding p_spectrogram_encoding = "data:image/png;base64," + str( generate_spectrogram(percussive_samples, rate, opt=2)) response['p_spectrogram'] = p_spectrogram_encoding if fft_on: fourier_transform = get_fft(samples, rate) fftplot_encoding = "data:image/png;base64," + str( generate_fft_plot(fourier_transform['x'], fourier_transform['y'])) response['fftplot'] = fftplot_encoding return jsonify(response) # dict of lists return render_template("index.html")