def __test(hz, resolution, bins_per_octave, tuning): est_tuning = librosa.pitch_tuning(hz, resolution=resolution, bins_per_octave=bins_per_octave) assert np.abs(tuning - est_tuning) <= resolution
def lpc_emotion_upload(): entry = dict() wav_files = [] SAMPLE_RATE = 44100 b, _ = librosa.core.load('pickles/catalyst.wav', sr=SAMPLE_RATE) y, sr = librosa.load('pickles/catalyst.wav') lpc = librosa.lpc(y, 5) for no in range(0, len(lpc)): entry['LIB_LPC{0}'.format(no)] = lpc[no] y, sr = librosa.load('pickles/catalyst.wav') pitches, magnitudes = librosa.core.piptrack(y, sr) # Select out pitches with high energy pitches = pitches[magnitudes > np.median(magnitudes)] pit = librosa.pitch_tuning(pitches) entry['pitch'] = pit wav_files.append(entry) wav_df = pd.DataFrame(wav_files) lpc_clf = joblib.load('pickles/lpc_model.sav') bar = pd.DataFrame(lpc_clf.predict_proba(wav_df)) bar.columns = lpc_clf.classes_ bar_t = bar.T bar_t.columns = ['values'] print('HERE') fig = go.Figure(data=[ go.Pie(labels=lpc_clf.classes_, values=bar_t['values'], hole=.3), ]) return lpc_clf.predict(wav_df), fig
def get_wav_df(self): wav_files = [] for wav in os.listdir(self.wav_dir): if wav.endswith('.wav'): entry = dict() entry['Session'] = wav fs, signal = swav.read(self.wav_dir + '/' + wav) y, sr = librosa.load(self.wav_dir + '/' + wav) lpc = librosa.lpc(y, 5) for no in range(0, len(lpc)): entry['LIB_LPC{0}'.format(no)] = lpc[no] y, sr = librosa.load(self.wav_dir + '/' + wav) pitches, magnitudes = librosa.core.piptrack(y, sr) # Select out pitches with high energy pitches = pitches[magnitudes > np.median(magnitudes)] pit = librosa.pitch_tuning(pitches) entry['pitch'] = pit wav_files.append(entry) # wav_files = [] # entry = dict() # iemocap_wav_list = self._load() # print(iemocap_wav_list.getframerate()) # print(iemocap_wav_list) # entry['Session'] = glob.glob("*.wav", iemocap_wav_list) # if bool(entry): # wav_files.append(entry) wav_df = pd.DataFrame(wav_files) return wav_df
def mfcc_emotion_upload(): entry = dict() wav_files = [] SAMPLE_RATE = 44100 b, _ = librosa.core.load('pickles/catalyst.wav', sr=SAMPLE_RATE) y, sr = librosa.load('pickles/catalyst.wav') entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y)) entry['STD_RMS'] = np.std(librosa.feature.rms(y=y)) assert _ == SAMPLE_RATE mfcc_feature = librosa.feature.mfcc(b, sr=SAMPLE_RATE, n_mfcc=20) delta_mfcc = librosa.feature.delta(mfcc_feature) d_delta_mfcc = librosa.feature.delta(mfcc_feature, order=2) mean_mfcc = np.mean(mfcc_feature, axis=1) std_mfcc = np.mean(mfcc_feature, axis=1) mean_ddmfcc = np.mean(d_delta_mfcc, axis=1) std_ddmfcc = np.std(d_delta_mfcc, axis=1) mean_dmfcc = np.mean(delta_mfcc, axis=1) std_dmfcc = np.std(delta_mfcc, axis=1) for no in range(0, len(np.mean(delta_mfcc, axis=1))): entry['Mean_MFCC{0}'.format(no)] = mean_mfcc[no] entry['STD_MFCC{0}'.format(no)] = std_mfcc[no] entry['Mean_DDMFCC{0}'.format(no)] = mean_ddmfcc[no] entry['STD_DDMFCC{0}'.format(no)] = std_ddmfcc[no] entry['Mean_Delta_MFCC{0}'.format(no)] = mean_dmfcc[no] entry['STD_Delta_MFCC{0}'.format(no)] = std_dmfcc[no] pitches, magnitudes = librosa.core.piptrack(y, sr) # Select out pitches with high energy pitches = pitches[magnitudes > np.median(magnitudes)] pit = librosa.pitch_tuning(pitches) entry['pitch'] = pit wav_files.append(entry) wav_df = pd.DataFrame(wav_files) mfcc_clf = joblib.load('pickles/mfcc_model.sav') bar = pd.DataFrame(mfcc_clf.predict_proba(wav_df)) bar.columns = mfcc_clf.classes_ bar_t = bar.T bar_t.columns = ['values'] fig = go.Figure(data=[ go.Pie(labels=mfcc_clf.classes_, values=bar_t['values'], hole=.3), ]) return mfcc_clf.predict(wav_df), fig
def get_wav_df(self): wav_files = [] for wav in os.listdir(self.wav_dir): if wav.endswith('.wav'): entry = dict() entry['Session'] = wav SAMPLE_RATE = 44100 b, _ = librosa.core.load(self.wav_dir + '/' + wav, sr=SAMPLE_RATE) y, sr = librosa.load(self.wav_dir + '/' + wav) entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y)) entry['STD_RMS'] = np.std(librosa.feature.rms(y=y)) assert _ == SAMPLE_RATE mfcc_feature = librosa.feature.mfcc(b, sr=SAMPLE_RATE, n_mfcc=20) delta_mfcc = librosa.feature.delta(mfcc_feature) d_delta_mfcc = librosa.feature.delta(mfcc_feature, order=2) mean_mfcc = np.mean(mfcc_feature, axis=1) std_mfcc = np.mean(mfcc_feature, axis=1) mean_ddmfcc = np.mean(d_delta_mfcc, axis=1) std_ddmfcc = np.std(d_delta_mfcc,axis=1) mean_dmfcc = np.mean(delta_mfcc, axis=1) std_dmfcc = np.std(delta_mfcc, axis=1) for no in range(0, len(np.mean(delta_mfcc, axis=1))): entry['Mean_MFCC{0}'.format(no)] = mean_mfcc[no] entry['STD_MFCC{0}'.format(no)] = std_mfcc[no] entry['Mean_DDMFCC{0}'.format(no)] = mean_ddmfcc[no] entry['STD_DDMFCC{0}'.format(no)] = std_ddmfcc[no] entry['Mean_Delta_MFCC{0}'.format(no)] = mean_dmfcc[no] entry['STD_Delta_MFCC{0}'.format(no)] = std_dmfcc[no] y, sr = librosa.load(self.wav_dir + '/' + wav) pitches, magnitudes = librosa.core.piptrack(y, sr) # Select out pitches with high energy pitches = pitches[magnitudes > np.median(magnitudes)] pit = librosa.pitch_tuning(pitches) entry['pitch'] = pit wav_files.append(entry) wav_df = pd.DataFrame(wav_files) return wav_df
def estimate_a4(pitches, sr): pitches_sel = [] # Pick out pitches that last longer than `min_pitch_frame` for row in range(0, pitches.shape[0]): line_frames = [] line_freq = [] for col in range(0, pitches.shape[1]): if (pitches[row, col] != 0): line_frames.append(col) line_freq.append(pitches[row, col]) else: if (len(line_frames) > 0): if (len(line_frames) >= min_pitch_frame): line_time = librosa.frames_to_time(line_frames, sr=sr, hop_length=hop_len) if (line_freq[0] < max_freq): pitches_sel.extend(line_freq.copy()) line_frames.clear() line_freq.clear() offset_to_a4 = librosa.pitch_tuning(pitches_sel) return 440 * (2**(offset_to_a4 / 12))
def run(self): logging.info("Starting Pitch detector") # This loop condition have to be checked frequently, so the code inside may not be blocking while not self.terminated: new_frame = self.audio_frames.get() # Get new frame (blocking) if self.counter == 0: self.frames = new_frame self.counter += 1 elif self.counter >= BUFFER_SIZE: self.frames = np.append(self.frames, new_frame) pitches, magnitudes = librosa.piptrack(self.frames, SAMPLE_RATE) # Select out pitches with high energy pitches = pitches[magnitudes > np.median(magnitudes)] new_tuning = int(50 + 100 * librosa.pitch_tuning(pitches)) if np.abs(self.last_pitch - new_tuning) > PITCH_CHANGE_THRESHOLD: self.last_pitch = new_tuning self.manager.new_tuning(new_tuning) self.counter = 0 else: self.frames = np.append(self.frames, new_frame) self.counter += 1
def features(X, sample_rate): stft = np.abs(librosa.stft(X)) # fmin 和 fmax 对应于人类语音的最小最大基本频率 pitches, magnitudes = librosa.piptrack(X, sr=sample_rate, S=stft, fmin=70, fmax=400) pitch = [] for i in range(magnitudes.shape[1]): index = magnitudes[:, 1].argmax() pitch.append(pitches[index, i]) pitch_tuning_offset = librosa.pitch_tuning(pitches) pitchmean = np.mean(pitch) pitchstd = np.std(pitch) pitchmax = np.max(pitch) pitchmin = np.min(pitch) # 频谱质心 cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate) cent = cent / np.sum(cent) meancent = np.mean(cent) stdcent = np.std(cent) maxcent = np.max(cent) # 谱平面 flatness = np.mean(librosa.feature.spectral_flatness(y=X)) # 使用系数为50的MFCC特征 mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) # 色谱图 chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) # 梅尔频率 mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0) # ottava对比 contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0) # 过零率 zerocr = np.mean(librosa.feature.zero_crossing_rate(X)) S, phase = librosa.magphase(stft) meanMagnitude = np.mean(S) stdMagnitude = np.std(S) maxMagnitude = np.max(S) # 均方根能量 rmse = librosa.feature.rmse(S=S)[0] meanrms = np.mean(rmse) stdrms = np.std(rmse) maxrms = np.max(rmse) ext_features = np.array([ flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent, maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd, pitch_tuning_offset, meanrms, maxrms, stdrms ]) ext_features = np.concatenate((ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast)) return ext_features
filename, rhythm_code, pitch_code = 'F:/项目/花城音乐项目/样式数据/9.08MP3/旋律/xx3.wav', '[2000;250,250,250,250,1000;2000;500,500,1000]', '[6,5,6,3,5,6,3,2,1,6-]' y, sr = librosa.load(filename, offset=0.75, duration=0.2) y, sr = librosa.load(filename, offset=1.1, duration=0.2) # -0.029\ # y, sr = librosa.load(filename, offset=1.3, duration=0.2) # 0.160 # y, sr = librosa.load(filename, offset=2.6, duration=0.2) # -0.48 # y, sr = librosa.load(filename, offset=2.8, duration=0.2) #-0.169 # y, sr = librosa.load(filename, offset=3, duration=0.2) # y, sr = librosa.load(filename, offset=3.3, duration=0.2) # y, sr = librosa.load(filename, offset=3.55, duration=0.2) pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr) np.set_printoptions(threshold=np.nan) print(pitches[np.nonzero(pitches)]) pitches = pitches[magnitudes > np.median(magnitudes)] p = librosa.pitch_tuning(pitches) print(p) tun = librosa.estimate_tuning(y=y, sr=sr) print(tun) onset_frames_time = [ 0.7662585, 1.27709751, 2.80961451, 3.0185941, 3.29723356, 3.57587302, 3.80807256, 4.80653061, 7.2678458, 7.70902494 ] onset_frames_time_diff = np.diff(onset_frames_time) onset_frames_time_diff = list(onset_frames_time_diff) onset_frames_time_diff.append(0.2) for i, o in enumerate(onset_frames_time): offset = round(o, 2) duration = round(onset_frames_time_diff[i], 2)
def probabilities(y, note_min, note_max, sr, frame_length, window_length, hop_length, pitch_acc, voiced_acc, onset_acc, spread): """ Estimate prior (observed) probabilities from audio signal Parameters ---------- y : 1-D numpy array Array containing audio samples note_min : string, 'A#4' format Lowest note supported by this estimator note_max : string, 'A#4' format Highest note supported by this estimator sr : int Sample rate. frame_length : int window_length : int hop_length : int Parameters for FFT estimation pitch_acc : float, between 0 and 1 Probability (estimated) that the pitch estimator is correct. voiced_acc : float, between 0 and 1 Estimated accuracy of the "voiced" parameter. onset_acc : float, between 0 and 1 Estimated accuracy of the onset detector. spread : float, between 0 and 1 Probability that the singer/musician had a one-semitone deviation due to vibrato or glissando. Returns ------- P : 2D numpy array. P[j,t] is the prior probability of being in state j at time t. """ fmin = librosa.note_to_hz(note_min) fmax = librosa.note_to_hz(note_max) midi_min = librosa.note_to_midi(note_min) midi_max = librosa.note_to_midi(note_max) n_notes = midi_max - midi_min + 1 # F0 and voicing f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin * 0.9, fmax * 1.1, sr, frame_length, window_length, hop_length) tuning = librosa.pitch_tuning(f0) f0_ = np.round(librosa.hz_to_midi(f0 - tuning)).astype(int) onsets = librosa.onset.onset_detect(y, sr=sr, hop_length=hop_length, backtrack=True) P = np.ones((n_notes * 2 + 1, len(f0))) for t in range(len(f0)): # probability of silence or onset = 1-voiced_prob # Probability of a note = voiced_prob * (pitch_acc) (estimated note) # Probability of a note = voiced_prob * (1-pitch_acc) (estimated note) if voiced_flag[t] == False: P[0, t] = voiced_acc else: P[0, t] = 1 - voiced_acc for j in range(n_notes): if t in onsets: P[(j * 2) + 1, t] = onset_acc else: P[(j * 2) + 1, t] = 1 - onset_acc if j + midi_min == f0_[t]: P[(j * 2) + 2, t] = pitch_acc elif np.abs(j + midi_min - f0_[t]) == 1: P[(j * 2) + 2, t] = pitch_acc * spread else: P[(j * 2) + 2, t] = 1 - pitch_acc return P
def main(): # Enable colored output colorama.init() parser = argparse.ArgumentParser( description= 'This tool can analyze audio files and estimate the frequecy of A4.') parser.add_argument("filename") parser.add_argument('-s', '--silent', action="store_true", dest='silent', help='process the given audio file silently') parser.add_argument('-o', '--offset', dest='offset', type=float, default=0, help='the offset of the audio to process, default 0') parser.add_argument( '-d', '--duration', dest='duration', type=float, help= 'the duration of the audio to process. It will process to the end if the argument is not used' ) args = parser.parse_args() if (args.silent): auto_process(args.filename, args.offset, args.duration) else: print(Fore.YELLOW + Back.RED + Style.BRIGHT + 'Welcome to RainEggplant\'s concert pitch analyzer!\n' + Style.RESET_ALL) print(Fore.YELLOW + Style.BRIGHT + 'Please follow the instructions to get the result:' + Style.RESET_ALL) print( Fore.YELLOW + Style.BRIGHT + '[1] ' + Style.RESET_ALL + 'We are going to generate a spectrogram with pitch lines.\n' + 'After the window pops up, please come back to watch the instructions.' ) print(Fore.CYAN + Style.BRIGHT + "Press Enter to continue... " + Style.RESET_ALL, end='') input() print('This may take serveral seconds, please wait.\n') tunes = show_spectrogram(args.filename, args.offset, args.duration) print( Fore.YELLOW + Style.BRIGHT + '[2] ' + Style.RESET_ALL + 'Now you have seen the spectrogram.\n' + '- The green lines are peak frequency of that location.\n' + '- The white vertical lines divide the spectrogram into serveral fragments, according to pitch and volume changes.\n' + ' They are labeled with index. If the labels are overlapped, you can zoom in to seperate them.\n' '- You can also use the tools in the tool bar to zoom, drag, save, etc.\n' + '- The time, frequency and note (relative to A4=440Hz) which you are pointing at will be shown in the status bar.\n' ) print( Fore.YELLOW + Style.BRIGHT + '[3] ' + Style.RESET_ALL + 'After you inspect the spectrogram, you need to decide whether the data is suitable for analyzing or not.\n' + 'If not suitable, re-run the program with different offset, duration or filename.' ) print(Fore.CYAN + Style.BRIGHT + 'Process current data? (y/n) ' + Style.RESET_ALL, end='') cont = input() while cont.lower() not in ('y', 'n'): print(Fore.CYAN + Style.BRIGHT + 'Process current data? (y/n) ' + Style.RESET_ALL, end='') cont = input() if cont == 'n': return print( '\n' + Fore.YELLOW + Style.BRIGHT + '[4] ' + Style.RESET_ALL + 'Now you need to select the range of the audio file for analyzing. There are two modes:\n' + '\t1. Give start and end time, and let the program analyze automatically (similar to `silent mode`).\n' + '\t2. [Pro] Give notes and their sustaining time.\n' + '\t This mode will give you a more accurate and specific result.' ) print(Fore.CYAN + Style.BRIGHT + 'Select mode: (1/2) ' + Style.RESET_ALL, end='') mode = input() while mode.lower() not in ('1', '2'): print(Fore.CYAN + 'Select mode: (1/2) ' + Style.RESET_ALL, end='') mode = input() if mode == '1': print( '\n' + Fore.YELLOW + Style.BRIGHT + '[5] ' + Style.RESET_ALL + 'Now enter the start and end time according to the spectrogram:' ) # TODO: Add validation. while True: start_time = float(input('start time: ')) end_time = float(input('end time: ')) print(Fore.YELLOW + Style.BRIGHT, end='') auto_process(args.filename, args.offset + start_time, end_time - start_time) print() # Re-estimate using another range print(Style.RESET_ALL + Fore.CYAN + Style.BRIGHT + 'Re-estimate using another range? (y/n) ' + Style.RESET_ALL, end='') again = input() while again.lower() not in ('y', 'n'): print(Fore.CYAN + Style.BRIGHT + 'Re-estimate using another range? (y/n) ' + Style.RESET_ALL, end='') again = input() if again.lower() == 'n': break else: print( '\n' + Fore.YELLOW + Style.BRIGHT + '[5] ' + Style.RESET_ALL + 'Now add the note, its start time and end time according to the pitch lines.\n' + ' Format: NOTENAME STARTTIME ENDTIME (e.g. "A4 1.2 2")\n' + 'Enter `q` to stop adding.') while True: notes = [] # TODO: Add validation. while True: input_msg = input('+ ') if input_msg.lower() == 'q': break notes.append(input_msg.split()) note_names = [row[0] for row in notes] # Normalize note names (like Bb to A#) for i in range(len(notes)): # notes[i][0] = librosa.hz_to_note( # librosa.note_to_hz(notes[i][0])) notes[i][1] = float(notes[i][1]) notes[i][2] = float(notes[i][2]) note_names = [row[0] for row in notes] note_names = librosa.hz_to_note(librosa.note_to_hz(note_names)) # Filter pitches tunes_match = {} for note_name in note_names: tunes_match[note_name] = [] for (time_seq, freq_seq) in tunes: note_name = librosa.hz_to_note(np.mean(freq_seq)) if note_name in note_names: tunes_match[note_name].append((time_seq, freq_seq)) # Calculate A4 frequencies from the notes a4s = [] for i in range(len(notes)): n_frame = 0 freq_sum = 0 for (time_seq, freq_seq) in tunes_match[note_names[i]]: for t in range(len(time_seq)): if (time_seq[t] >= notes[i][1] and time_seq[t] <= notes[i][2]): n_frame += 1 freq_sum += freq_seq[t] if n_frame == 0: print(Fore.RED + 'Warning: ' + Style.RESET_ALL + 'note `%s` not found, skipping...' % notes[i][0]) continue freq_avg = freq_sum / n_frame offset_to_a4 = librosa.pitch_tuning(freq_avg) a4 = 440 * (2**(offset_to_a4 / 12)) a4s.append(a4) print( Fore.YELLOW + Style.BRIGHT + 'The estimated frequencies of A4 from each note are:\n\t', end='') print(['{:.1f}'.format(i) for i in a4s]) print('Average estimated frequency: {:.1f} Hz, '.format( np.mean(a4s)) + 'median frequency: {:.1f} Hz, '.format(np.median(a4s)) + 'standard deviation: {:.1f} Hz.\n'.format(np.std(a4s)) + Style.RESET_ALL) # Re-estimate print(Style.RESET_ALL + Fore.CYAN + Style.BRIGHT + 'Re-estimate using different notes? (y/n) ' + Style.RESET_ALL, end='') again = input() while again.lower() not in ('y', 'n'): print(Fore.CYAN + Style.BRIGHT + 'Re-estimate using different notes? (y/n) ' + Style.RESET_ALL, end='') again = input() if again.lower() == 'n': break print()
def features(X, sample_rate: float) -> np.ndarray: stft = np.abs(librosa.stft(X)) # fmin and fmax correspond to the minimum and the maximum basic frequency of human speech pitches, magnitudes = librosa.piptrack(X, sr=sample_rate, S=stft, fmin=70, fmax=400) pitch = [] for i in range(magnitudes.shape[1]): index = magnitudes[:, 1].argmax() pitch.append(pitches[index, i]) pitch_tuning_offset = librosa.pitch_tuning(pitches) pitchmean = np.mean(pitch) pitchstd = np.std(pitch) pitchmax = np.max(pitch) pitchmin = np.min(pitch) # Spectral centroids cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate) cent = cent / np.sum(cent) meancent = np.mean(cent) stdcent = np.std(cent) maxcent = np.max(cent) # Spectral plane flatness = np.mean(librosa.feature.spectral_flatness(y=X)) # The MFCC feature with coefficient being 50 mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) # Chromatography chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) # Mel frequency mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0) # ottava contrast contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0) # zero-crossing rate zerocr = np.mean(librosa.feature.zero_crossing_rate(X)) S, phase = librosa.magphase(stft) meanMagnitude = np.mean(S) stdMagnitude = np.std(S) maxMagnitude = np.max(S) # RMS energy rmse = librosa.feature.rmse(S=S)[0] meanrms = np.mean(rmse) stdrms = np.std(rmse) maxrms = np.max(rmse) ext_features = np.array([ flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent, maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd, pitch_tuning_offset, meanrms, maxrms, stdrms, ]) ext_features = np.concatenate( (ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast)) return ext_features
def rasta_emotion_upload(): wav_files = [] entry = dict() SAMPLE_RATE = 44100 b, _ = librosa.core.load('pickles/catalyst.wav', sr=SAMPLE_RATE) y, sr = librosa.load('pickles/catalyst.wav') entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y)) entry['STD_RMS'] = np.std(librosa.feature.rms(y=y)) assert _ == SAMPLE_RATE spf = wave.open('pickles/catalyst.wav') signal = spf.readframes(-1) input_sig = np.fromstring(signal, 'Int16') matrix = plp(input_sig, nwin=0.025, fs=sr, plp_order=13, shift=0.01, get_spec=False, get_mspec=False, prefac=0.97, rasta=True) rasta_f_df = pd.DataFrame(matrix[0]) mean_rastaplp = np.asarray((np.mean(rasta_f_df, axis=0)).tolist()) std_rastaplp = np.asarray((np.std(rasta_f_df, axis=0)).tolist()) delta_rastaplp = librosa.feature.delta(rasta_f_df) d_delta_rastaplp = librosa.feature.delta(rasta_f_df, order=2) mean_ddrastaplp = np.mean(d_delta_rastaplp, axis=0) std_ddrastaplp = np.std(d_delta_rastaplp, axis=0) mean_drastaplp = np.mean(delta_rastaplp, axis=0) std_drastaplp = np.std(delta_rastaplp, axis=0) for no in range(0, 13): entry['Mean_RASTAPLP{0}'.format(no)] = mean_rastaplp[no] entry['STD_RASTAPLP{0}'.format(no)] = std_rastaplp[no] entry['Mean_DDRastaPLP{0}'.format(no)] = mean_ddrastaplp[no] entry['STD_DDRastaPLP{0}'.format(no)] = std_ddrastaplp[no] entry['Mean_Delta_RastaPLP{0}'.format(no)] = mean_drastaplp[no] entry['STD_Delta_RastaPLP{0}'.format(no)] = std_drastaplp[no] y, sr = librosa.load('/pickles/catalyst.wav') pitches, magnitudes = librosa.core.piptrack(y, sr) # Select out pitches with high energy pitches = pitches[magnitudes > np.median(magnitudes)] pit = librosa.pitch_tuning(pitches) entry['pitch'] = pit wav_files.append(entry) wav_df = pd.DataFrame(wav_files) rasta_clf = joblib.load('pickles/rastaplp_model.sav') bar = pd.DataFrame(rasta_clf.predict_proba(wav_df)) bar.columns = rasta_clf.classes_ bar_t = bar.T bar_t.columns = ['values'] print('HERE') fig = go.Figure(data=[ go.Pie(labels=rasta_clf.classes_, values=bar_t['values'], hole=.3), ]) return rasta_clf.predict(wav_df), fig
def get_wav_df(self): wav_files = [] for wav in os.listdir(self.wav_dir): if wav.endswith('.wav'): entry = dict() entry['Session'] = wav SAMPLE_RATE = 44100 b, _ = librosa.core.load(self.wav_dir + '/' + wav, sr=SAMPLE_RATE) y, sr = librosa.load(self.wav_dir + '/' + wav) entry['Mean_RMS'] = np.mean(librosa.feature.rms(y=y)) entry['STD_RMS'] = np.std(librosa.feature.rms(y=y)) assert _ == SAMPLE_RATE spf = wave.open(self.wav_dir + '/' + wav, 'r') signal = spf.readframes(-1) input_sig = np.fromstring(signal, 'Int16') matrix = plp(input_sig, nwin=0.025, fs=sr, plp_order=13, shift=0.01, get_spec=False, get_mspec=False, prefac=0.97, rasta=True) rasta_f_df = pd.DataFrame(matrix[0]) mean_rastaplp = np.asarray((np.mean(rasta_f_df, axis=0)).tolist()) std_rastaplp = np.asarray((np.std(rasta_f_df, axis=0)).tolist()) delta_rastaplp = librosa.feature.delta(rasta_f_df) d_delta_rastaplp = librosa.feature.delta(rasta_f_df, order=2) mean_ddrastaplp = np.mean(d_delta_rastaplp, axis=0) std_ddrastaplp = np.std(d_delta_rastaplp, axis=0) mean_drastaplp = np.mean(delta_rastaplp, axis=0) std_drastaplp = np.std(delta_rastaplp, axis=0) for no in range(0, 13): entry['Mean_RASTAPLP{0}'.format(no)] = mean_rastaplp[no] entry['STD_RASTAPLP{0}'.format(no)] = std_rastaplp[no] entry['Mean_DDRastaPLP{0}'.format( no)] = mean_ddrastaplp[no] entry['STD_DDRastaPLP{0}'.format(no)] = std_ddrastaplp[no] entry['Mean_Delta_RastaPLP{0}'.format( no)] = mean_drastaplp[no] entry['STD_Delta_RastaPLP{0}'.format( no)] = std_drastaplp[no] y, sr = librosa.load(self.wav_dir + '/' + wav) pitches, magnitudes = librosa.core.piptrack(y, sr) # Select out pitches with high energy pitches = pitches[magnitudes > np.median(magnitudes)] pit = librosa.pitch_tuning(pitches) entry['pitch'] = pit wav_files.append(entry) wav_df = pd.DataFrame(wav_files) return wav_df
def features(X, sample_rate): stft = np.abs(librosa.stft(X)) pitches, magnitudes = librosa.piptrack(X, sr=sample_rate, S=stft, fmin=70, fmax=400) pitch = [] for i in range(magnitudes.shape[1]): index = magnitudes[:, 1].argmax() pitch.append(pitches[index, i]) pitch_tuning_offset = librosa.pitch_tuning(pitches) pitchmean = np.mean(pitch) pitchstd = np.std(pitch) pitchmax = np.max(pitch) pitchmin = np.min(pitch) # Spectrum center cent = librosa.feature.spectral_centroid(y=X, sr=sample_rate) cent = cent / np.sum(cent) meancent = np.mean(cent) stdcent = np.std(cent) maxcent = np.max(cent) # Spectral plane flatness = np.mean(librosa.feature.spectral_flatness(y=X)) # MFCC mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccsstd = np.std(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) mfccmax = np.max(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0) # Chroma chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) # Mel frequency mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0) # ottava contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0) # Zero crossing rate zerocr = np.mean(librosa.feature.zero_crossing_rate(X)) S, phase = librosa.magphase(stft) meanMagnitude = np.mean(S) stdMagnitude = np.std(S) maxMagnitude = np.max(S) # Root mean square energy rmse = librosa.feature.rmse(S=S)[0] meanrms = np.mean(rmse) stdrms = np.std(rmse) maxrms = np.max(rmse) ext_features = np.array([ flatness, zerocr, meanMagnitude, maxMagnitude, meancent, stdcent, maxcent, stdMagnitude, pitchmean, pitchmax, pitchstd, pitch_tuning_offset, meanrms, maxrms, stdrms ]) ext_features = np.concatenate( (ext_features, mfccs, mfccsstd, mfccmax, chroma, mel, contrast)) return ext_features