def segment_shimmer(com_speech_sort, voiced_yes, voiced_no, shimmer_frames, audio_file): """ calculating shimmer for each voice segment """ snd = parselmouth.Sound(audio_file) pitch = snd.to_pitch(time_step=.001) for idx, vs in enumerate(com_speech_sort): try: shimmer = np.NaN if vs in voiced_yes and len(vs) > 1: start_time = pitch.get_time_from_frame_number(vs[0]) end_time = pitch.get_time_from_frame_number(vs[-1]) snd_start = int(snd.get_frame_number_from_time(start_time)) snd_end = int(snd.get_frame_number_from_time(end_time)) samples = parselmouth.Sound( snd.as_array()[0][snd_start:snd_end]) shimmer = audio_shimmer(samples) except: pass shimmer_frames[idx] = shimmer return shimmer_frames
def test_from_numpy_array_stereo(sampling_frequency): sine_values = np.sin(2 * np.pi * np.arange(sampling_frequency) / sampling_frequency) cosine_values = np.sin(2 * np.pi * np.arange(sampling_frequency) / sampling_frequency) sound = parselmouth.Sound(np.vstack((sine_values, cosine_values)), sampling_frequency=sampling_frequency) assert np.all(sound.values == [sine_values, cosine_values]) assert sound.n_samples == len(sine_values) assert sound.n_channels == 2 assert sound.sampling_frequency == sampling_frequency assert sound.duration == 1 sound = parselmouth.Sound(np.vstack((sine_values, cosine_values))[::-1, 1::3], sampling_frequency=sampling_frequency) assert np.all(sound.values == [cosine_values[1::3], sine_values[1::3]]) with pytest.warns( RuntimeWarning, match= r'Number of channels \([0-9]+\) is greater than number of samples \([0-9]+\)' ): parselmouth.Sound(np.vstack((sine_values, cosine_values)).T, sampling_frequency=sampling_frequency)
def plot_contours(sound, language, countdown_label): if language == "Mandarin": mdl.mand_deepL("user.mp3", countdown_label) elif language == "Vietnamese": vdl.viet_deepL("user.mp3", countdown_label) ref = parselmouth.Sound(sound) user = parselmouth.Sound("user.mp3") ref = ref.to_pitch().kill_octave_jumps().smooth() user = user.to_pitch().kill_octave_jumps().smooth() ref_frequencies = get_frequencies(ref) user_frequencies = get_frequencies(user) ref_indexes = get_indexes(ref_frequencies) user_indexes = get_indexes(user_frequencies) plt.figure() plt.subplot(1, 2, 1) plt.title('reference') plt.scatter(ref_indexes, ref_frequencies) plt.xlim([0, 60]) plt.ylim([0, 250]) plt.subplot(1, 2, 2) plt.title('user') plt.scatter(user_indexes, user_frequencies) plt.xlim([0, 60]) plt.ylim([0, 250]) plt.show() return
def main(original_audio, new_audio): """Our main function here runs record() inside plot_pitch to provide the new_audio, and then as a hardcoded input uses the original audio.""" target_audio = praat.Sound(original_audio) recorded_audio = praat.Sound(new_audio) #duration = target_audio.get_total_duration() + 1 plot_pitch(recorded_audio, target_audio)
def test_from_scalar(sampling_frequency): with pytest.raises( ValueError, match="Cannot create Sound from a single 0-dimensional number"): parselmouth.Sound(42, sampling_frequency=sampling_frequency) with pytest.raises( ValueError, match="Cannot create Sound from a single 0-dimensional number"): parselmouth.Sound(3.14159, sampling_frequency=sampling_frequency)
def offset(template: Clip, video: Clip) -> Tuple[float, float]: """Find position of this Clip in another Clip (may be negative). Returns two values: offset in seconds and cross-correlation score. """ s1 = pm.Sound(template.path).convert_to_mono() s2 = pm.Sound(video.path).convert_to_mono() cc = s1.cross_correlate(s2, pm.AmplitudeScaling.SUM) score = cc.values.max() frame = cc.values.argmax() offset = cc.frame_number_to_time(frame) return offset, score
def offset(self, clip: 'Clip') -> (float, float): """Find position of this Clip in another Clip (may be negative). Returns two values: offset in seconds and cross-correlation score. """ s1 = pm.Sound(self.path).convert_to_mono() s2 = pm.Sound(clip.path).convert_to_mono() cc = s1.cross_correlate(s2, pm.AmplitudeScaling.SUM) score = cc.values.max() frame = cc.values.argmax() offset = cc.frame_number_to_time(frame) return offset, score
def test_from_numpy_array_mono(sampling_frequency): sine_values = np.sin(2 * np.pi * np.arange(sampling_frequency) / sampling_frequency) sound = parselmouth.Sound(sine_values, sampling_frequency=sampling_frequency) assert np.all(sound.values == sine_values[np.newaxis, :]) assert sound.n_samples == len(sine_values) assert sound.n_channels == 1 assert sound.sampling_frequency == sampling_frequency assert sound.duration == 1 sound = parselmouth.Sound(sine_values[1::3], sampling_frequency=sampling_frequency) assert np.all(sound.values == sine_values[np.newaxis, 1::3])
def measurePitch(voiceID, f0min, f0max, unit): sound = parselmouth.Sound(voiceID) # read the sound pitch = call(sound, "To Pitch", 0.0, f0min, f0max) pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max) #create a praat pitch object localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3) localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3) rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3) ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3) localShimmer = call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6) localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6) apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6) aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6) apq11Shimmer = call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6) harmonicity05 = call(sound, "To Harmonicity (cc)", 0.01, 500, 0.1, 1.0) hnr05 = call(harmonicity05, "Get mean", 0, 0) harmonicity15 = call(sound, "To Harmonicity (cc)", 0.01, 1500, 0.1, 1.0) hnr15 = call(harmonicity15, "Get mean", 0, 0) harmonicity25 = call(sound, "To Harmonicity (cc)", 0.01, 2500, 0.1, 1.0) hnr25 = call(harmonicity25, "Get mean", 0, 0) harmonicity35 = call(sound, "To Harmonicity (cc)", 0.01, 3500, 0.1, 1.0) hnr35 = call(harmonicity35, "Get mean", 0, 0) harmonicity38 = call(sound, "To Harmonicity (cc)", 0.01, 3800, 0.1, 1.0) hnr38 = call(harmonicity38, "Get mean", 0, 0) return localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, hnr05, hnr15, hnr25, hnr35, hnr38
def main(): audio_files = [] path = '/home/rosageorge97/MajorProject/Audio/' # path = "/home/sunitha/Documents/8th_sem/major_project/dataset/" for filename in glob.glob(os.path.join(path, '*.wav')): audio_files.append(filename) print(audio_files) i = 1 for file in audio_files: snd = parselmouth.Sound(file) power, intensity = get_base_features(snd) duration, mean_pitch, min_pitch, max_pitch = pitch_values(snd) spectrogram = get_spectrogram(snd) # print(file) end_name = file.rsplit('/', 1)[-1] csv_file = path + end_name + "_st.csv" audio_analysis = convert_csv(csv_file) feature_vector = [ end_name, power, intensity, duration, mean_pitch, min_pitch, max_pitch ] for value in audio_analysis: feature_vector.append(value) i += 1 with open('/home/rosageorge97/MajorProject/Results/audio_features.csv', 'a', newline='') as file: # with open('audio_features.csv', 'a', newline='') as file: writer = csv.writer(file) writer.writerow(feature_vector)
def generate_f0_pulses(sound, interpolate=True): parselsound = parselmouth.Sound(sound, sound.samplerate_Hz) manipulation = parselmouth.praat.call(parselsound, "To Manipulation", 0.01, 75, 600) pitch_tier = parselmouth.praat.call(manipulation, "Extract pitch tier") pitch = parselsound.to_pitch(time_step=0.01) f0_contours = pitch.selected_array['frequency'] time_in_second = pitch.xs() parselmouth.praat.call(pitch_tier, "Remove points between", 0, parselsound.duration) if interpolate: zeros = (f0_contours == 0) mean_frequency = np.median(f0_contours[~zeros]) f0_contours[0], zeros[0] = mean_frequency, False f0_contours[-1], zeros[-1] = mean_frequency, False interpolator = scipy.interpolate.PchipInterpolator( time_in_second[~zeros], np.log10(f0_contours[~zeros])) f0_contours = 10**interpolator(time_in_second) for i, t in enumerate(time_in_second): parselmouth.praat.call(pitch_tier, "Add point", t, f0_contours[i]) point_process = parselmouth.praat.call(pitch_tier, "To PointProcess") pulse_train = parselmouth.praat.call(point_process, "To Sound (phonation)", sound.samplerate_Hz, 1.0, 0.05, 0.7, 0.03, 3.0, 4.0) pulse_train = np.squeeze(pulse_train) new_sound = Sound(pulse_train, sound.samplerate_Hz) return new_sound, f0_contours, time_in_second
def measurePitch(voiceID, f0min, f0max, unit): sound = parselmouth.Sound(voiceID) # read the sound pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object meanF0 = call(pitch, "Get mean", 0, 0, unit) # get mean pitch stdevF0 = call(pitch, "Get standard deviation", 0, 0, unit) # get standard deviation #harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0) #hnr = call(harmonicity, "Get mean", 0, 0) pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max) localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3) localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3) rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3) ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3) ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3) localShimmer = call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6) localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6) apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6) aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6) apq11Shimmer = call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6) ddaShimmer = call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6) voice_report = call([sound, pitch, pointProcess], "Voice report", 0.0, 0.0, f0min, f0max, 1.3, 1.6, 0.03, 0.45) return meanF0, stdevF0, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer, voice_report
def test_call_parameters(sound): assert parselmouth.praat.call(sound, "Add", 0.1) is None assert parselmouth.praat.call(sound, "Add", -1) is None assert parselmouth.praat.call(sound, "Override sampling frequency", 44100) is None with pytest.raises(parselmouth.PraatError, match=r"Argument \".*\" must be greater than 0"): assert parselmouth.praat.call(sound, "Override sampling frequency", -10.0) is None assert parselmouth.praat.call(sound, "Get time from sample number", 1) == sound.get_time_from_index(1) assert tuple(map(int, parselmouth.PRAAT_VERSION.split("."))) < (6, 0, 47) # Replace with commented assert underneath once Praat version gets updated # with pytest.raises(parselmouth.PraatError, match=r"Argument \".*\" should be a whole number"): # assert parselmouth.praat.call(sound, "Get time from sample number", 0.5) != sound.get_time_from_index(1) assert parselmouth.praat.call(sound, "Set value at sample number", 1, 0.0) is None with pytest.raises(parselmouth.PraatError, match=r"Argument \".*\" should be a positive whole number"): assert parselmouth.praat.call(sound, "Set value at sample number", 0, -1, 0.0) is None assert parselmouth.praat.call(sound, "To Spectrum", True) == parselmouth.praat.call(sound, "To Spectrum", 1) assert parselmouth.praat.call(sound, "To Spectrum", False) == parselmouth.praat.call(sound, "To Spectrum", "no") assert parselmouth.praat.call(sound, "To TextGrid", "points intervals", "points").class_name == "TextGrid" assert parselmouth.praat.call("Create Sound from formula", "someSound", 1, 0, 1, 44100, "1/2").name == "someSound" many_channels = parselmouth.Sound(np.zeros((10, 1600)), 16000) assert parselmouth.praat.call(many_channels, "Extract channels", np.array([2, 3, 5, 7])).n_channels == 4 assert parselmouth.praat.call(many_channels, "Extract channels", [2, 3, 5, 7]).n_channels == 4 with pytest.raises(parselmouth.PraatError, match=r"Argument \".*\" should be a numeric vector, not a number"): assert parselmouth.praat.call(many_channels, "Extract channels", 4) == 1 with pytest.raises(parselmouth.PraatError, match=r"Argument \".*\" should be a numeric vector, not a numeric matrix"): assert parselmouth.praat.call(many_channels, "Extract channels", np.array([[2, 3, 5, 7]])) == 4
def test_run_file_relative_paths(sound_path, resources): script_path = resources["script.praat"] assert os.getcwd() != os.path.abspath(os.path.dirname(script_path)) assert parselmouth.praat.run_file( script_path, os.path.relpath( sound_path, os.path.dirname(script_path)))[0] == parselmouth.Sound(sound_path)
def draw_pitch_and_intensisty(filename, title): filepath = os.path.join(data_dir, filename + '.wav') snd = parselmouth.Sound(filepath) plt.figure() # Plot the pitch contour plt.subplot(2, 1, 1) plt.title(title) pitch = snd.to_pitch() # If desired, pre-emphasize the sound fragment before calculating the spectrogram pre_emphasized_snd = snd.copy() pre_emphasized_snd.pre_emphasize() spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000) # plt.figure() draw_spectrogram(spectrogram) plt.twinx() draw_pitch(pitch) plt.xlim([snd.xmin, snd.xmax]) # plt.show() # or plt.savefig("spectrogram_0.03.pdf") # Plot the intensity plt.subplot(2, 1, 2) intensity = snd.to_intensity() spectrogram = snd.to_spectrogram() # plt.figure() draw_spectrogram(spectrogram) plt.twinx() draw_intensity(intensity) plt.xlim([snd.xmin, snd.xmax]) plt.show() # or plt.savefig("spectrogram.pdf")
def main_get_feature(directory): all_audio_features = [] vowelsDict = defaultdict(list) for f in os.listdir(directory): vowel = f.split("_")[0][-1] if f.endswith('.wav'): # Cheak if wav file than import it otherwise continue data_praat = parselmouth.Sound(directory + '/' + f) fs_scipy, data_scipy = wavfile.read(directory + '/' + f) # Audio read by the wavfile.read function from scipy has both left channel and right channel data inside of it. Where data[:, 0] is the left channel and data[:, 1] is the right channel. data_librosa = librosa.load(directory + '/' + f, sr=fs_scipy) Features, Feature_type = get_features(data_librosa[0],data_librosa[1],data_scipy,data_praat) # Get audio features in a list choose the type features wanted, by uncommenting the relevant line feature_number = 1; # 0 all features, 1 scalar features, 2 vectors features, 3 matrix features if feature_number == 0: all_audio_features.append(Features) #list of all features if feature_number == 1: scalar_features = get_features_of_type(Features, Feature_type, scalar); all_audio_features.append(scalar_features); # list of scalar features if feature_number == 2: vector_features = get_features_of_type(Features, Feature_type, vector); all_audio_features.append(vector_features); # list of vector features if feature_number == 3: matrix_features = get_features_of_type(Features, Feature_type, matrix); all_audio_features.append(matrix_features); # list of matrix features else: continue if np.isnan(np.sum(scalar_features)): continue if '_' in f: vowelsDict[vowel].append(np.array(scalar_features)) else: vowelsDict['iau'].append(np.array(scalar_features)) if 'iau' in vowelsDict: del vowelsDict['iau'] # Normalize for key in vowelsDict.keys(): tmp = np.reshape(vowelsDict[key],(len(vowelsDict[key]),len(vowelsDict[key][0]))) vowelsDict[key] = (tmp-np.min(tmp,axis=0))/(np.max(tmp,axis=0)-np.min(tmp,axis=0)) return vowelsDict
def segment(self): audio = pm.Sound(self.audio_path) for index, sentence in enumerate(self.transcript): # speaking rate (syllables/second) l_line = sentence['line'].lower() line = l_line.split() syllable_count = reduce(lambda x, y: x + y, map(count_syllable, line)) if sys.version_info[0] < 3 \ else functools.reduce(lambda x, y: x + y, map(count_syllable, line)) time_delta = sentence['end'] - sentence['start'] if time_delta == 0: time_delta = 0.01 sentence['speaking_rate'] = syllable_count / time_delta # eliminate the error of dividing 0 # filler rate (filler words/ last time) filler_count = 0 for word in line: if word in filler_dict: filler_count += 1 for word in filler_phrase: filler_count += l_line.count(word) sentence['filler_rate'] = filler_count/time_delta sentence['filler_count'] = filler_count # pitch variety ( the difference value between 95 percentile of pitch and that of 5% percentile) tmp_segment = audio.extract_part(from_time=sentence['start'], to_time=sentence['end']) tmp_pitch = tmp_segment.to_pitch().selected_array['frequency'] tmp_pitch[tmp_pitch == 0] = np.nan tmp_upper_bound = np.nanpercentile(a=tmp_pitch, q=95) tmp_lower_bound = np.nanpercentile(a=tmp_pitch, q=5) sentence['pitch_variety'] = tmp_upper_bound - tmp_lower_bound # make comments sentence['comment'] = Comment(sentence).comment return self
def extract_prosodic_features(audio_source, slope_cutoff=0.500, end_cutoff=0.2): s = parselmouth.Sound(audio_source) p = s.to_pitch() voiced_frames = {} for i in range(p.get_number_of_frames()): if str(p.get_value_in_frame(i)) != 'nan': voiced_frames[p.get_time_from_frame_number( i)] = p.get_value_in_frame(i) sorted_times = sorted(voiced_frames.keys()) fo_slope_cutoff = sorted_times[-1] - slope_cutoff fo_end_cutoff = sorted_times[-1] - end_cutoff fo_slope_vals = [] fo_end_vals = [] o_fos = [] for i in sorted(voiced_frames.keys()): c_fo = voiced_frames[i] if i >= fo_end_cutoff: fo_end_vals.append(c_fo) else: o_fos.append(c_fo) if i >= fo_slope_cutoff: fo_slope_vals.append([i, c_fo]) fo_slope_array = np.array(fo_slope_vals) slope, intercept, r_value, p_value, std_err = stats.linregress( fo_slope_array[:, 0], fo_slope_array[:, 1]) #fo_slope = np.gradient(np.array(fo_slope_vals), axis=0) fo_slope = slope return p, fo_slope_vals, fo_slope, fo_end_vals, o_fos
def measureFormants(sound, f0min, f0max): sound = parselmouth.Sound(sound) # read the sound duration = call(sound, "Get total duration") # duration pitch = call(sound, "To Pitch (cc)", 0, f0min, 15, 'no', 0.03, 0.45, 0.01, 0.35, 0.14, f0max) pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max) formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50) numPoints = call(pointProcess, "Get number of points") f1_list = [] f2_list = [] # Measure formants only at glottal pulses for point in range(0, numPoints): point += 1 t = call(pointProcess, "Get time from index", point) f1 = call(formants, "Get value at time", 1, t, 'Hertz', 'Linear') f2 = call(formants, "Get value at time", 2, t, 'Hertz', 'Linear') f1_list.append(f1) f2_list.append(f2) f1_list = [f1 for f1 in f1_list if str(f1) != 'nan'] f2_list = [f2 for f2 in f2_list if str(f2) != 'nan'] # calculate mean formants across pulses f1_mean = statistics.mean(f1_list) f2_mean = statistics.mean(f2_list) return f1_mean, f2_mean, f1_list, f2_list
def train(fp, db): for filepath in glob.iglob(fp): for file in glob.glob(filepath): s = parselmouth.Sound(file) pitch = s.to_pitch() formant = s.to_formant_burg() duration = pitch.get_total_duration() values = [] frames = 400 if file not in datapoints: datapoints[file] = set() for i in range(1, int(frames * TRAIN_COEFFICIENT)): r = random.randint(1, frames) datapoints[file].add(r) frame = r/float(frames) time = frame * duration p = pitch.get_value_at_time(time) f1 = formant.get_value_at_time(1, time) f2 = formant.get_value_at_time(2, time) if not math.isnan(p) and not math.isnan(f1) and not math.isnan(f2): features = (f1, f2, p) values.append(features) # print len(values) mean = get_mean(values) db[filepath] = (values, mean)
def get_features(path): sound = parselmouth.Sound(path) pitch = sound.to_pitch() pulses = parselmouth.praat.call([sound, pitch], "To PointProcess (cc)") voice_report = parselmouth.praat.call([sound, pitch, pulses], "Voice report", 0.0, 0.0, 75, 600, 1.3, 1.6, 0.03, 0.45) voice_report = voice_report.split('\n') index_list = [0, 1, 7, 12, 16, 22, 29, 33] # Fejlécek vr = [] for index, element in enumerate(voice_report): if index not in index_list: vr.append(element) numbers = [] for i in vr: numbers += ([float(ele) for ele in re.findall(r"[-+]?\d*\.\d+|\d+", i)]) numbers[7] = numbers[7] * 10 ** ((-1) * numbers[8]) numbers[9] = numbers[9] * 10 ** ((-1) * numbers[10]) numbers[19] = numbers[19] * 10 ** ((-1) * numbers[20]) index_list = [8, 10, 20, 12, 13, 16, 17, 22, 27, 29, 31] # Szükségtelen számok final_vr = [] for index, element in enumerate(numbers): if index not in index_list: final_vr.append(element) mfcc = get_MFCC(path) return final_vr+mfcc
def get_all_features(self, f0min, f0max, unit): try: sound = parselmouth.Sound(self.voiceID) # read the sound pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max) Pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #Vocal Report self.vocal_report = parselmouth.praat.call([sound, Pitch, pointProcess], "Voice report", 0, 0, 75, 600, 1.3, 1.6, 0.03, 0.45) #Pitch self.get_pitch_parameters(Pitch=Pitch, unit=unit) #Harmonicity Harmonicity = self.get_harmonicity_parameters(sound) #Jitter self.get_jitter_parameters(sound=sound, pointProcess=pointProcess) #Shimmer self.get_shimmer_parameters(sound=sound, pointProcess=pointProcess) #Pulse self.get_pulse_parameters() #Voicing self.get_voicing_parameters() except Exception as e: pass self.all_vocal_parameters = self.__dict__ return self.all_vocal_parameters
def test_run_with_parameters(sound_path): script = textwrap.dedent(""" form Test positive minPitch 100.0 real timeStep 0.0 boolean subtractMean "yes" endform Read from file: "{}" To Intensity: minPitch, timeStep, subtractMean selectObject: 1 selectObject: "Intensity the_north_wind_and_the_sun" """.format(sound_path)) min_pitch = 75 time_step = 0.05 subtract_mean = False assert parselmouth.praat.run( script, min_pitch, time_step, subtract_mean)[0] == parselmouth.Sound(sound_path).to_intensity( min_pitch, time_step, subtract_mean) with pytest.raises(parselmouth.PraatError, match="Found 0 arguments but expected more."): parselmouth.praat.run(script)
def draw_pitch(filename, output_fn=None): plt = init_set_plt() snd = parselmouth.Sound(filename) pitch = snd.to_pitch() pitch_values = pitch.selected_array['frequency'] proportion = len(pitch_values[pitch_values > 0]) / len(pitch_values) print("=" * 80) print(f"Filename: {filename}") print(f"Voiced segment proportion: {proportion}") print("=" * 80 + "\n") pitch_values[pitch_values == 0] = np.nan plt.plot(pitch.xs(), pitch_values, 'o', markersize=5, color='w') plt.plot(pitch.xs(), pitch_values, 'o', markersize=2) plt.grid(False) plt.xlim([snd.xmin, snd.xmax]) plt.ylim(50, 450) plt.xlabel("Time (s)", fontsize=24) plt.ylabel("Pitch (Hz)", fontsize=24) plt.title(filename.split("-")[0], fontsize=20) plt.tight_layout() if output_fn is not None: plt.savefig(output_fn) plt.close()
def get_f0_standard_deviation(pathSound, start_time, end_time, voice_max_frequency, voice_min_frequency): """ Get the standard deviation around a mean :params pathSound: path to the sound to analyse :params start_time: in seconds :params end_time : in seconds :params voice_max_frequency : maximum frequency of a human being (adult man or adult female) :params voice_min_frequency : minimum frequency of a human being (adult man or adult female) :returns: standart deviation of the sound """ sound = parselmouth.Sound(pathSound) sound = sound.extract_part(from_time=start_time, to_time=end_time) pitch = sound.to_pitch() pitch_values = pitch.selected_array['frequency'] sum = 0 mean = get_f0_mean(pathSound, start_time, end_time, voice_max_frequency, voice_min_frequency) for values in pitch_values: if values != 0: sum += math.pow(values - mean, 2) return math.sqrt(sum / len(pitch_values))
def predict(): # Get the WAV file name from the request. Must include the .wav extension. binary_file_data = request.form['file'] binary_file_path = "audio.wav" with open(binary_file_path, 'w') as f: f.write(binary_file_data) # Download the sound file from gcp sound_file = scipy.io.wavfile.read(binary_file_path) sound = parselmouth.Sound(binary_file_path) # Calculate features fundamental_frequency_features = calculate_fundamental_frequency_features( sound_file) other_features = engineer_features(sound) # Concatenate features in the order the model expects, then make a prediction. model_input = np.concatenate( [fundamental_frequency_features, other_features]) for i in range(len(model_input)): if np.isnan(model_input[i]): model_input[i] = 0 model_input = np.reshape(model_input, (1, 15)) prediction_array = MODEL.predict(model_input) # We only process one sound file so there should only be one prediction to return. prediction = prediction_array[0][0] return json.dumps({ 'prediction': str(prediction), 'averageFundamentalFrequency': str(model_input[0][0]), 'jitter': str(model_input[0][3]), 'shimmer': str(model_input[0][8]) }), 200
def calculate_pitch(wav, durs): mel_len = durs.sum() durs_cum = np.cumsum(np.pad(durs, (1, 0))) snd = parselmouth.Sound(wav) pitch = snd.to_pitch(time_step=snd.duration / (mel_len + 3)).selected_array['frequency'] assert np.abs(mel_len - pitch.shape[0]) <= 1.0 # Average pitch over characters pitch_char = np.zeros((durs.shape[0], ), dtype=np.float) for idx, a, b in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]): values = pitch[a:b][np.where(pitch[a:b] != 0.0)[0]] pitch_char[idx] = np.mean(values) if len(values) > 0 else 0.0 # Average to three values per character pitch_trichar = np.zeros((3 * durs.shape[0], ), dtype=np.float) durs_tri = np.concatenate([dur_chunk_sizes(d, 3) for d in durs]) durs_tri_cum = np.cumsum(np.pad(durs_tri, (1, 0))) for idx, a, b in zip(range(3 * mel_len), durs_tri_cum[:-1], durs_tri_cum[1:]): values = pitch[a:b][np.where(pitch[a:b] != 0.0)[0]] pitch_trichar[idx] = np.mean(values) if len(values) > 0 else 0.0 pitch_mel = maybe_pad(pitch, mel_len) pitch_char = maybe_pad(pitch_char, len(durs)) pitch_trichar = maybe_pad(pitch_trichar, len(durs_tri)) return pitch_mel, pitch_char, pitch_trichar
def get_prosodic_features(file_loc): unit="Hertz" filename = file_loc sound = parselmouth.Sound(file_loc) y, sr = librosa.load(file_loc) duration = librosa.get_duration(y=y, sr=sr) energy = librosa.feature.rms(y=y) #1 SD_energy = np.std(energy) #2 pitch = call(sound, "To Pitch", 0.0, 75, 300) #3 intensity=call(sound, "To Intensity", 75, 0) maxIntensity = call(intensity, "Get maximum", 0, 0,'Parabolic') #Ask if parabolic ok? minIntensity= call(intensity, "Get minimum", 0, 0,'Parabolic') maxPitch=call(pitch,"Get maximum",0,0,unit,'Parabolic') minPitch=call(pitch,"Get minimum",0,0,unit,'Parabolic') #4 #5 voiced_frames = pitch.count_voiced_frames() total_frames = pitch.get_number_of_frames() #6 voiced_to_total_ratio = voiced_frames/total_frames #7 voiced_to_unvoiced_ratio = voiced_frames / (total_frames - voiced_frames) return [SD_energy, maxIntensity, minIntensity, maxPitch, minPitch, voiced_frames, voiced_to_total_ratio, voiced_to_unvoiced_ratio]
def audio_to_textgrid(audio_no_annot_path, textgrid_path): """Generates .TextGrid files from audio using Praat. :param audio_no_annot_path: path with audio not annotated. :param textgrid_path: path with textgrid generated. :returns: None. """ create_data_path(textgrid_path) audio_files = os.listdir(audio_no_annot_path) print("Procesing " + str(len(audio_files)) + " audio files with Praat, can take a while...") data_mod = round(len(audio_files)/10) index_mod = 1 for audio in audio_files: sound = parselmouth.Sound(os.path.join(audio_no_annot_path, audio)) # Take each audio file and convert to textGrid with praat noise_reduction = call(sound, "Remove noise", 0.0, 1.0, 0.025, 80, 10000, 40, 'Spectral subtraction') # 'silent', 'sounding' manipulation = call(noise_reduction, "To TextGrid (silences)", 100, 0.0, -65.0, 0.8, 0.2, '', 'sounding') # 'silent', 'sounding' # 0.8, 0.2 # minimun silent and sound intervals text_audio = audio[:-4] + '.TextGrid' call(manipulation, "Save as text file", os.path.join(textgrid_path, text_audio)) # Show progresion try: if index_mod % data_mod == 0: # it means 10%, 20%, ... print(str(int(index_mod/data_mod * 10))+"% ",end="\r") except: print('',end="\r") #end try index_mod += 1
def test(fp): correct = 0 total = 0 for filepath in glob.iglob(fp): for file in glob.glob(filepath): s = parselmouth.Sound(file) pitch = s.to_pitch() formant = s.to_formant_burg() duration = pitch.get_total_duration() values = [] frames = 400 for i in range(1, frames): if i not in datapoints[file]: frame = i/float(frames) time = frame * duration p = pitch.get_value_at_time(time) f1 = formant.get_value_at_time(1, time) f2 = formant.get_value_at_time(2, time) if not math.isnan(p) and not math.isnan(f1) and not math.isnan(f2): features = (f1, f2, p) values.append(features) mean = get_mean(values) total += 1 correct = correct + 1 if predict(mean, data) == filepath else correct return (correct, total)