def determineSilences(file): trackA = AudioSegment.from_file(file["trackA"]) speakingA = silence.detect_nonsilent(trackA, min_silence_len=500, silence_thresh=-30) trackB = AudioSegment.from_file(file["trackB"]) speakingB = silence.detect_nonsilent(trackB, min_silence_len=500, silence_thresh=-30) aClass = [[set[0],set[1], hasConflicts(set, speakingB)] for set in speakingA] bClass = [[set[0], set[1], hasConflicts(set, speakingA)] for set in speakingB] return aClass, bClass
def compareSpeechPeriods(self, source_in, source_out): segment_in = AudioSegment(data=source_in, sample_width=2, frame_rate=8000, channels=1) segment_out = AudioSegment(data=source_out, sample_width=2, frame_rate=8000, channels=1) speech_period_in = silence.detect_nonsilent(segment_in, min_silence_len=1000, silence_thresh=-32) speech_period_out = silence.detect_nonsilent(segment_out, min_silence_len=1000, silence_thresh=-32) #print('period in: ', speech_period_in) #print('period out: ', speech_period_out) if len(speech_period_out) == 0 and len(speech_period_in) == 0: return 0 elif len(speech_period_out) == 0 and len(speech_period_in) > 0: voiceEdgePoints = [ speech_period_in[0][0], speech_period_in[len(speech_period_in) - 1][1] ] return (1, voiceEdgePoints) elif len(speech_period_in) == 0 and len(speech_period_out) > 0: voiceEdgePoints = [ speech_period_out[0][0], speech_period_out[len(speech_period_out) - 1][1] ] return (2, voiceEdgePoints) elif speech_period_in[0][0] <= speech_period_out[0][0]: if speech_period_out[len(speech_period_out) - 1][1] >= speech_period_in[ len(speech_period_in) - 1][1]: voiceEdgePoint = speech_period_out[len(speech_period_out) - 1][1] else: voiceEdgePoint = speech_period_in[len(speech_period_in) - 1][1] voiceEdgePoints = [speech_period_in[0][0], voiceEdgePoint] return (3, voiceEdgePoints) elif speech_period_in[0][0] > speech_period_out[0][0]: if speech_period_out[len(speech_period_out) - 1][1] >= speech_period_in[ len(speech_period_in) - 1][1]: voiceEdgePoint = speech_period_out[len(speech_period_out) - 1][1] else: voiceEdgePoint = speech_period_in[len(speech_period_in) - 1][1] voiceEdgePoints = [speech_period_out[0][0], voiceEdgePoint] return (4, voiceEdgePoints)
def trim_silence(wav): start_end = detect_nonsilent(wav, 250, -40, 1) start_end = [se for se in start_end if se[1] - se[0] > 50] while len(start_end) == 0 or wav.dBFS > 0: #if can't detect nonsilent wav = wav + 5 start_end = detect_nonsilent(wav, 250, -40, 1) start_end = [se for se in start_end if se[1] - se[0] > 50] start = min(start_end)[0] end = max(start_end)[1] wav = wav[start - 50:end + 50] wav = np.array(wav.get_array_of_samples(), dtype='float32') / 2**15 return wav
def goAmadeus(file,targetFolder, silenceModifier): audio_segment = AudioSegment.from_wav(file) normalized_sound = match_target_amplitude(audio_segment, -20.0) nonsilent_data = detect_nonsilent(normalized_sound, min_silence_len=50, silence_thresh=-45, seek_step=1) # print("start,Stop") # for chunks in nonsilent_data: # print([chunk / 1000 for chunk in chunks]) activeHolder = 0 where = 1 howManyLoops = 0 for i in range(len(nonsilent_data)): if(where-1 < 0): activeHolder += 0 where += 0 if where >= 1: current_silence = nonsilent_data[howManyLoops][0] - nonsilent_data[howManyLoops-1][1] if(current_silence >= 2000): activeHolder += 0 elif current_silence <= 50 and howManyLoops < 3: activeHolder += 5 elif current_silence > 50: activeHolder += current_silence where += 1 if(where == 0): where += 1 howManyLoops += 1 allSilence = activeHolder # print("All silence: " + str(allSilence) + " ms") bestSingleSilence = allSilence / where add = bestSingleSilence * silenceModifier/100 bestSingleSilence = bestSingleSilence+add # print("Best Silence: " + str(round(bestSingleSilence)) + " ms") # print("Started Chunking..") cutOnBestSilence(round(bestSingleSilence), file, targetFolder) best_nonsilent_data = detect_nonsilent(normalized_sound, min_silence_len=round(bestSingleSilence), silence_thresh=-45, seek_step=1) return best_nonsilent_data # goAmadeus("test.wav",1)
def run(self): try: if os.path.exists(self.wav_name) is not True: AudioSegment.from_file(self.video_name).export(self.wav_name, format='mp3', bitrate="64k") sound = AudioSegment.from_mp3(self.wav_name) chunks = detect_nonsilent(sound, min_silence_len=1000, silence_thresh=-45) # now recombine the chunks so that the parts are at least 60 sec long target_length = 60 * 1000 output_chunks = [chunks[0]] for chunk in chunks[1:]: if output_chunks[-1][1] - output_chunks[-1][0] < target_length: output_chunks[-1][1] = chunk[1] else: # if the last output chunk is longer than the target length, # we can start a new one output_chunks.append(chunk) config = { 'duration': sound.duration_seconds, 'total': len(output_chunks), 'chunks': output_chunks } self.config_sig.emit(config) except Exception as e: print(e) self.config_sig.emit({})
def get_first_nonsilent(self, sound, silence_threshold=-28): non_silences = silence.detect_nonsilent( sound, min_silence_len=self.fade_length, silence_thresh=silence_threshold, seek_step=1) return non_silences[0]
def cut_speech(dirs): """ This function is used to cut speech samples into non-silent pieces for further usage in MFCC extraction :return: None """ logging.basicConfig(filename="logfile.log", level=logging.DEBUG) for d in dirs: for speaker in os.listdir(d): wav_names = glob('{}/{}/*.wav'.format(d, speaker)) for name in wav_names: print "processing file {}...".format(name) sound = AudioSegment.from_file(name, format="wav") speech = silence.detect_nonsilent(sound, silence_thresh=-50) i = 1 for frag in speech: part = sound[frag[0]:frag[1]] part.export('{}/{}/part_{}.wav'.format(d, speaker, i), format="wav") i += 1 logging.info("Finished cutting audio samples")
def splitAudioBySilence(audio_path, skip_idx=0, out_ext="wav", silence_thresh=-40, silence_chunk_len=100, keep_silence=100): audio = read_audio(audio_path) not_silence_ranges = silence.detect_nonsilent( audio, min_silence_len=silence_chunk_len, silence_thresh=silence_thresh) edges = concatenate_edges(not_silence_ranges) intervals = get_rid_of_short_intervals(edges) for idx, (start_idx, end_idx) in enumerate(intervals[skip_idx:]): start_idx = max(0, start_idx - keep_silence) end_idx += keep_silence segment = audio[start_idx:end_idx] segment.export("./chunks/chunk{0}.mp3".format(idx), out_ext) segment.set_channels(1) segment.export("./chunks/chunk{0}.wav".format(idx), format="wav") os.remove("./chunks/chunk{0}.mp3".format(idx)) # emptyFolder("./chunks/") # splitAudioBySilence("audio.mp3")
def split_audio(self, audio_segment, min_silence_len=500, silence_thresh=-30, keep_silence=100, seek_step=1): not_silence_ranges = detect_nonsilent(audio_segment, min_silence_len, silence_thresh, seek_step) def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) next(b, None) return zip(a, b) chunks = [] audio_starts = [] audio_ends = [] # 根据silence片段切分视频 for (start1, end1), (start2, end2) in pairwise(not_silence_ranges): chunks.append(audio_segment[end1:start2]) audio_starts.append(end1) audio_ends.append(start2) return chunks, audio_starts, audio_ends
def get_segments(args, audio_file, segments_file): if os.path.exists(segments_file): with open(segments_file) as json_file: json_data = json.load(json_file) return json_data["sound_ranges"][0]["sounds"] else: long_nonsilence = detect_nonsilent(audio_file, min_silence_len=args.long_silence, silence_thresh=args.silence_thresh) silence = detect_silence(audio_file, min_silence_len=args.short_silence, silence_thresh=args.silence_thresh) gaps_silence = list( map( lambda x: [x[0] + args.short_silence / 2, x[1] - args.short_silence / 2], detect_silence(audio_file, min_silence_len=2 * args.short_silence, silence_thresh=args.silence_thresh + 20))) nonsilence1 = split_long_nonsilence(long_nonsilence, silence, args.min * 1000, args.max * 1000) segments = split_long_nonsilence(nonsilence1, gaps_silence, args.min * 1000, args.max * 1000) return segments
def get_non_silent_ranges(filepath, audio_length, silence_length, silence_thresh): """ Given a filepath to a .wav file and a target audio length, return all the non-silent ranges from the audio sample :param filepath: filepath to the .wav audio file :param audio_length: length in seconds of audio to process :param silence_length: minimum length of a silence to be used for a split :param silence_thresh: (in dBFS) anything quieter than this will be considered silence :return: 2D array: array of shape [num_ranges, 2] """ # Load data into AudioSegment object and extract audio_length seconds audio_file = AudioSegment.from_wav(filepath) audio_file = audio_file[:audio_length * MS] # Use given parameters to return non_silent ranges ranges = [] for i in range(10): ranges = detect_nonsilent(audio_file, min_silence_len=int(silence_length), silence_thresh=silence_thresh) if len(ranges) > 0: break else: silence_length /= 2 return ranges
def cut_by_silence(precut_audio_path, output_folder, filebasename): if not os.path.exists(output_folder): os.mkdir(output_folder) #use pydub AudioSegment &silence module to detect silence, cut and save, #return last chunck's time stamp in millionsecond cut_num = 0 audio_segment = AudioSegment.from_wav(precut_audio_path) silence_thresh_tries = range(-40, -5) for silence_thresh in silence_thresh_tries: chuncks = detect_nonsilent(audio_segment, min_silence_len=500, silence_thresh=silence_thresh) logging.debug("try {}".format(silence_thresh)) if len(chuncks) >= 2: for chunck in chuncks: out_audio_file = os.path.join( output_folder, filebasename + "_" + str(TimestampMillisec64()) + "_" + str(cut_num) + ".wav") audio_segment[chunck[0]:chunck[1]].export(out_audio_file, format='wav') cut_num = cut_num + 1 break if silence_thresh == -5 and len(chuncks) < 2: out_audio_file = os.path.join( output_folder, filebasename + "_" + str(TimestampMillisec64()) + "_" + str(cut_num) + ".wav") audio_segment[chuncks[0][0]:chuncks[0][1]].export(out_audio_file, format='wav') return 60, cut_num
def preprocess_wav_files(my_dir): vowelsDict = defaultdict(list) for filename in os.listdir(my_dir): vowel = filename.split("_")[0][-1] # File read signal = AudioSegment.from_wav(my_dir + '/' + filename) # Remove silence - beginning and end non_sil_times = detect_nonsilent(signal, min_silence_len=50, silence_thresh=signal.dBFS * 1.5) if len(non_sil_times): signal = signal[non_sil_times[0][0]:non_sil_times[0][1]] # Downsampling to 16KHz signal = signal.set_frame_rate(RATE) # Wav segmentation segmented_signal = wav_segmentation(signal) segmented_signal = [ chunk.get_array_of_samples() for chunk in segmented_signal ] if '_' in filename: vowelsDict[vowel].extend(segmented_signal) # librosa_data[vowel].append(librosa_features(my_dir + '/' + filename)) else: vowelsDict['iau'].extend(segmented_signal) return vowelsDict
def splitSound(self): self.__soundChunks = [] gaps = silence.detect_nonsilent(self.__audio, silence_thresh=self.__threshold) for start, final in gaps: self.__soundChunks.append(self.__audio[start:final]) return self
def trim_on_silence(audio_path, skip_idx=0, out_ext="wav", silence_thresh=-40, min_silence_len=400, silence_chunk_len=100, keep_silence=200): audio = read_audio(audio_path) not_silence_ranges = silence.detect_nonsilent( audio, min_silence_len=silence_chunk_len, silence_thresh=silence_thresh) if not not_silence_ranges: print(audio_path) return [] start_idx = not_silence_ranges[0][0] end_idx = not_silence_ranges[-1][1] start_idx = max(0, start_idx - keep_silence) end_idx = min(len(audio), end_idx + keep_silence) trimmed = audio[start_idx:end_idx] trimmed.export(audio_path, out_ext) return []
def split_on_silence_with_pydub( audio_path, skip_idx=0, out_ext="wav", silence_thresh=-40, silence_chunk_len=100, keep_silence=100): filename = os.path.basename(audio_path).split('.', 1)[0] audio = read_audio(audio_path) not_silence_ranges = silence.detect_nonsilent( audio, min_silence_len=silence_chunk_len, silence_thresh=silence_thresh) edges = concatenate_edges(not_silence_ranges) intervals = get_rid_of_short_intervals(edges) # Save audio files audio_paths = [] for idx, (start_idx, end_idx) in enumerate(intervals[skip_idx:]): start_idx = max(0, start_idx - keep_silence) end_idx += keep_silence target_audio_path = "{}/pre_audio/{}.{:04d}.{}".format(os.path.dirname(audio_path), filename, idx, out_ext) segment = audio[start_idx:end_idx] segment.export(target_audio_path, out_ext) audio_paths.append(target_audio_path) return audio_paths, intervals
def extractor(self): seg = AudioSegment.from_file(self.song) # reduce loudness of sounds over 120Hz (focus on bass drum, etc) seg = seg.low_pass_filter(120.0) # we'll call a beat: anything above average loudness beat_loudness = seg.dBFS # the fastest tempo we'll allow is 240 bpm (60000ms / 240beats) minimum_silence = int(60000 / 240.0) nonsilent_times = detect_nonsilent(seg, minimum_silence, beat_loudness) spaces_between_beats = [] last_t = nonsilent_times[0][0] for peak_start, _ in nonsilent_times[1:]: spaces_between_beats.append(peak_start - last_t) last_t = peak_start # We'll base our guess on the median space between beats spaces_between_beats = sorted(spaces_between_beats) space = spaces_between_beats[int(len(spaces_between_beats) / 2)] bpm = 60000 / space return bpm
def _split_on_silence_ranges(self, min_silence_len=1000, silence_thresh=-16, keep_silence=100, seek_step=1): # from the itertools documentation def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) next(b, None) return zip(a, b) if isinstance(keep_silence, bool): keep_silence = len(self.audio) if keep_silence else 0 output_ranges = [[ start - keep_silence, end + keep_silence ] for (start, end) in detect_nonsilent(self.audio, min_silence_len, silence_thresh, seek_step)] for range_i, range_ii in pairwise(output_ranges): last_end = range_i[1] next_start = range_ii[0] if next_start < last_end: range_i[1] = (last_end + next_start) // 2 range_ii[0] = range_i[1] return [(max(start, 0), min(end, len(self.audio))) for start, end in output_ranges]
def split_videos(filename, output): global silence # Split file on silences video = AudioSegment.from_file(filename, "mp4") silences = silence.detect_nonsilent(video, silence_thresh=-40, min_silence_len=400) print("Done splitting audio, start splitting clips") print(filename) clip = VideoFileClip(filename) for i, silence in enumerate(silences): start = silence[0] / 1000 end = silence[1] / 1000 length = end - start long_vid = False while length > 10: long_vid = True temp = start + 5 subpart = clip.subclip(start, temp) subpart.write_videofile("{}/000s{}.mp4".format(output, i)) i += 1 start += 5 length -= 5 if long_vid or length < 1: continue subpart = clip.subclip(start, end) subpart.write_videofile("{}/000{}.mp4".format(output, i))
def splitAudioFile(filename_mp3, in_dir, min_silence_len=400, silence_thresh=-65): sound = AudioSegment.from_mp3(filename_mp3) nonsilence_range = detect_nonsilent(sound, min_silence_len, silence_thresh) chunks_range = [] for i, chunk in enumerate(nonsilence_range): if i==0: print(chunk, len(nonsilence_range)) start = chunk[0] end = (chunk[1] + nonsilence_range[i+1][0])/2 sound[:end].export(".\\tmp\\%d\\%d.wav" % (in_dir, i), format="wav", bitrate="16k") elif i == len(nonsilence_range)-1: start = (nonsilence_range[i-1][1] + chunk[0])/2 end = chunk[1] + 1000.0 sound[start:].export(".\\tmp\\%d\\%d.wav" % (in_dir, i), format="wav", bitrate="16k") else: start = (nonsilence_range[i-1][1] + chunk[0])/2 end = (chunk[1] + nonsilence_range[i+1][0])/2 sound[start:end].export(".\\tmp\\%d\\%d.wav" % (in_dir, i), format="wav", bitrate="16k") start = round(start/1000, 1) end = round(end/1000, 1) chunks_range.append((start,end)) return chunks_range
def split_on_silence(audio_segment, min_silence_len=1000, silence_thresh=-16, keep_silence=100, seek_step=1): """ audio_segment - original pydub.AudioSegment() object min_silence_len - (in ms) minimum length of a silence to be used for a split. default: 1000ms silence_thresh - (in dBFS) anything quieter than this will be considered silence. default: -16dBFS keep_silence - (in ms) amount of silence to leave at the beginning and end of the chunks. Keeps the sound from sounding like it is abruptly cut off. (default: 100ms) """ not_silence_ranges = detect_nonsilent(audio_segment, min_silence_len, silence_thresh, seek_step) chunks = [] starttime = [] endtime = [] for start_i, end_i in not_silence_ranges: start_i = max(0, start_i - keep_silence) end_i += keep_silence chunks.append(audio_segment[start_i:end_i]) starttime.append(start_i) endtime.append(end_i) return chunks, starttime, endtime
def get_segmented_samples(my_dir, dictsList, iter_num=5): all_pitchs = [] for dictL in dictsList: all_speakers = [] for speakerNum in dictL.keys(): segments = [] for file in dictL[speakerNum]: signal = AudioSegment.from_wav(my_dir + '\\' + file) # Remove silence - beginning and end non_sil_times = detect_nonsilent(signal, min_silence_len=50, silence_thresh=signal.dBFS * 1.5) if len(non_sil_times): signal = signal[non_sil_times[0][0]:non_sil_times[0][1]] # Downsampling to 16KHz signal = signal.set_frame_rate(RATE) # Wav segmentation segmented_signal = wav_segmentation(signal) segmented_signal = [chunk.get_array_of_samples() for chunk in segmented_signal] segments.append(segmented_signal) iter_segments_per_speaker = [] for n in range(iter_num): tmp = [] for segList in segments: rand_idx = np.random.randint(len(segList)) tmp.append(segList[rand_idx]) iter_segments_per_speaker.append(tmp) all_speakers.append(iter_segments_per_speaker) all_pitchs.append(all_speakers) return all_pitchs
def shorter_filler(json_result, audio_file, min_silence_len, start_time, non_silence_start): # 침묵 길이를 더 짧게 min_silence_length = (int)(min_silence_len/1.2) intervals = detect_nonsilent(audio_file, min_silence_len=min_silence_length, silence_thresh=-32.64 ) for interval in intervals: interval_audio = audio_file[interval[0]:interval[1]] # padding 40 길이 이상인 경우 더 짧게 if (interval[1]-interval[0] >= 460): non_silence_start = shorter_filler(json_result, interval_audio, min_silence_length, interval[0]+start_time, non_silence_start) else:# padding 40 길이보다 짧은 경우 predict if interval[1]-interval[0] > 10 : if predict_filler(interval_audio) == 0 : # 추임새인 경우 json_result.append({'start':non_silence_start,'end':start_time+interval[0],'tag':'1000'}) # tag: 1000 means non-slience non_silence_start = start_time + interval[0] # 추임새 tagging json_result.append({'start':start_time+interval[0],'end':start_time+interval[1],'tag':'1111'}) # tag: 1111 means filler word return non_silence_start
def volume_trial(file): # The threshhold of volume (decibels) threshhold = -45 # The threshhold of total time over volume threshhold (seconds) length_seconds = 5 # Get audio from file and data about audio audio = AudioSegment.from_file(file) data = mediainfo(file) # The number of samples in the audio per second of audio samples_per_second = int(len(audio) / float(data["duration"])) # The time threshhold in samples length_samples = length_seconds * samples_per_second # Find all sets of nonsilent samples nonsilences = silence.detect_nonsilent(audio, 1, silence_thresh=threshhold) # calculate total amount of nonsilences total = 0 for i in nonsilences: total += i[1] - i[0] print(total / samples_per_second) if total >= length_samples: return True else: return False
def calc_bpm(audio: AudioSegment) -> int: tmp_audio = effects.low_pass_filter(audio, 120) # cut off sounds below 120 Hz beat_volume = tmp_audio.dBFS min_silence = int(60000 / 240.0) # Allow up to 240 bpm nonsilent_ranges = detect_nonsilent(tmp_audio, min_silence, beat_volume) spaces_between_beats = [] last_t = nonsilent_ranges[0][0] for peak_start, _ in nonsilent_ranges[1:]: spaces_between_beats.append(peak_start - last_t) last_t = peak_start spaces_between_beats = sorted(spaces_between_beats) temp = len(spaces_between_beats) / 2 temp = int(temp) print(temp) if temp == 0: # This just means that this segment had no audio louder then 120 Hz # might as well discard it since we aren't going to get a good bpm # measurment from quieter sections of the song return 0 space = spaces_between_beats[temp] bpm = 60000 / space return bpm
def trim_silent(self): """ Trims extraneous silence at the ends of the audio """ a = AudioSegment.empty() for seg in silence.detect_nonsilent(self.audio): a = a.append(self.audio[seg[0]:seg[1]], crossfade=0) self.audio = a return self
def chunk(self): print('In Audio_Object.Chunk') ''' Chunk the audio in smaller files and save to chunk_path ''' # open the audio file using pydub sound = AudioSegment.from_wav(self.audio_path + self.audio_filename + self.audio_extn) silence_to_keep = 100 # split audio sound where silence is 700 miliseconds or more and get chunks chunks = split_on_silence(sound, # experiment with this value for your target audio file min_silence_len = 700, # adjust this per requirement silence_thresh = sound.dBFS-14, # keep the silence for 1 second, adjustable as well keep_silence=silence_to_keep, ) # create a directory to store the audio chunks if not os.path.isdir(self.chunk_path): os.mkdir(self.chunk_path) times = [] #Print detected non-silent chunks, which in our case would be spoken words. nonsilent_data = detect_nonsilent(sound, min_silence_len=700, silence_thresh=sound.dBFS-14, seek_step=1) #convert ms to seconds print("start,Stop") for chunks_times in nonsilent_data: times.append( [ct/1000 for ct in chunks_times]) self.ts = pd.DataFrame(times, columns=["start_time", "stop_time"]) # print(self.ts.head()) # process each chunk dur_sec = [] text = [] for i, audio_chunk in enumerate(chunks, start=1): # export audio chunk and save it in # the `folder_name` directory. chunk_filename = os.path.join(self.chunk_path, f"chunk{i}.wav") audio_chunk.export(chunk_filename, format="wav") dur_sec.append(audio_chunk.duration_seconds - ((silence_to_keep*2)/1000)) text.append(Audio_Object.extract_text(chunk_filename)) # return the text for all chunks detected # return(self.whole_text) self.ts['duration'] = dur_sec self.ts['text'] = text self.ts['label'] = self.audio_filename self.save_ts_df()
def detect_nonsilences(self, sound): snd = AudioSegment.from_wav(sound) dBFS = snd.dBFS non_silent = detect_nonsilent(snd, min_silence_len=1000, silence_thresh=dBFS - 16) #Convert to seconds non_silence = [((start / 1000), (stop / 1000)) for start, stop in non_silent] return non_silence
def cutspeech(self, song1): not_silence_ranges = detect_nonsilent(song1, min_silence_len=100, silence_thresh=-32) starti = not_silence_ranges[0][0] if len(not_silence_ranges) == 0: return song1 endi = not_silence_ranges[-1][1] return song1[starti:endi]
def detect_nonsilence_audiotime(filename, format): sound = AudioSegment.from_file(filename, format=format) dbfs = sound.dBFS print("正在查找非静音片段,根据视频大小,等待时间会有不同~请耐心等待") timestamp_list = detect_nonsilent(sound, min_silence_len=700, silence_thresh=dbfs - 16, seek_step=1) print('恭喜你,已经完成' + "共找到" + str(len(timestamp_list)) + "段语音") return timestamp_list
def split_on_silence_with_pydub( audio_path, deepspeech, skip_idx=0, out_ext="wav", silence_thresh=-40, min_silence_len=400, silence_chunk_len=100, keep_silence=100, min_segment_length=0): #silence_chunk_len 100->200 filename = os.path.basename(audio_path).split('.', 1)[0] in_ext = audio_path.rsplit(".")[1] audio = read_audio(audio_path) audio= audio.set_channels(1) audio_sample_width=audio.sample_width min_chunk_len=int(float(1000)*min_segment_length) not_silence_ranges = silence.detect_nonsilent( audio, min_silence_len=silence_chunk_len, silence_thresh=silence_thresh) edges = [not_silence_ranges[0]] for idx in range(1, len(not_silence_ranges)-1): cur_start = not_silence_ranges[idx][0] prev_end = edges[-1][1] prev_start = edges[-1][0] prev_len = prev_end - prev_start # if silence is too short or nonsilent is too short # merge current nonsilence with prev one if cur_start - prev_end < min_silence_len or (min_chunk_len!=0 and prev_len < min_chunk_len) : edges[-1][1] = not_silence_ranges[idx][1] else: edges.append(not_silence_ranges[idx]) print("Finished finding Edges") audio_paths = [] for idx, (start_idx, end_idx) in enumerate(edges[skip_idx:]): start_idx = max(0, start_idx - keep_silence) end_idx += keep_silence target_audio_path = "{}/{}.{:04d}.{}".format( os.path.dirname(audio_path), filename, idx, out_ext) segment=audio[start_idx:end_idx] # Set this to deepspeech compatible if deepspeech: temp=segment.set_frame_rate(16000) temp=temp.set_sample_width(2) temp=temp.set_channels(1) temp.export(target_audio_path, out_ext) # for soundsegment else: segment.export(target_audio_path, out_ext) # for soundsegment audio_paths.append(target_audio_path) return audio_paths
def bpm(seg): l_seg = seg.low_pass_filter(120.0) beat_loudness = l_seg.dBFS minimum_silence = int(60000/240.0) nonsilent_times = detect_nonsilent(l_seg, minimum_silence, beat_loudness) spaces_between_beats = [] last_t = nonsilent_times[0][0] for peak_start, _ in nonsilent_times[1:]: spaces_between_beats.append(peak_start - last_t) last_t = peak_start spaces_between_beats = sorted(spaces_between_beats) space = spaces_between_beats[len(spaces_between_beats) / 2] bpm = 60000 / space return bpm
def process(self, dl_entry): """Make new audio file in the media directory. Take the audio file pointed to by dl_entry, normalize, remove silence, convert to output_format. """ input_format = dl_entry.file_extension.lstrip('.') try: loader = load_functions[input_format] except KeyError: loader = lambda file: AudioSegment.from_file( file=file, format=input_format) segment = loader(dl_entry.file_path) # This # sometimes raised a pydub.exceptions.CouldntDecodeError segment = segment.normalize() # First normalize # Try to remove silence loud_pos = detect_nonsilent( segment, min_silence_len=minimum_silence_length, silence_thresh=silence_threshold) fade_in_length = rapid_fade_length fade_out_length = rapid_fade_length if len(loud_pos) == 1: loud_p = loud_pos[0] if loud_p[0] > silence_fade_length: fade_in_length = silence_fade_length if loud_p[1] < len(segment) - silence_fade_length: fade_out_length = silence_fade_length if loud_p[0] > 0 or loud_p[1] < len(segment): segment = segment[loud_p[0] : loud_p[1]] segment = segment.fade_in(fade_in_length).fade_out(fade_out_length) # Now write tof = tempfile.NamedTemporaryFile( delete=False, suffix=output_suffix, prefix=u'anki_audio_') temp_out_file_name = tof.name tof.close() segment.export(temp_out_file_name, output_format) os.unlink(dl_entry.file_path) # Get rid of unprocessed version return temp_out_file_name, output_suffix