def _inner(ai: AudioTensor) -> AudioTensor: "Split signal at points of silence greater than 2*pad_ms" if remove_type is None: return ai padding = int(pad_ms / 1000 * ai.sr) if (padding > ai.nsamples): return ai splits = split(ai.numpy(), top_db=threshold, hop_length=padding) if remove_type == "split": sig = [ ai[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))] for (a, b) in _merge_splits(splits, padding) ] elif remove_type == "trim": sig = [ ai[:, (max(splits[0, 0] - padding, 0)):splits[-1, -1] + padding] ] elif remove_type == "all": sig = [ torch.cat([ ai[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))] for (a, b) in _merge_splits(splits, padding) ], dim=1) ] else: raise ValueError( f"Valid options for silence removal are None, 'split', 'trim', 'all' not '{remove_type}'." ) ai.data = torch.cat(sig, dim=-1) return ai
def tfm_remove_silence(signal, rate, remove_type, threshold=20, pad_ms=200): '''Split signal at points of silence greater than 2*pad_ms ''' actual = signal.clone().squeeze() padding = int(pad_ms / 1000 * rate) if (padding > len(actual)): return [actual] splits = split(actual.numpy(), top_db=threshold, hop_length=padding) if remove_type == "split": return [ actual[(max(a - padding, 0)):(min(b + padding, len(actual)))] for (a, b) in splits ] elif remove_type == "trim": return [ actual[(max(splits[0, 0] - padding, 0)):splits[-1, -1] + padding].unsqueeze(0) ] elif remove_type == "all": return [ torch.cat([ actual[(max(a - padding, 0)):(min(b + padding, len(actual)))] for (a, b) in splits ]) ] else: raise ValueError( f"Valid options for silence removal are None, 'split', 'trim', 'all' not {remove_type}." )
def main_(): ap = argparse.ArgumentParser() ap.add_argument("--hop_size", type=int, default=110) ap.add_argument("--fft_size", type=int, default=2048) ap.add_argument("wav_list", type=str) ap.add_argument("ali_rspecifier", type=str) args = ap.parse_args() wavlist_fp = open(args.wav_list, mode='rt') if args.wav_list != "-" else sys.stdin with kaldi_io.open_or_fd(args.ali_rspecifier, mode="wb") as ali_writer: for line in wavlist_fp: s = line.strip().replace("\t", " ").split(" ") assert len(s) == 2 utt, wav_file = s rate, wav = read(wav_file) wav = wav.astype("float") voiced = np.zeros_like(wav, dtype=np.int32) ans = split(wav, frame_length=args.fft_size, hop_length=args.hop_size) for a in ans: sidx, eidx = a voiced[sidx:eidx] = 1 kaldi_io.write_vec_int(ali_writer, voiced, key=utt) wavlist_fp.close()
def tfm_chop_silence(signal, rate, threshold=20, pad_ms=200): '''Split signal at points of silence greater than 2*pad_ms ''' actual = signal.clone().squeeze() padding = int(pad_ms/1000*rate) if(padding > len(actual)): return [actual] splits = split(actual.numpy(), top_db=threshold, hop_length=padding) return [actual[(max(a,0) - padding):(b + min(padding,len(actual)))] for (a, b) in splits]
def _inner(ai: AudioItem) -> AudioItem: '''Split signal at points of silence greater than 2*pad_ms ''' if remove_type is None: return ai padding = int(pad_ms / 1000 * ai.sr) if (padding > ai.nsamples): return ai actual = ai.sig.clone() splits = split(actual.numpy(), top_db=threshold, hop_length=padding) if remove_type == "split": sig = [ actual[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))] for (a, b) in _merge_splits(splits, padding) ] elif remove_type == "trim": sig = [ actual[:, (max(splits[0, 0] - padding, 0)):splits[-1, -1] + padding] ] elif remove_type == "all": sig = [ torch.cat([ actual[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))] for (a, b) in _merge_splits(splits, padding) ], dim=1) ] else: raise ValueError( f"Valid options for silence removal are None, 'split', 'trim', 'all' not '{remove_type}'." ) return AudioItem((*sig, ai.sr, ai.path))
def trim_pauses_and_split_into_chunks(audio, top_db=30, chunk_duration=4, sampling_rate=22050): """ Trims all the pauses in the file and cut it into chunks of a given sample_duration. For example, if a 50 seconds file is given then it will remove silence first (40 seconds remaining) and then will return 10 arrays with each representing a 4 second sample. All chunks are cut fully coherently so their stacking gives the trimmed file. :param audio: numpy array representing an audio signal :param top_db: The threshold (in decibels) below reference to consider as silence :param chunk_duration: cut into chunks of 'sample_duration' length :param sampling_rate: number of discrete samples in every second, not be confused with sample size we cut the audio into :return: list of np.ndarray [shape=(m, chunk_duration*sampling rate)]. m = ceil(len(audio) / chunk_duration) """ intervals = effects.split(audio, top_db=top_db) # trim pauses trimmed_audio = np.empty(0) for i in range(len(intervals)): trimmed_audio = np.concatenate( (trimmed_audio, audio[intervals[i][0]:intervals[i][1]])) # split into little chunks chunks = [] step = chunk_duration * sampling_rate for i in range(0, len(trimmed_audio), step): chunks.append(audio[i:i + step]) return chunks
def encodes(self, ai: AudioTensor) -> AudioTensor: if self.remove_type is None: return ai padding = int(self.pad_ms / 1000 * ai.sr) if padding > ai.nsamples: return ai splits = split(ai.numpy(), top_db=self.threshold, hop_length=padding) if self.remove_type == RemoveType.Split: sig = [ ai[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))] for (a, b) in _merge_splits(splits, padding) ] elif self.remove_type == RemoveType.Trim: sig = [ ai[:, (max(splits[0, 0] - padding, 0)):splits[-1, -1] + padding] ] elif self.remove_type == RemoveType.All: sig = [ torch.cat( [ ai[:, (max(a - padding, 0)):(min(b + padding, ai.nsamples))] for (a, b) in _merge_splits(splits, padding) ], dim=1, ) ] else: raise ValueError(f"""Valid options for silence removal are None, RemoveType.Split, RemoveType.Trim, RemoveType.All, but not '{self.remove_type}'.""") ai.data = torch.cat(sig, dim=-1) return ai
def clear_from_silence(wave): sounded_ints = effects.split(wave, top_db=top_decibells, frame_length=win_len, hop_length=hop_len) sounded_wave = [wave[inter[0]:inter[1]] for inter in sounded_ints] return np.concatenate(sounded_wave)
def main_(): ap = argparse.ArgumentParser() ap.add_argument("--hop_size", type=int, default=110) ap.add_argument("--fft_size", type=int, default=2048) ap.add_argument("wav_list", type=str) ap.add_argument("wav_outdir", type=str) args = ap.parse_args() wavlist_fp = open(args.wav_list, mode='rt') if args.wav_list != "-" else sys.stdin for line in wavlist_fp: s = line.strip().replace("\t", " ").split(" ") assert len(s) == 2 utt, wav_file = s rate, wav = read(wav_file) wav = wav.astype("float") ans = split(wav, frame_length=args.fft_size, hop_length=args.hop_size) voiced_wav = [wav[a[0]:a[1]] for a in ans] voiced_wav = np.concatenate(voiced_wav).astype(np.int16) write(join(args.wav_outdir, "{}.wav".format(utt)), rate, voiced_wav) wavlist_fp.close()
def split_sound_files(sound_waves, sound_types): test_sound = np.float64(sound_waves[0][0][1]) splits = split(test_sound, 10, frame_length=10) [ sci_wav.write( "/export/home/amatskev/testlibrosa/second_test{}.wav".format(idx), 16000, np.uint8(test_sound[splits[idx][0]:splits[idx][1]])) for idx, intervall in enumerate(splits) ] print(1 + 2) print(1 + 2)
def truncate_silence(self, signal): nonsilent_indices = split(y=signal, \ top_db=self.top_db, \ frame_length=self.frame_length, \ hop_length=self.frame_skip) # Only keep nonsilent intervals of signal signal_intervals = [] for index in nonsilent_indices: signal_interval = signal[index[0]:index[1]] signal_intervals.append(signal_interval) # Return flattened array return np.concatenate(signal_intervals, axis=0)
def tfm_trim_silence(signal, rate, threshold=20, pad_ms=200): '''Remove silence from start and end of audio''' actual = signal.clone().squeeze() padding = int(pad_ms/1000*rate) splits = split(actual.numpy(), top_db=threshold) return actual[splits[0, 0]-padding:splits[-1, -1]+padding].unsqueeze(0)
def preprocess_track(track_id): iad_path = os.path.join(source_path, "%s_SOURCEID.lab" % track_id) # the directory looks like "J_Q_T_reduced" spec_path = os.path.join( homedir, "MedleyDB/processed/%d_%d_%d_reduced_new/" % (J, Q, T)) input_path = os.path.join(spec_path, "input") label_path = os.path.join(spec_path, "labels") if not (os.path.exists(input_path)): os.makedirs(input_path) if not (os.path.exists(label_path)): os.makedirs(label_path) # load audio list # y, _ = librosa.load( # os.path.join(audiodir, audio_file), sr=sr, res_type="kaiser_fast" # ) meta_file = ("/home/laura/medleydb/medleydb/data/Metadata/" + track_id + "_METADATA.yaml") with open(meta_file) as f: data = yaml.load(f) stems_files = [ stem["filename"] for stem in data["stems"].values() if stem["instrument"] in MEDLEYDB_INSTRUMENTS ] if stems_files == []: return track_id y = np.mean( np.stack([ librosa.load( os.path.join(medleydir, track_id, track_id + "_STEMS", audio_file), sr=sr, res_type="kaiser_fast", )[0] for audio_file in stems_files ], ), axis=0, ) # get the non-silent intervals intervals = split(y, top_db=10, frame_length=samples_per_snippet, hop_length=samples_per_snippet) for start_i, end_i in intervals: for i in tqdm(range(start_i, end_i, samples_per_snippet), unit="clip"): sound_bite = y[i:i + samples_per_snippet] # get all sound_bites except the last one (which is not 6s long) if len(sound_bite) == samples_per_snippet: sound_bite = preprocess(sound_bite) S_dict = get_scattering_coefficients(sound_bite, order1_indices, order2_indices, scattering.forward) label = preprocess_label(iad_path, i, i + samples_per_snippet) np.save( os.path.join(input_path, "%s_%d.npy" % (track_id, int(i / sr))), S_dict, ) np.save( os.path.join(label_path, "%s_%d.npy" % (track_id, int(i / sr))), label, ) return track_id
def trim(signal, top_db=20): from librosa.effects import split intervals = split(signal, top_db=20) return signal[intervals[0][0]:intervals[-1][-1]]
def extract_log_mel_feats(set_type, path_to_csv, path_to_files, out_path, sr, fft_size, hop, n_mels): """ Extract features from given files and store them in binary format. :param set_type: :param path_to_csv: path to loaded csv :param path_to_files: path to loaded data :param out_path: path to store extracted features :param sr: input files sample rate :param fft_size: size of fft window :param hop: hop size :param n_mels: number of mel band :return: """ set_type = set_type.lower() if set_type not in ['train', 'test']: raise Exception('Such set type not supported: {}'.format(set_type)) feats = [] if set_type == 'train': meta = pd.read_csv(path_to_csv, sep='\t', header=None) meta.columns = ['file', 'unk1', 'unk2', 'duration', 'type'] file_names = list(meta['file']) n_files = len(file_names) labels = list(meta['type']) uniq_labels = np.sort(np.unique(labels)) label_to_id = {label: i for i, label in enumerate(uniq_labels)} print('Total files:', n_files) for i, (file_name, label) in tqdm(enumerate(zip(file_names, labels))): wav_data, sr = load_wav(os.path.join(path_to_files, file_name), sr=sr) for part in split(wav_data, top_db=30): start, end = part # skip ultra short parts if (end - start) < fft_size: continue wav_part = wav_data[start:end] mel_spec = melspectrogram(wav_part, n_fft=fft_size, hop_length=hop, n_mels=n_mels, fmax=sr // 2) log_mel_spec = power_to_db(mel_spec, ref=np.max) feats.append({ 'fname': file_name, 'feature': log_mel_spec, 'label_id': label_to_id[label] }) pickle.dump(feats, open(out_path, 'wb')) return label_to_id else: for i, file_name in tqdm(enumerate(os.listdir(path_to_files))): wav_data, sr = load_wav(os.path.join(path_to_files, file_name), sr=sr) if len(wav_data) == 0: # print('Empty file:', file_name) wav_data = np.zeros(sr) mel_spec = melspectrogram(wav_data, n_fft=fft_size, n_mels=n_mels, fmax=sr // 2) log_mel_spec = power_to_db(mel_spec, ref=np.max) feats.append({ 'fname': file_name, 'feature': log_mel_spec, }) pickle.dump(feats, open(out_path, 'wb'))