def readWavFunc(par): name = par['parent']['wavFile'] stratFs = par['parent']['fs'] [srcFs,signalIn] = wavread(name) # rescale from integer words to float for audio processing if signalIn.dtype == 'uint8': raise TypeError('8 bit PCM wav format not supported') elif signalIn.dtype == 'int16': bits = 16 maxBit = 2.**(bits-1) elif signalIn.dtype == 'int32': bits = 32 maxBit = 2.**(bits-1) elif signalIn.dtype == 'float32': # dont rescale 32bit float data maxBit = 0 elif signalIn.dtype == 'float64': # dont rescale 64 bit float either maxBit = 0 signalIn = signalIn/(maxBit+1) if len(signalIn.shape) > 1: signalIn = signalIn[:,par['iChannel']-1] else: signalIn = signalIn[np.newaxis,:] if len(par['tStartEnd']) > 0: iStartEnd = np.round(par['tStartEnd']*srcFs+np.array([1,0])) signalIn = signalIn[iStartEnd[0]:iStartEnd[1]] if srcFs != stratFs: # This implementation is not numerically identical to matlab if signalIn.shape[0] > 1: resampledSig = np.zeros((signalIn.shape[0],np.ceil(stratFs*signalIn.shape[1]/srcFs).astype(int))) for iCh in np.arange(signalIn.shape[0]): resampledSig[iCh,:] = resample(signalIn[iCh,:],stratFs,srcFs,axis=1) signalIn = resampledSig else: signalIn = resample(signalIn,stratFs,srcFs,axis=1) return signalIn, name
from nnresample import resample from skvideo.io import vread from skvideo.utils import rgb2gray root_path = Path("../TCDTIMITprocessing/downloadTCDTIMIT") for srcwavpath in root_path.glob('**/*.wav'): tgtwavpath = (srcwavpath.parent / (srcwavpath.stem + "_16khz" + srcwavpath.suffix)) tgtmfccpath = (srcwavpath.parent / (srcwavpath.stem + ".pkl")) # if not tgtmfccpath.exists(): if not tgtwavpath.exists() and '16khz' not in srcwavpath.stem: _, srcsig = wav.read(srcwavpath) max_nb_bit = float(2**(16 - 1)) srcsig = srcsig / (max_nb_bit + 1.0) resampled = resample(srcsig, 16000, 48000) rs = (resampled * (max_nb_bit + 1.0)).astype(np.int16) wav.write(tgtwavpath, 16000, rs) print(tgtwavpath) def create_pkl(mp4path): tgtpklpath = (mp4path.parent / (mp4path.stem + ".pkl")) if not tgtpklpath.exists(): try: images = rgb2gray(vread(mp4path)).astype(np.uint8).squeeze() face_detector = FaceDetector() faces = np.stack([ face_detector.crop_mouth(image, bounding_box_shape=(220, 150)) for image in images ], 0)
def read_waveform(file, resample=True, resampling_rate=22050, to_mono=True, mono_convertion_mode: Literal["left_only", "right_only", "downmix"] = "downmix"): """Convert audiofile to wanted format. :param file: Filepath of filename. :type file: str :param resample: Resample to resampling rate. :type resample: bool :param resampling_rate: Wanted sample rate. :type resampling_rate: int :param to_mono: Convert signal to mono. :type to_mono: bool :param mono_convertion_mode: 1. read left channel only, 2. read right channel only, or 3. average left and right channels :return: Audio data and related info. :rtype: tuple[np.ndarray, dict] """ info = dict.fromkeys([ "filename", "sampling_rate", "channels", "size", "duration", "bit_depth", "bit_rate" ]) with wave.open(file) as audio: info["bit_depth"] = audio.getsampwidth() * 8 with sf.SoundFile(file) as audio: data = audio.read() fileformat = audio.format info["filename"] = audio.name info["sampling_rate"] = audio.samplerate info["channels"] = audio.channels info["size"] = audio.frames info["duration"] = info["size"] / info["sampling_rate"] info["bit_rate"] = info["sampling_rate"] * info["bit_depth"] * info[ "channels"] assert fileformat == "WAV", "File must be a WAV-file." if info["channels"] == 2 and to_mono: info["channels"] = 1 if mono_convertion_mode == "left_only": data = data[:, 0] elif mono_convertion_mode == "right_only": data = data[:, 1] elif mono_convertion_mode == "downmix": data = np.mean(data, axis=1) else: raise ValueError( f"Conversion mode \"{mono_convertion_mode}\" not specified.") if resample and info["sampling_rate"] != resampling_rate: data: np.ndarray = nnresample.resample( s=data, up=resampling_rate, down=info["sampling_rate"]) # noqa info["sampling_rate"] = resampling_rate info["size"] = len(data) info["duration"] = info["size"] / info["sampling_rate"] info["bit_rate"] = info["sampling_rate"] * info["bit_depth"] * info[ "channels"] return data, info
def sonify(curve, audio, sampling_rate, feature_rate, min_confidence=0.01, only_ticks=False, confidence=True, half_tempo=False, half_tempo_start=1): """Mix the peaks of the given curve as clicks with the given audio. :param curve: Curve to sonify (should be normalised to 0-1 range) :type curve: np.ndarray :param audio: audio to mix the sonified curve with :type audio: np.ndarray :param sampling_rate: sampling rate of the curve and audio :type sampling_rate: int or float :param feature_rate: feature rate of the curve and audio :type feature_rate: int or float :param min_confidence: Minimum confidence require for the peaks. :type min_confidence: float :param only_ticks: sonification will only contain the ticks and not the audio. :type only_ticks: bool :param confidence: Tick volume is determided by the value of the curve (curve should be normalised to 0-1 range) :type confidence: bool :param half_tempo: Remove every other peak for half tempo. :type half_tempo: bool :param half_tempo_start: Start removing peaks for half tempo from this peak onwards. All peaks before this peak are removed. :type half_tempo_start: int :return: Sonified curve. :rtype: np.ndarray """ pos = np.append(curve, curve[-1]) > np.insert(curve, 0, curve[0]) neg = ~pos peaks = np.where(pos[:pos.shape[0] - 1] * neg[1:])[0] if half_tempo: peaks = peaks[half_tempo_start::2] values = curve[peaks].flatten() values = values / np.max(values) # Remove small peaks peaks = peaks[values >= min_confidence] values = values[values >= min_confidence] click = np.array([ 0.0000, 0.0000, -0.0001, 0.0002, -0.0001, 0.0001, -0.0000, -0.0001, 0.0001, 0.0938, 0.1861, 0.2755, 0.3606, 0.4400, 0.5125, 0.5769, 0.6324, 0.6778, 0.7127, 0.7362, 0.7484, 0.7487, 0.7373, 0.7143, 0.6800, 0.6352, 0.5803, 0.5164, 0.4442, 0.3654, 0.2803, 0.1915, 0.0989, 0.0054, -0.0885, -0.1810, -0.2704, -0.3560, -0.4356, -0.5086, -0.5734, -0.6295, -0.6755, -0.7108, -0.7354, -0.7479, -0.7491, -0.7382, -0.7159, -0.6823, -0.6380, -0.5837, -0.5202, -0.4485, -0.3700, -0.2853, -0.1965, -0.1043, -0.0107, 0.0832, 0.1758, 0.2655, 0.3511, 0.4314, 0.5045, 0.5702, 0.6264, 0.6733, 0.7091, 0.7343, 0.7475, 0.7493, 0.7392, 0.7174, 0.6845, 0.6408, 0.5871, 0.5240, 0.4529, 0.3744, 0.2905, 0.2014, 0.1098, 0.0159, -0.0779, -0.1704, -0.2606, -0.3464, -0.4269, -0.5007, -0.5665, -0.6235, -0.6709, -0.7073, -0.7332, -0.7469, -0.7497, -0.7399, -0.7191, -0.6867, -0.6434, -0.5905, -0.5278, -0.4571, -0.3792, -0.2952, -0.2068, -0.1148, -0.0215, 0.0726, 0.1653, 0.2556, 0.3416, 0.4225, 0.4966, 0.5631, 0.6206, 0.6683, 0.7058, 0.7318, 0.7467, 0.7497, 0.7408, 0.7206, 0.6888, 0.6463, 0.5937, 0.5316, 0.4613, 0.3838, 0.3002, 0.2119, 0.1202, 0.0268, -0.0673, -0.1600, -0.2506, -0.3368, -0.4182, -0.4926, -0.5595, -0.6176, -0.6658, -0.7039, -0.7307, -0.7461, -0.7499, -0.7416, -0.7220, -0.6909, -0.6488, -0.5971, -0.5352, -0.4656, -0.3883, -0.3051, -0.2171, -0.1254, -0.0322, 0.0620, 0.1548, 0.2454, 0.3322, 0.4135, 0.4887, 0.5558, 0.6147, 0.6633, 0.7021, 0.7294, 0.7456, 0.7499, 0.7425, 0.7234, 0.6929, 0.6517, 0.6000, 0.5392, 0.4696, 0.3930, 0.3099, 0.2220, 0.1308, 0.0374, -0.0566, -0.1497, -0.2403, -0.3275, -0.4090, -0.4846, -0.5523, -0.6114, -0.6610, -0.7001, -0.7283, -0.7449, -0.7499, -0.7432, -0.7248, -0.6949, -0.6544, -0.6032, -0.5429, -0.4739, -0.3974, -0.3149, -0.2271, -0.1362, -0.0426, 0.0513, 0.1443, 0.2355, 0.3224, 0.4048, 0.4804, 0.5486, 0.6084, 0.6583, 0.6982, 0.7269, 0.7444, 0.7500, 0.7439, 0.7261, 0.6970, 0.6569, 0.6064, 0.5466, 0.4778, 0.4022, 0.3194, 0.2324, 0.1413, 0.0479, -0.0457, -0.1393, -0.2302, -0.3177, -0.4002, -0.4762, -0.5451, -0.6051, -0.6559, -0.6962, -0.7256, -0.7437, -0.7500, -0.7445, -0.7276, -0.6988, -0.6595, -0.6095, -0.5501, -0.4822, -0.4064, -0.3244, -0.2374, -0.1464, -0.0536, 0.0407, 0.1338, 0.2252, 0.3128, 0.3957, 0.4722, 0.5413, 0.6021, 0.6532, 0.6942, 0.7242, 0.7430, 0.7499, 0.7452, 0.7287, 0.7009, 0.6619, 0.6128, 0.5537, 0.4862, 0.4109, 0.3292, 0.2424, 0.1517, 0.0587, -0.0353, -0.1286, -0.2201, -0.3079, -0.3911, -0.4680, -0.5377, -0.5988, -0.6506, -0.6921, -0.7229, -0.7421, -0.7499, -0.7457, -0.7300, -0.7027, -0.6645, -0.6157, -0.5574, -0.4902, -0.4155, -0.3340, -0.2475, -0.1570, -0.0640, 0.0298, 0.1235, 0.2148, 0.3031, 0.3865, 0.4639, 0.5338, 0.5957, 0.6477, 0.6902, 0.7213, 0.7414, 0.7498, 0.7462, 0.7313, 0.7045, 0.6670, 0.6187, 0.5609, 0.4943, 0.4198, 0.3389, 0.2525, 0.1622, 0.0693, -0.0245, -0.1181, -0.2098, -0.2982, -0.3820, -0.4597, -0.5301, -0.5923, -0.6452, -0.6879, -0.7199, -0.7405, -0.7496, -0.7469, -0.7323, -0.7065, -0.6693, -0.6218, -0.5644, -0.4984, -0.4241, -0.3438, -0.2574, -0.1675, -0.0747, 0.0194, 0.1126, 0.2049, 0.2932, 0.3773, 0.4554, 0.5263, 0.5891, 0.6424, 0.6859, 0.7184, 0.7397, 0.7495, 0.7472, 0.7336, 0.7081, 0.6717, 0.6249, 0.5677, 0.5025, 0.4284, 0.3485, 0.2624, 0.1727, 0.0800, -0.0139, -0.1076, -0.1995, -0.2884, -0.3727, -0.4511, -0.5226, -0.5857, -0.6397, -0.6836, -0.7168, -0.7389, -0.7490, -0.7478, -0.7346, -0.7099, -0.6742, -0.6276, -0.5716, -0.5061, -0.4331, -0.3530, -0.2676, -0.1778, -0.0853, 0.0086, 0.1022, 0.1945, 0.2834, 0.3681, 0.4469, 0.5187, 0.5824, 0.6369, 0.6814, 0.7152, 0.7379, 0.7487, 0.7483, 0.7355, 0.7117, 0.6764, 0.6306, 0.5749, 0.5101, 0.4155, 0.3219, 0.2317, 0.1463, 0.0680, -0.0022, -0.0631, -0.1134, -0.1532, -0.1817, -0.1991, -0.2060, -0.2026, -0.1902, -0.1699, -0.1427, -0.1106, -0.0748, -0.0375, 0.0000, -0.0001, 0.0000, 0.0001, -0.0001, 0.0002, -0.0002, 0.0001, 0 ]) click = click * np.arange( start=1, stop=1 / len(click), step=-1 / len(click))**2 click: np.ndarray = nnresample.resample(click, up=sampling_rate, down=88200) # noqa out = np.zeros_like(audio) for idx in range(0, len(peaks)): start = int(np.floor(peaks[idx] / feature_rate * sampling_rate)) stop = start + len(click) if stop <= len(out): if confidence: out[start:stop] = out[start:stop] + click * values[idx] else: out[start:stop] = out[start:stop] + click if only_ticks: return np.concatenate((out[..., None], out[..., None]), axis=1) else: return np.concatenate((audio[..., None], out[..., None]), axis=1)
def audio_to_novelty_curve(audio, sampling_rate, threshold=-74, window_length=None, stepsize=None, log_compression=True, compression_constant=1000, resample_feature_rate=200): """Compute spectrogram :param audio: Audio data. :type audio: np.ndarray :param sampling_rate: Sampling rate of the audio (Hz) :type sampling_rate: float or int :param threshold: Threshold for the normalization (dB) :type threshold: int :param window_length: Lenght of the stft window. :type window_length: float :param stepsize: Stepsize for the STFT. :type stepsize: float :param log_compression: Enable/disable log compression. :type log_compression: bool :param compression_constant: Constant for log compression :type compression_constant: int :param resample_feature_rate: Feature rate of the resulting novelty curve (resampled, independent of stepsize) :type resample_feature_rate: int :return: :rtype: """ if window_length is None: window_length: float = 1024 * sampling_rate / 22050 if stepsize is None: stepsize: float = 512 * sampling_rate / 22050 stft_window = np.hanning(round2(window_length)) # Compute spectrogram spec_data, feature_rate, _, _ = audio_to_spectrogram_via_STFT( audio, sampling_rate, stft_window=stft_window, stepsize=stepsize) # Normalize and convert to dB threshold = 10**(threshold / 20) spec_data = spec_data / np.amax(spec_data) with np.nditer(spec_data, op_flags=["readwrite"]) as it: for x in it: x[...] = max(x, threshold) # bandwise processing bands = np.array([[0, 500], [500, 1250], [1250, 3125], [3125, 7812.5], [7812.5, math.floor(sampling_rate / 2)]]) band_novelty_curves = np.zeros((len(bands), spec_data.shape[1])) for band in range(0, len(bands)): bins = np.around(bands[band] / (sampling_rate / window_length)) bins[0] = max(0, bins[0]) bins[1] = min(len(spec_data) - 1, bins[1]) # Band novelty curve band_data = spec_data[int(bins[0]):int(bins[1]), :] if log_compression and compression_constant > 0: band_data = np.log(1 + band_data * compression_constant) / ( np.log(1 + compression_constant)) # Smoothed differentiator diff_len = 0.3 diff_len = max(math.ceil(diff_len * sampling_rate / stepsize), 5) diff_len = 2 * round2(diff_len / 2) + 1 diff_filter = np.hanning(diff_len) * np.concatenate( (-1 * np.ones(math.floor(diff_len / 2)), np.array( [0]), np.ones(math.floor(diff_len / 2)))) diff_filter = diff_filter[..., None].T rm1 = np.array( [band_data[:, 0] for _ in range(math.floor(diff_len / 2))]).T rm2 = np.array( [band_data[:, -1] for _ in range(math.floor(diff_len / 2))]).T hhh = np.concatenate((rm1, band_data, rm2), axis=1) band_diff = filter2(diff_filter, hhh) band_diff[band_diff < 0] = 0 band_diff = band_diff[:, math.floor(diff_len / 2) - 1:-1 - math.floor(diff_len / 2)] # Normalize band norm_len = 5 norm_len = max(math.ceil(norm_len * sampling_rate / stepsize), 3) norm_filter = np.hanning(norm_len)[..., None] norm_curve = filter2(norm_filter / np.sum(norm_filter), np.sum(band_data, axis=0)[..., None]).T # Boundary correction norm_filter_sum = ( (np.sum(norm_filter) - np.cumsum(norm_filter, axis=0)) / np.sum(norm_filter)).T norm_curve[:, 0:math.floor( norm_len / 2)] = norm_curve[:, 0:math.floor(norm_len / 2)] / np.fliplr( norm_filter_sum[:, 0:math.floor(norm_len / 2)]) norm_curve[:, -math.floor(norm_len / 2):] = norm_curve[:, -math.floor( norm_len / 2):] / norm_filter_sum[:, 0:math.floor(norm_len / 2)] band_diff /= norm_curve # Compute novelty curve of band band_novelty_curves[band, :] = np.sum(band_diff, axis=0) novelty_curve = np.mean(band_novelty_curves, axis=0) # Resample curve if resample_feature_rate > 0 and resample_feature_rate != feature_rate: novelty_curve: np.ndarray = nnresample.resample( novelty_curve, resample_feature_rate, int(feature_rate)) # noqa novelty_curve = novelty_curve[..., None] feature_rate = resample_feature_rate # Average subtraction smooth_len: int = max(int(math.ceil(1.5 * sampling_rate / stepsize)), 3) smooth_filter: np.ndarray = np.hanning(smooth_len)[..., None] local_average: np.ndarray = filter2(smooth_filter / np.sum(smooth_filter), novelty_curve) novelty_curve = novelty_curve - local_average novelty_curve[novelty_curve < 0] = 0 return novelty_curve, feature_rate
def resample(self, fs): """Resamples this impulse response to the given sampling rate.""" self.data = nnresample.resample(self.data, fs, self.fs) self.fs = fs
def resample(wav, old_sr, new_sr): return nnresample.resample(wav, new_sr, old_sr)