Beispiel #1
0
def readWavFunc(par):
    name = par['parent']['wavFile']
    stratFs = par['parent']['fs']

    
    [srcFs,signalIn] = wavread(name)    
    # rescale from integer words to float for audio processing
    
    if signalIn.dtype == 'uint8':
        raise TypeError('8 bit PCM wav format not supported')
    elif signalIn.dtype == 'int16':
        bits = 16
        maxBit = 2.**(bits-1)
    elif signalIn.dtype == 'int32':
        bits = 32 
        maxBit = 2.**(bits-1)
    elif signalIn.dtype == 'float32':  # dont rescale 32bit float data
        maxBit = 0
    elif signalIn.dtype == 'float64':  # dont rescale 64 bit float either
        maxBit = 0
    
    signalIn = signalIn/(maxBit+1) 
        
    if len(signalIn.shape) > 1:
        signalIn = signalIn[:,par['iChannel']-1]
    else:
        
        signalIn = signalIn[np.newaxis,:]
    
    
    if len(par['tStartEnd']) > 0:
        iStartEnd = np.round(par['tStartEnd']*srcFs+np.array([1,0]))
        signalIn = signalIn[iStartEnd[0]:iStartEnd[1]]
             
    
    if srcFs != stratFs:    # This implementation is not numerically identical to matlab
        if signalIn.shape[0] > 1:
            resampledSig = np.zeros((signalIn.shape[0],np.ceil(stratFs*signalIn.shape[1]/srcFs).astype(int)))
            for iCh in np.arange(signalIn.shape[0]):
                resampledSig[iCh,:] = resample(signalIn[iCh,:],stratFs,srcFs,axis=1)
            signalIn = resampledSig
        else:
            signalIn = resample(signalIn,stratFs,srcFs,axis=1)
            
    return signalIn, name
    
        
    
Beispiel #2
0
from nnresample import resample
from skvideo.io import vread
from skvideo.utils import rgb2gray

root_path = Path("../TCDTIMITprocessing/downloadTCDTIMIT")

for srcwavpath in root_path.glob('**/*.wav'):
    tgtwavpath = (srcwavpath.parent /
                  (srcwavpath.stem + "_16khz" + srcwavpath.suffix))
    tgtmfccpath = (srcwavpath.parent / (srcwavpath.stem + ".pkl"))
    # if not tgtmfccpath.exists():
    if not tgtwavpath.exists() and '16khz' not in srcwavpath.stem:
        _, srcsig = wav.read(srcwavpath)
        max_nb_bit = float(2**(16 - 1))
        srcsig = srcsig / (max_nb_bit + 1.0)
        resampled = resample(srcsig, 16000, 48000)
        rs = (resampled * (max_nb_bit + 1.0)).astype(np.int16)
        wav.write(tgtwavpath, 16000, rs)
        print(tgtwavpath)


def create_pkl(mp4path):
    tgtpklpath = (mp4path.parent / (mp4path.stem + ".pkl"))
    if not tgtpklpath.exists():
        try:
            images = rgb2gray(vread(mp4path)).astype(np.uint8).squeeze()
            face_detector = FaceDetector()
            faces = np.stack([
                face_detector.crop_mouth(image, bounding_box_shape=(220, 150))
                for image in images
            ], 0)
Beispiel #3
0
def read_waveform(file,
                  resample=True,
                  resampling_rate=22050,
                  to_mono=True,
                  mono_convertion_mode: Literal["left_only", "right_only",
                                                "downmix"] = "downmix"):
    """Convert audiofile to wanted format.

    :param file: Filepath of filename.
    :type file: str
    :param resample: Resample to resampling rate.
    :type resample: bool
    :param resampling_rate: Wanted sample rate.
    :type resampling_rate: int
    :param to_mono: Convert signal to mono.
    :type to_mono: bool
    :param mono_convertion_mode: 1. read left channel only, 2. read right channel only, or 3. average left and right channels
    :return: Audio data and related info.
    :rtype: tuple[np.ndarray, dict]
    """

    info = dict.fromkeys([
        "filename", "sampling_rate", "channels", "size", "duration",
        "bit_depth", "bit_rate"
    ])

    with wave.open(file) as audio:
        info["bit_depth"] = audio.getsampwidth() * 8

    with sf.SoundFile(file) as audio:
        data = audio.read()
        fileformat = audio.format

        info["filename"] = audio.name
        info["sampling_rate"] = audio.samplerate
        info["channels"] = audio.channels
        info["size"] = audio.frames
        info["duration"] = info["size"] / info["sampling_rate"]
        info["bit_rate"] = info["sampling_rate"] * info["bit_depth"] * info[
            "channels"]

    assert fileformat == "WAV", "File must be a WAV-file."

    if info["channels"] == 2 and to_mono:
        info["channels"] = 1

        if mono_convertion_mode == "left_only":
            data = data[:, 0]
        elif mono_convertion_mode == "right_only":
            data = data[:, 1]
        elif mono_convertion_mode == "downmix":
            data = np.mean(data, axis=1)
        else:
            raise ValueError(
                f"Conversion mode \"{mono_convertion_mode}\" not specified.")

    if resample and info["sampling_rate"] != resampling_rate:
        data: np.ndarray = nnresample.resample(
            s=data, up=resampling_rate, down=info["sampling_rate"])  # noqa
        info["sampling_rate"] = resampling_rate
        info["size"] = len(data)
        info["duration"] = info["size"] / info["sampling_rate"]
        info["bit_rate"] = info["sampling_rate"] * info["bit_depth"] * info[
            "channels"]

    return data, info
Beispiel #4
0
def sonify(curve,
           audio,
           sampling_rate,
           feature_rate,
           min_confidence=0.01,
           only_ticks=False,
           confidence=True,
           half_tempo=False,
           half_tempo_start=1):
    """Mix the peaks of the given curve as clicks with the given audio.

    :param curve: Curve to sonify (should be normalised to 0-1 range)
    :type curve: np.ndarray
    :param audio: audio to mix the sonified curve with
    :type audio: np.ndarray
    :param sampling_rate: sampling rate of the curve and audio
    :type sampling_rate: int or float
    :param feature_rate: feature rate of the curve and audio
    :type feature_rate: int or float
    :param min_confidence: Minimum confidence require for the peaks.
    :type min_confidence: float
    :param only_ticks: sonification will only contain the ticks and not the audio.
    :type only_ticks: bool
    :param confidence: Tick volume is determided by the value of the curve (curve should be normalised to 0-1 range)
    :type confidence: bool
    :param half_tempo: Remove every other peak for half tempo.
    :type half_tempo: bool
    :param half_tempo_start: Start removing peaks for half tempo from this peak onwards. All peaks before this peak are removed.
    :type half_tempo_start: int
    :return: Sonified curve.
    :rtype: np.ndarray
    """

    pos = np.append(curve, curve[-1]) > np.insert(curve, 0, curve[0])
    neg = ~pos

    peaks = np.where(pos[:pos.shape[0] - 1] * neg[1:])[0]

    if half_tempo:
        peaks = peaks[half_tempo_start::2]

    values = curve[peaks].flatten()
    values = values / np.max(values)

    # Remove small peaks
    peaks = peaks[values >= min_confidence]
    values = values[values >= min_confidence]

    click = np.array([
        0.0000, 0.0000, -0.0001, 0.0002, -0.0001, 0.0001, -0.0000, -0.0001,
        0.0001, 0.0938, 0.1861, 0.2755, 0.3606, 0.4400, 0.5125, 0.5769, 0.6324,
        0.6778, 0.7127, 0.7362, 0.7484, 0.7487, 0.7373, 0.7143, 0.6800, 0.6352,
        0.5803, 0.5164, 0.4442, 0.3654, 0.2803, 0.1915, 0.0989, 0.0054,
        -0.0885, -0.1810, -0.2704, -0.3560, -0.4356, -0.5086, -0.5734, -0.6295,
        -0.6755, -0.7108, -0.7354, -0.7479, -0.7491, -0.7382, -0.7159, -0.6823,
        -0.6380, -0.5837, -0.5202, -0.4485, -0.3700, -0.2853, -0.1965, -0.1043,
        -0.0107, 0.0832, 0.1758, 0.2655, 0.3511, 0.4314, 0.5045, 0.5702,
        0.6264, 0.6733, 0.7091, 0.7343, 0.7475, 0.7493, 0.7392, 0.7174, 0.6845,
        0.6408, 0.5871, 0.5240, 0.4529, 0.3744, 0.2905, 0.2014, 0.1098, 0.0159,
        -0.0779, -0.1704, -0.2606, -0.3464, -0.4269, -0.5007, -0.5665, -0.6235,
        -0.6709, -0.7073, -0.7332, -0.7469, -0.7497, -0.7399, -0.7191, -0.6867,
        -0.6434, -0.5905, -0.5278, -0.4571, -0.3792, -0.2952, -0.2068, -0.1148,
        -0.0215, 0.0726, 0.1653, 0.2556, 0.3416, 0.4225, 0.4966, 0.5631,
        0.6206, 0.6683, 0.7058, 0.7318, 0.7467, 0.7497, 0.7408, 0.7206, 0.6888,
        0.6463, 0.5937, 0.5316, 0.4613, 0.3838, 0.3002, 0.2119, 0.1202, 0.0268,
        -0.0673, -0.1600, -0.2506, -0.3368, -0.4182, -0.4926, -0.5595, -0.6176,
        -0.6658, -0.7039, -0.7307, -0.7461, -0.7499, -0.7416, -0.7220, -0.6909,
        -0.6488, -0.5971, -0.5352, -0.4656, -0.3883, -0.3051, -0.2171, -0.1254,
        -0.0322, 0.0620, 0.1548, 0.2454, 0.3322, 0.4135, 0.4887, 0.5558,
        0.6147, 0.6633, 0.7021, 0.7294, 0.7456, 0.7499, 0.7425, 0.7234, 0.6929,
        0.6517, 0.6000, 0.5392, 0.4696, 0.3930, 0.3099, 0.2220, 0.1308, 0.0374,
        -0.0566, -0.1497, -0.2403, -0.3275, -0.4090, -0.4846, -0.5523, -0.6114,
        -0.6610, -0.7001, -0.7283, -0.7449, -0.7499, -0.7432, -0.7248, -0.6949,
        -0.6544, -0.6032, -0.5429, -0.4739, -0.3974, -0.3149, -0.2271, -0.1362,
        -0.0426, 0.0513, 0.1443, 0.2355, 0.3224, 0.4048, 0.4804, 0.5486,
        0.6084, 0.6583, 0.6982, 0.7269, 0.7444, 0.7500, 0.7439, 0.7261, 0.6970,
        0.6569, 0.6064, 0.5466, 0.4778, 0.4022, 0.3194, 0.2324, 0.1413, 0.0479,
        -0.0457, -0.1393, -0.2302, -0.3177, -0.4002, -0.4762, -0.5451, -0.6051,
        -0.6559, -0.6962, -0.7256, -0.7437, -0.7500, -0.7445, -0.7276, -0.6988,
        -0.6595, -0.6095, -0.5501, -0.4822, -0.4064, -0.3244, -0.2374, -0.1464,
        -0.0536, 0.0407, 0.1338, 0.2252, 0.3128, 0.3957, 0.4722, 0.5413,
        0.6021, 0.6532, 0.6942, 0.7242, 0.7430, 0.7499, 0.7452, 0.7287, 0.7009,
        0.6619, 0.6128, 0.5537, 0.4862, 0.4109, 0.3292, 0.2424, 0.1517, 0.0587,
        -0.0353, -0.1286, -0.2201, -0.3079, -0.3911, -0.4680, -0.5377, -0.5988,
        -0.6506, -0.6921, -0.7229, -0.7421, -0.7499, -0.7457, -0.7300, -0.7027,
        -0.6645, -0.6157, -0.5574, -0.4902, -0.4155, -0.3340, -0.2475, -0.1570,
        -0.0640, 0.0298, 0.1235, 0.2148, 0.3031, 0.3865, 0.4639, 0.5338,
        0.5957, 0.6477, 0.6902, 0.7213, 0.7414, 0.7498, 0.7462, 0.7313, 0.7045,
        0.6670, 0.6187, 0.5609, 0.4943, 0.4198, 0.3389, 0.2525, 0.1622, 0.0693,
        -0.0245, -0.1181, -0.2098, -0.2982, -0.3820, -0.4597, -0.5301, -0.5923,
        -0.6452, -0.6879, -0.7199, -0.7405, -0.7496, -0.7469, -0.7323, -0.7065,
        -0.6693, -0.6218, -0.5644, -0.4984, -0.4241, -0.3438, -0.2574, -0.1675,
        -0.0747, 0.0194, 0.1126, 0.2049, 0.2932, 0.3773, 0.4554, 0.5263,
        0.5891, 0.6424, 0.6859, 0.7184, 0.7397, 0.7495, 0.7472, 0.7336, 0.7081,
        0.6717, 0.6249, 0.5677, 0.5025, 0.4284, 0.3485, 0.2624, 0.1727, 0.0800,
        -0.0139, -0.1076, -0.1995, -0.2884, -0.3727, -0.4511, -0.5226, -0.5857,
        -0.6397, -0.6836, -0.7168, -0.7389, -0.7490, -0.7478, -0.7346, -0.7099,
        -0.6742, -0.6276, -0.5716, -0.5061, -0.4331, -0.3530, -0.2676, -0.1778,
        -0.0853, 0.0086, 0.1022, 0.1945, 0.2834, 0.3681, 0.4469, 0.5187,
        0.5824, 0.6369, 0.6814, 0.7152, 0.7379, 0.7487, 0.7483, 0.7355, 0.7117,
        0.6764, 0.6306, 0.5749, 0.5101, 0.4155, 0.3219, 0.2317, 0.1463, 0.0680,
        -0.0022, -0.0631, -0.1134, -0.1532, -0.1817, -0.1991, -0.2060, -0.2026,
        -0.1902, -0.1699, -0.1427, -0.1106, -0.0748, -0.0375, 0.0000, -0.0001,
        0.0000, 0.0001, -0.0001, 0.0002, -0.0002, 0.0001, 0
    ])
    click = click * np.arange(
        start=1, stop=1 / len(click), step=-1 / len(click))**2
    click: np.ndarray = nnresample.resample(click,
                                            up=sampling_rate,
                                            down=88200)  # noqa

    out = np.zeros_like(audio)

    for idx in range(0, len(peaks)):
        start = int(np.floor(peaks[idx] / feature_rate * sampling_rate))
        stop = start + len(click)

        if stop <= len(out):
            if confidence:
                out[start:stop] = out[start:stop] + click * values[idx]
            else:
                out[start:stop] = out[start:stop] + click

    if only_ticks:
        return np.concatenate((out[..., None], out[..., None]), axis=1)
    else:
        return np.concatenate((audio[..., None], out[..., None]), axis=1)
Beispiel #5
0
def audio_to_novelty_curve(audio,
                           sampling_rate,
                           threshold=-74,
                           window_length=None,
                           stepsize=None,
                           log_compression=True,
                           compression_constant=1000,
                           resample_feature_rate=200):
    """Compute spectrogram

    :param audio: Audio data.
    :type audio: np.ndarray
    :param sampling_rate: Sampling rate of the audio (Hz)
    :type sampling_rate: float or int
    :param threshold: Threshold for the normalization (dB)
    :type threshold: int
    :param window_length: Lenght of the stft window.
    :type window_length: float
    :param stepsize: Stepsize for the STFT.
    :type stepsize: float
    :param log_compression: Enable/disable log compression.
    :type log_compression: bool
    :param compression_constant: Constant for log compression
    :type compression_constant: int
    :param resample_feature_rate: Feature rate of the resulting novelty curve (resampled, independent of stepsize)
    :type resample_feature_rate: int
    :return:
    :rtype:
    """

    if window_length is None:
        window_length: float = 1024 * sampling_rate / 22050

    if stepsize is None:
        stepsize: float = 512 * sampling_rate / 22050

    stft_window = np.hanning(round2(window_length))

    # Compute spectrogram
    spec_data, feature_rate, _, _ = audio_to_spectrogram_via_STFT(
        audio, sampling_rate, stft_window=stft_window, stepsize=stepsize)

    # Normalize and convert to dB
    threshold = 10**(threshold / 20)
    spec_data = spec_data / np.amax(spec_data)
    with np.nditer(spec_data, op_flags=["readwrite"]) as it:
        for x in it:
            x[...] = max(x, threshold)

    # bandwise processing
    bands = np.array([[0, 500], [500, 1250], [1250, 3125], [3125, 7812.5],
                      [7812.5, math.floor(sampling_rate / 2)]])

    band_novelty_curves = np.zeros((len(bands), spec_data.shape[1]))

    for band in range(0, len(bands)):

        bins = np.around(bands[band] / (sampling_rate / window_length))

        bins[0] = max(0, bins[0])
        bins[1] = min(len(spec_data) - 1, bins[1])

        # Band novelty curve
        band_data = spec_data[int(bins[0]):int(bins[1]), :]

        if log_compression and compression_constant > 0:
            band_data = np.log(1 + band_data * compression_constant) / (
                np.log(1 + compression_constant))

        # Smoothed differentiator
        diff_len = 0.3
        diff_len = max(math.ceil(diff_len * sampling_rate / stepsize), 5)
        diff_len = 2 * round2(diff_len / 2) + 1

        diff_filter = np.hanning(diff_len) * np.concatenate(
            (-1 * np.ones(math.floor(diff_len / 2)), np.array(
                [0]), np.ones(math.floor(diff_len / 2))))
        diff_filter = diff_filter[..., None].T

        rm1 = np.array(
            [band_data[:, 0] for _ in range(math.floor(diff_len / 2))]).T
        rm2 = np.array(
            [band_data[:, -1] for _ in range(math.floor(diff_len / 2))]).T
        hhh = np.concatenate((rm1, band_data, rm2), axis=1)

        band_diff = filter2(diff_filter, hhh)
        band_diff[band_diff < 0] = 0
        band_diff = band_diff[:,
                              math.floor(diff_len / 2) - 1:-1 -
                              math.floor(diff_len / 2)]

        # Normalize band
        norm_len = 5
        norm_len = max(math.ceil(norm_len * sampling_rate / stepsize), 3)
        norm_filter = np.hanning(norm_len)[..., None]
        norm_curve = filter2(norm_filter / np.sum(norm_filter),
                             np.sum(band_data, axis=0)[..., None]).T

        # Boundary correction
        norm_filter_sum = (
            (np.sum(norm_filter) - np.cumsum(norm_filter, axis=0)) /
            np.sum(norm_filter)).T

        norm_curve[:, 0:math.floor(
            norm_len /
            2)] = norm_curve[:, 0:math.floor(norm_len / 2)] / np.fliplr(
                norm_filter_sum[:, 0:math.floor(norm_len / 2)])
        norm_curve[:, -math.floor(norm_len / 2):] = norm_curve[:, -math.floor(
            norm_len / 2):] / norm_filter_sum[:, 0:math.floor(norm_len / 2)]

        band_diff /= norm_curve

        # Compute novelty curve of band
        band_novelty_curves[band, :] = np.sum(band_diff, axis=0)

    novelty_curve = np.mean(band_novelty_curves, axis=0)

    # Resample curve
    if resample_feature_rate > 0 and resample_feature_rate != feature_rate:
        novelty_curve: np.ndarray = nnresample.resample(
            novelty_curve, resample_feature_rate, int(feature_rate))  # noqa
        novelty_curve = novelty_curve[..., None]
        feature_rate = resample_feature_rate

    # Average subtraction
    smooth_len: int = max(int(math.ceil(1.5 * sampling_rate / stepsize)), 3)
    smooth_filter: np.ndarray = np.hanning(smooth_len)[..., None]

    local_average: np.ndarray = filter2(smooth_filter / np.sum(smooth_filter),
                                        novelty_curve)

    novelty_curve = novelty_curve - local_average
    novelty_curve[novelty_curve < 0] = 0

    return novelty_curve, feature_rate
Beispiel #6
0
 def resample(self, fs):
     """Resamples this impulse response to the given sampling rate."""
     self.data = nnresample.resample(self.data, fs, self.fs)
     self.fs = fs
Beispiel #7
0
def resample(wav, old_sr, new_sr):
    return nnresample.resample(wav, new_sr, old_sr)