def get_ffts(audio_file): """Computes the FFT of each frame of a WAVE file. Splits the WAVE file into frames of equal temporal length and performs an FFT on each. Args: audio_file: A WAVE file. Returns: An iterable of the FFTs of the frames of the WAVE file. """ global COMP_FRAME_SIZE # Read the file, and determine its length in frames (sample, data) = utils.read_wave_from_file(audio_file) total_frames = (data.size / sample) / COMP_FRAME_SIZE # Allocate space for the FFT decompsitions of each frame of sound data fft_out = numpy.ndarray(shape=(total_frames, sample * COMP_FRAME_SIZE), dtype=numpy.complex128) # Loop invariant: # 0 <= frame_index <= total_frames # results in an array (fft_out) of FFTs that correspond to the frames of # the audio file frame_index = 0 while frame_index < total_frames: fft = numpy.fft.fft(data[frame_index * COMP_FRAME_SIZE * sample : (frame_index + 1) * COMP_FRAME_SIZE * sample]) fft_out[frame_index] = fft frame_index = frame_index + 1 return fft_out
def get_mfcc(path): """Finds the MFCCs and FFTs of a WAVE file. Args: path: The path to a WAVE file. Returns: A tuple of two iterables, the FFTs and MFCCs of the frames of the WAVE file. """ global COMP_FRAME_SIZE # Read the file, and determine its length in frames (sample, data) = utils.read_wave_from_file(path) total_frames = (data.size / sample) / COMP_FRAME_SIZE step = COMP_FRAME_SIZE * sample window = hamming(step) # Allocate space for the FFT decompositions of each frame of sound data fft_out = [] mfcc_out = [] # Loop invariant: # 0 <= frame_index <= total_frames # results in an array (fft_out) of FFTs that correspond to the # frames of the WAVE file filterbank_cache = {} frame_index = 0 while frame_index + (1 - FRAME_OVERLAP_FACTOR) < total_frames: # Obtain the frame_indexth frame from the data frame = data[frame_index * step : (frame_index + 1) * step] # Generate the FFT of the frame windowed by the hamming window frame_fft = numpy.fft.rfft(frame * window, n=256) frame_fft[frame_fft == 0] = 0.000003 nfft = len(frame_fft) # Compute the mel triangular filterbank or get a cached version fb_key = (sample, nfft) if fb_key in filterbank_cache: filterbank = filterbank_cache[fb_key] else: filterbank = triangular_filters(sample, nfft).T filterbank[filterbank == 0] = 0.00003 filterbank_cache[fb_key] = filterbank # The power spectrum of the frame power_spectrum = numpy.abs(frame_fft) # Filtered by the mel filterbank mel_power_spectrum = numpy.log10(numpy.dot(power_spectrum, filterbank)) # With the discrete cosine transform to find the cepstrum cepstrum = dct(mel_power_spectrum, type=2, norm="ortho", axis=-1) fft_out.append(frame_fft) mfcc_out.append(cepstrum[: int(len(cepstrum) * SIGNIFICANT_MFCC)]) frame_index = frame_index + FRAME_OVERLAP_FACTOR return numpy.array(mfcc_out)