def __getitem__(self, index): phn_seq, clip = self.phnm_seqs[index] if self.channels > 1: if self.audio == 'same': speaker = clip[:2] speaker_indices = self.speaker_to_indices[speaker] other_index = speaker_indices[np.random.randint( 0, len(speaker_indices))] else: other_index = np.random.randint(0, len(self.phnm_seqs)) other_phn_seq, other_clip = self.phnm_seqs[other_index] if self.audio: audio_1 = audio_frames[clip] if self.channels == 1: mfcc_feat = lmfe(audio_1, 16000, frame_length=0.025, num_filters=80) else: audio_2 = audio_frames[other_clip] mfcc_feat = lmfe(mix_audio(audio_1, audio_2), 16000, frame_length=0.025, num_filters=80) mfcc_feat = torch.from_numpy( (fractional_index(mfcc_feat, phn_seq[0][0], phn_seq[-1][1]) - 10) / 2).type(torch.FloatTensor) else: mfcc_feat = None if self.lips: lips_1, beg_v, end_v = fractional_index(mouth_frames[clip], phn_seq[0][0], phn_seq[-1][1], indices=True) if np.random.random_sample() < 0.5: lips_1 = np.flip(lips_1, axis=2).copy() lips_1 = (torch.from_numpy(lips_1).unsqueeze(0).type( torch.FloatTensor) - 128) / 64 if self.channels > 1: if beg_v >= len(mouth_frames[other_clip]): lips_2 = np.zeros([1, lips_1.shape[2], lips_1.shape[3]]) else: lips_2 = mouth_frames[other_clip][beg_v:end_v] if np.random.random_sample() < 0.5: lips_2 = np.flip(lips_2, axis=2).copy() lips_2 = (torch.from_numpy(lips_2).unsqueeze(0).type( torch.FloatTensor) - 128) / 64 else: lips_2 = None else: lips_1, lips_2 = None, None text_indexes = torch.LongTensor([tokens2index['<sos>']] + [tokens2index[p[2]] for p in phn_seq] + [tokens2index['<eos>']]) if self.channels > 1: clip = clip + other_clip clip = clip + (phn_seq) return clip, mfcc_feat, lips_1, lips_2, text_indexes
def compute_fbank(file, debug=True): sr, signal = wav.read(file) if debug: print('signal shape: ', signal.shape) # Pre-emphasizing. signal_preemphasized = processing.preemphasis(signal, cof=data_config.preemphasis) # Stacking frames frames = processing.stack_frames(signal_preemphasized, sampling_frequency=sr, frame_length=data_config.window_size, frame_stride=data_config.hop_size, zero_padding=True) # Extracting power spectrum power_spectrum = processing.power_spectrum( frames, fft_points=512) # num_frames x fft_length if debug: print('power spectrum shape=', power_spectrum.shape) ############# Extract fbanks features ############# log_fbank = feature.lmfe(signal_preemphasized, sampling_frequency=sr, frame_length=data_config.window_size, frame_stride=data_config.hop_size, num_filters=data_config.num_mels, fft_length=512, low_frequency=0, high_frequency=None) # num_frames x num_filters if data_config.apply_cmvn: # Cepstral mean variance normalization. log_fbank_cmvn = processing.cmvn(log_fbank, variance_normalization=True) if debug: print('fbank(mean + variance normalized) feature shape=', log_fbank_cmvn.shape) log_fbank = log_fbank_cmvn # num_frames x num_filters # Extracting derivative features log_fbank = feature.extract_derivative_feature(log_fbank) # print('log fbank feature cube shape=', log_fbank_feature_cube.shape) # num_frames x num_filters x 3 # frameSlice and dowmSampling # concat_mat = concat_frame(log_fbank) # log_fbank = subsampling(concat_mat) # log_fbank = build_LFR_features(log_fbank, data_config.LFR_m, data_config.LFR_n) if debug: print('concat & subsample shape=', log_fbank.shape) return log_fbank
def Compute_filterbank(audio_file, frame_length=0.025, frame_stride=0.01): fs, audio_data = wav.read(audio_file) filterbank_energy = lmfe(audio_data, fs, frame_length, frame_stride, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) #filterbank_energy_inv = np.transpose(filterbank_energy) normalized = speechpy.feature.processing.cmvn(filterbank_energy, variance_normalization=True) return np.transpose(normalized)
############# Extract MFCC features ############# mfcc = feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) # Cepstral mean variance normalization. mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True) print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) # Extracting derivative features mfcc_feature_cube = feature.extract_derivative_feature(mfcc) print('mfcc feature cube shape=', mfcc_feature_cube.shape) ############# Extract logenergy features ############# logenergy = feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) logenergy_feature_cube = feature.extract_derivative_feature(logenergy) print('logenergy features=', logenergy.shape)
if subdir.__len__() < 38: if count != 0: elapsed_time = time.time() - start_time curr_id = subdir[29:38] # print(curr_id) count = count + 1 start_time = time.time() for file in files: sound = AudioSegment.from_wav(subdir + "/" + file) sound = sound.set_channels(1) sound.export("modified.wav", format="wav") sample_rate, samples = wavfile.read("modified.wav") features = lmfe(samples, sample_rate, 0.025, 0.01, 40) features = extract_derivative_feature(features) timevar = 100 if features.shape[0] >= timevar: no_cuts = int(features.shape[0] / timevar) for i in range(no_cuts): cut = features[i * timevar:(i * timevar) + timevar:, :, :] # print("cut: ", cut.shape) with open(filename2, "a") as myfile: myfile.write(curr_id + "\n") with open(filename, "a") as myfile: for data_slice in cut: np.savetxt(myfile, data_slice, delimiter=',',