Ejemplo n.º 1
0
    def __getitem__(self, index):
        phn_seq, clip = self.phnm_seqs[index]
        if self.channels > 1:
            if self.audio == 'same':
                speaker = clip[:2]
                speaker_indices = self.speaker_to_indices[speaker]
                other_index = speaker_indices[np.random.randint(
                    0, len(speaker_indices))]
            else:
                other_index = np.random.randint(0, len(self.phnm_seqs))
            other_phn_seq, other_clip = self.phnm_seqs[other_index]

        if self.audio:
            audio_1 = audio_frames[clip]
            if self.channels == 1:
                mfcc_feat = lmfe(audio_1,
                                 16000,
                                 frame_length=0.025,
                                 num_filters=80)
            else:
                audio_2 = audio_frames[other_clip]
                mfcc_feat = lmfe(mix_audio(audio_1, audio_2),
                                 16000,
                                 frame_length=0.025,
                                 num_filters=80)
            mfcc_feat = torch.from_numpy(
                (fractional_index(mfcc_feat, phn_seq[0][0], phn_seq[-1][1]) -
                 10) / 2).type(torch.FloatTensor)
        else:
            mfcc_feat = None

        if self.lips:
            lips_1, beg_v, end_v = fractional_index(mouth_frames[clip],
                                                    phn_seq[0][0],
                                                    phn_seq[-1][1],
                                                    indices=True)
            if np.random.random_sample() < 0.5:
                lips_1 = np.flip(lips_1, axis=2).copy()
            lips_1 = (torch.from_numpy(lips_1).unsqueeze(0).type(
                torch.FloatTensor) - 128) / 64
            if self.channels > 1:
                if beg_v >= len(mouth_frames[other_clip]):
                    lips_2 = np.zeros([1, lips_1.shape[2], lips_1.shape[3]])
                else:
                    lips_2 = mouth_frames[other_clip][beg_v:end_v]
                if np.random.random_sample() < 0.5:
                    lips_2 = np.flip(lips_2, axis=2).copy()
                lips_2 = (torch.from_numpy(lips_2).unsqueeze(0).type(
                    torch.FloatTensor) - 128) / 64
            else:
                lips_2 = None
        else:
            lips_1, lips_2 = None, None
        text_indexes = torch.LongTensor([tokens2index['<sos>']] +
                                        [tokens2index[p[2]] for p in phn_seq] +
                                        [tokens2index['<eos>']])
        if self.channels > 1:
            clip = clip + other_clip
        clip = clip + (phn_seq)
        return clip, mfcc_feat, lips_1, lips_2, text_indexes
Ejemplo n.º 2
0
def compute_fbank(file, debug=True):
    sr, signal = wav.read(file)
    if debug:
        print('signal shape: ', signal.shape)
    # Pre-emphasizing.
    signal_preemphasized = processing.preemphasis(signal,
                                                  cof=data_config.preemphasis)
    # Stacking frames
    frames = processing.stack_frames(signal_preemphasized,
                                     sampling_frequency=sr,
                                     frame_length=data_config.window_size,
                                     frame_stride=data_config.hop_size,
                                     zero_padding=True)

    # Extracting power spectrum
    power_spectrum = processing.power_spectrum(
        frames, fft_points=512)  # num_frames x fft_length
    if debug:
        print('power spectrum shape=', power_spectrum.shape)

    ############# Extract fbanks features #############
    log_fbank = feature.lmfe(signal_preemphasized,
                             sampling_frequency=sr,
                             frame_length=data_config.window_size,
                             frame_stride=data_config.hop_size,
                             num_filters=data_config.num_mels,
                             fft_length=512,
                             low_frequency=0,
                             high_frequency=None)  # num_frames x num_filters

    if data_config.apply_cmvn:
        # Cepstral mean variance normalization.
        log_fbank_cmvn = processing.cmvn(log_fbank,
                                         variance_normalization=True)
        if debug:
            print('fbank(mean + variance normalized) feature shape=',
                  log_fbank_cmvn.shape)
        log_fbank = log_fbank_cmvn  # num_frames x num_filters

    # Extracting derivative features
    log_fbank = feature.extract_derivative_feature(log_fbank)
    # print('log fbank feature cube shape=', log_fbank_feature_cube.shape) # num_frames x num_filters x 3

    # frameSlice and dowmSampling
    # concat_mat = concat_frame(log_fbank)
    # log_fbank = subsampling(concat_mat)
    # log_fbank = build_LFR_features(log_fbank, data_config.LFR_m, data_config.LFR_n)
    if debug:
        print('concat & subsample shape=', log_fbank.shape)

    return log_fbank
def Compute_filterbank(audio_file, frame_length=0.025, frame_stride=0.01):
    fs, audio_data = wav.read(audio_file)
    filterbank_energy = lmfe(audio_data,
                             fs,
                             frame_length,
                             frame_stride,
                             num_filters=40,
                             fft_length=512,
                             low_frequency=0,
                             high_frequency=None)
    #filterbank_energy_inv = np.transpose(filterbank_energy)
    normalized = speechpy.feature.processing.cmvn(filterbank_energy,
                                                  variance_normalization=True)
    return np.transpose(normalized)
Ejemplo n.º 4
0
############# Extract MFCC features #############
mfcc = feature.mfcc(signal,
                    sampling_frequency=fs,
                    frame_length=0.020,
                    frame_stride=0.01,
                    num_filters=40,
                    fft_length=512,
                    low_frequency=0,
                    high_frequency=None)

# Cepstral mean variance normalization.
mfcc_cmvn = processing.cmvn(mfcc, variance_normalization=True)
print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)

# Extracting derivative features
mfcc_feature_cube = feature.extract_derivative_feature(mfcc)
print('mfcc feature cube shape=', mfcc_feature_cube.shape)

############# Extract logenergy features #############
logenergy = feature.lmfe(signal,
                         sampling_frequency=fs,
                         frame_length=0.020,
                         frame_stride=0.01,
                         num_filters=40,
                         fft_length=512,
                         low_frequency=0,
                         high_frequency=None)
logenergy_feature_cube = feature.extract_derivative_feature(logenergy)
print('logenergy features=', logenergy.shape)
Ejemplo n.º 5
0
        if subdir.__len__() < 38:
            if count != 0:
                elapsed_time = time.time() - start_time
            curr_id = subdir[29:38]
            # print(curr_id)
            count = count + 1
            start_time = time.time()

    for file in files:

        sound = AudioSegment.from_wav(subdir + "/" + file)
        sound = sound.set_channels(1)
        sound.export("modified.wav", format="wav")
        sample_rate, samples = wavfile.read("modified.wav")

        features = lmfe(samples, sample_rate, 0.025, 0.01, 40)
        features = extract_derivative_feature(features)

        timevar = 100
        if features.shape[0] >= timevar:
            no_cuts = int(features.shape[0] / timevar)
            for i in range(no_cuts):
                cut = features[i * timevar:(i * timevar) + timevar:, :, :]
                # print("cut: ", cut.shape)
                with open(filename2, "a") as myfile:
                    myfile.write(curr_id + "\n")
                with open(filename, "a") as myfile:
                    for data_slice in cut:
                        np.savetxt(myfile,
                                   data_slice,
                                   delimiter=',',