Example #1
0
    def lr_preprocess(self, x):
        global LR_HOP_DURATION
        global HOP_DURATION
        global AUDIO_SAMPLE_RATE
        if self.raw_max_length is None:
            self.raw_max_length = get_max_length(x)
            scale = self.raw_max_length/84000
            LR_HOP_DURATION = max(BASE_LR_HOP_DURATION,BASE_LR_HOP_DURATION*scale)
            HOP_DURATION = max(BASE_HOP_DURATION,BASE_HOP_DURATION*scale/2)
            AUDIO_SAMPLE_RATE = max(BASE_AUDIO_SAMPLE_RATE,int(BASE_AUDIO_SAMPLE_RATE*scale/3))
            print('LR_HOP_DURATION---%s'%LR_HOP_DURATION)
            print('HOP_DURATION---%s'%HOP_DURATION)
            print('AUDIO_SAMPLE_RATE---%s'%AUDIO_SAMPLE_RATE)
            
        x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x]
        x_mel = extract_melspectrogram_parallel(
            x, n_mels=30, use_power_db=True,lr=True)
        # x_contrast = extract_bandwidth_parallel(x)

        x_feas = []
        for i in range(len(x_mel)):
            mel = np.mean(x_mel[i], axis=0).reshape(-1)
            mel_std = np.std(x_mel[i], axis=0).reshape(-1)
            # contrast = np.mean(x_contrast[i], axis=0).reshape(-1)
            # contrast_std = np.std(x_contrast[i], axis=0).reshape(-1)
            # contrast, contrast_std
            x_feas.append(np.concatenate([mel, mel_std], axis=-1))
        x_feas = np.asarray(x_feas)

        scaler = StandardScaler()
        X = scaler.fit_transform(x_feas[:, :])
        return X
Example #2
0
    def nn_preprocess(self, x, n_mfcc=96, max_duration=5, is_mfcc=True):
        if self.raw_max_length is None:
            self.raw_max_length = get_max_length(x)
            if self.raw_max_length > (MIDDLE_DURATION * AUDIO_SAMPLE_RATE):
                self.need_30s = True
                if len(self._train_y) < 1000 and self._num_classes < 30:
                    self.crnn_first = True
            self.raw_max_length = min(max_duration * AUDIO_SAMPLE_RATE,
                                      self.raw_max_length)
            self.raw_max_length = max(MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE,
                                      self.raw_max_length)
        x = [sample[0:self.raw_max_length] for sample in x]

        if is_mfcc:
            # extract mfcc
            x = extract_mfcc_parallel(x, n_mfcc=n_mfcc)
        else:
            x = extract_melspectrogram_parallel(x,
                                                n_mels=128,
                                                use_power_db=True)
        if self.fea_max_length is None:
            self.fea_max_length = get_max_length(x)
            self.fea_max_length = min(MAX_FRAME_NUM, self.fea_max_length)
        x = pad_seq(x, pad_len=self.fea_max_length)

        return x
Example #3
0
    def preprocess_data(self, x):
        if IS_CUT_AUDIO:
            x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE]
                 for sample in x]

        x_mel = extract_melspectrogram_parallel(
            x, n_mels=128, use_power_db=True)
        # x_mel = extract_mfcc_parallel(x, n_mfcc=96)
        if self.max_length is None:
            self.max_length = get_max_length(x_mel)
            self.max_length = min(MAX_FRAME_NUM, self.max_length)
        x_mel = pad_seq(x_mel, pad_len=self.max_length)
        x_mel = x_mel[:, :, :, np.newaxis]
        return x_mel
Example #4
0
 def preprocess_data(self, x):
     if IS_CUT_AUDIO:
         x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x]
     # extract mfcc
     x_mfcc = extract_mfcc_parallel(x, n_mfcc=64)
     x_mel = extract_melspectrogram_parallel(x, n_mels=64, use_power_db=True)
     if self.max_length is None:
         self.max_length = get_max_length(x_mfcc)
         self.max_length = min(MAX_FRAME_NUM, self.max_length)
     x_mfcc = pad_seq(x_mfcc, self.max_length)
     x_mel = pad_seq(x_mel, self.max_length)
     x_feas = np.concatenate([x_mfcc, x_mel], axis=-1)
     x_feas = x_feas[:, :, :, np.newaxis]
     # x_mel = pad_seq(x_mel, self.max_length)
     # x_mel = x_mel[:, :, :, np.newaxis]
     return x_feas
    def preprocess_data(self, x):
        # cut down
        x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x]
        # extract mfcc
        # x_mfcc = extract_mfcc_parallel(x, n_mfcc=63)
        x_mel = extract_melspectrogram_parallel(x,
                                                n_mels=40,
                                                use_power_db=True)
        # x_chroma_stft = extract_chroma_stft_parallel(x, n_chroma=12)
        # x_rms = extract_rms_parallel(x)
        # x_contrast = extract_spectral_contrast_parallel(x, n_bands=6)
        # x_flatness = extract_spectral_flatness_parallel(x)
        # x_polyfeatures = extract_poly_features_parallel(x, order=1)
        # x_cent = extract_spectral_centroid_parallel(x)
        # x_bw = extract_bandwidth_parallel(x)
        # x_rolloff = extract_spectral_rolloff_parallel(x)
        # x_zcr = extract_zero_crossing_rate_parallel(x)

        x_feas = []
        for i in range(len(x_mel)):
            mel = np.mean(x_mel[i], axis=0).reshape(-1)
            mel_std = np.std(x_mel[i], axis=0).reshape(-1)
            # mel = np.mean(x_mel[i], axis=0).reshape(-1)
            # mel_std = np.std(x_mel[i], axis=0).reshape(-1)
            # chroma_stft = np.mean(x_chroma_stft[i], axis=0).reshape(-1)
            # chroma_stft_std = np.std(x_chroma_stft[i], axis=0).reshape(-1)
            # rms = np.mean(x_rms[i], axis=0).reshape(-1)
            # contrast = np.mean(x_contrast[i], axis=0).reshape(-1)
            # contrast_std = np.std(x_contrast[i], axis=0).reshape(-1)
            # flatness = np.mean(x_flatness[i], axis=0).reshape(-1)
            # polyfeatures = np.mean(x_polyfeatures[i], axis=0).reshape(-1)
            # cent = np.mean(x_cent[i], axis=0).reshape(-1)
            # cent_std = np.std(x_cent[i], axis=0).reshape(-1)
            # bw = np.mean(x_bw[i], axis=0).reshape(-1)
            # bw_std = np.std(x_bw[i], axis=0).reshape(-1)
            # rolloff = np.mean(x_rolloff[i], axis=0).reshape(-1)
            # zcr = np.mean(x_zcr[i], axis=0).reshape(-1)
            x_feas.append(np.concatenate([mel, mel_std], axis=-1))
            # x_feas.append(np.concatenate([mfcc, mel, contrast, bw, cent, mfcc_std, mel_std, contrast_std, bw_std, cent_std]))
        x_feas = np.asarray(x_feas)

        scaler = StandardScaler()
        X = scaler.fit_transform(x_feas[:, :])
        # log(   'x_feas shape: {X.shape}\n'
        #        'x_feas[0]: {X[0]}')
        return X
Example #6
0
    def preprocess_data(self, x):
        if IS_CUT_AUDIO:
            x = [
                sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE]
                for sample in x
            ]
        # extract mfcc
        x_mfcc = extract_mfcc_parallel(x, n_mfcc=20)
        x_mel = extract_melspectrogram_parallel(x,
                                                n_mels=20,
                                                use_power_db=True)
        x_chroma_stft = extract_chroma_stft_parallel(x, n_chroma=12)
        # x_rms = extract_rms_parallel(x)
        x_contrast = extract_spectral_contrast_parallel(x, n_bands=6)
        x_flatness = extract_spectral_flatness_parallel(x)
        # x_polyfeatures = extract_poly_features_parallel(x, order=1)
        x_cent = extract_spectral_centroid_parallel(x)
        x_bw = extract_bandwidth_parallel(x)
        x_rolloff = extract_spectral_rolloff_parallel(x)
        x_zcr = extract_zero_crossing_rate_parallel(x)

        x_feas = []
        for i in range(len(x_mfcc)):
            mfcc = np.mean(x_mfcc[i], axis=0).reshape(-1)
            mel = np.mean(x_mel[i], axis=0).reshape(-1)
            chroma_stft = np.mean(x_chroma_stft[i], axis=0).reshape(-1)
            # rms = np.mean(x_rms[i], axis=0).reshape(-1)
            contrast = np.mean(x_contrast[i], axis=0).reshape(-1)
            flatness = np.mean(x_flatness[i], axis=0).reshape(-1)
            # polyfeatures = np.mean(x_polyfeatures[i], axis=0).reshape(-1)
            cent = np.mean(x_cent[i], axis=0).reshape(-1)
            bw = np.mean(x_bw[i], axis=0).reshape(-1)
            rolloff = np.mean(x_rolloff[i], axis=0).reshape(-1)
            zcr = np.mean(x_zcr[i], axis=0).reshape(-1)
            x_feas.append(
                np.concatenate([
                    mfcc, mel, chroma_stft, contrast, flatness, cent, bw,
                    rolloff, zcr
                ],
                               axis=-1))
        x_feas = np.asarray(x_feas)

        scaler = StandardScaler()
        X = scaler.fit_transform(x_feas[:, :])
        return X
    def lr_preprocess(self, x):
        x = [sample[0:MAX_AUDIO_DURATION * AUDIO_SAMPLE_RATE] for sample in x]
        x_mel = extract_melspectrogram_parallel(x, n_mels=30, use_power_db=True)
        # x_contrast = extract_bandwidth_parallel(x)

        x_feas = []
        for i in range(len(x_mel)):
            mel = np.mean(x_mel[i], axis=0).reshape(-1)
            mel_std = np.std(x_mel[i], axis=0).reshape(-1)
            # contrast = np.mean(x_contrast[i], axis=0).reshape(-1)
            # contrast_std = np.std(x_contrast[i], axis=0).reshape(-1)
            # contrast, contrast_std
            x_feas.append(np.concatenate([mel, mel_std], axis=-1))
        x_feas = np.asarray(x_feas)

        scaler = StandardScaler()
        X = scaler.fit_transform(x_feas[:, :])
        return X
Example #8
0
    def nn_preprocess(self, x, n_mfcc=96, max_duration=5, is_mfcc=True):
        global LR_HOP_DURATION
        global HOP_DURATION
        global AUDIO_SAMPLE_RATE
        if self.raw_max_length is None:
            self.raw_max_length = get_max_length(x)
            scale = self.raw_max_length/84000
            LR_HOP_DURATION = max(BASE_LR_HOP_DURATION,BASE_LR_HOP_DURATION*scale)
            HOP_DURATION = max(BASE_HOP_DURATION,BASE_HOP_DURATION*scale/2)
            AUDIO_SAMPLE_RATE = max(BASE_AUDIO_SAMPLE_RATE,int(BASE_AUDIO_SAMPLE_RATE*scale/3))
            print('LR_HOP_DURATION---%s'%LR_HOP_DURATION)
            print('HOP_DURATION---%s'%HOP_DURATION)
            print('AUDIO_SAMPLE_RATE---%s'%AUDIO_SAMPLE_RATE)
            
            if self.raw_max_length > (MIDDLE_DURATION * AUDIO_SAMPLE_RATE):
                self.need_30s = True
                if len(self._train_y) < 1000 and self._num_classes < 30:
                    self.crnn_first = True
            self.raw_max_length = min(
                max_duration * AUDIO_SAMPLE_RATE,
                self.raw_max_length)
            self.raw_max_length = max(
                MAX_AUDIO_DURATION *
                AUDIO_SAMPLE_RATE,
                self.raw_max_length)
        x = [sample[0:self.raw_max_length] for sample in x]

        if is_mfcc:
            # extract mfcc
            x = extract_mfcc_parallel(x, n_mfcc=n_mfcc)
        else:
            x = extract_melspectrogram_parallel(
                x, n_mels=128, use_power_db=True)
        if self.fea_max_length is None:
            self.fea_max_length = get_max_length(x)
            self.fea_max_length = min(MAX_FRAME_NUM, self.fea_max_length)
        x = pad_seq(x, pad_len=self.fea_max_length)

        return x