def make_features(df_x): """Данные разбиваются на блоки и создают признаки для них.""" feat = dict() # Спектральная плотность (диапазоны выбраны в ручную) - нечто похожее используется при анализе голоса в NN welch = signal.welch(df_x)[1] for num in [2, 3, 28, 30]: feat[f"welch_{num}"] = welch[num] # Фичи на скользящих медианах - идейно похоже на Pooling только не max и average, а MedianPolling mean_abs = (df_x - df_x.mean()).abs() feat["mean_abs_med"] = mean_abs.median() roll_std = df_x.rolling(375).std().dropna() feat["std_roll_med_375"] = roll_std.median() half = len(roll_std) // 2 feat["std_roll_half1"] = roll_std.iloc[:half].median() feat["std_roll_half2"] = roll_std.iloc[-half:].median() # Фичи на скользящих глубоких квантилях - тоже нейкий QuantilePolling feat["q05_roll_std_25"] = df_x.rolling(25).std().dropna().quantile(0.05) feat["q05_roll_std_375"] = df_x.rolling(375).std().dropna().quantile(0.05) feat["q05_roll_std_1500"] = df_x.rolling(1500).std().dropna().quantile( 0.05) feat["q05_roll_std_1000"] = df_x.rolling(1000).std().dropna().quantile( 0.05) feat["q01_roll_mean_1500"] = df_x.rolling(1500).mean().dropna().quantile( 0.01) feat["q99_roll_mean_1500"] = df_x.rolling(1500).mean().dropna().quantile( 0.99) feat["ave10"] = stats.trim_mean(df_x, 0.1) # Pre Main feat["num_peaks_10"] = feature_calculators.number_peaks(df_x, 10) feat["percentile_roll_std_5"] = np.percentile( df_x.rolling(10000).std().dropna().values, 5) feat["afc_50"] = feature_calculators.autocorrelation(df_x, 50) welch = signal.welch(df_x.clip(-11, 20))[1] for num in list(range(33)): feat[f"welch_clipped_{num}"] = welch[num] return feat
def npeaks_100(arr): return feature_calculators.number_peaks(arr, n=100)
def create_features(seg_id, seg, X, st, end): """ create features including fft features, statistical features and time series features :param seg_id: the ID for a sample :param seg: s signal segment :param X: train set features before creating these features :param st: the start index of the signal segment :param end: the end index of the signal segment :return: train set features after creating these features """ try: # test set won't create these features because its seg_id is string X.loc[seg_id, 'seg_id'] = np.int32(seg_id) X.loc[seg_id, 'seg_start'] = np.int32(st) X.loc[seg_id, 'seg_end'] = np.int32(end) except ValueError: pass xc = pd.Series(seg['acoustic_data'].values) xcdm = xc - np.mean(xc) b, a = des_bw_filter_lp(cutoff=18000) xcz = sg.lfilter(b, a, xcdm) zc = np.fft.fft(xcz) zc = zc[:MAX_FREQ] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = [x for x in range(0, MAX_FREQ, FREQ_BAND)] magFFT = np.sqrt(realFFT ** 2 + imagFFT ** 2) phzFFT = np.arctan(imagFFT / realFFT) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: X.loc[seg_id, 'FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.01) X.loc[seg_id, 'FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.1) X.loc[seg_id, 'FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.9) X.loc[seg_id, 'FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.99) X.loc[seg_id, 'FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_BAND]) X.loc[seg_id, 'FFT_Rmean'] = realFFT.mean() X.loc[seg_id, 'FFT_Rstd'] = realFFT.std() X.loc[seg_id, 'FFT_Rmax'] = realFFT.max() X.loc[seg_id, 'FFT_Rmin'] = realFFT.min() X.loc[seg_id, 'FFT_Imean'] = imagFFT.mean() X.loc[seg_id, 'FFT_Istd'] = imagFFT.std() X.loc[seg_id, 'FFT_Imax'] = imagFFT.max() X.loc[seg_id, 'FFT_Imin'] = imagFFT.min() X.loc[seg_id, 'FFT_Rmean_first_6000'] = realFFT[:6000].mean() X.loc[seg_id, 'FFT_Rstd__first_6000'] = realFFT[:6000].std() X.loc[seg_id, 'FFT_Rmax_first_6000'] = realFFT[:6000].max() X.loc[seg_id, 'FFT_Rmin_first_6000'] = realFFT[:6000].min() X.loc[seg_id, 'FFT_Rmean_first_18000'] = realFFT[:18000].mean() X.loc[seg_id, 'FFT_Rstd_first_18000'] = realFFT[:18000].std() X.loc[seg_id, 'FFT_Rmax_first_18000'] = realFFT[:18000].max() X.loc[seg_id, 'FFT_Rmin_first_18000'] = realFFT[:18000].min() del xcz del zc b, a = des_bw_filter_lp(cutoff=2500) xc0 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=2500, high=5000) xc1 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=5000, high=7500) xc2 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=7500, high=10000) xc3 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=10000, high=12500) xc4 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=12500, high=15000) xc5 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=15000, high=17500) xc6 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_bp(low=17500, high=20000) xc7 = sg.lfilter(b, a, xcdm) b, a = des_bw_filter_hp(cutoff=20000) xc8 = sg.lfilter(b, a, xcdm) sigs = [xc, pd.Series(xc0), pd.Series(xc1), pd.Series(xc2), pd.Series(xc3), pd.Series(xc4), pd.Series(xc5), pd.Series(xc6), pd.Series(xc7), pd.Series(xc8)] for i, sig in enumerate(sigs): X.loc[seg_id, 'mean_%d' % i] = sig.mean() X.loc[seg_id, 'std_%d' % i] = sig.std() X.loc[seg_id, 'max_%d' % i] = sig.max() X.loc[seg_id, 'min_%d' % i] = sig.min() X.loc[seg_id, 'mean_change_abs_%d' % i] = np.mean(np.diff(sig)) X.loc[seg_id, 'mean_change_rate_%d' % i] = calc_mean_change_rate(sig) X.loc[seg_id, 'abs_max_%d' % i] = np.abs(sig).max() X.loc[seg_id, 'std_first_50000_%d' % i] = sig[:50000].std() X.loc[seg_id, 'std_last_50000_%d' % i] = sig[-50000:].std() X.loc[seg_id, 'std_first_10000_%d' % i] = sig[:10000].std() X.loc[seg_id, 'std_last_10000_%d' % i] = sig[-10000:].std() X.loc[seg_id, 'avg_first_50000_%d' % i] = sig[:50000].mean() X.loc[seg_id, 'avg_last_50000_%d' % i] = sig[-50000:].mean() X.loc[seg_id, 'avg_first_10000_%d' % i] = sig[:10000].mean() X.loc[seg_id, 'avg_last_10000_%d' % i] = sig[-10000:].mean() X.loc[seg_id, 'min_first_50000_%d' % i] = sig[:50000].min() X.loc[seg_id, 'min_last_50000_%d' % i] = sig[-50000:].min() X.loc[seg_id, 'min_first_10000_%d' % i] = sig[:10000].min() X.loc[seg_id, 'min_last_10000_%d' % i] = sig[-10000:].min() X.loc[seg_id, 'max_first_50000_%d' % i] = sig[:50000].max() X.loc[seg_id, 'max_last_50000_%d' % i] = sig[-50000:].max() X.loc[seg_id, 'max_first_10000_%d' % i] = sig[:10000].max() X.loc[seg_id, 'max_last_10000_%d' % i] = sig[-10000:].max() X.loc[seg_id, 'max_to_min_%d' % i] = sig.max() / np.abs(sig.min()) X.loc[seg_id, 'max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min()) X.loc[seg_id, 'count_big_%d' % i] = len(sig[np.abs(sig) > 500]) X.loc[seg_id, 'mean_change_rate_first_50000_%d' % i] = calc_mean_change_rate(sig[:50000]) X.loc[seg_id, 'mean_change_rate_last_50000_%d' % i] = calc_mean_change_rate(sig[-50000:]) X.loc[seg_id, 'mean_change_rate_first_10000_%d' % i] = calc_mean_change_rate(sig[:10000]) X.loc[seg_id, 'mean_change_rate_last_10000_%d' % i] = calc_mean_change_rate(sig[-10000:]) X.loc[seg_id, 'q95_%d' % i] = np.quantile(sig, 0.95) X.loc[seg_id, 'q99_%d' % i] = np.quantile(sig, 0.99) X.loc[seg_id, 'q05_%d' % i] = np.quantile(sig, 0.05) X.loc[seg_id, 'q01_%d' % i] = np.quantile(sig, 0.01) X.loc[seg_id, 'abs_q95_%d' % i] = np.quantile(np.abs(sig), 0.95) X.loc[seg_id, 'abs_q99_%d' % i] = np.quantile(np.abs(sig), 0.99) X.loc[seg_id, 'abs_q05_%d' % i] = np.quantile(np.abs(sig), 0.05) X.loc[seg_id, 'abs_q01_%d' % i] = np.quantile(np.abs(sig), 0.01) X.loc[seg_id, 'trend_%d' % i] = add_trend_feature(sig) X.loc[seg_id, 'abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True) X.loc[seg_id, 'abs_mean_%d' % i] = np.abs(sig).mean() X.loc[seg_id, 'abs_std_%d' % i] = np.abs(sig).std() X.loc[seg_id, 'mad_%d' % i] = sig.mad() X.loc[seg_id, 'kurt_%d' % i] = sig.kurtosis() X.loc[seg_id, 'skew_%d' % i] = sig.skew() X.loc[seg_id, 'med_%d' % i] = sig.median() X.loc[seg_id, 'Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean() X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean() X.loc[seg_id, 'classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean() X.loc[seg_id, 'classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean() X.loc[seg_id, 'classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean() X.loc[seg_id, 'classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean() X.loc[seg_id, 'Moving_average_700_mean_%d' % i] = sig.rolling(window=700).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_1500_mean_%d' % i] = sig.rolling(window=1500).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_3000_mean_%d' % i] = sig.rolling(window=3000).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_6000_mean_%d' % i] = sig.rolling(window=6000).mean().mean(skipna=True) ewma = pd.Series.ewm X.loc[seg_id, 'exp_Moving_average_300_mean_%d' % i] = ewma(sig, span=300).mean().mean(skipna=True) X.loc[seg_id, 'exp_Moving_average_3000_mean_%d' % i] = ewma(sig, span=3000).mean().mean(skipna=True) X.loc[seg_id, 'exp_Moving_average_30000_mean_%d' % i] = ewma(sig, span=30000).mean().mean(skipna=True) no_of_std = 3 X.loc[seg_id, 'MA_700MA_std_mean_%d' % i] = sig.rolling(window=700).std().mean() X.loc[seg_id, 'MA_700MA_BB_high_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] + no_of_std * X.loc[ seg_id, 'MA_700MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_700MA_BB_low_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] - no_of_std * X.loc[ seg_id, 'MA_700MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_400MA_std_mean_%d' % i] = sig.rolling(window=400).std().mean() X.loc[seg_id, 'MA_400MA_BB_high_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] + no_of_std * X.loc[ seg_id, 'MA_400MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_400MA_BB_low_mean_%d' % i] = ( X.loc[seg_id, 'Moving_average_700_mean_%d' % i] - no_of_std * X.loc[ seg_id, 'MA_400MA_std_mean_%d' % i]).mean() X.loc[seg_id, 'MA_1000MA_std_mean_%d' % i] = sig.rolling(window=1000).std().mean() X.loc[seg_id, 'iqr_%d' % i] = np.subtract(*np.percentile(sig, [75, 25])) X.loc[seg_id, 'q999_%d' % i] = np.quantile(sig, 0.999) X.loc[seg_id, 'q001_%d' % i] = np.quantile(sig, 0.001) X.loc[seg_id, 'ave10_%d' % i] = stats.trim_mean(sig, 0.1) X.loc[seg_id, 'num_peaks_10_%d' % i] = feature_calculators.number_peaks(sig, 10) X.loc[seg_id, 'cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1) # time series complexity X.loc[seg_id, 'count_1000_0_%d' % i] = feature_calculators.range_count(sig, -1000, 0) X.loc[seg_id, 'binned_entropy_5_%d' % i] = feature_calculators.binned_entropy(sig, 5) X.loc[seg_id, 'binned_entropy_15_%d' % i] = feature_calculators.binned_entropy(sig, 15) # sliding window is a kind of filter, so this code is out of the cycle of band pass for windows in [10, 100, 1000]: x_roll_std = xc.rolling(windows).std().dropna() x_roll_mean = xc.rolling(windows).mean().dropna() X.loc[seg_id, 'ave_roll_std_' + str(windows)] = x_roll_std.mean() X.loc[seg_id, 'std_roll_std_' + str(windows)] = x_roll_std.std() X.loc[seg_id, 'max_roll_std_' + str(windows)] = x_roll_std.max() X.loc[seg_id, 'min_roll_std_' + str(windows)] = x_roll_std.min() X.loc[seg_id, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01) X.loc[seg_id, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05) X.loc[seg_id, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95) X.loc[seg_id, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99) X.loc[seg_id, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std)) X.loc[seg_id, 'av_change_rate_roll_std_' + str(windows)] = calc_mean_change_rate(x_roll_std) X.loc[seg_id, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max() X.loc[seg_id, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean() X.loc[seg_id, 'std_roll_mean_' + str(windows)] = x_roll_mean.std() X.loc[seg_id, 'max_roll_mean_' + str(windows)] = x_roll_mean.max() X.loc[seg_id, 'min_roll_mean_' + str(windows)] = x_roll_mean.min() X.loc[seg_id, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01) X.loc[seg_id, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05) X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95) X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99) X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean)) X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = calc_mean_change_rate(x_roll_mean) X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max() return X
def create_features(seg, ): data_row = {} xcz = des_filter(seg, high=CUTOFF) zc = np.fft.fft(xcz) zc = zc[:MAX_FREQ] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = list(range(0, MAX_FREQ, FREQ_STEP)) magFFT = np.abs(zc) phzFFT = np.angle(zc) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: data_row['FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.01) data_row['FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.1) data_row['FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.9) data_row['FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.99) data_row['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Rmean'] = realFFT.mean() data_row['FFT_Rstd'] = realFFT.std() data_row['FFT_Rmax'] = realFFT.max() data_row['FFT_Rmin'] = realFFT.min() data_row['FFT_Imean'] = imagFFT.mean() data_row['FFT_Istd'] = imagFFT.std() data_row['FFT_Imax'] = imagFFT.max() data_row['FFT_Imin'] = imagFFT.min() data_row['FFT_Rmean_first_6000'] = realFFT[:6000].mean() data_row['FFT_Rstd__first_6000'] = realFFT[:6000].std() data_row['FFT_Rmax_first_6000'] = realFFT[:6000].max() data_row['FFT_Rmin_first_6000'] = realFFT[:6000].min() data_row['FFT_Rmean_first_18000'] = realFFT[:18000].mean() data_row['FFT_Rstd_first_18000'] = realFFT[:18000].std() data_row['FFT_Rmax_first_18000'] = realFFT[:18000].max() data_row['FFT_Rmin_first_18000'] = realFFT[:18000].min() del xcz del zc gc.collect() sigs = [seg] for freq in range(0,MAX_FREQ+FREQ_STEP,FREQ_STEP): if freq==0: xc_ = des_filter(seg, high=FREQ_STEP) elif freq==MAX_FREQ: xc_ = des_filter(seg, low=freq) else: xc_ = des_filter(seg, low=freq, high=freq+FREQ_STEP) sigs.append(pd.Series(xc_)) for i, sig in enumerate(sigs): data_row['mean_%d' % i] = sig.mean() data_row['std_%d' % i] = sig.std() data_row['max_%d' % i] = sig.max() data_row['min_%d' % i] = sig.min() data_row['mean_change_abs_%d' % i] = np.mean(np.diff(sig)) data_row['mean_change_rate_%d' % i] = np.mean(np.nonzero((np.diff(sig) / sig[:-1]))[0]) data_row['abs_max_%d' % i] = np.abs(sig).max() data_row['abs_min_%d' % i] = np.abs(sig).min() data_row['std_first_50000_%d' % i] = sig[:50000].std() data_row['std_last_50000_%d' % i] = sig[-50000:].std() data_row['std_first_10000_%d' % i] = sig[:10000].std() data_row['std_last_10000_%d' % i] = sig[-10000:].std() data_row['avg_first_50000_%d' % i] = sig[:50000].mean() data_row['avg_last_50000_%d' % i] = sig[-50000:].mean() data_row['avg_first_10000_%d' % i] = sig[:10000].mean() data_row['avg_last_10000_%d' % i] = sig[-10000:].mean() data_row['min_first_50000_%d' % i] = sig[:50000].min() data_row['min_last_50000_%d' % i] = sig[-50000:].min() data_row['min_first_10000_%d' % i] = sig[:10000].min() data_row['min_last_10000_%d' % i] = sig[-10000:].min() data_row['max_first_50000_%d' % i] = sig[:50000].max() data_row['max_last_50000_%d' % i] = sig[-50000:].max() data_row['max_first_10000_%d' % i] = sig[:10000].max() data_row['max_last_10000_%d' % i] = sig[-10000:].max() data_row['max_to_min_%d' % i] = sig.max() / np.abs(sig.min()) data_row['max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min()) data_row['count_big_%d' % i] = len(sig[np.abs(sig) > 500]) data_row['sum_%d' % i] = sig.sum() data_row['mean_change_rate_first_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:50000]) / sig[:50000][:-1]))[0]) data_row['mean_change_rate_last_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-50000:]) / sig[-50000:][:-1]))[0]) data_row['mean_change_rate_first_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:10000]) / sig[:10000][:-1]))[0]) data_row['mean_change_rate_last_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-10000:]) / sig[-10000:][:-1]))[0]) data_row['q95_%d' % i] = np.quantile(sig, 0.95) data_row['q99_%d' % i] = np.quantile(sig, 0.99) data_row['q05_%d' % i] = np.quantile(sig, 0.05) data_row['q01_%d' % i] = np.quantile(sig, 0.01) data_row['abs_q95_%d' % i] = np.quantile(np.abs(sig), 0.95) data_row['abs_q99_%d' % i] = np.quantile(np.abs(sig), 0.99) data_row['abs_q05_%d' % i] = np.quantile(np.abs(sig), 0.05) data_row['abs_q01_%d' % i] = np.quantile(np.abs(sig), 0.01) data_row['trend_%d' % i] = add_trend_feature(sig) data_row['abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True) data_row['abs_mean_%d' % i] = np.abs(sig).mean() data_row['abs_std_%d' % i] = np.abs(sig).std() data_row['mad_%d' % i] = sig.mad() data_row['kurt_%d' % i] = sig.kurtosis() data_row['skew_%d' % i] = sig.skew() data_row['med_%d' % i] = sig.median() data_row['Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean() data_row['Hann_window_mean'] = (convolve(seg, hann(150), mode='same') / sum(hann(150))).mean() data_row['classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean() data_row['classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean() data_row['classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean() data_row['classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean() data_row['Moving_average_400_mean_%d' % i] = sig.rolling(window=400).mean().mean(skipna=True) data_row['Moving_average_700_mean_%d' % i] = sig.rolling(window=700).mean().mean(skipna=True) data_row['Moving_average_1500_mean_%d' % i] = sig.rolling(window=1500).mean().mean(skipna=True) data_row['Moving_average_3000_mean_%d' % i] = sig.rolling(window=3000).mean().mean(skipna=True) data_row['Moving_average_6000_mean_%d' % i] = sig.rolling(window=6000).mean().mean(skipna=True) ewma = pd.Series.ewm data_row['exp_Moving_average_300_mean_%d' % i] = ewma(sig, span=300).mean().mean(skipna=True) data_row['exp_Moving_average_3000_mean_%d' % i] = ewma(sig, span=3000).mean().mean(skipna=True) data_row['exp_Moving_average_30000_mean_%d' % i] = ewma(sig, span=6000).mean().mean(skipna=True) no_of_std = 2 data_row['MA_700MA_std_mean_%d' % i] = sig.rolling(window=700).std().mean(skipna=True) data_row['MA_700MA_BB_high_mean_%d' % i] = ( data_row['Moving_average_700_mean_%d' % i] + no_of_std * data_row['MA_700MA_std_mean_%d' % i]).mean() data_row['MA_700MA_BB_low_mean_%d' % i] = ( data_row['Moving_average_700_mean_%d' % i] - no_of_std * data_row['MA_700MA_std_mean_%d' % i]).mean() data_row['MA_400MA_std_mean_%d' % i] = sig.rolling(window=400).std().mean(skipna=True) data_row['MA_400MA_BB_high_mean_%d' % i] = ( data_row['Moving_average_400_mean_%d' % i] + no_of_std * data_row['MA_400MA_std_mean_%d' % i]).mean() data_row['MA_400MA_BB_low_mean_%d' % i] = ( data_row['Moving_average_400_mean_%d' % i] - no_of_std * data_row['MA_400MA_std_mean_%d' % i]).mean() data_row['iqr0_%d' % i] = np.subtract(*np.percentile(sig, [75, 25])) data_row['q999_%d' % i] = np.quantile(sig, 0.999) data_row['q001_%d' % i] = np.quantile(sig, 0.001) data_row['ave10_%d' % i] = stats.trim_mean(sig, 0.1) data_row['peak10_num_%d' % i] = feature_calculators.number_peaks(sig, 10) data_row['num_cross_0_%d' % i] = feature_calculators.number_crossing_m(sig, 0) data_row['autocorrelation_%d' % i] = feature_calculators.autocorrelation(sig, 5) # data_row['spkt_welch_density_%d' % i] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 50}]))[0][1] data_row['ratio_value_number_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig) for windows in [50, 200, 1000]: x_roll_std = seg.rolling(windows).std().dropna().values x_roll_mean = seg.rolling(windows).mean().dropna().values data_row['ave_roll_std_' + str(windows)] = x_roll_std.mean() data_row['std_roll_std_' + str(windows)] = x_roll_std.std() data_row['max_roll_std_' + str(windows)] = x_roll_std.max() data_row['min_roll_std_' + str(windows)] = x_roll_std.min() data_row['q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01) data_row['q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05) data_row['q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95) data_row['q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99) data_row['av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std)) data_row['av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) data_row['abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max() data_row['ave_roll_mean_' + str(windows)] = x_roll_mean.mean() data_row['std_roll_mean_' + str(windows)] = x_roll_mean.std() data_row['max_roll_mean_' + str(windows)] = x_roll_mean.max() data_row['min_roll_mean_' + str(windows)] = x_roll_mean.min() data_row['q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01) data_row['q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05) data_row['q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95) data_row['q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99) data_row['av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean)) data_row['av_change_rate_roll_mean_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) data_row['abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max() data_row['num_peak10_rolling_' + str(windows)] = feature_calculators.number_peaks(x_roll_mean, 10) data_row['num_cross0_rolling_' + str(windows)] = feature_calculators.number_crossing_m(x_roll_mean, 0) data_row['autocorrelation_rolling_' + str(windows)] = feature_calculators.autocorrelation(x_roll_mean, 5) # data_row['spkt_welch_density_rolling_' + str(windows)] = list(feature_calculators.spkt_welch_density(x_roll_mean, [{'coeff': 50}]))[0][1] data_row['ratio_value_number_rolling_' + str(windows)] = feature_calculators.ratio_value_number_to_time_series_length(x_roll_mean) data_row['classic_sta_lta_rolling_' + str(windows)] = classic_sta_lta(x_roll_mean, 500, 10000).mean() return data_row
def create_features2(seg, ): data_row = {} xcz = des_filter(seg, high=CUTOFF) zc = np.fft.fft(xcz) zc = zc[:MAX_FREQ] # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) freq_bands = list(range(0, MAX_FREQ, FREQ_STEP)) magFFT = np.abs(zc) phzFFT = np.angle(zc) phzFFT[phzFFT == -np.inf] = -np.pi / 2.0 phzFFT[phzFFT == np.inf] = np.pi / 2.0 phzFFT = np.nan_to_num(phzFFT) for freq in freq_bands: data_row['FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.01) data_row['FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.1) data_row['FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.9) data_row['FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.99) data_row['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Mag_min%d' % freq] = np.min(magFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_max%d' % freq] = np.max(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Phz_min%d' % freq] = np.min(phzFFT[freq: freq + FREQ_STEP]) data_row['FFT_Rmean'] = realFFT.mean() data_row['FFT_Rstd'] = realFFT.std() data_row['FFT_Rmax'] = realFFT.max() data_row['FFT_Rmin'] = realFFT.min() data_row['FFT_Imean'] = imagFFT.mean() data_row['FFT_Istd'] = imagFFT.std() data_row['FFT_Imax'] = imagFFT.max() data_row['FFT_Imin'] = imagFFT.min() data_row['FFT_Rmean_first_6000'] = realFFT[:6000].mean() data_row['FFT_Rstd__first_6000'] = realFFT[:6000].std() data_row['FFT_Rmax_first_6000'] = realFFT[:6000].max() data_row['FFT_Rmin_first_6000'] = realFFT[:6000].min() data_row['FFT_Rmean_first_18000'] = realFFT[:18000].mean() data_row['FFT_Rstd_first_18000'] = realFFT[:18000].std() data_row['FFT_Rmax_first_18000'] = realFFT[:18000].max() data_row['FFT_Rmin_first_18000'] = realFFT[:18000].min() del xcz del zc # gc.collect() sigs = [seg] for freq in range(0, MAX_FREQ + FREQ_STEP, FREQ_STEP): if freq == 0: xc_ = des_filter(seg, high=FREQ_STEP) elif freq == MAX_FREQ: xc_ = des_filter(seg, low=freq) else: xc_ = des_filter(seg, low=freq, high=freq + FREQ_STEP) sigs.append(pd.Series(xc_)) for window in [50, 200, 1000]: roll_mean = seg.rolling(window).mean().dropna() roll_std = seg.rolling(window).std().dropna() sigs.append(pd.Series(roll_mean)) sigs.append(pd.Series(roll_std)) for span in [30, 300, 3000]: exp_mean = seg.ewm(span).mean().dropna() exp_std = seg.ewm(span).std().dropna() sigs.append(pd.Series(exp_mean)) sigs.append(pd.Series(exp_std)) for i, sig in enumerate(sigs): data_row['mean_%d' % i] = sig.mean() data_row['std_%d' % i] = sig.std() data_row['max_%d' % i] = sig.max() data_row['min_%d' % i] = sig.min() data_row['mean_change_abs_%d' % i] = np.mean(np.diff(sig)) data_row['mean_change_rate_%d' % i] = np.mean(np.nonzero((np.diff(sig) / sig[:-1]))[0]) data_row['abs_max_%d' % i] = np.abs(sig).max() data_row['abs_min_%d' % i] = np.abs(sig).min() data_row['std_first_50000_%d' % i] = sig[:50000].std() data_row['std_last_50000_%d' % i] = sig[-50000:].std() data_row['std_first_10000_%d' % i] = sig[:10000].std() data_row['std_last_10000_%d' % i] = sig[-10000:].std() data_row['avg_first_50000_%d' % i] = sig[:50000].mean() data_row['avg_last_50000_%d' % i] = sig[-50000:].mean() data_row['avg_first_10000_%d' % i] = sig[:10000].mean() data_row['avg_last_10000_%d' % i] = sig[-10000:].mean() data_row['min_first_50000_%d' % i] = sig[:50000].min() data_row['min_last_50000_%d' % i] = sig[-50000:].min() data_row['min_first_10000_%d' % i] = sig[:10000].min() data_row['min_last_10000_%d' % i] = sig[-10000:].min() data_row['max_first_50000_%d' % i] = sig[:50000].max() data_row['max_last_50000_%d' % i] = sig[-50000:].max() data_row['max_first_10000_%d' % i] = sig[:10000].max() data_row['max_last_10000_%d' % i] = sig[-10000:].max() data_row['max_to_min_%d' % i] = sig.max() / np.abs(sig.min()) data_row['max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min()) data_row['count_big_%d' % i] = len(sig[np.abs(sig) > 500]) data_row['sum_%d' % i] = sig.sum() data_row['mean_change_rate_first_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:50000]) / sig[:50000][:-1]))[0]) data_row['mean_change_rate_last_50000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-50000:]) / sig[-50000:][:-1]))[0]) data_row['mean_change_rate_first_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[:10000]) / sig[:10000][:-1]))[0]) data_row['mean_change_rate_last_10000_%d' % i] = np.mean( np.nonzero((np.diff(sig[-10000:]) / sig[-10000:][:-1]))[0]) for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]: data_row['percentile_p{}_{}'.format(p, i)] = np.percentile(sig, p) data_row['abd_percentile_p{}_{}'.format(p, i)] = np.percentile(np.abs(sig), p) data_row['trend_%d' % i] = add_trend_feature(sig) data_row['abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True) data_row['abs_mean_%d' % i] = np.abs(sig).mean() data_row['abs_std_%d' % i] = np.abs(sig).std() data_row['mad_%d' % i] = sig.mad() data_row['kurt_%d' % i] = sig.kurtosis() data_row['skew_%d' % i] = sig.skew() data_row['med_%d' % i] = sig.median() # data_row['Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean() data_row['Hann_window50_%d' % i] = (convolve(sig, hann(50), mode='same') / sum(hann(50))).mean() data_row['Hann_window500_%d' % i] = (convolve(sig, hann(500), mode='same') / sum(hann(500))).mean() data_row['classic_sta_lta0_mean_%d' % i] = classic_sta_lta(sig, 50, 1000).mean() data_row['classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean() data_row['classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean() data_row['classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean() data_row['classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean() no_of_std = 2 for w in [10, 100, 500]: signal_mean = sig.rolling(window=w).mean() signal_std = sig.rolling(window=w).std() data_row['high_bound_mean_win{}_{}'.format(w, i)] = (signal_mean + no_of_std * signal_std).mean() data_row['low_bound_mean_win{}_{}'.format(w, i)] = (signal_mean - no_of_std * signal_std).mean() data_row['range_inf_4000_%d' % i] = feature_calculators.range_count(sig, -np.inf, -4000) data_row['range_4000_inf_%d' % i] = feature_calculators.range_count(sig, 4000, np.inf) for l, h in [[-4000, -2000], [-2000, 0], [0, 2000], [2000, 4000]]: data_row['range_{}_{}_{}'.format(np.abs(l), np.abs(h), i)] = feature_calculators.range_count(sig, l, h) data_row['iqr0_%d' % i] = np.subtract(*np.percentile(sig, [75, 25])) data_row['iqr1_%d' % i] = np.subtract(*np.percentile(sig, [95, 5])) data_row['ave10_%d' % i] = stats.trim_mean(sig, 0.1) data_row['num_cross_0_%d' % i] = feature_calculators.number_crossing_m(sig, 0) data_row['ratio_value_number_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig) # data_row['var_larger_than_std_dev_%d' % i] = feature_calculators.variance_larger_than_standard_deviation(sig) data_row['ratio_unique_values_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig) data_row['abs_energy_%d' % i] = feature_calculators.abs_energy(sig) data_row['abs_sum_of_changes_%d' % i] = feature_calculators.absolute_sum_of_changes(sig) data_row['count_above_mean_%d' % i] = feature_calculators.count_above_mean(sig) data_row['count_below_mean_%d' % i] = feature_calculators.count_below_mean(sig) data_row['mean_abs_change_%d' % i] = feature_calculators.mean_abs_change(sig) data_row['mean_change_%d' % i] = feature_calculators.mean_change(sig) data_row['first_loc_min_%d' % i] = feature_calculators.first_location_of_minimum(sig) data_row['first_loc_max_%d' % i] = feature_calculators.first_location_of_maximum(sig) data_row['last_loc_min_%d' % i] = feature_calculators.last_location_of_minimum(sig) data_row['last_loc_max_%d' % i] = feature_calculators.last_location_of_maximum(sig) data_row['long_strk_above_mean_%d' % i] = feature_calculators.longest_strike_above_mean(sig) data_row['long_strk_below_mean_%d' % i] = feature_calculators.longest_strike_below_mean(sig) # data_row['cid_ce_0_%d' % i] = feature_calculators.cid_ce(sig, 0) # data_row['cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1) for j in [10, 50, ]: data_row['peak_num_p{}_{}'.format(j, i)] = feature_calculators.number_peaks(sig, j) for j in [1, 10, 50, 100]: data_row['spkt_welch_density_coeff{}_{}'.format(j, i)] = \ list(feature_calculators.spkt_welch_density(sig, [{'coeff': j}]))[0][1] for j in [5, 10, 100]: data_row['c3_c{}_{}'.format(j, i)] = feature_calculators.c3(sig, j) for j in [5, 10, 50, 100, 1000]: data_row['autocorrelation_auto{}_{}'.format(j, i)] = feature_calculators.autocorrelation(sig, j) for j in [10, 100, 1000]: data_row['time_rev_asym_stat_t{}_{}'.format(j, i)] = feature_calculators.time_reversal_asymmetry_statistic( sig, j) for j in range(1, 5): data_row['kstat_k{}_{}'.format(j, i)] = stats.kstat(sig, j) data_row['moment_m{}_{}'.format(j, i)] = stats.moment(sig, j) for j in range(1, 3): data_row['kstatvar_k{}_{}'.format(j, i)] = stats.kstatvar(sig, j) for j in [5, 10, 50, 100]: data_row['binned_entropy_b{}_{}'.format(j, i)] = feature_calculators.binned_entropy(sig, j) return data_row
range_data = data[:600] mean_abs_change = feature_calculators.mean_abs_change(data) # 前後のポイント間での差分の平均値 # np.mean(np.abs(np.diff(x))) と等しい first_location_of_maximum = feature_calculators.first_location_of_maximum(data) # 最大値が観測される位置 fft_aggregated = feature_calculators.fft_aggregated(data, [{ 'aggtype': 'skew' }]) # フーリエ変換 number_peaks = feature_calculators.number_peaks(data[:1000], 50) # ピークの数 index_mass_quantile = feature_calculators.index_mass_quantile( data[:1000], [{ 'q': 0.5 }, { 'q': 0.1 }]) # パーセンタイル処理 linear_trend = feature_calculators.linear_trend(range_data, [{ 'attr': "slope" }, { 'attr': 'intercept'