def make_features(df_x):
    """Данные разбиваются на блоки и создают признаки для них."""
    feat = dict()

    # Спектральная плотность (диапазоны выбраны в ручную) - нечто похожее используется при анализе голоса в NN
    welch = signal.welch(df_x)[1]
    for num in [2, 3, 28, 30]:
        feat[f"welch_{num}"] = welch[num]

    # Фичи на скользящих медианах - идейно похоже на Pooling только не max и average, а MedianPolling
    mean_abs = (df_x - df_x.mean()).abs()
    feat["mean_abs_med"] = mean_abs.median()

    roll_std = df_x.rolling(375).std().dropna()
    feat["std_roll_med_375"] = roll_std.median()

    half = len(roll_std) // 2
    feat["std_roll_half1"] = roll_std.iloc[:half].median()
    feat["std_roll_half2"] = roll_std.iloc[-half:].median()

    # Фичи на скользящих глубоких квантилях - тоже нейкий QuantilePolling
    feat["q05_roll_std_25"] = df_x.rolling(25).std().dropna().quantile(0.05)
    feat["q05_roll_std_375"] = df_x.rolling(375).std().dropna().quantile(0.05)
    feat["q05_roll_std_1500"] = df_x.rolling(1500).std().dropna().quantile(
        0.05)
    feat["q05_roll_std_1000"] = df_x.rolling(1000).std().dropna().quantile(
        0.05)
    feat["q01_roll_mean_1500"] = df_x.rolling(1500).mean().dropna().quantile(
        0.01)
    feat["q99_roll_mean_1500"] = df_x.rolling(1500).mean().dropna().quantile(
        0.99)

    feat["ave10"] = stats.trim_mean(df_x, 0.1)

    # Pre Main
    feat["num_peaks_10"] = feature_calculators.number_peaks(df_x, 10)
    feat["percentile_roll_std_5"] = np.percentile(
        df_x.rolling(10000).std().dropna().values, 5)
    feat["afc_50"] = feature_calculators.autocorrelation(df_x, 50)

    welch = signal.welch(df_x.clip(-11, 20))[1]
    for num in list(range(33)):
        feat[f"welch_clipped_{num}"] = welch[num]

    return feat
Beispiel #2
0
def npeaks_100(arr):
    return feature_calculators.number_peaks(arr, n=100)
def create_features(seg_id, seg, X, st, end):
    """
    create features including fft features, statistical features and time series features
    :param seg_id: the ID for a sample
    :param seg: s signal segment
    :param X: train set features before creating these features
    :param st: the start index of the signal segment
    :param end: the end index of the signal segment
    :return: train set features after creating these features
    """
    try:
        # test set won't create these features because its seg_id is string
        X.loc[seg_id, 'seg_id'] = np.int32(seg_id)
        X.loc[seg_id, 'seg_start'] = np.int32(st)
        X.loc[seg_id, 'seg_end'] = np.int32(end)
    except ValueError:
        pass

    xc = pd.Series(seg['acoustic_data'].values)
    xcdm = xc - np.mean(xc)

    b, a = des_bw_filter_lp(cutoff=18000)
    xcz = sg.lfilter(b, a, xcdm)

    zc = np.fft.fft(xcz)
    zc = zc[:MAX_FREQ]

    # FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)

    freq_bands = [x for x in range(0, MAX_FREQ, FREQ_BAND)]
    magFFT = np.sqrt(realFFT ** 2 + imagFFT ** 2)
    phzFFT = np.arctan(imagFFT / realFFT)
    phzFFT[phzFFT == -np.inf] = -np.pi / 2.0
    phzFFT[phzFFT == np.inf] = np.pi / 2.0
    phzFFT = np.nan_to_num(phzFFT)

    for freq in freq_bands:
        X.loc[seg_id, 'FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.01)
        X.loc[seg_id, 'FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.1)
        X.loc[seg_id, 'FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.9)
        X.loc[seg_id, 'FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_BAND], 0.99)
        X.loc[seg_id, 'FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_BAND])
        X.loc[seg_id, 'FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_BAND])
        X.loc[seg_id, 'FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_BAND])

        X.loc[seg_id, 'FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_BAND])
        X.loc[seg_id, 'FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_BAND])

    X.loc[seg_id, 'FFT_Rmean'] = realFFT.mean()
    X.loc[seg_id, 'FFT_Rstd'] = realFFT.std()
    X.loc[seg_id, 'FFT_Rmax'] = realFFT.max()
    X.loc[seg_id, 'FFT_Rmin'] = realFFT.min()
    X.loc[seg_id, 'FFT_Imean'] = imagFFT.mean()
    X.loc[seg_id, 'FFT_Istd'] = imagFFT.std()
    X.loc[seg_id, 'FFT_Imax'] = imagFFT.max()
    X.loc[seg_id, 'FFT_Imin'] = imagFFT.min()

    X.loc[seg_id, 'FFT_Rmean_first_6000'] = realFFT[:6000].mean()
    X.loc[seg_id, 'FFT_Rstd__first_6000'] = realFFT[:6000].std()
    X.loc[seg_id, 'FFT_Rmax_first_6000'] = realFFT[:6000].max()
    X.loc[seg_id, 'FFT_Rmin_first_6000'] = realFFT[:6000].min()
    X.loc[seg_id, 'FFT_Rmean_first_18000'] = realFFT[:18000].mean()
    X.loc[seg_id, 'FFT_Rstd_first_18000'] = realFFT[:18000].std()
    X.loc[seg_id, 'FFT_Rmax_first_18000'] = realFFT[:18000].max()
    X.loc[seg_id, 'FFT_Rmin_first_18000'] = realFFT[:18000].min()

    del xcz
    del zc

    b, a = des_bw_filter_lp(cutoff=2500)
    xc0 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_bp(low=2500, high=5000)
    xc1 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_bp(low=5000, high=7500)
    xc2 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_bp(low=7500, high=10000)
    xc3 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_bp(low=10000, high=12500)
    xc4 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_bp(low=12500, high=15000)
    xc5 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_bp(low=15000, high=17500)
    xc6 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_bp(low=17500, high=20000)
    xc7 = sg.lfilter(b, a, xcdm)

    b, a = des_bw_filter_hp(cutoff=20000)
    xc8 = sg.lfilter(b, a, xcdm)

    sigs = [xc, pd.Series(xc0), pd.Series(xc1), pd.Series(xc2), pd.Series(xc3),
            pd.Series(xc4), pd.Series(xc5), pd.Series(xc6), pd.Series(xc7), pd.Series(xc8)]

    for i, sig in enumerate(sigs):
        X.loc[seg_id, 'mean_%d' % i] = sig.mean()
        X.loc[seg_id, 'std_%d' % i] = sig.std()
        X.loc[seg_id, 'max_%d' % i] = sig.max()
        X.loc[seg_id, 'min_%d' % i] = sig.min()

        X.loc[seg_id, 'mean_change_abs_%d' % i] = np.mean(np.diff(sig))
        X.loc[seg_id, 'mean_change_rate_%d' % i] = calc_mean_change_rate(sig)
        X.loc[seg_id, 'abs_max_%d' % i] = np.abs(sig).max()

        X.loc[seg_id, 'std_first_50000_%d' % i] = sig[:50000].std()
        X.loc[seg_id, 'std_last_50000_%d' % i] = sig[-50000:].std()
        X.loc[seg_id, 'std_first_10000_%d' % i] = sig[:10000].std()
        X.loc[seg_id, 'std_last_10000_%d' % i] = sig[-10000:].std()

        X.loc[seg_id, 'avg_first_50000_%d' % i] = sig[:50000].mean()
        X.loc[seg_id, 'avg_last_50000_%d' % i] = sig[-50000:].mean()
        X.loc[seg_id, 'avg_first_10000_%d' % i] = sig[:10000].mean()
        X.loc[seg_id, 'avg_last_10000_%d' % i] = sig[-10000:].mean()

        X.loc[seg_id, 'min_first_50000_%d' % i] = sig[:50000].min()
        X.loc[seg_id, 'min_last_50000_%d' % i] = sig[-50000:].min()
        X.loc[seg_id, 'min_first_10000_%d' % i] = sig[:10000].min()
        X.loc[seg_id, 'min_last_10000_%d' % i] = sig[-10000:].min()

        X.loc[seg_id, 'max_first_50000_%d' % i] = sig[:50000].max()
        X.loc[seg_id, 'max_last_50000_%d' % i] = sig[-50000:].max()
        X.loc[seg_id, 'max_first_10000_%d' % i] = sig[:10000].max()
        X.loc[seg_id, 'max_last_10000_%d' % i] = sig[-10000:].max()

        X.loc[seg_id, 'max_to_min_%d' % i] = sig.max() / np.abs(sig.min())
        X.loc[seg_id, 'max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min())
        X.loc[seg_id, 'count_big_%d' % i] = len(sig[np.abs(sig) > 500])

        X.loc[seg_id, 'mean_change_rate_first_50000_%d' % i] = calc_mean_change_rate(sig[:50000])
        X.loc[seg_id, 'mean_change_rate_last_50000_%d' % i] = calc_mean_change_rate(sig[-50000:])
        X.loc[seg_id, 'mean_change_rate_first_10000_%d' % i] = calc_mean_change_rate(sig[:10000])
        X.loc[seg_id, 'mean_change_rate_last_10000_%d' % i] = calc_mean_change_rate(sig[-10000:])

        X.loc[seg_id, 'q95_%d' % i] = np.quantile(sig, 0.95)
        X.loc[seg_id, 'q99_%d' % i] = np.quantile(sig, 0.99)
        X.loc[seg_id, 'q05_%d' % i] = np.quantile(sig, 0.05)
        X.loc[seg_id, 'q01_%d' % i] = np.quantile(sig, 0.01)

        X.loc[seg_id, 'abs_q95_%d' % i] = np.quantile(np.abs(sig), 0.95)
        X.loc[seg_id, 'abs_q99_%d' % i] = np.quantile(np.abs(sig), 0.99)
        X.loc[seg_id, 'abs_q05_%d' % i] = np.quantile(np.abs(sig), 0.05)
        X.loc[seg_id, 'abs_q01_%d' % i] = np.quantile(np.abs(sig), 0.01)

        X.loc[seg_id, 'trend_%d' % i] = add_trend_feature(sig)
        X.loc[seg_id, 'abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True)
        X.loc[seg_id, 'abs_mean_%d' % i] = np.abs(sig).mean()
        X.loc[seg_id, 'abs_std_%d' % i] = np.abs(sig).std()

        X.loc[seg_id, 'mad_%d' % i] = sig.mad()
        X.loc[seg_id, 'kurt_%d' % i] = sig.kurtosis()
        X.loc[seg_id, 'skew_%d' % i] = sig.skew()
        X.loc[seg_id, 'med_%d' % i] = sig.median()

        X.loc[seg_id, 'Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean()
        X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean()

        X.loc[seg_id, 'classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean()
        X.loc[seg_id, 'classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean()
        X.loc[seg_id, 'classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean()
        X.loc[seg_id, 'classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean()

        X.loc[seg_id, 'Moving_average_700_mean_%d' % i] = sig.rolling(window=700).mean().mean(skipna=True)
        X.loc[seg_id, 'Moving_average_1500_mean_%d' % i] = sig.rolling(window=1500).mean().mean(skipna=True)
        X.loc[seg_id, 'Moving_average_3000_mean_%d' % i] = sig.rolling(window=3000).mean().mean(skipna=True)
        X.loc[seg_id, 'Moving_average_6000_mean_%d' % i] = sig.rolling(window=6000).mean().mean(skipna=True)

        ewma = pd.Series.ewm
        X.loc[seg_id, 'exp_Moving_average_300_mean_%d' % i] = ewma(sig, span=300).mean().mean(skipna=True)
        X.loc[seg_id, 'exp_Moving_average_3000_mean_%d' % i] = ewma(sig, span=3000).mean().mean(skipna=True)
        X.loc[seg_id, 'exp_Moving_average_30000_mean_%d' % i] = ewma(sig, span=30000).mean().mean(skipna=True)

        no_of_std = 3
        X.loc[seg_id, 'MA_700MA_std_mean_%d' % i] = sig.rolling(window=700).std().mean()
        X.loc[seg_id, 'MA_700MA_BB_high_mean_%d' % i] = (
                X.loc[seg_id, 'Moving_average_700_mean_%d' % i] + no_of_std * X.loc[
            seg_id, 'MA_700MA_std_mean_%d' % i]).mean()
        X.loc[seg_id, 'MA_700MA_BB_low_mean_%d' % i] = (
                X.loc[seg_id, 'Moving_average_700_mean_%d' % i] - no_of_std * X.loc[
            seg_id, 'MA_700MA_std_mean_%d' % i]).mean()
        X.loc[seg_id, 'MA_400MA_std_mean_%d' % i] = sig.rolling(window=400).std().mean()
        X.loc[seg_id, 'MA_400MA_BB_high_mean_%d' % i] = (
                X.loc[seg_id, 'Moving_average_700_mean_%d' % i] + no_of_std * X.loc[
            seg_id, 'MA_400MA_std_mean_%d' % i]).mean()
        X.loc[seg_id, 'MA_400MA_BB_low_mean_%d' % i] = (
                X.loc[seg_id, 'Moving_average_700_mean_%d' % i] - no_of_std * X.loc[
            seg_id, 'MA_400MA_std_mean_%d' % i]).mean()
        X.loc[seg_id, 'MA_1000MA_std_mean_%d' % i] = sig.rolling(window=1000).std().mean()

        X.loc[seg_id, 'iqr_%d' % i] = np.subtract(*np.percentile(sig, [75, 25]))
        X.loc[seg_id, 'q999_%d' % i] = np.quantile(sig, 0.999)
        X.loc[seg_id, 'q001_%d' % i] = np.quantile(sig, 0.001)
        X.loc[seg_id, 'ave10_%d' % i] = stats.trim_mean(sig, 0.1)

        X.loc[seg_id, 'num_peaks_10_%d' % i] = feature_calculators.number_peaks(sig, 10)
        X.loc[seg_id, 'cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1)  # time series complexity
        X.loc[seg_id, 'count_1000_0_%d' % i] = feature_calculators.range_count(sig, -1000, 0)
        X.loc[seg_id, 'binned_entropy_5_%d' % i] = feature_calculators.binned_entropy(sig, 5)
        X.loc[seg_id, 'binned_entropy_15_%d' % i] = feature_calculators.binned_entropy(sig, 15)

    # sliding window is a kind of filter, so this code is out of the cycle of band pass
    for windows in [10, 100, 1000]:
        x_roll_std = xc.rolling(windows).std().dropna()
        x_roll_mean = xc.rolling(windows).mean().dropna()

        X.loc[seg_id, 'ave_roll_std_' + str(windows)] = x_roll_std.mean()
        X.loc[seg_id, 'std_roll_std_' + str(windows)] = x_roll_std.std()
        X.loc[seg_id, 'max_roll_std_' + str(windows)] = x_roll_std.max()
        X.loc[seg_id, 'min_roll_std_' + str(windows)] = x_roll_std.min()
        X.loc[seg_id, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01)
        X.loc[seg_id, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05)
        X.loc[seg_id, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95)
        X.loc[seg_id, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std))
        X.loc[seg_id, 'av_change_rate_roll_std_' + str(windows)] = calc_mean_change_rate(x_roll_std)
        X.loc[seg_id, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max()

        X.loc[seg_id, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean()
        X.loc[seg_id, 'std_roll_mean_' + str(windows)] = x_roll_mean.std()
        X.loc[seg_id, 'max_roll_mean_' + str(windows)] = x_roll_mean.max()
        X.loc[seg_id, 'min_roll_mean_' + str(windows)] = x_roll_mean.min()
        X.loc[seg_id, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01)
        X.loc[seg_id, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05)
        X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95)
        X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean))
        X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = calc_mean_change_rate(x_roll_mean)
        X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()

    return X
Beispiel #4
0
def create_features(seg, ):
    data_row = {}

    xcz = des_filter(seg, high=CUTOFF)

    zc = np.fft.fft(xcz)
    zc = zc[:MAX_FREQ]

    # FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)

    freq_bands = list(range(0, MAX_FREQ, FREQ_STEP))
    magFFT = np.abs(zc)
    phzFFT = np.angle(zc)
    phzFFT[phzFFT == -np.inf] = -np.pi / 2.0
    phzFFT[phzFFT == np.inf] = np.pi / 2.0
    phzFFT = np.nan_to_num(phzFFT)

    for freq in freq_bands:
        data_row['FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.01)
        data_row['FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.1)
        data_row['FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.9)
        data_row['FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.99)
        data_row['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_STEP])

        data_row['FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_STEP])

    data_row['FFT_Rmean'] = realFFT.mean()
    data_row['FFT_Rstd'] = realFFT.std()
    data_row['FFT_Rmax'] = realFFT.max()
    data_row['FFT_Rmin'] = realFFT.min()
    data_row['FFT_Imean'] = imagFFT.mean()
    data_row['FFT_Istd'] = imagFFT.std()
    data_row['FFT_Imax'] = imagFFT.max()
    data_row['FFT_Imin'] = imagFFT.min()

    data_row['FFT_Rmean_first_6000'] = realFFT[:6000].mean()
    data_row['FFT_Rstd__first_6000'] = realFFT[:6000].std()
    data_row['FFT_Rmax_first_6000'] = realFFT[:6000].max()
    data_row['FFT_Rmin_first_6000'] = realFFT[:6000].min()
    data_row['FFT_Rmean_first_18000'] = realFFT[:18000].mean()
    data_row['FFT_Rstd_first_18000'] = realFFT[:18000].std()
    data_row['FFT_Rmax_first_18000'] = realFFT[:18000].max()
    data_row['FFT_Rmin_first_18000'] = realFFT[:18000].min()

    del xcz
    del zc
    gc.collect()

    sigs = [seg]
    for freq in range(0,MAX_FREQ+FREQ_STEP,FREQ_STEP):
        if freq==0:
            xc_ = des_filter(seg, high=FREQ_STEP)
        elif freq==MAX_FREQ:
            xc_ = des_filter(seg, low=freq)
        else:
            xc_ = des_filter(seg, low=freq, high=freq+FREQ_STEP)
        sigs.append(pd.Series(xc_))

    for i, sig in enumerate(sigs):
        data_row['mean_%d' % i] = sig.mean()
        data_row['std_%d' % i] = sig.std()
        data_row['max_%d' % i] = sig.max()
        data_row['min_%d' % i] = sig.min()

        data_row['mean_change_abs_%d' % i] = np.mean(np.diff(sig))
        data_row['mean_change_rate_%d' % i] = np.mean(np.nonzero((np.diff(sig) / sig[:-1]))[0])
        data_row['abs_max_%d' % i] = np.abs(sig).max()
        data_row['abs_min_%d' % i] = np.abs(sig).min()

        data_row['std_first_50000_%d' % i] = sig[:50000].std()
        data_row['std_last_50000_%d' % i] = sig[-50000:].std()
        data_row['std_first_10000_%d' % i] = sig[:10000].std()
        data_row['std_last_10000_%d' % i] = sig[-10000:].std()

        data_row['avg_first_50000_%d' % i] = sig[:50000].mean()
        data_row['avg_last_50000_%d' % i] = sig[-50000:].mean()
        data_row['avg_first_10000_%d' % i] = sig[:10000].mean()
        data_row['avg_last_10000_%d' % i] = sig[-10000:].mean()

        data_row['min_first_50000_%d' % i] = sig[:50000].min()
        data_row['min_last_50000_%d' % i] = sig[-50000:].min()
        data_row['min_first_10000_%d' % i] = sig[:10000].min()
        data_row['min_last_10000_%d' % i] = sig[-10000:].min()

        data_row['max_first_50000_%d' % i] = sig[:50000].max()
        data_row['max_last_50000_%d' % i] = sig[-50000:].max()
        data_row['max_first_10000_%d' % i] = sig[:10000].max()
        data_row['max_last_10000_%d' % i] = sig[-10000:].max()

        data_row['max_to_min_%d' % i] = sig.max() / np.abs(sig.min())
        data_row['max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min())
        data_row['count_big_%d' % i] = len(sig[np.abs(sig) > 500])
        data_row['sum_%d' % i] = sig.sum()

        data_row['mean_change_rate_first_50000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[:50000]) / sig[:50000][:-1]))[0])
        data_row['mean_change_rate_last_50000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[-50000:]) / sig[-50000:][:-1]))[0])
        data_row['mean_change_rate_first_10000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[:10000]) / sig[:10000][:-1]))[0])
        data_row['mean_change_rate_last_10000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[-10000:]) / sig[-10000:][:-1]))[0])

        data_row['q95_%d' % i] = np.quantile(sig, 0.95)
        data_row['q99_%d' % i] = np.quantile(sig, 0.99)
        data_row['q05_%d' % i] = np.quantile(sig, 0.05)
        data_row['q01_%d' % i] = np.quantile(sig, 0.01)

        data_row['abs_q95_%d' % i] = np.quantile(np.abs(sig), 0.95)
        data_row['abs_q99_%d' % i] = np.quantile(np.abs(sig), 0.99)
        data_row['abs_q05_%d' % i] = np.quantile(np.abs(sig), 0.05)
        data_row['abs_q01_%d' % i] = np.quantile(np.abs(sig), 0.01)

        data_row['trend_%d' % i] = add_trend_feature(sig)
        data_row['abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True)
        data_row['abs_mean_%d' % i] = np.abs(sig).mean()
        data_row['abs_std_%d' % i] = np.abs(sig).std()

        data_row['mad_%d' % i] = sig.mad()
        data_row['kurt_%d' % i] = sig.kurtosis()
        data_row['skew_%d' % i] = sig.skew()
        data_row['med_%d' % i] = sig.median()

        data_row['Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean()
        data_row['Hann_window_mean'] = (convolve(seg, hann(150), mode='same') / sum(hann(150))).mean()

        data_row['classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean()
        data_row['classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean()
        data_row['classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean()
        data_row['classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean()

        data_row['Moving_average_400_mean_%d' % i] = sig.rolling(window=400).mean().mean(skipna=True)
        data_row['Moving_average_700_mean_%d' % i] = sig.rolling(window=700).mean().mean(skipna=True)
        data_row['Moving_average_1500_mean_%d' % i] = sig.rolling(window=1500).mean().mean(skipna=True)
        data_row['Moving_average_3000_mean_%d' % i] = sig.rolling(window=3000).mean().mean(skipna=True)
        data_row['Moving_average_6000_mean_%d' % i] = sig.rolling(window=6000).mean().mean(skipna=True)

        ewma = pd.Series.ewm
        data_row['exp_Moving_average_300_mean_%d' % i] = ewma(sig, span=300).mean().mean(skipna=True)
        data_row['exp_Moving_average_3000_mean_%d' % i] = ewma(sig, span=3000).mean().mean(skipna=True)
        data_row['exp_Moving_average_30000_mean_%d' % i] = ewma(sig, span=6000).mean().mean(skipna=True)

        no_of_std = 2
        data_row['MA_700MA_std_mean_%d' % i] = sig.rolling(window=700).std().mean(skipna=True)
        data_row['MA_700MA_BB_high_mean_%d' % i] = (
        data_row['Moving_average_700_mean_%d' % i] + no_of_std * data_row['MA_700MA_std_mean_%d' % i]).mean()
        data_row['MA_700MA_BB_low_mean_%d' % i] = (
        data_row['Moving_average_700_mean_%d' % i] - no_of_std * data_row['MA_700MA_std_mean_%d' % i]).mean()
        data_row['MA_400MA_std_mean_%d' % i] = sig.rolling(window=400).std().mean(skipna=True)
        data_row['MA_400MA_BB_high_mean_%d' % i] = (
        data_row['Moving_average_400_mean_%d' % i] + no_of_std * data_row['MA_400MA_std_mean_%d' % i]).mean()
        data_row['MA_400MA_BB_low_mean_%d' % i] = (
        data_row['Moving_average_400_mean_%d' % i] - no_of_std * data_row['MA_400MA_std_mean_%d' % i]).mean()

        data_row['iqr0_%d' % i] = np.subtract(*np.percentile(sig, [75, 25]))
        data_row['q999_%d' % i] = np.quantile(sig, 0.999)
        data_row['q001_%d' % i] = np.quantile(sig, 0.001)
        data_row['ave10_%d' % i] = stats.trim_mean(sig, 0.1)
        data_row['peak10_num_%d' % i] = feature_calculators.number_peaks(sig, 10)
        data_row['num_cross_0_%d' % i] = feature_calculators.number_crossing_m(sig, 0)
        data_row['autocorrelation_%d' % i] = feature_calculators.autocorrelation(sig, 5)
        # data_row['spkt_welch_density_%d' % i] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 50}]))[0][1]
        data_row['ratio_value_number_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig)

    for windows in [50, 200, 1000]:
        x_roll_std = seg.rolling(windows).std().dropna().values
        x_roll_mean = seg.rolling(windows).mean().dropna().values

        data_row['ave_roll_std_' + str(windows)] = x_roll_std.mean()
        data_row['std_roll_std_' + str(windows)] = x_roll_std.std()
        data_row['max_roll_std_' + str(windows)] = x_roll_std.max()
        data_row['min_roll_std_' + str(windows)] = x_roll_std.min()
        data_row['q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01)
        data_row['q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05)
        data_row['q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95)
        data_row['q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99)
        data_row['av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std))
        data_row['av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
        data_row['abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max()

        data_row['ave_roll_mean_' + str(windows)] = x_roll_mean.mean()
        data_row['std_roll_mean_' + str(windows)] = x_roll_mean.std()
        data_row['max_roll_mean_' + str(windows)] = x_roll_mean.max()
        data_row['min_roll_mean_' + str(windows)] = x_roll_mean.min()
        data_row['q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01)
        data_row['q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05)
        data_row['q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95)
        data_row['q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99)
        data_row['av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean))
        data_row['av_change_rate_roll_mean_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
        data_row['abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()

        data_row['num_peak10_rolling_' + str(windows)] = feature_calculators.number_peaks(x_roll_mean, 10)
        data_row['num_cross0_rolling_' + str(windows)] = feature_calculators.number_crossing_m(x_roll_mean, 0)
        data_row['autocorrelation_rolling_' + str(windows)] = feature_calculators.autocorrelation(x_roll_mean, 5)
        # data_row['spkt_welch_density_rolling_' + str(windows)] = list(feature_calculators.spkt_welch_density(x_roll_mean, [{'coeff': 50}]))[0][1]
        data_row['ratio_value_number_rolling_' + str(windows)] = feature_calculators.ratio_value_number_to_time_series_length(x_roll_mean)
        data_row['classic_sta_lta_rolling_' + str(windows)] = classic_sta_lta(x_roll_mean, 500, 10000).mean()

    return data_row
Beispiel #5
0
def create_features2(seg, ):
    data_row = {}

    xcz = des_filter(seg, high=CUTOFF)

    zc = np.fft.fft(xcz)
    zc = zc[:MAX_FREQ]

    # FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)

    freq_bands = list(range(0, MAX_FREQ, FREQ_STEP))
    magFFT = np.abs(zc)
    phzFFT = np.angle(zc)
    phzFFT[phzFFT == -np.inf] = -np.pi / 2.0
    phzFFT[phzFFT == np.inf] = np.pi / 2.0
    phzFFT = np.nan_to_num(phzFFT)

    for freq in freq_bands:
        data_row['FFT_Mag_01q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.01)
        data_row['FFT_Mag_10q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.1)
        data_row['FFT_Mag_90q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.9)
        data_row['FFT_Mag_99q%d' % freq] = np.quantile(magFFT[freq: freq + FREQ_STEP], 0.99)

        data_row['FFT_Mag_mean%d' % freq] = np.mean(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_std%d' % freq] = np.std(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_max%d' % freq] = np.max(magFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Mag_min%d' % freq] = np.min(magFFT[freq: freq + FREQ_STEP])

        data_row['FFT_Phz_mean%d' % freq] = np.mean(phzFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Phz_std%d' % freq] = np.std(phzFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Phz_max%d' % freq] = np.max(phzFFT[freq: freq + FREQ_STEP])
        data_row['FFT_Phz_min%d' % freq] = np.min(phzFFT[freq: freq + FREQ_STEP])

    data_row['FFT_Rmean'] = realFFT.mean()
    data_row['FFT_Rstd'] = realFFT.std()
    data_row['FFT_Rmax'] = realFFT.max()
    data_row['FFT_Rmin'] = realFFT.min()
    data_row['FFT_Imean'] = imagFFT.mean()
    data_row['FFT_Istd'] = imagFFT.std()
    data_row['FFT_Imax'] = imagFFT.max()
    data_row['FFT_Imin'] = imagFFT.min()

    data_row['FFT_Rmean_first_6000'] = realFFT[:6000].mean()
    data_row['FFT_Rstd__first_6000'] = realFFT[:6000].std()
    data_row['FFT_Rmax_first_6000'] = realFFT[:6000].max()
    data_row['FFT_Rmin_first_6000'] = realFFT[:6000].min()
    data_row['FFT_Rmean_first_18000'] = realFFT[:18000].mean()
    data_row['FFT_Rstd_first_18000'] = realFFT[:18000].std()
    data_row['FFT_Rmax_first_18000'] = realFFT[:18000].max()
    data_row['FFT_Rmin_first_18000'] = realFFT[:18000].min()

    del xcz
    del zc
    # gc.collect()

    sigs = [seg]
    for freq in range(0, MAX_FREQ + FREQ_STEP, FREQ_STEP):
        if freq == 0:
            xc_ = des_filter(seg, high=FREQ_STEP)
        elif freq == MAX_FREQ:
            xc_ = des_filter(seg, low=freq)
        else:
            xc_ = des_filter(seg, low=freq, high=freq + FREQ_STEP)
        sigs.append(pd.Series(xc_))

    for window in [50, 200, 1000]:
        roll_mean = seg.rolling(window).mean().dropna()
        roll_std = seg.rolling(window).std().dropna()
        sigs.append(pd.Series(roll_mean))
        sigs.append(pd.Series(roll_std))

    for span in [30, 300, 3000]:
        exp_mean = seg.ewm(span).mean().dropna()
        exp_std = seg.ewm(span).std().dropna()
        sigs.append(pd.Series(exp_mean))
        sigs.append(pd.Series(exp_std))

    for i, sig in enumerate(sigs):

        data_row['mean_%d' % i] = sig.mean()
        data_row['std_%d' % i] = sig.std()
        data_row['max_%d' % i] = sig.max()
        data_row['min_%d' % i] = sig.min()

        data_row['mean_change_abs_%d' % i] = np.mean(np.diff(sig))
        data_row['mean_change_rate_%d' % i] = np.mean(np.nonzero((np.diff(sig) / sig[:-1]))[0])
        data_row['abs_max_%d' % i] = np.abs(sig).max()
        data_row['abs_min_%d' % i] = np.abs(sig).min()

        data_row['std_first_50000_%d' % i] = sig[:50000].std()
        data_row['std_last_50000_%d' % i] = sig[-50000:].std()
        data_row['std_first_10000_%d' % i] = sig[:10000].std()
        data_row['std_last_10000_%d' % i] = sig[-10000:].std()

        data_row['avg_first_50000_%d' % i] = sig[:50000].mean()
        data_row['avg_last_50000_%d' % i] = sig[-50000:].mean()
        data_row['avg_first_10000_%d' % i] = sig[:10000].mean()
        data_row['avg_last_10000_%d' % i] = sig[-10000:].mean()

        data_row['min_first_50000_%d' % i] = sig[:50000].min()
        data_row['min_last_50000_%d' % i] = sig[-50000:].min()
        data_row['min_first_10000_%d' % i] = sig[:10000].min()
        data_row['min_last_10000_%d' % i] = sig[-10000:].min()

        data_row['max_first_50000_%d' % i] = sig[:50000].max()
        data_row['max_last_50000_%d' % i] = sig[-50000:].max()
        data_row['max_first_10000_%d' % i] = sig[:10000].max()
        data_row['max_last_10000_%d' % i] = sig[-10000:].max()

        data_row['max_to_min_%d' % i] = sig.max() / np.abs(sig.min())
        data_row['max_to_min_diff_%d' % i] = sig.max() - np.abs(sig.min())
        data_row['count_big_%d' % i] = len(sig[np.abs(sig) > 500])
        data_row['sum_%d' % i] = sig.sum()

        data_row['mean_change_rate_first_50000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[:50000]) / sig[:50000][:-1]))[0])
        data_row['mean_change_rate_last_50000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[-50000:]) / sig[-50000:][:-1]))[0])
        data_row['mean_change_rate_first_10000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[:10000]) / sig[:10000][:-1]))[0])
        data_row['mean_change_rate_last_10000_%d' % i] = np.mean(
            np.nonzero((np.diff(sig[-10000:]) / sig[-10000:][:-1]))[0])

        for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
            data_row['percentile_p{}_{}'.format(p, i)] = np.percentile(sig, p)
            data_row['abd_percentile_p{}_{}'.format(p, i)] = np.percentile(np.abs(sig), p)

        data_row['trend_%d' % i] = add_trend_feature(sig)
        data_row['abs_trend_%d' % i] = add_trend_feature(sig, abs_values=True)
        data_row['abs_mean_%d' % i] = np.abs(sig).mean()
        data_row['abs_std_%d' % i] = np.abs(sig).std()

        data_row['mad_%d' % i] = sig.mad()
        data_row['kurt_%d' % i] = sig.kurtosis()
        data_row['skew_%d' % i] = sig.skew()
        data_row['med_%d' % i] = sig.median()

        # data_row['Hilbert_mean_%d' % i] = np.abs(hilbert(sig)).mean()
        data_row['Hann_window50_%d' % i] = (convolve(sig, hann(50), mode='same') / sum(hann(50))).mean()
        data_row['Hann_window500_%d' % i] = (convolve(sig, hann(500), mode='same') / sum(hann(500))).mean()

        data_row['classic_sta_lta0_mean_%d' % i] = classic_sta_lta(sig, 50, 1000).mean()
        data_row['classic_sta_lta1_mean_%d' % i] = classic_sta_lta(sig, 500, 10000).mean()
        data_row['classic_sta_lta2_mean_%d' % i] = classic_sta_lta(sig, 5000, 100000).mean()
        data_row['classic_sta_lta3_mean_%d' % i] = classic_sta_lta(sig, 3333, 6666).mean()
        data_row['classic_sta_lta4_mean_%d' % i] = classic_sta_lta(sig, 10000, 25000).mean()

        no_of_std = 2
        for w in [10, 100, 500]:
            signal_mean = sig.rolling(window=w).mean()
            signal_std = sig.rolling(window=w).std()
            data_row['high_bound_mean_win{}_{}'.format(w, i)] = (signal_mean + no_of_std * signal_std).mean()
            data_row['low_bound_mean_win{}_{}'.format(w, i)] = (signal_mean - no_of_std * signal_std).mean()

        data_row['range_inf_4000_%d' % i] = feature_calculators.range_count(sig, -np.inf, -4000)
        data_row['range_4000_inf_%d' % i] = feature_calculators.range_count(sig, 4000, np.inf)
        for l, h in [[-4000, -2000], [-2000, 0], [0, 2000], [2000, 4000]]:
            data_row['range_{}_{}_{}'.format(np.abs(l), np.abs(h), i)] = feature_calculators.range_count(sig, l, h)

        data_row['iqr0_%d' % i] = np.subtract(*np.percentile(sig, [75, 25]))
        data_row['iqr1_%d' % i] = np.subtract(*np.percentile(sig, [95, 5]))
        data_row['ave10_%d' % i] = stats.trim_mean(sig, 0.1)
        data_row['num_cross_0_%d' % i] = feature_calculators.number_crossing_m(sig, 0)
        data_row['ratio_value_number_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig)
        # data_row['var_larger_than_std_dev_%d' % i] = feature_calculators.variance_larger_than_standard_deviation(sig)
        data_row['ratio_unique_values_%d' % i] = feature_calculators.ratio_value_number_to_time_series_length(sig)
        data_row['abs_energy_%d' % i] = feature_calculators.abs_energy(sig)
        data_row['abs_sum_of_changes_%d' % i] = feature_calculators.absolute_sum_of_changes(sig)
        data_row['count_above_mean_%d' % i] = feature_calculators.count_above_mean(sig)
        data_row['count_below_mean_%d' % i] = feature_calculators.count_below_mean(sig)
        data_row['mean_abs_change_%d' % i] = feature_calculators.mean_abs_change(sig)
        data_row['mean_change_%d' % i] = feature_calculators.mean_change(sig)
        data_row['first_loc_min_%d' % i] = feature_calculators.first_location_of_minimum(sig)
        data_row['first_loc_max_%d' % i] = feature_calculators.first_location_of_maximum(sig)
        data_row['last_loc_min_%d' % i] = feature_calculators.last_location_of_minimum(sig)
        data_row['last_loc_max_%d' % i] = feature_calculators.last_location_of_maximum(sig)
        data_row['long_strk_above_mean_%d' % i] = feature_calculators.longest_strike_above_mean(sig)
        data_row['long_strk_below_mean_%d' % i] = feature_calculators.longest_strike_below_mean(sig)
        # data_row['cid_ce_0_%d' % i] = feature_calculators.cid_ce(sig, 0)
        # data_row['cid_ce_1_%d' % i] = feature_calculators.cid_ce(sig, 1)

        for j in [10, 50, ]:
            data_row['peak_num_p{}_{}'.format(j, i)] = feature_calculators.number_peaks(sig, j)
        for j in [1, 10, 50, 100]:
            data_row['spkt_welch_density_coeff{}_{}'.format(j, i)] = \
            list(feature_calculators.spkt_welch_density(sig, [{'coeff': j}]))[0][1]
        for j in [5, 10, 100]:
            data_row['c3_c{}_{}'.format(j, i)] = feature_calculators.c3(sig, j)
        for j in [5, 10, 50, 100, 1000]:
            data_row['autocorrelation_auto{}_{}'.format(j, i)] = feature_calculators.autocorrelation(sig, j)
        for j in [10, 100, 1000]:
            data_row['time_rev_asym_stat_t{}_{}'.format(j, i)] = feature_calculators.time_reversal_asymmetry_statistic(
                sig, j)
        for j in range(1, 5):
            data_row['kstat_k{}_{}'.format(j, i)] = stats.kstat(sig, j)
            data_row['moment_m{}_{}'.format(j, i)] = stats.moment(sig, j)
        for j in range(1, 3):
            data_row['kstatvar_k{}_{}'.format(j, i)] = stats.kstatvar(sig, j)
        for j in [5, 10, 50, 100]:
            data_row['binned_entropy_b{}_{}'.format(j, i)] = feature_calculators.binned_entropy(sig, j)

    return data_row
Beispiel #6
0
range_data = data[:600]

mean_abs_change = feature_calculators.mean_abs_change(data)
# 前後のポイント間での差分の平均値
# np.mean(np.abs(np.diff(x))) と等しい

first_location_of_maximum = feature_calculators.first_location_of_maximum(data)
# 最大値が観測される位置

fft_aggregated = feature_calculators.fft_aggregated(data, [{
    'aggtype': 'skew'
}])
# フーリエ変換

number_peaks = feature_calculators.number_peaks(data[:1000], 50)
# ピークの数

index_mass_quantile = feature_calculators.index_mass_quantile(
    data[:1000], [{
        'q': 0.5
    }, {
        'q': 0.1
    }])
# パーセンタイル処理

linear_trend = feature_calculators.linear_trend(range_data,
                                                [{
                                                    'attr': "slope"
                                                }, {
                                                    'attr': 'intercept'