Beispiel #1
0
def get_data(
    mode,
    max_samples,
    n_process,
    complex_feature_channels,
    max_bckg_samps_per_file,
    use_simple_hand_engineered_features,
    random_under_sample_data_gen,
    num_seconds,
    ref="01_tcp_ar",
    num_files=None,
    freq_bins=[0, 3.5, 7.5, 14, 20, 25, 40],
    include_simple_coherence=True,
):
    eds = getDataSampleGenerator()
    train_label_files_segs = eds.get_train_split()
    test_label_files_segs = eds.get_test_split()
    valid_label_files_segs = eds.get_valid_split()

    #increased n_process to deal with io processing
    train_edss = er.EdfDatasetSegmentedSampler(
        segment_file_tuples=train_label_files_segs,
        mode=mode,
        random_under_sample=random_under_sample_data_gen,
        num_samples=max_samples,
        max_bckg_samps_per_file=max_bckg_samps_per_file,
        n_process=int(n_process * 2),
        gap=num_seconds * pd.Timedelta(seconds=1))[:]
    valid_edss = er.EdfDatasetSegmentedSampler(
        segment_file_tuples=valid_label_files_segs,
        mode=mode,
        random_under_sample=random_under_sample_data_gen,
        num_samples=max_samples,
        max_bckg_samps_per_file=max_bckg_samps_per_file,
        n_process=int(n_process * 2),
        gap=num_seconds * pd.Timedelta(seconds=1))[:]
    test_edss = er.EdfDatasetSegmentedSampler(
        segment_file_tuples=test_label_files_segs,
        mode=mode,
        random_under_sample=random_under_sample_data_gen,
        num_samples=max_samples,
        max_bckg_samps_per_file=max_bckg_samps_per_file,
        n_process=int(n_process * 2),
        gap=num_seconds * pd.Timedelta(seconds=1))[:]

    def simple_edss(edss):
        '''
        Use only a few columns so that we don't make 21*20 coherence pairs
        '''
        all_channels = util_funcs.get_common_channel_names()
        subset_channels = [
            all_channels.index(channel) for channel in complex_feature_channels
        ]
        return [(datum[0][:, subset_channels], datum[1]) for datum in edss]

    if include_simple_coherence:
        trainCoherData = np.stack([
            datum.values for datum in [
                datum[0] for datum in wfdata.CoherenceTransformer(
                    simple_edss(train_edss),
                    columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET,
                    n_process=n_process,
                    is_pandas=False)[:]
            ]
        ])
        validCoherData = np.stack([
            datum.values for datum in [
                datum[0] for datum in wfdata.CoherenceTransformer(
                    simple_edss(valid_edss),
                    columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET,
                    n_process=n_process,
                    is_pandas=False)[:]
            ]
        ])
        testCoherData = np.stack([
            datum.values for datum in [
                datum[0] for datum in wfdata.CoherenceTransformer(
                    simple_edss(test_edss),
                    columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET,
                    n_process=n_process,
                    is_pandas=False)[:]
            ]
        ])
    if use_simple_hand_engineered_features:
        trainSHED = wfdata.SimpleHandEngineeredDataset(
            simple_edss(train_edss),
            n_process=n_process,
            is_pandas_data=False,
            features=[
                tsf.abs_energy, tsf.sample_entropy,
                lambda x: tsf.number_cwt_peaks(x,
                                               int(constants.COMMON_FREQ / 25))
            ],
            f_names=["abs_energy", "entropy", "num_peaks"],
            vectorize="full")[:]
        validSHED = wfdata.SimpleHandEngineeredDataset(
            simple_edss(valid_edss),
            n_process=n_process,
            is_pandas_data=False,
            features=[
                tsf.abs_energy, tsf.sample_entropy,
                lambda x: tsf.number_cwt_peaks(x,
                                               int(constants.COMMON_FREQ / 25))
            ],
            f_names=["abs_energy", "entropy", "num_peaks"],
            vectorize="full")[:]
        testSHED = wfdata.SimpleHandEngineeredDataset(
            simple_edss(test_edss),
            n_process=n_process,
            is_pandas_data=False,
            features=[
                tsf.abs_energy, tsf.sample_entropy,
                lambda x: tsf.number_cwt_peaks(x,
                                               int(constants.COMMON_FREQ / 25))
            ],
            f_names=["abs_energy", "entropy", "num_peaks"],
            vectorize="full")[:]

    train_edss = read.Flattener(read.EdfFFTDatasetTransformer(
        train_edss, freq_bins=freq_bins, is_pandas_data=False),
                                n_process=n_process)[:]
    valid_edss = read.Flattener(read.EdfFFTDatasetTransformer(
        valid_edss, freq_bins=freq_bins, is_pandas_data=False),
                                n_process=n_process)[:]
    test_edss = read.Flattener(read.EdfFFTDatasetTransformer(
        test_edss, freq_bins=freq_bins, is_pandas_data=False),
                               n_process=n_process)[:]

    def split_tuples(data):
        return np.stack([datum[0] for datum in data
                         ]), np.stack([datum[1] for datum in data])

    train_edss, train_labels = split_tuples(train_edss)
    valid_edss, valid_labels = split_tuples(valid_edss)
    test_edss, test_labels = split_tuples(test_edss)

    if include_simple_coherence:
        train_edss = np.hstack([train_edss, trainCoherData])
        valid_edss = np.hstack([valid_edss, validCoherData])
        test_edss = np.hstack([test_edss, testCoherData])

    if use_simple_hand_engineered_features:
        train_edss = np.hstack([train_edss, np.stack(trainSHED)])
        valid_edss = np.hstack([valid_edss, np.stack(validSHED)])
        test_edss = np.hstack([test_edss, np.stack(testSHED)])

    print("Data Shape:", train_edss.shape)

    #some of the features are returning nans (assuming there is a log that may not play well?)
    return (np.nan_to_num(train_edss), train_labels), \
        (np.nan_to_num(valid_edss), valid_labels), \
        (np.nan_to_num(test_edss), test_labels)
Beispiel #2
0
 def function(x):
     return number_cwt_peaks(x, n=self.n)
def get_feature(df, FFTSAMPLE):

    header_list = ['proximity', 'ambient', 'leanForward', 'energy']
    df_new = df[header_list]

    # --------------------------------
    # Generate feature names
    # --------------------------------
    feature_label = [
        "mean", "std", "max", "min", "median", "skewness", "RMS", "kurtosis",
        "quart1", "quart3", "irq", "fft1", "fft2", "fft3", "fft4", "fft5",
        "fft6", "fft7", "fft8", "fft9", "fft10", "count_above_mean",
        "count_below_mean", "first_location_of_maximum",
        "first_location_of_minimum", "longest_strike_above_mean",
        "longest_strike_below_mean", "number_cwt_peaks"
    ]

    header = []
    for k in header_list:
        for feat in feature_label:
            one = k + "_" + feat
            header.extend([one])

    header.extend([
        "SK_prox_fft", "K_prox_fft", "SK_amb_fft", "K_amb_fft", "SK_lean_fft",
        "K_lean_fft", "SK_engy_fft", "K_engy_fft", "prox_amb", "prox_lean",
        "prox_engy", "amb_lean", "amb_engy", "lean_engy"
    ])

    prox = df_new['proximity'].as_matrix()
    amb = df_new['ambient'].as_matrix()
    lean = df_new['leanForward'].as_matrix()
    engy = df_new['energy'].as_matrix()

    R_T = df_new.as_matrix().astype(float)

    M_T = mean(R_T, axis=0)
    V_T = std(R_T, axis=0)
    MAX = R_T.max(axis=0)
    MIN = R_T.min(axis=0)
    MED = median(R_T, axis=0)
    SK_T = skew(R_T, axis=0)
    RMS_T = sqrt(mean(R_T**2, axis=0))
    K_T = kurtosis(R_T, axis=0)
    Q1 = np.percentile(R_T, 25, axis=0)
    Q3 = np.percentile(R_T, 75, axis=0)
    QI = Q3 - Q1

    prox_fft = fft_wo_offset(prox[:FFTSAMPLE])
    amb_fft = fft_wo_offset(amb[:FFTSAMPLE])
    lean_fft = fft_wo_offset(lean[:FFTSAMPLE])
    engy_fft = fft_wo_offset(engy[:FFTSAMPLE])

    # time series features
    count_above_mean = []
    for k in header_list:
        count_above_mean.append(fc.count_above_mean(df_new[k]))
    count_above_mean = np.array(count_above_mean)

    count_below_mean = []
    for k in header_list:
        count_below_mean.append(fc.count_below_mean(df_new[k]))
    count_below_mean = np.array(count_below_mean)

    first_location_of_maximum = []
    for k in header_list:
        print(df_new[k])
        print('xdxd')
        first_location_of_maximum.append(
            fc.first_location_of_maximum(df_new[k]))
    first_location_of_maximum = np.array(first_location_of_maximum)

    first_location_of_minimum = []
    for k in header_list:
        first_location_of_minimum.append(
            fc.first_location_of_minimum(df_new[k]))
    first_location_of_minimum = np.array(first_location_of_minimum)

    longest_strike_above_mean = []
    for k in header_list:
        longest_strike_above_mean.append(
            fc.longest_strike_above_mean(df_new[k]))
    longest_strike_above_mean = np.array(longest_strike_above_mean)

    longest_strike_below_mean = []
    for k in header_list:
        longest_strike_below_mean.append(
            fc.longest_strike_below_mean(df_new[k]))
    longest_strike_below_mean = np.array(longest_strike_below_mean)

    number_cwt_peaks = []
    for k in header_list:
        number_cwt_peaks.append(fc.number_cwt_peaks(df_new[k], 10))
    number_cwt_peaks = np.array(number_cwt_peaks)

    SK_prox_fft = skew(prox_fft)
    K_prox_fft = kurtosis(prox_fft)
    SK_amb_fft = skew(amb_fft)
    K_amb_fft = kurtosis(amb_fft)
    SK_lean_fft = skew(lean_fft)
    K_lean_fft = kurtosis(lean_fft)
    SK_engy_fft = skew(engy_fft)
    K_engy_fft = kurtosis(engy_fft)

    COV_M = np.cov(R_T.T)
    COV = np.array([
        COV_M[0, 1], COV_M[0, 2], COV_M[0, 3], COV_M[1, 2], COV_M[1, 3],
        COV_M[2, 3]
    ])

    H_T = hstack(
        (M_T, V_T, MAX, MIN, MED, SK_T, RMS_T, K_T, Q1, Q3, QI, prox_fft,
         amb_fft, lean_fft, engy_fft, count_above_mean, count_below_mean,
         first_location_of_maximum, first_location_of_minimum,
         longest_strike_above_mean, longest_strike_below_mean,
         number_cwt_peaks, SK_prox_fft, K_prox_fft, SK_amb_fft, K_amb_fft,
         SK_lean_fft, K_lean_fft, SK_engy_fft, K_engy_fft, COV))

    feat_df = pd.DataFrame(data=H_T[np.newaxis, :], columns=header)

    return feat_df
Beispiel #4
0
def ncwtpeaks_100(arr):
    return feature_calculators.number_cwt_peaks(arr, n=100)