def get_data( mode, max_samples, n_process, complex_feature_channels, max_bckg_samps_per_file, use_simple_hand_engineered_features, random_under_sample_data_gen, num_seconds, ref="01_tcp_ar", num_files=None, freq_bins=[0, 3.5, 7.5, 14, 20, 25, 40], include_simple_coherence=True, ): eds = getDataSampleGenerator() train_label_files_segs = eds.get_train_split() test_label_files_segs = eds.get_test_split() valid_label_files_segs = eds.get_valid_split() #increased n_process to deal with io processing train_edss = er.EdfDatasetSegmentedSampler( segment_file_tuples=train_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process * 2), gap=num_seconds * pd.Timedelta(seconds=1))[:] valid_edss = er.EdfDatasetSegmentedSampler( segment_file_tuples=valid_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process * 2), gap=num_seconds * pd.Timedelta(seconds=1))[:] test_edss = er.EdfDatasetSegmentedSampler( segment_file_tuples=test_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process * 2), gap=num_seconds * pd.Timedelta(seconds=1))[:] def simple_edss(edss): ''' Use only a few columns so that we don't make 21*20 coherence pairs ''' all_channels = util_funcs.get_common_channel_names() subset_channels = [ all_channels.index(channel) for channel in complex_feature_channels ] return [(datum[0][:, subset_channels], datum[1]) for datum in edss] if include_simple_coherence: trainCoherData = np.stack([ datum.values for datum in [ datum[0] for datum in wfdata.CoherenceTransformer( simple_edss(train_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:] ] ]) validCoherData = np.stack([ datum.values for datum in [ datum[0] for datum in wfdata.CoherenceTransformer( simple_edss(valid_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:] ] ]) testCoherData = np.stack([ datum.values for datum in [ datum[0] for datum in wfdata.CoherenceTransformer( simple_edss(test_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:] ] ]) if use_simple_hand_engineered_features: trainSHED = wfdata.SimpleHandEngineeredDataset( simple_edss(train_edss), n_process=n_process, is_pandas_data=False, features=[ tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ / 25)) ], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:] validSHED = wfdata.SimpleHandEngineeredDataset( simple_edss(valid_edss), n_process=n_process, is_pandas_data=False, features=[ tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ / 25)) ], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:] testSHED = wfdata.SimpleHandEngineeredDataset( simple_edss(test_edss), n_process=n_process, is_pandas_data=False, features=[ tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ / 25)) ], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:] train_edss = read.Flattener(read.EdfFFTDatasetTransformer( train_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:] valid_edss = read.Flattener(read.EdfFFTDatasetTransformer( valid_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:] test_edss = read.Flattener(read.EdfFFTDatasetTransformer( test_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:] def split_tuples(data): return np.stack([datum[0] for datum in data ]), np.stack([datum[1] for datum in data]) train_edss, train_labels = split_tuples(train_edss) valid_edss, valid_labels = split_tuples(valid_edss) test_edss, test_labels = split_tuples(test_edss) if include_simple_coherence: train_edss = np.hstack([train_edss, trainCoherData]) valid_edss = np.hstack([valid_edss, validCoherData]) test_edss = np.hstack([test_edss, testCoherData]) if use_simple_hand_engineered_features: train_edss = np.hstack([train_edss, np.stack(trainSHED)]) valid_edss = np.hstack([valid_edss, np.stack(validSHED)]) test_edss = np.hstack([test_edss, np.stack(testSHED)]) print("Data Shape:", train_edss.shape) #some of the features are returning nans (assuming there is a log that may not play well?) return (np.nan_to_num(train_edss), train_labels), \ (np.nan_to_num(valid_edss), valid_labels), \ (np.nan_to_num(test_edss), test_labels)
def function(x): return number_cwt_peaks(x, n=self.n)
def get_feature(df, FFTSAMPLE): header_list = ['proximity', 'ambient', 'leanForward', 'energy'] df_new = df[header_list] # -------------------------------- # Generate feature names # -------------------------------- feature_label = [ "mean", "std", "max", "min", "median", "skewness", "RMS", "kurtosis", "quart1", "quart3", "irq", "fft1", "fft2", "fft3", "fft4", "fft5", "fft6", "fft7", "fft8", "fft9", "fft10", "count_above_mean", "count_below_mean", "first_location_of_maximum", "first_location_of_minimum", "longest_strike_above_mean", "longest_strike_below_mean", "number_cwt_peaks" ] header = [] for k in header_list: for feat in feature_label: one = k + "_" + feat header.extend([one]) header.extend([ "SK_prox_fft", "K_prox_fft", "SK_amb_fft", "K_amb_fft", "SK_lean_fft", "K_lean_fft", "SK_engy_fft", "K_engy_fft", "prox_amb", "prox_lean", "prox_engy", "amb_lean", "amb_engy", "lean_engy" ]) prox = df_new['proximity'].as_matrix() amb = df_new['ambient'].as_matrix() lean = df_new['leanForward'].as_matrix() engy = df_new['energy'].as_matrix() R_T = df_new.as_matrix().astype(float) M_T = mean(R_T, axis=0) V_T = std(R_T, axis=0) MAX = R_T.max(axis=0) MIN = R_T.min(axis=0) MED = median(R_T, axis=0) SK_T = skew(R_T, axis=0) RMS_T = sqrt(mean(R_T**2, axis=0)) K_T = kurtosis(R_T, axis=0) Q1 = np.percentile(R_T, 25, axis=0) Q3 = np.percentile(R_T, 75, axis=0) QI = Q3 - Q1 prox_fft = fft_wo_offset(prox[:FFTSAMPLE]) amb_fft = fft_wo_offset(amb[:FFTSAMPLE]) lean_fft = fft_wo_offset(lean[:FFTSAMPLE]) engy_fft = fft_wo_offset(engy[:FFTSAMPLE]) # time series features count_above_mean = [] for k in header_list: count_above_mean.append(fc.count_above_mean(df_new[k])) count_above_mean = np.array(count_above_mean) count_below_mean = [] for k in header_list: count_below_mean.append(fc.count_below_mean(df_new[k])) count_below_mean = np.array(count_below_mean) first_location_of_maximum = [] for k in header_list: print(df_new[k]) print('xdxd') first_location_of_maximum.append( fc.first_location_of_maximum(df_new[k])) first_location_of_maximum = np.array(first_location_of_maximum) first_location_of_minimum = [] for k in header_list: first_location_of_minimum.append( fc.first_location_of_minimum(df_new[k])) first_location_of_minimum = np.array(first_location_of_minimum) longest_strike_above_mean = [] for k in header_list: longest_strike_above_mean.append( fc.longest_strike_above_mean(df_new[k])) longest_strike_above_mean = np.array(longest_strike_above_mean) longest_strike_below_mean = [] for k in header_list: longest_strike_below_mean.append( fc.longest_strike_below_mean(df_new[k])) longest_strike_below_mean = np.array(longest_strike_below_mean) number_cwt_peaks = [] for k in header_list: number_cwt_peaks.append(fc.number_cwt_peaks(df_new[k], 10)) number_cwt_peaks = np.array(number_cwt_peaks) SK_prox_fft = skew(prox_fft) K_prox_fft = kurtosis(prox_fft) SK_amb_fft = skew(amb_fft) K_amb_fft = kurtosis(amb_fft) SK_lean_fft = skew(lean_fft) K_lean_fft = kurtosis(lean_fft) SK_engy_fft = skew(engy_fft) K_engy_fft = kurtosis(engy_fft) COV_M = np.cov(R_T.T) COV = np.array([ COV_M[0, 1], COV_M[0, 2], COV_M[0, 3], COV_M[1, 2], COV_M[1, 3], COV_M[2, 3] ]) H_T = hstack( (M_T, V_T, MAX, MIN, MED, SK_T, RMS_T, K_T, Q1, Q3, QI, prox_fft, amb_fft, lean_fft, engy_fft, count_above_mean, count_below_mean, first_location_of_maximum, first_location_of_minimum, longest_strike_above_mean, longest_strike_below_mean, number_cwt_peaks, SK_prox_fft, K_prox_fft, SK_amb_fft, K_amb_fft, SK_lean_fft, K_lean_fft, SK_engy_fft, K_engy_fft, COV)) feat_df = pd.DataFrame(data=H_T[np.newaxis, :], columns=header) return feat_df
def ncwtpeaks_100(arr): return feature_calculators.number_cwt_peaks(arr, n=100)