def test_make_forecasting_frame_feature_extraction(self): t_index = pd.date_range('1/1/2011', periods=4, freq='H') df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index), kind="test", max_timeshift=1, rolling_direction=1) extract_relevant_features(df, y, column_id="id", column_sort="time", column_value="value", default_fc_parameters=MinimalFCParameters())
def prepare_tsfresh(self, positive_dir, negative_dir): tsfresh_short_df = pd.DataFrame(columns=[ 'id', 'time', 'flux', 'flux_err', 'background_flux', 'quality', 'centroids_x', 'centroids_y', 'motion_x', 'motion_y' ]) tsfresh_long_df = pd.DataFrame(columns=[ 'id', 'time', 'flux', 'flux_err', 'background_flux', 'quality', 'centroids_x', 'centroids_y', 'motion_x', 'motion_y' ]) tsfresh_tags_short = [] tsfresh_tags_long = [] for tic_dir in os.listdir(positive_dir): short_lc_dir = positive_dir + "/" + tic_dir + "/time_series_short.csv" if os.path.exists(short_lc_dir): lc_short_df = pd.read_csv(positive_dir + "/" + tic_dir + "/time_series_short.csv") lc_short_df['id'] = tic_dir tsfresh_short_df.append(lc_short_df) tsfresh_tags_short.append([tic_dir, 1]) lc_long_df = pd.read_csv(positive_dir + "/" + tic_dir + "/time_series_long.csv") lc_long_df['id'] = tic_dir tsfresh_long_df.append(lc_long_df) tsfresh_tags_long.append([tic_dir, 1]) for tic_dir in os.listdir(negative_dir): short_lc_dir = negative_dir + "/" + tic_dir + "/time_series_short.csv" if os.path.exists(short_lc_dir): lc_short_df = pd.read_csv(negative_dir + "/" + tic_dir + "/time_series_short.csv") lc_short_df['id'] = tic_dir tsfresh_short_df.append(lc_short_df) tsfresh_tags_short.append([tic_dir, 1]) lc_long_df = pd.read_csv(negative_dir + "/" + tic_dir + "/time_series_long.csv") lc_long_df['id'] = tic_dir tsfresh_long_df.append(lc_long_df) tsfresh_tags_long.append([tic_dir, 0]) tsfresh_tags_short = pd.Series(tsfresh_tags_short) tsfresh_tags_long = pd.Series(tsfresh_tags_long) # TODO tsfresh needs a dataframe with all the "time series" data (centroids, motion, flux, bck_flux...) # TODO with an id column specifying the target id and a "y" as a df containing the target ids and the classification # TODO tag. We need to check how to make this compatible with transit times tagging instead of entire curve # TODO classification. Maybe https://tsfresh.readthedocs.io/en/latest/text/forecasting.html is helpful. extracted_features_short = tsfresh.extract_relevant_features( tsfresh_short_df, tsfresh_tags_short, column_id='id', column_sort='time') extracted_features_long = tsfresh.extract_relevant_features( tsfresh_long_df, tsfresh_tags_long, column_id='id', column_sort='time')
def features_tsfresh_select(df): df = df[[ 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'item_id', 'date', 'store_id', 'id' ]] df = roll_time_series(df, column_id="item_id", column_sort="date") existing_cols = df.columns.tolist() y = df['demand'] X_cols = [x for x in existing_cols if not x == "demand"] X = df[X_cols] X = X.fillna(value={'sell_price': X['sell_price'].mean(skipna=True)}) X = X[['snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'item_id', 'date']] X_filtered = extract_relevant_features(X, y, column_id='item_id', column_sort='date') filtered_col_names = X_filtered.columns.tolist() filtered_col_names_mapping = {} for filtered_col_name in filtered_col_names: filtered_col_names_mapping[ filtered_col_name] = filtered_col_name.replace('"', '').replace( ',', '') X_filtered = X_filtered.rename(columns=filtered_col_names_mapping) # This is done because lightgbm can not have features with " in the feature name feature_df = pd.concat([X[['item_id', 'date']], X_filtered]) return feature_df, []
def test_functional_equality(self): """ `extract_relevant_features` should be equivalent to running first `extract_features` with impute and `select_features` afterwards. Meaning it should produce the same relevant features and the values of these features should be identical. :return: """ df, y = self.create_test_data_sample_with_target() relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind', column_sort='sort') extracted_features = extract_features(df, column_id='id', column_value='val', column_kind='kind', column_sort='sort', impute_function=impute) selected_features = select_features(extracted_features, y) self.assertEqual( set(relevant_features.columns), set(selected_features.columns), "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format( relevant_features.columns, selected_features.columns)) relevant_columns = relevant_features.columns relevant_index = relevant_features.index self.assertTrue( relevant_features.equals( selected_features.loc[relevant_index][relevant_columns]), "Should calculate the same feature values")
def test_functional_equality(self): """ `extract_relevant_features` should be equivalent to running first `extract_features` with impute and `select_features` afterwards. Meaning it should produce the same relevant features and the values of these features should be identical. :return: """ df, y = self.create_test_data_sample_with_target() relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind', column_sort='sort') extracted_features = extract_features(df, column_id='id', column_value='val', column_kind='kind', column_sort='sort', impute_function=impute) selected_features = select_features(extracted_features, y) self.assertEqual( set(relevant_features.columns), set(selected_features.columns), "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(relevant_features.columns, selected_features.columns)) relevant_columns = relevant_features.columns relevant_index = relevant_features.index self.assertTrue( relevant_features.equals(selected_features.loc[relevant_index][relevant_columns]), "Should calculate the same feature values")
def extract_sub_window(df_x, y, window, start_index, lag, fc_parameters="min", n_jobs=-1): from tsfresh import extract_relevant_features from tsfresh.feature_extraction.settings import MinimalFCParameters if fc_parameters == "min": fc_parameters = MinimalFCParameters() window_start, window_end = window sub_df_x = get_rolling_timeseries(df_x, start_index, lag, window_start, window_end) if n_jobs == -1: n_jobs = multiprocessing.cpu_count() y = y[y.index.isin(sub_df_x.window_id)] features = extract_relevant_features(sub_df_x, y, column_id="window_id", column_sort="timestamp", column_value=None, default_fc_parameters=fc_parameters, n_jobs=n_jobs) features = features.add_suffix(f"_{window_start}_{window_end}") return (features, y)
def test_functional_equality(self): """ `extract_relevant_features` should be equivalent to running first `extract_features` with impute and `select_features` afterwards. Meaning it should produce the same relevant features and the values of these features should be identical. :return: """ df, y = self.create_test_data_sample_with_target() relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind', column_sort='sort') extraction_settings = FeatureExtractionSettings() extraction_settings.IMPUTE = impute extracted_features = extract_features( df, feature_extraction_settings=extraction_settings, column_id='id', column_value='val', column_kind='kind', column_sort='sort') selected_features = select_features(extracted_features, y) self.assertEqual( set(relevant_features.columns), set(selected_features.columns), "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format( relevant_features.columns, selected_features.columns)) self.assertTrue( (relevant_features.values == selected_features.values).all().all(), "Should calculate the same feature values")
def filter_features(app_dir, df_series, labels): filtered_feature = tsfresh.extract_relevant_features(df_series, labels, column_id='id', column_sort='time', ml_task='classification') excel_writer = pandas.ExcelWriter(os.path.join(app_dir, 'filtered_feature.xlsx')) df_filtered_features = pandas.DataFrame(data=filtered_feature) df_filtered_features.to_excel(excel_writer) excel_writer.save()
def get_best_features(X, y): labels = pd.DataFrame({'y': y}) sel = extract_relevant_features(X, labels['y'], column_id='id', column_sort='time') return sel
def extract_features_from_dataframe(dataframe: pandas.DataFrame, wordid_userid_mapping): return tsfresh.extract_relevant_features(dataframe, wordid_userid_mapping, column_id=Utils.WORD_ID, column_sort=Utils.TIME, n_jobs=4)
def find_feature(timeseries, y): from tsfresh import extract_relevant_features features_filtered_direct = extract_relevant_features(timeseries, y, column_id='id', column_sort='time') return features_filtered_direct
def short_extract_features(df, y): features_filtered_direct = extract_relevant_features(df, y, column_id="label", column_sort='t', show_warnings=False, n_jobs=8) return features_filtered_direct
def test_extracted_features_contain_X_features(self): X = extract_relevant_features(self.df, self.y, self.X, column_id='id') self.assertIn("f1", X.columns) self.assertIn("f2", X.columns) pdt.assert_series_equal(self.X["f1"], X["f1"]) pdt.assert_series_equal(self.X["f2"], X["f2"]) pdt.assert_index_equal(self.X["f1"].index, X["f1"].index) pdt.assert_index_equal(self.X["f2"].index, X["f2"].index)
def extract_features_from_TS(Data, y): extracted_features = basic_features_extract(Data) impute(extracted_features) # features_filtered = select_features(extracted_features, y) features_filtered_direct = extract_relevant_features( Data, y, column_id="id", column_sort="time" ) return extracted_features, features_filtered_direct
def test_extracted_features_contain_X_features(self): X = extract_relevant_features(self.df, self.y, self.X, column_id='id') self.assertIn("f1", X.columns) self.assertIn("f2", X.columns) pdt.assert_series_equal(self.X["f1"], X["f1"]) pdt.assert_series_equal(self.X["f2"], X["f2"]) pdt.assert_index_equal(self.X["f1"].index, X["f1"].index) pdt.assert_index_equal(self.X["f2"].index, X["f2"].index)
def tsfresh2_relevant(df, y_true, name): X = df.transpose().reset_index(drop=True).reset_index().\ rename({'index':'time'}, axis=1) X = pd.melt(X, id_vars=['time'], value_name=name) extracted_features = extract_relevant_features(X, y_true, column_id="cycle", column_sort="time") impute(extracted_features) return extracted_features
def tsfresh_extraction(X, y, config): n_jobs = config['SVM-config']['n_jobs'] extraction_settings = ComprehensiveFCParameters() return extract_relevant_features(X, y, n_jobs=n_jobs, fdr_level=0.01, show_warnings=False, column_id='id', column_sort='time', default_fc_parameters=extraction_settings)
def tsfresh_extract_features(timeSeries, idCol, timeCol): from tsfresh import extract_relevant_features from tsfresh import select_features from tsfresh.utilities.dataframe_functions import impute extracted_features = extract_relevant_features(timeSeries, column_id=idCol, column_sort=timeCol) impute(extracted_features) features_filtered = select_features(extracted_features, y) return features_filtered
def fit_transform(self, X: pd.DataFrame, y: pd.Series): y.index = X[self.column_id].unique() features = extract_relevant_features( X, y, column_id=self.column_id, column_sort=self.column_sort, default_fc_parameters=self.default_fc_parameters, n_jobs=self.n_jobs) self.columns = features.columns.tolist() return features.reset_index(drop=True)
def main(args): data = np.load(args.data) X_np = data['mts'] Y_np = data['labels'] info_file = args.data.split('.npz')[0] + '-info.txt' poe = get_POE_field(info_file) # print(X_np.shape) # assert False nbr_feats = X_np.shape[-1] x = np.reshape(X_np, (-1,nbr_feats)) # print(x.shape) # print(np.sum(x[:100,:] - X_np[0,...])) xx = np.zeros((x.shape[0], x.shape[1]+2)) yy = np.zeros((Y_np.shape[0], 2)) # print(xx.shape) xx[:,2:] = x yy[:,0] = range(1, Y_np.shape[0]+1) yy[:,1] = Y_np[:,0] id = -1 time = 0 for i in range(xx.shape[0]): if i % 100 == 0: id += 1 time = 0 xx[i,:2] = [id, time] time +=1 cols = ["id", "time"] # print(x.shape) # assert False for i in range(x.shape[1]): cols.append(f'feat{i+1}') # print(cols) # assert False timeseries = pd.DataFrame(xx, columns=cols) y = pd.Series(Y_np[:,0]) features = extract_relevant_features(timeseries, y, column_id='id', column_sort='time') save_path = f'/home/filipkr/Documents/xjob/motion-analysis/classification/tsc/feats-{poe}.npy' print(f'features: {features}') np.save(save_path, np.array(features.values)) print('done')
def get_features(y, relevant_features, data): sensor_data_list = dict_as_list(data) df = pd.DataFrame(sensor_data_list, columns=['id', 'time', 'accx', 'accy','accz', 'gyrox', 'gyroy', 'gyroz']) extraction_settings = ComprehensiveFCParameters() if relevant_features: X = extract_relevant_features(df, y, column_id = 'id', column_sort = 'time', default_fc_parameters = extraction_settings) else: X = extract_features(df, column_id = 'id', column_sort = 'time', default_fc_parameters = extraction_settings, impute_function = impute) return X
def test_relevant_feature_extraction(self): df, y = load_driftbif(100, 10, classification=False) df['id'] = df['id'].astype('str') y.index = y.index.astype('str') X = extract_relevant_features(df, y, column_id="id", column_sort="time", column_kind="dimension", column_value="value") self.assertGreater(len(X.columns), 10)
def extract_sub_window(df_x, y, window, start_index, lag, fc_parameters=MinimalFCParameters(), n_jobs=-1): from tsfresh import extract_relevant_features window_start, window_end = window sub_df_x = get_rolling_timeseries(df_x, start_index, lag, window_end-window_start) if n_jobs == -1: n_jobs = multiprocessing.cpu_count() print('Remove non target values...') y = y.iloc[start_index + lag:] # y = y[y.index.isin(sub_df_x.window_id)] print('Extracting features...') features = extract_relevant_features(sub_df_x, y, column_id="window_id", column_sort="timestamp", column_value=None, default_fc_parameters=fc_parameters, n_jobs=n_jobs) # features = pd.concat([extracted_features], axis=1) features = features.add_suffix(f"_{window_start}_{window_end}") return features
def tsfresh_calculator(timeseries, column_id, column_sort, cleanup): from tsfresh import extract_features from tsfresh import extract_relevant_features if cleanup == "Yes": extracted_features = extract_relevant_features(timeseries, column_id=column_id, column_sort=column_sort) else: extracted_features = extract_features(timeseries, column_id=column_id, column_sort=column_sort) return extracted_features
def extract_game_tot_feature(game_info_df, game_ratio_info_df): game_ratio_info_data = game_ratio_info_df[get_conf_item( 'data', 'game_ratio_info_clean', is_eval=True)] game_ratio_info_data.drop(['odds_grail', 'guest_ratio'], axis=1, inplace=True) y = game_info_df[['game_id', 'game_rst_two_cls']] y = pd.Series(y['game_rst_two_cls'].map(lambda x: x == 1).values, index=y.game_id) settings = ComprehensiveFCParameters() game_ratio_info_model = extract_relevant_features( game_ratio_info_data, y, fdr_level=0.1, default_fc_parameters=settings, column_id='game_id', column_sort='position_tm') game_ratio_info_model.to_csv('game_ratio_info_model.csv', index=True)
def features(): print("Starting [features]...") df = pd.read_csv("trainRecord.csv", error_bad_lines=False) truth = pd.read_csv("trainLabel.csv", error_bad_lines=False) kind_to_fc_parameters = { "Speed": { "maximum": None, "mean_abs_change": None, "count_above_mean": None, "longest_strike_above_mean": None }, "totalAcceleration": { "maximum": None, "mean_abs_change": None, "count_above_mean": None, "longest_strike_above_mean": None }, "totalGyro": { "maximum": None, "mean_abs_change": None, "count_above_mean": None, "longest_strike_above_mean": None }, "Bearing": { "mean_abs_change": None, "count_above_mean": None, "longest_strike_above_mean": None } } tripLabel = pd.Series(data=truth["label"].values, index=truth["bookingID"].values) features_filtered_direct = extract_relevant_features( df, tripLabel, column_id='bookingID', column_sort='second', kind_to_fc_parameters=kind_to_fc_parameters) print(features_filtered_direct.head()) features_filtered_direct.to_csv("trainFeature.csv", index=False) print("Finish [features]...")
def get_features(X, y=None, kind_to_fc_parameters=None): samples, time_steps, data_dim = X.shape X = X.reshape([-1, data_dim]) time = list(range(time_steps)) * samples ids = [] for i in range(samples): ids.extend([i] * time_steps) X = pd.DataFrame(X) X['id'] = ids X['time'] = time if y is not None: features = extract_relevant_features(X, y, column_id='id', column_sort='time', n_jobs=0) kind_to_fc_parameters = from_columns(features) return features.values, kind_to_fc_parameters elif kind_to_fc_parameters is not None: features = extract_features(X, column_id='id', column_sort='time', n_jobs=0, default_fc_parameters=kind_to_fc_parameters) else: features = extract_features(X, column_id='id', column_sort='time', n_jobs=0) return features.values
def genFeatures(features, classes): ''' Method to generate features, particularlly for entire dataset as it take a long time so as to save it in tsfeatures.pkl :param features: Type of |Series(Dataframe(2D))| :param classes: Type of |Series(integers)| :return: Type of |Dataframe(2D)| columns are the extracted features, rows designate each timeseries ''' #Prepares the data for feature extraction ts_dataframe = fNIR.tstag(features) #Ts fresh function to extract all features and based on classes for classificatiom, choose the mose relevant ones extracted_features = extract_relevant_features(ts_dataframe, classes, column_id="id", column_sort="time") #Saves the feautes to pickle files so we dont have to generate it again extracted_features.to_pickle('tsfeatures.pkl') #Saves the names of the features for future extraction for prediction featureNames = feature_extraction.settings.from_columns( extracted_features) with open("features_extracted.json", "w") as jsonFile: dump(featureNames, jsonFile) return extracted_features
Debug.DEBUG = 0 arcma = ARCMA_Model() processing = Processing_DB_Files() project = Project() s = save() #window = 26 # Janela Fixa window = 50 # Melhor Janela persons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] for p in persons: data = arcma.load_training_data_by_people(p) print("Slicing Window....") data_tsfresh, y = arcma.slice_by_window_tsfresh(data, window) y.index += 1 del data_tsfresh["activity"] classes_counts = y.value_counts() if len(classes_counts) > 1: relevant_features = extract_relevant_features(data_tsfresh, y, column_id='id', column_sort='time') s.save_var( relevant_features, "arcma_relevant_features_best_window{}relevant_features_{}.pkl". format(slash, p)) s.save_var( y, "arcma_relevant_features_best_window{}y_{}.pkl".format(slash, p))
"ch1": full_data_matrix[:, 1], "ch2": full_data_matrix[:, 2], "ch3": full_data_matrix[:, 3] }) #df2 = pd.DataFrame({"id":[ID]}) #df3 = pd.DataFrame(full_data_matrix) # #A=df1.join(df2) #B=A.join(df3) from tsfresh import extract_relevant_features features_filtered_direct = extract_relevant_features(df, y, column_id='id', column_sort='time', n_jobs=4) """ PCA stuff """ #################PCA AND VARIANCE EXPLAINED #pca = PCA(svd_solver='auto')#PCA with all components #pca.fit(full_normalized_array) #pca_cumsum = np.cumsum(pca.explained_variance_ratio_)*100 # #plt.figure() #plt.plot(pca_cumsum) #plt.grid() #plt.ylabel('% Variance Explained')
def extractFeaturesTimeSeries(timeSeries, idCol, timeCol): from tsfresh import extract_relevant_features return extract_relevant_features(timeSeries, column_id=idCol, column_sort=timeCol)
def run(): logger = logging.getLogger() logger.info("Processing feature extraction") timeseries, y = load_csv_data_from_filtered_scheme.run() logger.info("successfully load the correct dataframe for tsfresh") # reference to the github code ./tsfresh/feature_extraction/settings.py extraction_setting = { "mean":None, "standard_deviation":None, "mean_change": None, "mean_abs_change":None, "abs_energy": None, "autocorrelation": [{"lag": 1}, {"lag": 2}, {"lag": 3}, {"lag": 4}], "agg_autocorrelation": [{'f_agg':'mean'}, {'f_agg':'std'}], "ar_coefficient": [{"coeff": 0, "k": 10}, {"coeff": 1, "k": 10}, {"coeff": 2, "k": 10}, {"coeff": 3, "k": 10}, {"coeff": 4, "k": 10}], "partial_autocorrelation": [{"lag":1}, {"lag":2}, {"lag":3}, {"lag":4}, {"lag":5}], "fft_coefficient":[{"coeff":0, "attr":"real"}, {"coeff":1, "attr":"real"}, {"coeff":2, "attr":"real"}, {"coeff":3, "attr":"real"}, {"coeff":4, "attr":"real"}, {"coeff":0, "attr":"angle"}, {"coeff":1, "attr":"angle"}, {"coeff":2, "attr":"angle"}, {"coeff":3, "attr":"angle"}, {"coeff":4, "attr":"angle"}], } #extraction_setting = ComprehensiveFCParameters() # all features #extraction_setting = EfficientFCParameters() # without the 'high_comp_cost' features X = extract_features(timeseries, column_id = 'id', column_sort = 'time', default_fc_parameters=extraction_setting, impute_function=impute) logger.warning('Features Info:') print X.info() X_filtered = extract_relevant_features(timeseries, y, column_id='id', column_sort='time', default_fc_parameters=extraction_setting) logger.warning('Filtered features Info:') print X_filtered.info() print X_filtered.shape return X, X_filtered, y