def test_warning_on_non_uniform_time_steps(self): with warnings.catch_warnings(record=True) as w: first_class = pd.DataFrame({ "a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": [1, 2, 4, 5] }) second_class = pd.DataFrame({ "a": [10, 11], "b": [12, 13], "time": range(20, 22) }) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=1) self.assertEqual(len(w), 1) self.assertEqual( str(w[0].message), "Your time stamps are not uniformly sampled, which makes rolling " "nonsensical in some domains.")
def test_rolling_with_larger_shift(self): first_class = pd.DataFrame({ "a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4) }) second_class = pd.DataFrame({ "a": [10, 11], "b": [12, 13], "time": range(20, 22) }) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) """ df_full is a b time id 0 1 5 0 1 1 2 6 1 1 2 3 7 2 1 3 4 8 3 1 4 10 12 20 2 5 11 13 21 2 """ correct_indices = [(1, 1), (1, 1), (1, 3), (1, 3), (1, 3), (1, 3), (2, 21), (2, 21)] correct_values_a = [1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 10.0, 11.0] correct_values_b = [5.0, 6.0, 5.0, 6.0, 7.0, 8.0, 12.0, 13.0] df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=2, n_jobs=0) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) correct_indices = [(1, 0), (1, 0), (1, 0), (1, 0), (1, 2), (1, 2), (2, 20), (2, 20)] correct_values_a = [1.0, 2.0, 3.0, 4.0, 3.0, 4.0, 10.0, 11.0] correct_values_b = [5.0, 6.0, 7.0, 8.0, 7.0, 8.0, 12.0, 13.0] df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-2, n_jobs=0) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b)
def test_negative_rolling(self): first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) correct_indices = (["id=1, shift=-3"] * 1 + ["id=1, shift=-2"] * 2 + ["id=1, shift=-1"] * 3 + ["id=2, shift=-1"] * 1 + ["id=1, shift=0"] * 4 + ["id=2, shift=0"] * 2) correct_values_a = [4, 3, 4, 2, 3, 4, 11, 1, 2, 3, 4, 10, 11] correct_values_b = [8, 7, 8, 6, 7, 8, 13, 5, 6, 7, 8, 12, 13] df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, maximum_number_of_timeshifts=None) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, maximum_number_of_timeshifts=1) self.assertListEqual(list(df["id"].values), correct_indices[3:]) self.assertListEqual(list(df["a"].values), correct_values_a[3:]) self.assertListEqual(list(df["b"].values), correct_values_b[3:]) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, maximum_number_of_timeshifts=2) self.assertListEqual(list(df["id"].values), correct_indices[1:]) self.assertListEqual(list(df["a"].values), correct_values_a[1:]) self.assertListEqual(list(df["b"].values), correct_values_b[1:]) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, maximum_number_of_timeshifts=4) self.assertListEqual(list(df["id"].values), correct_indices[:]) self.assertListEqual(list(df["a"].values), correct_values_a[:]) self.assertListEqual(list(df["b"].values), correct_values_b[:])
def test_positive_rolling(self): first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) """ df_full is a b time id 0 1 5 0 1 1 2 6 1 1 2 3 7 2 1 3 4 8 3 1 4 10 12 20 2 5 11 13 21 2 """ correct_indices = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 20, 21, 21] correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 5.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=1) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=1, max_timeshift=4) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=1, max_timeshift=2) correct_indices = [0, 1, 1, 2, 2, 2, 3, 3, 3, 20, 21, 21] correct_values_a = [1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 10.0, 10.0, 11.0] correct_values_b = [5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 12.0, 12.0, 13.0] self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b)
def create_features_by_tsfresh(path, dataset, years, features): data = dataset.copy() data = data[features + ['time', 'id']] data_rolled = roll_time_series(data, column_id="id", column_sort="time", max_timeshift=7 * 24, n_jobs=8) features = extract_features(data_rolled, column_id="id", column_sort="time", n_jobs=8) impute(features) print(features.shape) features.to_csv(path + '/modified_data_after_feature_extraction/') AQI = get_raw_AQI_data(path, years) AQI_data = pd.Series(data=AQI['AQI'].values, index=features.index, name='AQI') print(AQI_data.shape) selected_features = select_features(features, AQI_data) print(selected_features.shape) # features.drop('ID', axis=1, inplace=True) selected_features.index = range(selected_features.shape[0]) return selected_features
def test_dict_rolling(self): df_dict = { "a": pd.DataFrame({ "_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2] }), "b": pd.DataFrame({ "_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2] }) } df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None, column_kind=None, rolling_direction=-1) correct_indices = (["id=1, shift=-3"] * 1 + ["id=1, shift=-2"] * 2 + ["id=1, shift=-1"] * 3 + ["id=2, shift=-1"] * 1 + ["id=1, shift=0"] * 4 + ["id=2, shift=0"] * 2) self.assertListEqual(list(df["a"]["id"].values), correct_indices) self.assertListEqual(list(df["b"]["id"].values), correct_indices) self.assertListEqual(list(df["a"]["_value"].values), [4, 3, 4, 2, 3, 4, 11, 1, 2, 3, 4, 10, 11]) self.assertListEqual(list(df["b"]["_value"].values), [8, 7, 8, 6, 7, 8, 13, 5, 6, 7, 8, 12, 13])
def tsfresh_run(forecast,season,insample=True,forecast_out=None): df_roll_prep = forecast.reset_index() if insample: df_roll_prep = df_roll_prep.drop(["Target","Date"],axis=1) df_roll_prep["id"] = 1 target = forecast["Target"] else: df_roll_prep = df_roll_prep.drop(["index"],axis=1) df_roll_prep["id"] = 1 df_roll = roll_time_series(df_roll_prep, column_id="id", column_sort=None, column_kind=None, rolling_direction=1,max_timeshift=season-1) counts = df_roll['id'].value_counts() df_roll_cut = df_roll[df_roll['id'].isin(counts[counts >= season].index)] ### TS feature extraction concat_df = pd.DataFrame() #rap = 4 ## Change this to suit your memory capacity, the lower the more memory concat_df = extract_features(df_roll_cut.ffill(), column_id="id", column_sort="sort", n_jobs=season, show_warnings=False, disable_progressbar=True ) if insample: concat_df = concat_df.dropna(axis=1, how="all") concat_df.index = target[df_roll_cut['id'].value_counts().index].sort_index().to_frame().index concat_df = pd.merge(target[df_roll_cut['id'].value_counts().index].sort_index().to_frame(), concat_df, left_index=True, right_index=True, how="left") concat_df_list = constant_feature_detect(data=concat_df,threshold=0.95) concat_df = concat_df.drop(concat_df_list,axis=1) else: forecast_out.index.name = "Date" concat_df.index = forecast_out.index concat_df = impute(concat_df) return concat_df
def test_positive_rolling(self): first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=1) correct_indices = (["id=1, shift=3"] * 1 + ["id=1, shift=2"] * 2 + ["id=1, shift=1"] * 3 + ["id=2, shift=1"] * 1 + ["id=1, shift=0"] * 4 + ["id=2, shift=0"] * 2) self.assertListEqual(list(df["id"]), correct_indices) self.assertListEqual(list(df["a"].values), [1, 1, 2, 1, 2, 3, 10, 1, 2, 3, 4, 10, 11]) self.assertListEqual(list(df["b"].values), [5, 5, 6, 5, 6, 7, 12, 5, 6, 7, 8, 12, 13])
def test_stacked_rolling(self): first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) df_stacked = pd.concat([df_full[["time", "id", "a"]].rename(columns={"a": "_value"}), df_full[["time", "id", "b"]].rename(columns={"b": "_value"})], ignore_index=True) df_stacked["kind"] = ["a"] * 6 + ["b"] * 6 df = dataframe_functions.roll_time_series(df_stacked, column_id="id", column_sort="time", column_kind="kind", rolling_direction=-1) correct_indices = (["id=1, shift=-3"] * 2 + ["id=1, shift=-2"] * 4 + ["id=1, shift=-1"] * 6 + ["id=2, shift=-1"] * 2 + ["id=1, shift=0"] * 8 + ["id=2, shift=0"] * 4) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13) self.assertListEqual(list(df["_value"].values), [4, 8, 3, 7, 4, 8, 2, 6, 3, 7, 4, 8, 11, 13, 1, 5, 2, 6, 3, 7, 4, 8, 10, 12, 11, 13])
def test_order_rolling(self): first_class = pd.DataFrame({ "x": [1, 2, 3, 4], "time": [1, 15, 132, 145] }) second_class = pd.DataFrame({"x": [5, 6, 7], "time": [16, 133, 146]}) first_class["initial_id"] = 1 second_class["initial_id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) window_size = 2 df_rolled = dataframe_functions.roll_time_series( df_full, column_id="initial_id", column_sort="time", min_timeshift=window_size - 1, max_timeshift=window_size - 1) """ df is {x: _value id 1.0 1 2.0 1 3.0 1 4.0 1 5.0 2 6.0 2 7.0 2, } """ correct_indices = [(1, 15), (1, 15), (1, 132), (1, 132), (1, 145), (1, 145), (2, 133), (2, 133), (2, 146), (2, 146)] self.assertListEqual(list(df_rolled["id"]), correct_indices)
def test_warning_on_non_uniform_time_steps(self): with warnings.catch_warnings(record=True) as w: first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": [1, 2, 4, 5]}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=1) self.assertEqual(len(w), 1) self.assertEqual(str(w[0].message), "Your time stamps are not uniformly sampled, which makes rolling " "nonsensical in some domains.")
def gen_rolling_feature(self, window_size, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate aggregation feature for each sample. This method will be implemented by tsfresh. TODO: relationship with scale should be figured out. :param window_size: int, generate feature according to the rolling result. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' assert not self._has_generate_agg_feature,\ "Only one of gen_global_feature and gen_rolling_feature should be called." if isinstance(settings, str): assert settings in ["comprehensive", "minimal", "efficient"], \ f"settings str should be one of \"comprehensive\", \"minimal\", \"efficient\"\ , but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings df_rolled = roll_time_series(self.df, column_id=self.id_col, column_sort=self.dt_col, max_timeshift=window_size - 1, min_timeshift=window_size - 1, n_jobs=n_jobs) if not full_settings: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) else: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) impute_tsfresh(self.roll_feature_df) self.feature_col += list(self.roll_feature_df.columns) self.roll_additional_feature = list(self.roll_feature_df.columns) self._has_generate_agg_feature = True return self
def test_stacked_rolling(self): first_class = pd.DataFrame({ "a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4) }) second_class = pd.DataFrame({ "a": [10, 11], "b": [12, 13], "time": range(20, 22) }) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) df_stacked = pd.concat([ df_full[["time", "id", "a"]].rename(columns={"a": "_value"}), df_full[["time", "id", "b"]].rename(columns={"b": "_value"}) ], ignore_index=True) df_stacked["kind"] = ["a"] * 6 + ["b"] * 6 """ df_stacked is time id _value kind 0 0 1 1 a 1 1 1 2 a 2 2 1 3 a 3 3 1 4 a 4 20 2 10 a 5 21 2 11 a 6 0 1 5 b 7 1 1 6 b 8 2 1 7 b 9 3 1 8 b 10 20 2 12 b 11 21 2 13 b """ df = dataframe_functions.roll_time_series(df_stacked, column_id="id", column_sort="time", column_kind="kind", rolling_direction=-1, n_jobs=0) correct_indices = ([(1, 0)] * 2 * 4 + [(1, 1)] * 2 * 3 + [(1, 2)] * 2 * 2 + [(1, 3)] * 2 * 1 + [(2, 20)] * 2 * 2 + [(2, 21)] * 2 * 1) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13) self.assertListEqual(list(df["_value"].values), [ 1., 5., 2., 6., 3., 7., 4., 8., 2., 6., 3., 7., 4., 8., 3., 7., 4., 8., 4., 8., 10., 12., 11., 13., 11., 13. ])
def create_new_features(df_x, s_y, x_train_cols=[]): """ Create new Features from Input-Dataframe by using TSFRESH :param df_x: Dataframe containing Time-Series :param s_y: Series of Target-Var :param x_train_cols: :return: Dataframe containing created Features """ # add id column (same id for every row, because only one time series is # considered in this dataset) df_x["id"] = 1 # create roll time series for generating time series features df_x_rolled = roll_time_series( df_x, column_id="id", column_sort="Date", column_kind=None, rolling_direction=1, max_timeshift=TSFRESH_TIME_WINDOWS - 1, ) x = df_x.set_index("Date") # for each variable in input df new features are generated for current_feature in FEATURES: # noinspection PyTypeChecker generated_features = extract_features( df_x_rolled, column_id="id", n_jobs=3, column_kind=None, column_value=current_feature, impute_function=impute, default_fc_parameters=settings, ) x = pd.concat([x, generated_features], axis=1) print(f"\nNew shape of Feature-Matrix: {x.shape}") print(f"\nAmount of Features before selection: {len(x.columns)}") # check if features of train set are already selected if len(x_train_cols) == 0: # select relevant features for train set selected_features = feature_selection.select_features(x, s_y) print(f"\nAmount of Features after selection: " f"{len(selected_features.columns)}") else: # no selection is needed, features are already selected for train set selected_features = x[x_train_cols] return selected_features
def segment(dir_path): """Create segments of time series.""" target = yaml.safe_load(open("params.yaml"))["clean"]["target"] filepaths = find_files(dir_path, file_extension=".csv") output_columns = np.array( pd.read_csv(DATA_PATH / OUTPUT_FEATURES_PATH, index_col=0)).reshape(-1) dfs = [] for filepath in filepaths: df = pd.read_csv(filepath, index_col=0) # df = df.iloc[10000:90000,:] # df = df.iloc[:,:-1] dfs.append(df) combined_df = pd.concat(dfs, ignore_index=True) combined_df = combined_df[::10] print(combined_df) n_rows = len(combined_df) segment_size = 100 n_segments = int(n_rows / segment_size) ids = np.arange(1, n_segments + 1, 1) idlist = np.ones(segment_size) for i in ids[1:]: idlist = np.concatenate((idlist, np.ones(segment_size) * i)) idlist = np.array(idlist, dtype=np.int32) # combined_df = combined_df.iloc[:len(idlist),:] # combined_df["id"] = idlist combined_df["id"] = np.ones(n_rows) # y = [] # for i in ids: # target_value = combined_df[combined_df["id"] == i][target].iloc[-1] # y.append(target_value) # y = pd.Series(y) # y.index = y.index + 1 # combined_df.index.name = "index" # print(y) print(combined_df) # print(np.unique(y)) df_rolled = roll_time_series(combined_df, column_id="id", column_sort=None) print(df_rolled)
def test_dict_rolling_maxshift_1(self): df_dict = { "a": pd.DataFrame({ "_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2] }), "b": pd.DataFrame({ "_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2] }) } df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None, column_kind=None, rolling_direction=-1, max_timeshift=1, n_jobs=0) """ df is {a: _value id 1.0 1 2.0 1 3.0 1 4.0 1 10.0 2 11.0 2, b: _value id 5.0 1 6.0 1 7.0 1 8.0 1 12.0 2 13.0 2 } """ correct_indices = [(1, 0), (1, 0), (1, 1), (1, 1), (1, 2), (1, 2), (1, 3), (2, 0), (2, 0), (2, 1)] self.assertListEqual(list(df["a"]["id"].values), correct_indices) self.assertListEqual(list(df["b"]["id"].values), correct_indices) self.assertListEqual( list(df["a"]["_value"].values), [1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0]) self.assertListEqual( list(df["b"]["_value"].values), [5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0])
def test_dict_rolling(self): df_dict = { "a": pd.DataFrame({"_value": [1, 2, 3, 4, 10, 11], "id": [1, 1, 1, 1, 2, 2]}), "b": pd.DataFrame({"_value": [5, 6, 7, 8, 12, 13], "id": [1, 1, 1, 1, 2, 2]}) } df = dataframe_functions.roll_time_series(df_dict, column_id="id", column_sort=None, column_kind=None, rolling_direction=-1) """ df is {a: _value sort id 7 1.0 0.0 0 3 2.0 1.0 0 1 3.0 2.0 0 0 4.0 3.0 0 8 2.0 1.0 1 4 3.0 2.0 1 2 4.0 3.0 1 9 3.0 2.0 2 5 4.0 3.0 2 10 4.0 3.0 3 11 10.0 4.0 4 6 11.0 5.0 4 12 11.0 5.0 5, b: _value sort id 7 5.0 0.0 0 3 6.0 1.0 0 1 7.0 2.0 0 0 8.0 3.0 0 8 6.0 1.0 1 4 7.0 2.0 1 2 8.0 3.0 1 9 7.0 2.0 2 5 8.0 3.0 2 10 8.0 3.0 3 11 12.0 4.0 4 6 13.0 5.0 4 12 13.0 5.0 5} """ correct_indices = [0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 4, 4, 5] self.assertListEqual(list(df["a"]["id"].values), correct_indices) self.assertListEqual(list(df["b"]["id"].values), correct_indices) self.assertListEqual(list(df["a"]["_value"].values), [1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0]) self.assertListEqual(list(df["b"]["_value"].values), [5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0])
def test_features_on_btc(): df = pd.DataFrame({ "id": [1, 1, 1, 1, 2, 2], "time": [1, 2, 3, 4, 8, 9], "x": [1, 2, 3, 4, 10, 11], "y": [5, 6, 7, 8, 12, 13], }) df_rolled = roll_time_series(df, column_id="id", column_sort="time") assert df_rolled['id'].nunique() == 6 df_features = extract_features(df_rolled, column_id="id", column_sort="time") assert df_features.shape[0] == 6
def add_tsfresh_participant(data, tsfresh_features, columns, k): # The dictionary containing the features that we want to extract and the setting for those features if tsfresh_features == 'minimal': settings = MinimalFCParameters() elif tsfresh_features == 'efficient': settings = EfficientFCParameters() elif tsfresh_features == 'comprehensive': settings = ComprehensiveFCParameters() else: settings = MinimalFCParameters() for participant in range(len(data)): # First we add the necesary columns data[participant]['id'] = 0 data[participant]['index'] = data[participant].index # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none # means that it takes the maximal possible lengths rolled_series = roll_time_series(data[participant], column_id='id', column_sort='index', max_timeshift=k) all_features = [] for column in columns: # We extract the features for every element of the time series which return a dataframe with the same number # of rows as the original dataframe but a different number of columns extracted = extract_features(rolled_series, default_fc_parameters=settings, column_id='id', column_sort='index', column_value=column) # We need to reset the indexes as they have been changed and add them to our list of features all_features.append(extracted.reset_index(drop=True)) # Add all the features together extracted = pd.concat(all_features, axis=1) # We drop the columns that we previously created because we do no want them in the data del data[participant]['id'] # note that you can also use df.drop here del data[participant]['index'] data[participant] = pd.concat([data[participant], extracted], axis=1) return data
def transform(self, X, y=None): windows = roll_time_series( X, self.column_id, self.column_sort, self.column_kind, self.rolling_direction, self.max_timeshift, self.min_timeshift, self.chunksize, self.n_jobs, self.show_warnings, self.disable_progressbar, self.distributor, ) return windows
def get_resample_features(data, window, settings=MinimalFCParameters()): """ Make rolling in time series to extrated daily features Given that time series that is taken as input must be in hours, the number of windows to make the rolling and in this way get daily features Parameters ---------- data : DataFrame The DataFrame contains events update by CI settings : Object A object that maps feature calculator names in tsfresh. There are two options: ComprehensiveFCParameters() or MinimalFCParameters() list_features: list A list that contains the relevant features to calculate in tsfresh time_resample: Unit of time in which the features are required Returns ------- DataFrame DataFrame with daily features """ data["id"] = 1 df_roll_time = roll_time_series( data, column_id="id", column_sort='Timestamp', column_kind=None, rolling_direction=1, # max_timeshift=23, max_timeshift=window) X_features = extract_features( df_roll_time, column_id="id", column_sort='Timestamp', default_fc_parameters=settings, # n_jobs=4 ) #resample time series by day X_features.index = pd.to_datetime(X_features.index) return X_features
def test_stacked_rolling(self): first_class = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4)}) second_class = pd.DataFrame({"a": [10, 11], "b": [12, 13], "time": range(20, 22)}) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) df_stacked = pd.concat([df_full[["time", "id", "a"]].rename(columns={"a": "_value"}), df_full[["time", "id", "b"]].rename(columns={"b": "_value"})], ignore_index=True) df_stacked["kind"] = ["a"] * 6 + ["b"] * 6 """ df_stacked is time id _value kind 0 0 1 1 a 1 1 1 2 a 2 2 1 3 a 3 3 1 4 a 4 20 2 10 a 5 21 2 11 a 6 0 1 5 b 7 1 1 6 b 8 2 1 7 b 9 3 1 8 b 10 20 2 12 b 11 21 2 13 b """ df = dataframe_functions.roll_time_series(df_stacked, column_id="id", column_sort="time", column_kind="kind", rolling_direction=-1) correct_indices = ([0]*2*4 + [1]*2*3 + [2]*2*2 + [3]*2*1 + [20]*4 + [21] *2) self.assertListEqual(list(df["id"].values), correct_indices) print(df["_value"].values) self.assertListEqual(list(df["kind"].values), ["a", "b"] * 13) self.assertListEqual(list(df["_value"].values), [1., 5., 2., 6., 3., 7., 4., 8., 2., 6., 3., 7., 4., 8., 3., 7., 4., 8., 4., 8., 10., 12., 11., 13., 11., 13.])
def _extract_features(self, data_frame): df_rolled = roll_time_series( data_frame, column_id=self.column_id, column_sort=self.time_stamp, max_timeshift=self.memory, ) extracted_minimal = tsfresh.extract_features( df_rolled, column_id=self.column_id, column_sort=self.time_stamp, default_fc_parameters=tsfresh.feature_extraction. MinimalFCParameters(), ) extracted_index_based = tsfresh.extract_features( df_rolled, column_id=self.column_id, column_sort=self.time_stamp, default_fc_parameters=tsfresh.feature_extraction.settings. IndexBasedFCParameters(), ) extracted_features = pd.concat( [extracted_minimal, extracted_index_based], axis=1) del extracted_minimal del extracted_index_based gc.collect() extracted_features[np.isnan(extracted_features)] = 0.0 extracted_features[np.isinf(extracted_features)] = 0.0 return extracted_features
def gen_rolling_feature(self, window_size, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate aggregation feature for each sample. This method will be implemented by tsfresh. Make sure that the specified column name does not contain '__'. TODO: relationship with scale should be figured out. :param window_size: int, generate feature according to the rolling result. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters, \ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } assert not self._has_generate_agg_feature,\ "Only one of gen_global_feature and gen_rolling_feature should be called." if isinstance(settings, str): assert settings in ['comprehensive', 'minimal', 'efficient'], \ "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\ f", but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\ "should have a window_size smaller than shortest time series length." df_rolled = roll_time_series(self.df, column_id=self.id_col, column_sort=self.dt_col, max_timeshift=window_size - 1, min_timeshift=window_size - 1, n_jobs=n_jobs) if not full_settings: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) else: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) impute_tsfresh(self.roll_feature_df) self.feature_col += list(self.roll_feature_df.columns) self.roll_additional_feature = list(self.roll_feature_df.columns) self._has_generate_agg_feature = True return self
feature_matrix_list = [] for room in room_params: for measurement in range(1, room_params[room] + 1): print(f'Room: {room}, Measurement: {measurement}') subset_data = loader.return_experiment_measurement( room_location=room, measurement_no=measurement, sensor_node=1) subset_data.set_index('entry_id', inplace=True) subset_data['entry_id'] = subset_data.index target_vector = subset_data['binary_target'] subset_data.sort_values(['node_id', 'entry_id'], inplace=True) ts_for_rolling = subset_data[FEATURES] ts_for_rolling = roll_time_series(ts_for_rolling, column_id='node_id', column_sort='entry_id', column_kind=None, rolling_direction=1, max_timeshift=WINDOW_SIZE) temp_df = extract_features(ts_for_rolling, n_jobs=NCORES, column_sort='entry_id', column_id='node_id', column_kind=None, show_warnings=False) impute_df = impute(temp_df) temp_sel_df = select_features(X=impute_df, y=target_vector, n_jobs=NCORES) join_df = subset_data[FEATURES + JOIN_FEATURES] join_df = join_df.join(temp_sel_df) feature_matrix_list.append(join_df)
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') data = pd.read_csv(folder + filename, index_col=0, date_parser=dateparse) data['id'] = [site_ID for _ in range(data.shape[0])] data.rename(columns={'AQI_': 'AQI'}, inplace=True) data['time'] = data.index data = data[['AQI', 'time', 'id']] # data = data.iloc[0:40, :] print(data.shape) # data = drop_missing_weeks(data, years) data_list[m] = data data = pd.concat(data_list, axis=0) print(data.shape) data_rolled = roll_time_series(data, column_id="id", column_sort="time", max_timeshift=7*24, n_jobs=8) features = extract_features(data_rolled, column_id="id", column_sort="time", n_jobs=8) impute(features) print(features.shape) features['time'] = data['time'].values features = drop_missing_weeks(features, years, typical_index=False) features.drop(['time'], axis=1, inplace=True) AQI = get_raw_AQI_data(path, years) AQI_data = pd.Series(data=AQI['AQI'].values, index=features.index, name='AQI') print(AQI.shape) selected_features = select_features(features, AQI_data) print(selected_features.shape) selected_features.to_csv('./data/modified_data_after_feature_extraction/AQI_features.csv', index=False)
def test_negative_rolling(self): first_class = pd.DataFrame({ "a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "time": range(4) }) second_class = pd.DataFrame({ "a": [10, 11], "b": [12, 13], "time": range(20, 22) }) first_class["id"] = 1 second_class["id"] = 2 df_full = pd.concat([first_class, second_class], ignore_index=True) """ df_full is a b time id 0 1 5 0 1 1 2 6 1 1 2 3 7 2 1 3 4 8 3 1 4 10 12 20 2 5 11 13 21 2 """ correct_indices = ([0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21]) correct_values_a = [ 1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0 ] correct_values_b = [ 5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0 ] df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, max_timeshift=None) self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, max_timeshift=1) correct_indices = ([0, 0, 1, 1, 2, 2, 3, 20, 20, 21]) correct_values_a = [ 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0 ] correct_values_b = [ 5.0, 6.0, 6.0, 7.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0 ] self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, max_timeshift=2) correct_indices = ([0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21]) correct_values_a = [ 1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0 ] correct_values_b = [ 5.0, 6.0, 7.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0 ] self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b) df = dataframe_functions.roll_time_series(df_full, column_id="id", column_sort="time", column_kind=None, rolling_direction=-1, max_timeshift=4) correct_indices = ([0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 20, 20, 21]) correct_values_a = [ 1.0, 2.0, 3.0, 4.0, 2.0, 3.0, 4.0, 3.0, 4.0, 4.0, 10.0, 11.0, 11.0 ] correct_values_b = [ 5.0, 6.0, 7.0, 8.0, 6.0, 7.0, 8.0, 7.0, 8.0, 8.0, 12.0, 13.0, 13.0 ] self.assertListEqual(list(df["id"].values), correct_indices) self.assertListEqual(list(df["a"].values), correct_values_a) self.assertListEqual(list(df["b"].values), correct_values_b)
def get_rolling_windows(self): self.df_rolled = roll_time_series(self.df, column_id=self.id, column_sort=self.column_sort)
def preprocess(self, feature_window: int, aggregations: dict, strict_feature_window: bool = True, include_target: bool = True, drop_na_target: bool = True) -> tuple: assert feature_window <= self.max_feature_window, \ "Try smaller integer feature window!" df = self.df.copy() id_col = self.id_col datetime_col = self.datetime_col target_col = self.target_col forecast_horizon = self.forecast_horizon df[datetime_col] = pd.to_datetime(df[datetime_col], format=self.dt_format).dt.date df['timeID'] = super(RollWin, RollWin)._map_timeid(df, datetime_col) df['kind'] = df[id_col] # Process target variable outside rolling window implementation df_target = df[[id_col, datetime_col, target_col]] df_target['target_shift'] = df_target.groupby( id_col)[target_col].shift(-forecast_horizon) df_target = df_target.rename(columns={datetime_col: 'ref_date'}) df_target.drop(target_col, 1, inplace=True) # Apply rolling and do some processing df_rolled = roll_time_series(df, column_id=id_col, column_sort='timeID', column_kind='kind', rolling_direction=1, max_timeshift=feature_window - 1) df_rolled = df_rolled.rename(columns={id_col: 'winID', 'kind': id_col}) cols = list(df_rolled.columns.values) first_cols = [id_col, 'winID', 'timeID', datetime_col] remaining_cols = sorted(list(set(cols) - set(first_cols))) cols = first_cols + remaining_cols df_rolled = df_rolled[cols].sort_values(by=[id_col, 'winID', 'timeID']). \ reset_index(drop=True) df_rolled['ref_date'] = df_rolled.groupby( [id_col, 'winID'])[datetime_col].transform('last') df_rolled = pd.merge(df_rolled, df_target, how='left', on=[id_col, 'ref_date']) cols = list(df_rolled.columns) first_cols = [ id_col, 'ref_date', 'winID', datetime_col, 'timeID', 'target_shift' ] remaining_cols = list(set(cols) - set(first_cols)) cols = first_cols + sorted(remaining_cols) df_rolled = df_rolled[cols] self.n_strict_rolling_win = df_rolled[df_rolled.groupby([ id_col, 'winID' ])['timeID'].transform(len) == feature_window].dropna( subset=['target_shift'])['timeID'].nunique() if strict_feature_window: df_rolled = df_rolled[df_rolled.groupby([id_col, 'winID']) ['timeID'].transform(len) == feature_window] else: pass if drop_na_target: df_rolled.dropna(subset=['target_shift'], inplace=True) else: pass self.n_rolling_win = df_rolled['timeID'].nunique() # TODO: set default aggregations as mean and last aggregations_local = deepcopy(aggregations) if include_target: aggregations_local[target_col] = 'last' else: pass df_aggregated = df_rolled.groupby([id_col, 'ref_date']).agg(aggregations_local) df_aggregated.reset_index(inplace=True) # Rename columns df_aggregated.columns = [ i[0] + '_' + i[1] if len(i) == 2 else i for i in df_aggregated.columns ] df_aggregated.columns = [ i[:-1] if i[-1] == '_' else i for i in df_aggregated.columns ] df_aggregated['month'] = df_aggregated['ref_date'].map( lambda x: x.month) # TODO: get dummies for all categoricals df_aggregated = pd.concat( [df_aggregated, pd.get_dummies(list(df_aggregated.id))], axis=1) if include_target: df_aggregated.rename(columns={target_col + '_last': target_col}, inplace=True) else: pass cols = list(df_aggregated.columns) first_cols = [id_col, 'ref_date', 'target', 'month'] remaining_cols = list(set(cols) - set(first_cols)) cols = first_cols + sorted(remaining_cols) df_aggregated = df_aggregated[cols] return df_aggregated, df_rolled