Ejemplo n.º 1
0
    def test_make_forecasting_frame_feature_extraction(self):
        t_index = pd.date_range('1/1/2011', periods=4, freq='H')
        df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index),
                                                           kind="test", max_timeshift=1, rolling_direction=1)

        extract_relevant_features(df, y, column_id="id", column_sort="time", column_value="value",
                                  default_fc_parameters=MinimalFCParameters())
Ejemplo n.º 2
0
    def test_make_forecasting_frame_list(self):
        df, y = dataframe_functions.make_forecasting_frame(x=range(4), kind="test", max_timeshift=1, rolling_direction=1)
        expected_df = pd.DataFrame({"id": [1, 2, 3], "kind": ["test"]*3, "value": [0., 1., 2.], "time": [0., 1., 2.]})

        expected_y = pd.Series(data=[1, 2, 3], index=[1, 2, 3], name="value")
        assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
        assert_series_equal(y, expected_y)
Ejemplo n.º 3
0
    def test_make_forecasting_frame_list(self):
        df, y = dataframe_functions.make_forecasting_frame(x=range(4), kind="test", max_timeshift=1, rolling_direction=1)
        expected_df = pd.DataFrame({"id": [1, 2, 3], "kind": ["test"]*3, "value": [0., 1., 2.], "time": [0., 1., 2.]})

        expected_y = pd.Series(data=[1, 2, 3], index=[1, 2, 3], name="value")
        assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
        assert_series_equal(y, expected_y)
Ejemplo n.º 4
0
    def compute_tsfresh_features(self):
        """Calculate the features using `tsfresh`."""
        value = self.df[self.ts_col]
        df_shift, y = make_forecasting_frame(value,
                                             kind="kind",
                                             max_timeshift=self.max_timeshift,
                                             rolling_direction=1)

        extract_start = time.time()
        X_gen_raw = extract_features(df_shift,
                                     column_id="id",
                                     column_sort="time",
                                     column_value="value",
                                     impute_function=impute,
                                     n_jobs=8,
                                     show_warnings=False)
        extract_end = time.time()
        tqdm.write("Extraction time: {}".format(extract_end - extract_start))

        non_const_idx = X_gen_raw.apply(pd.Series.nunique) != 1
        X_gen_raw_non_const = X_gen_raw.loc[:, non_const_idx]
        select_start = time.time()
        X_gen = select_features(
            X_gen_raw_non_const, y, ml_task='regression')
        select_end = time.time()

        tqdm.write("Filtering time: {}".format(select_end - select_start))
        tqdm.write("Raw features: {}".format(X_gen_raw.shape[1]))
        tqdm.write(
            "Non-constant features: {}".format(X_gen_raw_non_const.shape[1]))
        tqdm.write("Final filtered features: {}".format(X_gen.shape[1]))

        return X_gen
Ejemplo n.º 5
0
 def transform(self, X, y=None):
     max_timeshift = self.determine_timeshift_count(X)
     x, y = make_forecasting_frame(X["price"],
                                   kind="price",
                                   max_timeshift=max_timeshift,
                                   rolling_direction=1)
     return x, y
Ejemplo n.º 6
0
    def test_make_forecasting_frame_pdSeries(self):

        t_index = pd.date_range('1/1/2011', periods=4, freq='H')
        df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(
            data=range(4), index=t_index),
                                                           kind="test",
                                                           max_timeshift=1,
                                                           rolling_direction=1)

        expected_y = pd.Series(data=[1, 2, 3],
                               index=pd.DatetimeIndex([
                                   "2011-01-01 01:00:00",
                                   "2011-01-01 02:00:00", "2011-01-01 03:00:00"
                               ]),
                               name="value")
        expected_df = pd.DataFrame({
            "id":
            pd.DatetimeIndex([
                "2011-01-01 01:00:00", "2011-01-01 02:00:00",
                "2011-01-01 03:00:00"
            ]),
            "kind": ["test"] * 3,
            "value": [0., 1., 2.],
            "time":
            pd.DatetimeIndex([
                "2011-01-01 00:00:00", "2011-01-01 01:00:00",
                "2011-01-01 02:00:00"
            ])
        })
        assert_frame_equal(df.sort_index(axis=1),
                           expected_df.sort_index(axis=1))
        assert_series_equal(y, expected_y)
def main():
    files = pd.read_excel(
        '/home/velaraptor/Downloads/Raw Data 10yrs (2018).xlsx', header=1)
    files = files.fillna(0)
    groups = files.groupby('Name')
    forecast_df = []
    for name, group in tqdm.tqdm(groups):
        if len(group) > 1:
            group.index = group.Year
            df_shift, y = make_forecasting_frame(group["FantPt"],
                                                 kind=name,
                                                 max_timeshift=10,
                                                 rolling_direction=1)
            forecast_df.append(df_shift)

    features_df = []
    for sample in tqdm.tqdm(forecast_df):
        X = extract_features(sample,
                             column_id="id",
                             column_sort="time",
                             column_value="value",
                             impute_function=impute,
                             show_warnings=False,
                             disable_progressbar=True,
                             default_fc_parameters=EfficientFCParameters())
        X = X.reset_index()
        X.loc[:, 'Name'] = sample['kind']
        features_df.append(X)
    features_time_series = pd.concat(features_df)
    features_time_series.to_csv('features_time_series.csv', index=False)
Ejemplo n.º 8
0
def fit_rolling_auto_sklearn(y_train,
                             max_timeshift=10,
                             rolling_direction=1,
                             params=None,
                             my_dict_of_features=None):

    exog_lag = np.hstack((shift(np.concatenate([y_train, [0]]),
                                shift=1,
                                cval=0.0).reshape(-1, 1),
                          shift(np.concatenate([y_train, [0]]),
                                shift=2,
                                cval=0.0).reshape(-1, 1),
                          shift(np.concatenate([y_train, [0]]),
                                shift=3,
                                cval=0.0).reshape(-1, 1),
                          shift(np.concatenate([y_train, [0]]),
                                shift=12,
                                cval=0.0).reshape(-1, 1)))

    df_shift, y = make_forecasting_frame(y_train,
                                         kind="price",
                                         max_timeshift=max_timeshift,
                                         rolling_direction=rolling_direction)
    X_train = extract_features(df_shift,
                               column_id="id",
                               column_sort="time",
                               column_value="value",
                               impute_function=impute,
                               show_warnings=False,
                               default_fc_parameters=my_dict_of_features,
                               disable_progressbar=True)
    X_train.dropna(axis=1, inplace=True)
    X_train = np.array(X_train)
    ts = y_train[2:]
    exog = np.hstack((X_train[:-1], exog_lag[2:-1]))
    # print (exog)
    last_exog = np.concatenate([X_train[-1], exog_lag[-1]]).reshape(1, -1)
    feature_types = (['numerical'] * 8)
    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=1200,
        per_run_time_limit=120,
        ml_memory_limit=2048,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
    )
    automl.fit(exog,
               ts,
               dataset_name='airlines',
               feat_type=feature_types,
               metric=autosklearn.metrics.mean_squared_error)

    predict_in_sample = automl.predict(exog)

    print(automl.show_models())
    print('\nStatistics: \n', automl.sprint_statistics())

    return automl, last_exog, predict_in_sample
Ejemplo n.º 9
0
 def test_make_forecasting_frame_range(self):
     df, y = dataframe_functions.make_forecasting_frame(x=np.arange(4), kind="test",
                                                        max_timeshift=1, rolling_direction=1)
     expected_df = pd.DataFrame({"id": list(zip(["id"] * 3, np.arange(1, 4))),
                                 "kind": ["test"] * 3,
                                 "value": np.arange(3),
                                 "time": [0, 1, 2]})
     expected_y = pd.Series(data=[1, 2, 3], index=[("id", 1), ("id", 2), ("id", 3)], name="value")
     assert_frame_equal(df.sort_index(axis=1).reset_index(drop=True), expected_df.sort_index(axis=1))
     assert_series_equal(y, expected_y)
Ejemplo n.º 10
0
def predict_rolling(model,
                    last_exog,
                    y_train,
                    forecast_horizont,
                    max_timeshift=10,
                    rolling_direction=1,
                    my_dict_of_features=None):
    """
        Predicting values on the next forecast_horizont values
        """
    predictions = np.empty(forecast_horizont)
    predictions[0] = model.predict(last_exog)
    for it in range(1, forecast_horizont):

        y_train = np.append(y_train, predictions[it - 1])
        exog_lag = np.hstack((shift(np.concatenate([y_train, [0]]),
                                    shift=1,
                                    cval=0.0).reshape(-1, 1),
                              shift(np.concatenate([y_train, [0]]),
                                    shift=2,
                                    cval=0.0).reshape(-1, 1),
                              shift(np.concatenate([y_train, [0]]),
                                    shift=3,
                                    cval=0.0).reshape(-1, 1),
                              shift(np.concatenate([y_train, [0]]),
                                    shift=12,
                                    cval=0.0).reshape(-1, 1)))

        df_shift, y = make_forecasting_frame(
            y_train,
            kind="price",
            max_timeshift=max_timeshift,
            rolling_direction=rolling_direction)

        X_train = extract_features(df_shift,
                                   default_fc_parameters=my_dict_of_features,
                                   column_id="id",
                                   column_sort="time",
                                   disable_progressbar=True,
                                   column_value="value",
                                   impute_function=impute,
                                   show_warnings=False)

        X_train.dropna(axis=1, inplace=True)
        X_train = np.array(X_train)
        ts = y_train[2:]
        exog = np.concatenate([X_train[-1], exog_lag[-1]]).reshape(1, -1)

        y_pred = model.predict(exog)

    predictions[it] = y_pred

    return predictions
Ejemplo n.º 11
0
 def test_make_forecasting_frame_range(self):
     df, y = dataframe_functions.make_forecasting_frame(x=np.arange(4),
                                                        kind="test",
                                                        max_timeshift=1,
                                                        rolling_direction=1)
     expected_df = pd.DataFrame({
         "id": [1, 2, 3],
         "kind": ["test"] * 3,
         "value": [0., 1., 2.],
         "time": [0., 1., 2.]
     })
     assert_frame_equal(df.sort_index(axis=1),
                        expected_df.sort_index(axis=1))
Ejemplo n.º 12
0
    def test_make_forecasting_frame_pdSeries(self):

        t_index = pd.date_range('1/1/2011', periods=4, freq='H')
        df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index),
                                                           kind="test", max_timeshift=1, rolling_direction=1)

        expected_y = pd.Series(data=[1, 2, 3], index=pd.DatetimeIndex(["2011-01-01 01:00:00", "2011-01-01 02:00:00",
                                                                       "2011-01-01 03:00:00"]), name="value")
        expected_df = pd.DataFrame({"id": pd.DatetimeIndex(["2011-01-01 01:00:00", "2011-01-01 02:00:00",
                                                            "2011-01-01 03:00:00"]),
                                    "kind": ["test"]*3, "value": [0., 1., 2.],
                                    "time": pd.DatetimeIndex(["2011-01-01 00:00:00", "2011-01-01 01:00:00",
                                                              "2011-01-01 02:00:00"])
                                    })
        assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
        assert_series_equal(y, expected_y)
Ejemplo n.º 13
0
 def test_make_forecasting_frame_range(self):
     df, y = dataframe_functions.make_forecasting_frame(x=np.arange(4), kind="test", max_timeshift=1, rolling_direction=1)
     expected_df = pd.DataFrame({"id": [1, 2, 3], "kind": ["test"]*3, "value": [0., 1., 2.], "time": [0., 1., 2.]})
     assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
Ejemplo n.º 14
0
    def predict(
        self,
        forecast_length: int,
        future_regressor=[],
        just_point_forecast: bool = False,
    ):
        """Generates forecast data immediately following dates of index supplied to .fit()

        Args:
            forecast_length (int): Number of periods of data to forecast ahead
            regressor (numpy.Array): additional regressor
            just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts

        Returns:
            Either a PredictionObject of forecasts and metadata, or
            if just_point_forecast == True, a dataframe of point forecasts
        """
        if not _has_tsfresh:
            raise ImportError("Package tsfresh is required")
        # num_subsamples = 10
        predictStartTime = datetime.datetime.now()

        # from tsfresh import extract_features
        from tsfresh.utilities.dataframe_functions import make_forecasting_frame

        # from sklearn.ensemble import AdaBoostRegressor
        from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute

        # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters

        max_timeshift = 10
        regression_model = 'Adaboost'
        feature_selection = None

        max_timeshift = self.max_timeshift
        regression_model = self.regression_model
        feature_selection = self.feature_selection

        sktraindata = self.df_train.copy()

        X = pd.DataFrame()
        y = pd.DataFrame()
        counter = 0
        for column in sktraindata.columns:
            df_shift, current_y = make_forecasting_frame(
                sktraindata[column],
                kind="time_series",
                max_timeshift=max_timeshift,
                rolling_direction=1,
            )
            # disable_progressbar = True MinimalFCParameters EfficientFCParameters
            current_X = extract_features(
                df_shift,
                column_id="id",
                column_sort="time",
                column_value="value",
                impute_function=tsfresh_impute,
                show_warnings=False,
                default_fc_parameters=EfficientFCParameters(),
                n_jobs=1,
            )  #
            current_X["feature_last_value"] = current_y.shift(1)
            current_X.rename(columns=lambda x: str(counter) + '_' + x,
                             inplace=True)

            X = pd.concat([X, current_X], axis=1)
            y = pd.concat([y, current_y], axis=1)
            counter += 1

        # drop constant features
        X = X.loc[:, X.apply(pd.Series.nunique) != 1]
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)
        y = y.fillna(method='ffill').fillna(method='bfill')

        if feature_selection == 'Variance':
            from sklearn.feature_selection import VarianceThreshold

            sel = VarianceThreshold(threshold=(0.15))
            X = pd.DataFrame(sel.fit_transform(X))
        if feature_selection == 'Percentile':
            from sklearn.feature_selection import SelectPercentile, chi2

            X = pd.DataFrame(
                SelectPercentile(chi2, percentile=20).fit_transform(
                    X, y[y.columns[0]]))
        if feature_selection == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.feature_selection import SelectFromModel

            clf = DecisionTreeRegressor()
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        if feature_selection == 'Lasso':
            from sklearn.linear_model import MultiTaskLasso
            from sklearn.feature_selection import SelectFromModel

            clf = MultiTaskLasso(max_iter=2000)
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        """
         decisionTreeList = X.columns[model.get_support()]
         LassoList = X.columns[model.get_support()]
         
         feature_list = decisionTreeList.to_list()
         set([x for x in feature_list if feature_list.count(x) > 1])
         from collections import Counter
         repeat_features = Counter(feature_list)
         repeat_features = repeat_features.most_common(20)
        """

        # Drop first line
        X = X.iloc[1:, ]
        y = y.iloc[1:]

        y = y.fillna(method='ffill').fillna(method='bfill')

        index = self.create_forecast_index(forecast_length=forecast_length)

        if regression_model == 'ElasticNet':
            from sklearn.linear_model import MultiTaskElasticNet

            regr = MultiTaskElasticNet(alpha=1.0,
                                       random_state=self.random_seed)
        elif regression_model == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor

            regr = DecisionTreeRegressor(random_state=self.random_seed)
        elif regression_model == 'MLP':
            from sklearn.neural_network import MLPRegressor

            # relu/tanh lbfgs/adam layer_sizes (100) (10)
            regr = MLPRegressor(
                hidden_layer_sizes=(10, 25, 10),
                verbose=self.verbose_bool,
                max_iter=200,
                activation='tanh',
                solver='lbfgs',
                random_state=self.random_seed,
            )
        elif regression_model == 'KNN':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.neighbors import KNeighborsRegressor

            regr = MultiOutputRegressor(
                KNeighborsRegressor(random_state=self.random_seed))
        elif regression_model == 'Adaboost':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.ensemble import AdaBoostRegressor

            regr = MultiOutputRegressor(AdaBoostRegressor(
                n_estimators=200))  # , random_state=self.random_seed))
        else:
            regression_model = 'RandomForest'
            from sklearn.ensemble import RandomForestRegressor

            regr = RandomForestRegressor(random_state=self.random_seed,
                                         n_estimators=1000,
                                         verbose=self.verbose)

        regr.fit(X, y)

        combined_index = self.df_train.index.append(index)
        forecast = pd.DataFrame()
        sktraindata.columns = [x for x in range(len(sktraindata.columns))]

        for x in range(forecast_length):
            x_dat = pd.DataFrame()
            y_dat = pd.DataFrame()
            counter = 0
            for column in sktraindata.columns:
                df_shift, current_y = make_forecasting_frame(
                    sktraindata.tail(max_timeshift)[column],
                    kind="time_series",
                    max_timeshift=max_timeshift,
                    rolling_direction=1,
                )
                # disable_progressbar = True MinimalFCParameters EfficientFCParameters
                current_X = extract_features(
                    df_shift,
                    column_id="id",
                    column_sort="time",
                    column_value="value",
                    impute_function=tsfresh_impute,
                    show_warnings=False,
                    n_jobs=1,
                    default_fc_parameters=EfficientFCParameters(),
                )  # default_fc_parameters=MinimalFCParameters(),
                current_X["feature_last_value"] = current_y.shift(1)

                current_X.rename(columns=lambda x: str(counter) + '_' + x,
                                 inplace=True)

                x_dat = pd.concat([x_dat, current_X], axis=1)
                y_dat = pd.concat([y_dat, current_y], axis=1)
                counter += 1

            x_dat = x_dat[X.columns]
            rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values))

            forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True)
            sktraindata = pd.concat([sktraindata, rfPred],
                                    axis=0,
                                    ignore_index=True)
            sktraindata.index = combined_index[:len(sktraindata.index)]

        forecast.columns = self.column_names
        forecast.index = index

        if just_point_forecast:
            return forecast
        else:
            upper_forecast, lower_forecast = Point_to_Probability(
                self.df_train,
                forecast,
                prediction_interval=self.prediction_interval)

            predict_runtime = datetime.datetime.now() - predictStartTime
            prediction = PredictionObject(
                model_name=self.name,
                forecast_length=forecast_length,
                forecast_index=forecast.index,
                forecast_columns=forecast.columns,
                lower_forecast=lower_forecast,
                forecast=forecast,
                upper_forecast=upper_forecast,
                prediction_interval=self.prediction_interval,
                predict_runtime=predict_runtime,
                fit_runtime=self.fit_runtime,
                model_parameters=self.get_params(),
            )
            return prediction
Ejemplo n.º 15
0
def extract_tsfresh_features(timeseries, window_size, threshold):

    # tsfresh make_forecasting_frame rolling window
    df_shift, y = make_forecasting_frame(timeseries,
                                         kind="x",
                                         max_timeshift=window_size,
                                         rolling_direction=1)

    settings_original = EfficientFCParameters()

    # caculate all features
    All_features = extract_features(df_shift,
                                    column_id="id",
                                    column_sort="time",
                                    column_value="value",
                                    default_fc_parameters=settings_original,
                                    impute_function=None,
                                    disable_progressbar=True,
                                    show_warnings=False,
                                    n_jobs=8)

    # drop the the first window size values
    All_features = All_features.iloc[window_size - 1:]
    y = y.iloc[window_size - 1:]

    # tsfresh fileter out relevant featrues through significant test
    #kind_to_fc_parameters =  filter_features(All_features, y, threshold)

    #drop columns witch are all nan
    All_features = All_features.dropna(axis=1, how='all')

    # nan percentage
    nan_percentage = (All_features.shape[0] -
                      All_features.count()) / All_features.shape[0]
    index = nan_percentage.index
    for i in range(0, len(nan_percentage)):
        if nan_percentage[i] > threshold:
            del All_features[index[i]]

    # drop constant features
    All_features = All_features.loc[:,
                                    All_features.apply(pd.Series.nunique) != 1]
    All_features.replace([np.inf, -np.inf], np.nan)
    if All_features.isnull().values.any():
        All_features = All_features.fillna(All_features.mean())

    #filter out not important features
    All_features = select_features(All_features, y)

    kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(
        All_features)

    if len(kind_to_fc_parameters) > 0:
        temp = extract_features(df_shift.iloc[:3, :],
                                column_id="id",
                                column_sort="time",
                                column_value="value",
                                kind_to_fc_parameters=kind_to_fc_parameters,
                                impute_function=None,
                                disable_progressbar=True,
                                show_warnings=False,
                                n_jobs=8)

        All_features = All_features[temp.columns & All_features.columns]

        return All_features, False

    else:

        return None, True
def split_into_train_test_out_tsfresh(data, in_num):
    """
    Get the time series to be used for feature extraction
    y_train is the y value of the data fitting data
    
    """

    data1 = np.roll(data, -1)  # roll the data once

    #make the dataframe using Tsfresh package
    df_shift_small, y_train = make_forecasting_frame(data1,
                                                     kind="price",
                                                     max_timeshift=in_num,
                                                     rolling_direction=1)

    #create the features needed for the
    result = extract_features(df_shift_small,
                              column_id="id",
                              column_sort="time",
                              column_value="value",
                              impute_function=impute,
                              show_warnings=False,
                              disable_progressbar=False,
                              n_jobs=5,
                              chunksize=1,
                              default_fc_parameters=EfficientFCParameters())

    #result_without_zero = result.loc[:, (result != 0).any(axis=0)]
    #the 50 columns i only need out tsfresh
    columl_list = [
        #            'value__absolute_sum_of_changes',
        # =============================================================================
        # =============================================================================
        'value__agg_autocorrelation__f_agg_"mean"',
        'value__agg_autocorrelation__f_agg_"median"',
        'value__agg_autocorrelation__f_agg_"var"',
        'value__autocorrelation__lag_0',
        'value__autocorrelation__lag_1',
        'value__autocorrelation__lag_2',
        'value__binned_entropy__max_bins_10',
        # =============================================================================
        # =============================================================================
        #                     'value__cid_ce__normalize_False',
        #                     'value__cid_ce__normalize_True',
        #                     'value__count_above_mean',
        #                     'value__count_below_mean',
        #                     'value__fft_aggregated__aggtype_"centroid"',
        'value__fft_aggregated__aggtype_"variance"',
        'value__fft_coefficient__coeff_0__attr_"abs"',
        'value__fft_coefficient__coeff_0__attr_"real"',
        'value__fft_coefficient__coeff_1__attr_"abs"',
        'value__fft_coefficient__coeff_1__attr_"angle"',
        'value__fft_coefficient__coeff_1__attr_"imag"',
        'value__fft_coefficient__coeff_1__attr_"real"',
        'value__first_location_of_maximum',
        #=============================================================================
        # =============================================================================
        'value__large_standard_deviation__r_0.05',
        'value__large_standard_deviation__r_0.1',
        'value__large_standard_deviation__r_0.15000000000000002',
        'value__large_standard_deviation__r_0.2',
        'value__large_standard_deviation__r_0.25',
        #                       'value__large_standard_deviation__r_0.30000000000000004',
        #                       'value__large_standard_deviation__r_0.35000000000000003',
        #                       'value__large_standard_deviation__r_0.4',
        #                       'value__large_standard_deviation__r_0.45',
        # =============================================================================
        # =============================================================================
        'value__linear_trend__attr_"intercept"',
        'value__linear_trend__attr_"pvalue"',
        'value__linear_trend__attr_"rvalue"',
        'value__linear_trend__attr_"slope"',
        'value__longest_strike_above_mean',
        'value__longest_strike_below_mean',
        'value__max_langevin_fixed_point__m_3__r_30',
        'value__maximum',
        'value__mean',
        'value__mean_abs_change',
        'value__mean_change',
        'value__median',
        'value__minimum',
        'value__number_cwt_peaks__n_5',
        'value__partial_autocorrelation__lag_0',
        'value__partial_autocorrelation__lag_1',
        'value__partial_autocorrelation__lag_2',
        'value__standard_deviation',
        'value__sum_values',
        'value__variance'
    ]
    #extract just only those colums
    result_without_zero = result[columl_list]

    #return these values
    x_train = result_without_zero[:-1]
    x_test = result_without_zero[-1:]
    y_train = y_train[:-1]

    return x_train, y_train, x_test
Ejemplo n.º 17
0
def get_tsfresh_features(df=None, max_timeshift=10, n_jobs=10):
    from tsfresh.utilities.dataframe_functions import make_forecasting_frame
    from tsfresh.utilities.dataframe_functions import impute
    from tsfresh.feature_extraction import extract_features
    import pandas as pd
    if max_timeshift > 10:
        d = {
            'skewness':
            None,
            'kurtosis':
            None,
            'quantile': [{
                'q': 0.05
            }, {
                'q': 0.95
            }],
            'linear_trend': [{
                'attr': 'slope'
            }],
            'mean_abs_change':
            None,
            'mean_second_derivative_central':
            None,
            'fft_aggregated': [{
                'aggtype': "centroid"
            }, {
                'aggtype': "variance"
            }, {
                'aggtype': "skew"
            }, {
                'aggtype': "kurtosis"
            }],
            #   'max_min_diff': None,
            #   'max_slope': None,
            #   'min_slope': None
        }
    else:
        d = {
            'mean': None,
            'maximum': None,
            'minimum': None,
            'mean_abs_change': None,
            'mean_second_derivative_central': None,
            #  'max_min_diff': None,
            #  'max_slope': None,
            #  'min_slope': None
        }

    df = df.fillna(method='ffill')
    df_tsfresh = df.reset_index(level=[0, 1], drop=True)
    dfs = {}
    cols_to_calc = [
        "rougher.input.feed_fe", "rougher.input.feed_zn",
        "rougher.input.feed_sol", "rougher.input.feed_pb",
        "rougher.input.feed_rate", 'rougher.input.floatbank11_xanthate',
        'rougher.input.floatbank10_copper_sulfate',
        'rougher.state.floatbank10_b_air',
        "secondary_cleaner.state.floatbank5_a_air",
        "primary_cleaner.input.copper_sulfate",
        "primary_cleaner.state.floatbank8_a_air",
        "primary_cleaner.input.depressant", "primary_cleaner.input.feed_size",
        "primary_cleaner.input.xanthate"
    ]

    for c in cols_to_calc:
        print(f'Working on {c}...')
        df_shift, y = make_forecasting_frame(df_tsfresh[c],
                                             kind="price",
                                             max_timeshift=max_timeshift,
                                             rolling_direction=1)
        X = extract_features(df_shift,
                             column_id="id",
                             column_sort="time",
                             column_value="value",
                             impute_function=impute,
                             show_warnings=False,
                             default_fc_parameters=d,
                             n_jobs=n_jobs)
        dfs[c] = X
    df_tsfresh_feats = pd.concat(dfs, keys=list(dfs.keys()))
    df_tsfresh_feats.columns = [
        f'{i}_p{max_timeshift}' for i in df_tsfresh_feats.columns
    ]
    return df_tsfresh_feats
Ejemplo n.º 18
0
import pandas as pd
import tsfresh
from tsfresh.utilities.dataframe_functions import make_forecasting_frame

# df = pd.read_csv('../data/input/international-airline-passengers.csv', index_col=0)
# df.index = range(len(df))

df = pd.DataFrame().from_dict({'y': [1, 2, 3, 4]})
print(df)

x, y = make_forecasting_frame(
    x=df['y'],
    kind='ts',
    max_timeshift=1,
    rolling_direction=1
)

print(x)

print(y)
Ejemplo n.º 19
0
#        [df] + [df.shift(i).rename(columns=lambda c: "{}_lag_{}".format(c, i))
#                for i in range(1, max_lags)], axis=1).dropna()


# rolled = roll_time_series(df,
#                          column_id='id',
#                          column_sort='timestamp',
#                          column_kind='kind',
#                          rolling_direction=1,
#                          max_timeshift=2)
scaled_value = MinMaxScaler().fit_transform(
    df.value.values.reshape(-1, 1))[:, 0]
scaled_value = pd.Series(scaled_value, index=df.index, name=df.value.name)

df_shift, y = make_forecasting_frame(scaled_value,
                                     kind="kind",
                                     max_timeshift=10,
                                     rolling_direction=1)

if False:
    extract_start = time.time()
    X = extract_features(df_shift,
                         column_id="id",
                         column_sort="time",
                         column_value="value",
                         impute_function=impute,
                         n_jobs=8,
                         show_warnings=False)
    extract_end = time.time()
    print("Extraction time: {}".format(extract_end - extract_start))
    raw_feat_num = X.shape[1]
    print("Extracted {} features.".format(raw_feat_num))