Esempio n. 1
0
    def transform(self, X: dt.Frame):
        """
        Uses fitted models (1 per time group) to predict the target
        If self.is_train exists, it means we are doing in-sample predictions
        if it does not then we Arima is used to predict the future
        :param X: Datatable Frame containing the features
        :return: ARIMA predictions
        """
        logger = self._get_experiment_logger()

        # 0. Preliminary steps
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        X.rename(columns={self.time_column: "ds"}, inplace=True)
        X['ds'] = pd.to_datetime(
            X['ds'], format=self.datetime_formats[self.time_column])

        # 1. Predict with average model
        if self.avg_model is not None:
            X_time = X[['ds']].groupby('ds').first().reset_index()
            if hasattr(self, 'is_train'):
                yhat = self.avg_model.predict_in_sample()
            else:
                yhat = self.avg_model.predict(n_periods=self.pred_gap +
                                              X_time.shape[0])
                # Assign predictions the same order the dates had
                yhat = yhat[self.pred_gap:]

            X_time.sort_values('ds', inplace=True)
            X_time['yhat'] = yhat
            X_time.sort_index(inplace=True)
            # Merge back the average prediction to all similar timestamps
            indices = X.index
            X = pd.merge(left=X,
                         right=X_time[['ds', 'yhat']],
                         on='ds',
                         how='left')
            X.index = indices
        else:
            X['yhat'] = np.nan

        y_avg_model = X['yhat'].values
        y_predictions = pd.DataFrame(y_avg_model, columns=['average_pred'])

        # 2. Predict for individual group
        # Go through groups
        for i_tgc, grp_col in enumerate(tgc_wo_time):
            y_hat_tgc = np.zeros(X.shape[0])

            # Get the unique dates to be predicted
            X_groups = X[['ds', grp_col]].groupby(grp_col)

            nb_groups = len(X_groups)
            dfs = []
            for _i_g, (key, X_grp) in enumerate(X_groups):
                # Just say where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(
                        logger, "Auto ARIMA : %d%% of groups transformed" %
                        (100 * (_i_g + 1) // nb_groups))

                grp_hash = self.get_hash(grp_col, key)
                try:
                    model = self.models[grp_hash]
                except KeyError:
                    model = None

                # Find unique datetime
                X_time = X_grp[['ds']].groupby('ds').first().reset_index()
                X_time['ds'] = pd.to_datetime(
                    X_time['ds'],
                    format=self.datetime_formats[self.time_column])
                X_time = X_time.sort_values('ds')

                if model is not None:
                    # Get predictions from ARIMA model, make sure we include prediction gaps
                    if hasattr(self, 'is_train'):
                        print(X_grp.shape, model.predict_in_sample().shape)
                        # It can happen that in_sample predictions are smaller than the training set used
                        pred = model.predict_in_sample()
                        tmp = np.zeros(X_time.shape[0])
                        tmp[:len(pred)] = pred
                        X_time['yhat'] = tmp
                    else:
                        # In ARIMA, you provide the number of periods you predict on
                        # So you have to
                        yhat = model.predict(n_periods=self.pred_gap +
                                             X_time.shape[0])
                        X_time['yhat'] = yhat[self.pred_gap:]

                    # Now merge back the predictions into X_grp
                    indices = X_grp.index
                    X_grp = pd.merge(left=X_grp,
                                     right=X_time[['ds', 'yhat']],
                                     on='ds',
                                     how='left')
                    X_grp.index = indices
                else:
                    X_grp = X_grp.copy()
                    X_grp['yhat'] = np.nan

                dfs.append(X_grp['yhat'])

            y_predictions[f'{grp_col}_pred'] = pd.concat(dfs, axis=0)

        # Now we have to invert scale all this
        for grp_col in tgc_wo_time:
            # Add time group to the predictions, will be used to invert scaling
            y_predictions[grp_col] = X[grp_col].copy()
            # Fill NaN
            y_predictions[f'{grp_col}_pred'] = y_predictions[
                f'{grp_col}_pred'].fillna(y_predictions['average_pred'])

        # Go through groups and recover the scaled target for knowed groups
        if len(tgc_wo_time) > 0:
            X_groups = y_predictions.groupby(tgc_wo_time)
        else:
            X_groups = [([None], y_predictions)]

        for _f in [f'{grp_col}_pred'
                   for grp_col in tgc_wo_time] + ['average_pred']:
            inverted_ys = []
            for key, X_grp in X_groups:
                grp_hash = self.get_hash(key)
                # Scale target for current group
                if grp_hash in self.scalers.keys():
                    inverted_y = self.scalers[grp_hash].inverse_transform(
                        X_grp[[_f]])
                else:
                    inverted_y = self.general_scaler.inverse_transform(
                        X_grp[[_f]])

                # Put back in a DataFrame to keep track of original index
                inverted_df = pd.DataFrame(inverted_y, columns=[_f])
                inverted_df.index = X_grp.index
                inverted_ys.append(inverted_df)
            y_predictions[_f] = pd.concat(tuple(inverted_ys),
                                          axis=0).sort_index()[_f]

        y_predictions.drop(tgc_wo_time, axis=1, inplace=True)

        self._output_feature_names = [
            f'{self.display_name}{orig_feat_prefix}{self.time_column}{extra_prefix}{_f}'
            for _f in y_predictions
        ]
        self._feature_desc = self._output_feature_names
        return y_predictions
Esempio n. 2
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Predict y using unique dates
        X_time = X[['ds']].groupby('ds').first().reset_index()
        with suppress_stdout_stderr():
            y_avg = self.model.predict(X_time)[['ds', 'yhat']]

        # Prophet transforms the date column to datetime so we need to transfrom that to merge back
        X_time.sort_values('ds', inplace=True)
        X_time['yhat'] = y_avg['yhat']
        X_time.sort_index(inplace=True)

        # Merge back into original frame on 'ds'
        # pd.merge wipes the index ... so keep it to provide it again
        indices = X.index
        X = pd.merge(
            left=X,
            right=X_time[['ds', 'yhat']],
            on='ds',
            how='left'
        )
        X.index = indices

        # Go through groups and recover the scaled target for knowed groups
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        inverted_ys = []
        for key, X_grp in X_groups:
            grp_hash = self.get_hash(key)

            # Scale target for current group
            if grp_hash in self.scalers.keys():
                inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']])
            else:
                inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']])

            # Put back in a DataFrame to keep track of original index
            inverted_df = pd.DataFrame(inverted_y, columns=['yhat'])
            inverted_df.index = X_grp.index
            inverted_ys.append(inverted_df)

        XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index()

        if self.top_groups:
            # Go though the groups and predict only top
            XX_paths = []
            model_paths = []

            def processor(out, res):
                out.append(res)
            num_tasks = len(self.top_groups)
            pool_to_use = small_job_pool
            pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs)

            nb_groups = len(X_groups)
            for _i_g, (key, X_grp) in enumerate(X_groups):

                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups))

                # Create dict key to store the min max scaler
                grp_hash = self.get_hash(key)
                X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4()))

                if grp_hash not in self.top_groups:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                if self.grp_models[grp_hash] is None:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                model = self.grp_models[grp_hash]
                model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X_grp, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)

            pool.finish()
            XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
            for p in XX_paths + model_paths:
                remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        features_df = pd.DataFrame()
        features_df[self.display_name + '_GrpAvg'] = XX_general['yhat']

        if self.top_groups:
            features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat']

        self._output_feature_names = list(features_df.columns)
        self._feature_desc = list(features_df.columns)

        return features_df