def transform(self, X: dt.Frame): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ logger = self._get_experiment_logger() # 0. Preliminary steps tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) X.rename(columns={self.time_column: "ds"}, inplace=True) X['ds'] = pd.to_datetime( X['ds'], format=self.datetime_formats[self.time_column]) # 1. Predict with average model if self.avg_model is not None: X_time = X[['ds']].groupby('ds').first().reset_index() if hasattr(self, 'is_train'): yhat = self.avg_model.predict_in_sample() else: yhat = self.avg_model.predict(n_periods=self.pred_gap + X_time.shape[0]) # Assign predictions the same order the dates had yhat = yhat[self.pred_gap:] X_time.sort_values('ds', inplace=True) X_time['yhat'] = yhat X_time.sort_index(inplace=True) # Merge back the average prediction to all similar timestamps indices = X.index X = pd.merge(left=X, right=X_time[['ds', 'yhat']], on='ds', how='left') X.index = indices else: X['yhat'] = np.nan y_avg_model = X['yhat'].values y_predictions = pd.DataFrame(y_avg_model, columns=['average_pred']) # 2. Predict for individual group # Go through groups for i_tgc, grp_col in enumerate(tgc_wo_time): y_hat_tgc = np.zeros(X.shape[0]) # Get the unique dates to be predicted X_groups = X[['ds', grp_col]].groupby(grp_col) nb_groups = len(X_groups) dfs = [] for _i_g, (key, X_grp) in enumerate(X_groups): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) grp_hash = self.get_hash(grp_col, key) try: model = self.models[grp_hash] except KeyError: model = None # Find unique datetime X_time = X_grp[['ds']].groupby('ds').first().reset_index() X_time['ds'] = pd.to_datetime( X_time['ds'], format=self.datetime_formats[self.time_column]) X_time = X_time.sort_values('ds') if model is not None: # Get predictions from ARIMA model, make sure we include prediction gaps if hasattr(self, 'is_train'): print(X_grp.shape, model.predict_in_sample().shape) # It can happen that in_sample predictions are smaller than the training set used pred = model.predict_in_sample() tmp = np.zeros(X_time.shape[0]) tmp[:len(pred)] = pred X_time['yhat'] = tmp else: # In ARIMA, you provide the number of periods you predict on # So you have to yhat = model.predict(n_periods=self.pred_gap + X_time.shape[0]) X_time['yhat'] = yhat[self.pred_gap:] # Now merge back the predictions into X_grp indices = X_grp.index X_grp = pd.merge(left=X_grp, right=X_time[['ds', 'yhat']], on='ds', how='left') X_grp.index = indices else: X_grp = X_grp.copy() X_grp['yhat'] = np.nan dfs.append(X_grp['yhat']) y_predictions[f'{grp_col}_pred'] = pd.concat(dfs, axis=0) # Now we have to invert scale all this for grp_col in tgc_wo_time: # Add time group to the predictions, will be used to invert scaling y_predictions[grp_col] = X[grp_col].copy() # Fill NaN y_predictions[f'{grp_col}_pred'] = y_predictions[ f'{grp_col}_pred'].fillna(y_predictions['average_pred']) # Go through groups and recover the scaled target for knowed groups if len(tgc_wo_time) > 0: X_groups = y_predictions.groupby(tgc_wo_time) else: X_groups = [([None], y_predictions)] for _f in [f'{grp_col}_pred' for grp_col in tgc_wo_time] + ['average_pred']: inverted_ys = [] for key, X_grp in X_groups: grp_hash = self.get_hash(key) # Scale target for current group if grp_hash in self.scalers.keys(): inverted_y = self.scalers[grp_hash].inverse_transform( X_grp[[_f]]) else: inverted_y = self.general_scaler.inverse_transform( X_grp[[_f]]) # Put back in a DataFrame to keep track of original index inverted_df = pd.DataFrame(inverted_y, columns=[_f]) inverted_df.index = X_grp.index inverted_ys.append(inverted_df) y_predictions[_f] = pd.concat(tuple(inverted_ys), axis=0).sort_index()[_f] y_predictions.drop(tgc_wo_time, axis=1, inplace=True) self._output_feature_names = [ f'{self.display_name}{orig_feat_prefix}{self.time_column}{extra_prefix}{_f}' for _f in y_predictions ] self._feature_desc = self._output_feature_names return y_predictions
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Predict y using unique dates X_time = X[['ds']].groupby('ds').first().reset_index() with suppress_stdout_stderr(): y_avg = self.model.predict(X_time)[['ds', 'yhat']] # Prophet transforms the date column to datetime so we need to transfrom that to merge back X_time.sort_values('ds', inplace=True) X_time['yhat'] = y_avg['yhat'] X_time.sort_index(inplace=True) # Merge back into original frame on 'ds' # pd.merge wipes the index ... so keep it to provide it again indices = X.index X = pd.merge( left=X, right=X_time[['ds', 'yhat']], on='ds', how='left' ) X.index = indices # Go through groups and recover the scaled target for knowed groups if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] inverted_ys = [] for key, X_grp in X_groups: grp_hash = self.get_hash(key) # Scale target for current group if grp_hash in self.scalers.keys(): inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']]) else: inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']]) # Put back in a DataFrame to keep track of original index inverted_df = pd.DataFrame(inverted_y, columns=['yhat']) inverted_df.index = X_grp.index inverted_ys.append(inverted_df) XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index() if self.top_groups: # Go though the groups and predict only top XX_paths = [] model_paths = [] def processor(out, res): out.append(res) num_tasks = len(self.top_groups) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) nb_groups = len(X_groups) for _i_g, (key, X_grp) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups)) # Create dict key to store the min max scaler grp_hash = self.get_hash(key) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) if grp_hash not in self.top_groups: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue if self.grp_models[grp_hash] is None: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue model = self.grp_models[grp_hash] model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X_grp, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) pool.finish() XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) features_df = pd.DataFrame() features_df[self.display_name + '_GrpAvg'] = XX_general['yhat'] if self.top_groups: features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat'] self._output_feature_names = list(features_df.columns) self._feature_desc = list(features_df.columns) return features_df