def transform(self, X: dt.Frame): """Transform features once grouped by Time Group Columns (TGC)""" # With the col_type set to "all", X can contain text features # So restrict to int float and bool types # This is easily done in datatable X = X[:, [int, float, bool]] # If after the filtering there are no features left then just return a zero valued features if X.ncols == 0: return np.zeros(X.nrows) # Move to pandas to use the apply method X = X.to_pandas() group_cols = [_f for _f in self.tgc if _f != self.time_column] # Check if we really have any group columns available if len(group_cols) == 0: # Apply MACD directly on the available features but drop the time column features = [_f for _f in X.columns if _f != self.time_column] return self.normalized_macd(X[features]) # Get the data columns, i.e. not the group columns or time column col = np.setdiff1d(X.columns, self.tgc) if len(col) > 0: # Groupby by the TGC and apply normalized MACD to the data # Pandas.apply ios not time effective so should move this to data table res = X.groupby(group_cols)[col].apply(self.normalized_macd) res.index = X.index return res else: return np.zeros(X.nrows)
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Predict y using unique dates X_time = X[['ds']].groupby('ds').first().reset_index() with suppress_stdout_stderr(): y_avg = self.model.predict(X_time)[['ds', 'yhat']] # Prophet transforms the date column to datetime so we need to transfrom that to merge back X_time.sort_values('ds', inplace=True) X_time['yhat'] = y_avg['yhat'] X_time.sort_index(inplace=True) # Merge back into original frame on 'ds' # pd.merge wipes the index ... so keep it to provide it again indices = X.index X = pd.merge( left=X, right=X_time[['ds', 'yhat']], on='ds', how='left' ) X.index = indices # Go through groups and recover the scaled target for knowed groups if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] inverted_ys = [] for key, X_grp in X_groups: grp_hash = self.get_hash(key) # Scale target for current group if grp_hash in self.scalers.keys(): inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']]) else: inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']]) # Put back in a DataFrame to keep track of original index inverted_df = pd.DataFrame(inverted_y, columns=['yhat']) inverted_df.index = X_grp.index inverted_ys.append(inverted_df) XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index() if self.top_groups: # Go though the groups and predict only top XX_paths = [] model_paths = [] def processor(out, res): out.append(res) num_tasks = len(self.top_groups) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) nb_groups = len(X_groups) for _i_g, (key, X_grp) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups)) # Create dict key to store the min max scaler grp_hash = self.get_hash(key) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) if grp_hash not in self.top_groups: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue if self.grp_models[grp_hash] is None: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue model = self.grp_models[grp_hash] model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X_grp, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) pool.finish() XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) features_df = pd.DataFrame() features_df[self.display_name + '_GrpAvg'] = XX_general['yhat'] if self.top_groups: features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat'] self._output_feature_names = list(features_df.columns) self._feature_desc = list(features_df.columns) return features_df
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits FB Prophet models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) try: # Add value of prophet_top_n in recipe_dict variable inside of config.toml file # eg1: recipe_dict="{'prophet_top_n': 200}" # eg2: recipe_dict="{'prophet_top_n':10}" self.top_n = config.recipe_dict['prophet_top_n'] except KeyError: self.top_n = 50 loggerinfo(logger, f"Prophet will use {self.top_n} groups as well as average target data.") tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Add target, Label encoder is only used for Classif. which we don't support... if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) X['y'] = np.array(y) self.nan_value = X['y'].mean() # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Create a general scale now that will be used for unknown groups at prediction time # Can we do smarter than that ? self.general_scaler = MinMaxScaler().fit(X[['y', 'ds']].groupby('ds').median().values) # Go through groups and standard scale them if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] self.scalers = {} scaled_ys = [] print(f'{datetime.now()} Start of group scaling') for key, X_grp in X_groups: # Create dict key to store the min max scaler grp_hash = self.get_hash(key) # Scale target for current group self.scalers[grp_hash] = MinMaxScaler() y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values) # Put back in a DataFrame to keep track of original index y_skl_df = pd.DataFrame(y_skl, columns=['y']) # (0, 'A') (1, 4) (100, 1) (100, 1) # print(grp_hash, X_grp.shape, y_skl.shape, y_skl_df.shape) y_skl_df.index = X_grp.index scaled_ys.append(y_skl_df) print(f'{datetime.now()} End of group scaling') # Set target back in original frame but keep original X['y_orig'] = X['y'] X['y'] = pd.concat(tuple(scaled_ys), axis=0) # Now Average groups X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index() # Send that to Prophet params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") self.model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) if params["country_holidays"] is not None: self.model.add_country_holidays(country_name=params["country_holidays"]) if params["monthly_seasonality"]: self.model.add_seasonality(name='monthly', period=30.5, fourier_order=5) with suppress_stdout_stderr(): self.model.fit(X[['ds', 'y']]) print(f'{datetime.now()} General Model Fitted') self.top_groups = None if len(tgc_wo_time) > 0: if self.top_n > 0: top_n_grp = X.groupby(tgc_wo_time).size().sort_values().reset_index()[tgc_wo_time].iloc[-self.top_n:].values self.top_groups = [ '_'.join(map(str, key)) for key in top_n_grp ] if self.top_groups: self.grp_models = {} self.priors = {} # Prepare for multi processing num_tasks = len(self.top_groups) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.") loggerinfo(logger, "Prophet parameters holidays {} / monthly {}".format(self.country_holidays, self.monthly_seasonality)) pool = pool_to_use( logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs ) # # Fit 1 FB Prophet model per time group columns nb_groups = len(X_groups) # Put y back to its unscaled value for top groups X['y'] = X['y_orig'] for _i_g, (key, X) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) grp_hash = self.get_hash(key) if grp_hash not in self.top_groups: continue self.priors[grp_hash] = X['y'].mean() params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } args = (X_path, grp_hash, tmp_folder, params) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.grp_models) pool.finish() for k, v in self.grp_models.items(): self.grp_models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self