def transform(self, X: dt.Frame): logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) try: X = dt.Frame(X) original_zip_column_name = X.names[0] X.names = ['zip_key'] X = X[:, str('zip_key')] zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0] zip_features = [self.get_zipcode_features(x) for x in zip_list] X_g = dt.Frame({"zip_key": zip_list}) X_g.cbind(dt.Frame(zip_features)) X_g.key = 'zip_key' X_result = X[:, :, dt.join(X_g)] self._output_feature_names = [ "{}.{}".format(original_zip_column_name, f) for f in list(X_result[:, 1:].names) ] self._feature_desc = [ "Property '{}' of US zipcode found in '{}'".format( f, original_zip_column_name) for f in list(X_result[:, 1:].names) ] return X_result[:, 1:] except Exception as ex: loggerwarning( logger, "USZipcodeDatabaseTransformer got exception {}".format( type(ex).__name__)) return np.zeros(X.shape[0])
def get_experiment_logger(self): logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) return logger
def logger(self): from h2oaicore import application_context from h2oaicore.systemutils import exp_dir # Don't assign to self, not picklable return make_experiment_logger( experiment_id=application_context.context.experiment_id, tmp_dir=None, experiment_tmp_dir=exp_dir())
def _get_experiment_logger(self): # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) return logger
def transform(self, X: dt.Frame): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) nb_groups = len(XX_grp) preds = [] for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # print("auto arima - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) order = np.argsort(X[self.time_column]) if grp_hash in self.models: model = self.models[grp_hash] if model is not None: yhat = model.predict_in_sample() \ if hasattr(self, 'is_train') else model.predict(n_periods=X.shape[0]) yhat = yhat[order] XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # invalid model else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups XX.index = X.index preds.append(XX) XX = pd.concat(tuple(preds), axis=0).sort_index() return XX
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) maybe_download_language_model(logger, save_directory=self.__class__._model_path, model_link=self.__class__._model_link, config_link=self.__class__._config_link, vocab_link=self.__class__._vocab_link) super().fit(X, y, sample_weight, eval_set, sample_weight_eval_set, **kwargs)
def fit(self, X: dt.Frame, y: np.array = None): """ Fits ARIMA models (1 per time group) using historical target values contained in y :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Import the ARIMA python module pm = importlib.import_module('pmdarima') # Init models self.models = {} # Convert to pandas X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Build 1 ARIMA model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # print("auto arima - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) order = np.argsort(X[self.time_column]) try: model = pm.auto_arima(X['y'].values[order], error_action='ignore') except: model = None self.models[grp_hash] = model return self
def transform(self, X: dt.Frame): logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) X = dt.Frame(X) original_zip_column_name = X.names[0] X = X[:, dt.str64(dt.f[0])] X.names = ['zip_key'] try: zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0] + ['79936'] zip_features = [self.get_zipcode_features(x) for x in zip_list] X_g = dt.Frame({"zip_key": zip_list}) X_g.cbind(dt.Frame(zip_features)) X_g.key = 'zip_key' X_result = X[:, :, dt.join(X_g)] self._output_feature_names = [ "{}:{}.{}".format(self.transformer_name, original_zip_column_name, self.replaceBannedCharacters(f)) for f in list(X_result[:, 1:].names) ] self._feature_desc = [ "Property '{}' of zipcode column ['{}'] from US zipcode database (recipe '{}')" .format(f, original_zip_column_name, self.transformer_name) for f in list(X_result[:, 1:].names) ] return X_result[:, 1:] except ValueError as ve: loggerinfo( logger, "Column '{}' is not a zipcode: {}".format( original_zip_column_name, str(ve))) return self.get_zipcode_null_result(X, original_zip_column_name) except TypeError as te: loggerwarning( logger, "Column '{}' triggered TypeError: {}".format( original_zip_column_name, str(te))) raise te
def fit_transform(self, X: dt.Frame, y: np.array = None, **kwargs): X_original = X # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir, username=self.context.username, ) self._output_feature_names = [ "pre:" + x for x in list(X_original.names) ] self._feature_desc = [ "Pre-transformed feature " + x for x in list(X_original.names) ] return X_original
def mutate_params(self, accuracy, time_tolerance, interpretability, **kwargs): logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Default version is do no mutation # Otherwise, change self.params for this model holiday_choice = [None, "US", "UK", "DE", "FRA"] if accuracy >= 8: weekly_choice = [False, 'auto', 5, 7, 10, 15] yearly_choice = [False, 'auto', 5, 10, 15, 20, 30] monthly_choice = [False, 3, 5, 7, 10] quarterly_choice = [False, 3, 5, 7, 10] elif accuracy >= 5: weekly_choice = [False, 'auto', 10, 20] yearly_choice = [False, 'auto', 10, 20] monthly_choice = [False, 5] quarterly_choice = [False, 5] else: # No alternative seasonality, and no seasonality override for weekly and yearly weekly_choice = [False, 'auto'] yearly_choice = [False, 'auto'] monthly_choice = [False] quarterly_choice = [False] self.params["country_holidays"] = np.random.choice(holiday_choice) self.params["seasonality_mode"] = np.random.choice( ["additive", "multiplicative"]) self.params["weekly_seasonality"] = np.random.choice(weekly_choice) self.params["monthly_seasonality"] = np.random.choice(monthly_choice) self.params["quarterly_seasonality"] = np.random.choice( quarterly_choice) self.params["yearly_seasonality"] = np.random.choice(yearly_choice) self.params["growth"] = np.random.choice(["linear", "logistic"])
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get column names orig_cols = list(X.names) from h2oaicore.tensorflow_dynamic import got_cpu_tf, got_gpu_tf import tensorflow as tf import shap import scipy import pandas as pd self.setup_keras_session() import h2oaicore.keras as keras import matplotlib.pyplot as plt if not hasattr(self, 'save_model_path'): model_id = str(uuid.uuid4())[:8] self.save_model_path = os.path.join(user_dir(), "custom_xnn_model.hdf5") np.random.seed(self.random_state) my_init = keras.initializers.RandomUniform(seed=self.random_state) # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folter tmp_folder = self._create_tmp_folder(logger) # define base model def xnn_initialize(features, ridge_functions=3, arch=[20, 12], learning_rate=0.01, bg_samples=100, beta1=0.9, beta2=0.999, dec=0.0, ams=True, bseed=None, is_categorical=False): # # Prepare model architecture # # Input to the network, our observation containing all the features input = keras.layers.Input(shape=(features, ), name='main_input') # Record current column names loggerinfo(logger, "XNN LOG") loggerdata(logger, "Feature list:") loggerdata(logger, str(orig_cols)) # Input to ridge function number i is the dot product of our original input vector times coefficients ridge_input = keras.layers.Dense(ridge_functions, name="projection_layer", activation='linear')(input) ridge_networks = [] # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it ridge_inputs = SplitLayer(ridge_functions)(ridge_input) for i, ridge_input in enumerate(ridge_inputs): # Generate subnetwork i mlp = _mlp(ridge_input, i, arch) ridge_networks.append(mlp) added = keras.layers.Concatenate( name='concatenate_1')(ridge_networks) # Add the correct output layer for the problem if is_categorical: out = keras.layers.Dense(1, activation='sigmoid', input_shape=(ridge_functions, ), name='main_output')(added) else: out = keras.layers.Dense(1, activation='linear', input_shape=(ridge_functions, ), name='main_output')(added) model = keras.models.Model(inputs=input, outputs=out) optimizer = keras.optimizers.Adam(lr=learning_rate, beta_1=beta1, beta_2=beta2, decay=dec, amsgrad=ams) # Use the correct loss for the problem if is_categorical: model.compile(loss={'main_output': 'binary_crossentropy'}, optimizer=optimizer) else: model.compile(loss={'main_output': 'mean_squared_error'}, optimizer=optimizer) return model def _mlp(input, idx, arch=[20, 12], activation='relu'): # Set up a submetwork # Hidden layers mlp = keras.layers.Dense(arch[0], activation=activation, name='mlp_{}_dense_0'.format(idx), kernel_initializer=my_init)(input) for i, layer in enumerate(arch[1:]): mlp = keras.layers.Dense(layer, activation=activation, name='mlp_{}_dense_{}'.format( idx, i + 1), kernel_initializer=my_init)(mlp) # Output of the MLP mlp = keras.layers.Dense( 1, activation='linear', name='mlp_{}_dense_last'.format(idx), kernel_regularizer=keras.regularizers.l1(1e-3), kernel_initializer=my_init)(mlp) return mlp def get_shap(X, model): # Calculate the Shap values np.random.seed(24) bg_samples = min(X.shape[0], 1000) if isinstance(X, pd.DataFrame): background = X.iloc[np.random.choice(X.shape[0], bg_samples, replace=False)] else: background = X[np.random.choice(X.shape[0], bg_samples, replace=False)] # Explain predictions of the model on the subset explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(X) # Return the mean absolute value of each shap value for each dataset xnn_shap = np.abs(shap_values[0]).mean(axis=0) return xnn_shap # Initialize the xnn's features = X.shape[1] orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) self.is_cat = True xnn1 = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) xnn = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) else: self.is_cat = False xnn1 = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) xnn = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) # Replace missing values with a value smaller than all observed values self.min = dict() for col in X.names: XX = X[:, col] self.min[col] = XX.min1() if self.min[col] is None or np.isnan(self.min[col]): self.min[col] = -1e10 else: self.min[col] -= 1 XX.replace(None, self.min[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() inputs = {'main_input': X} validation_set = 0 verbose = 0 # Train the neural network once with early stopping and a validation set history = keras.callbacks.History() es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min') history = xnn1.fit(inputs, y, epochs=self.params["n_estimators"], batch_size=self.params["batch_size"], validation_split=0.3, verbose=verbose, callbacks=[history, es]) # Train again on the full data number_of_epochs_it_ran = len(history.history['loss']) xnn.fit(inputs, y, epochs=number_of_epochs_it_ran, batch_size=self.params["batch_size"], validation_split=0.0, verbose=verbose) # Get the mean absolute Shapley values importances = np.array(get_shap(X, xnn)) int_output = {} int_weights = {} int_bias = {} int_input = {} original_activations = {} x_labels = list(map(lambda x: 'x' + str(x), range(features))) intermediate_output = [] # Record and plot the projection weights # weight_list = [] for layer in xnn.layers: layer_name = layer.get_config()['name'] if layer_name != "main_input": print(layer_name) weights = layer.get_weights() # Record the biases try: bias = layer.get_weights()[1] int_bias[layer_name] = bias except: print("No Bias") # Record outputs for the test set intermediate_layer_model = keras.models.Model( inputs=xnn.input, outputs=xnn.get_layer(layer_name).output) # Record the outputs from the training set if self.is_cat and (layer_name == 'main_output'): original_activations[layer_name] = scipy.special.logit( intermediate_layer_model.predict(X)) original_activations[ layer_name + "_p"] = intermediate_layer_model.predict(X) else: original_activations[ layer_name] = intermediate_layer_model.predict(X) # Record other weights, inputs, and outputs int_weights[layer_name] = weights int_input[layer_name] = layer.input int_output[layer_name] = layer.output # Plot the projection layers if "projection_layer" in layer.get_config()['name']: # print(layer.get_config()['name']) # Record the weights for each projection layer weights = [np.transpose(layer.get_weights()[0])] weight_list2 = [] for i, weight in enumerate(weights[0]): weight_list.append(weight) weight_list2.append( list(np.reshape(weight, (1, features))[0])) # Plot weights plt.bar(orig_cols, abs(np.reshape(weight, (1, features))[0]), 1, color="blue") plt.ylabel("Coefficient value") plt.title("Projection Layer Weights {}".format(i), fontdict={'fontsize': 10}) plt.xticks(rotation=90) plt.show() plt.savefig(os.path.join( tmp_folder, 'projection_layer_' + str(i) + '.png'), bbox_inches="tight") plt.clf() if "main_output" in layer.get_config()['name']: weights_main = layer.get_weights() print(weights_main) pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder, "projection_data.csv"), index=False) intermediate_output = [] for feature_num in range(features): intermediate_layer_model = keras.models.Model( inputs=xnn.input, outputs=xnn.get_layer('mlp_' + str(feature_num) + '_dense_last').output) intermediate_output.append(intermediate_layer_model.predict(X)) # Record and plot the ridge functions ridge_x = [] ridge_y = [] for weight_number in range(len(weight_list)): ridge_x.append( list( sum(X[:, ii] * weight_list[weight_number][ii] for ii in range(features)))) ridge_y.append(list(intermediate_output[weight_number])) plt.plot( sum(X[:, ii] * weight_list[weight_number][ii] for ii in range(features)), intermediate_output[weight_number], 'o') plt.xlabel("Input") plt.ylabel("Subnetwork " + str(weight_number)) plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10}) plt.show() plt.savefig( os.path.join(tmp_folder, 'ridge_' + str(weight_number) + '.png')) plt.clf() # Output the ridge function importance weights2 = np.array([item[0] for item in list(weights)[0]]) output_activations = np.abs( np.array([ item * weights2 for item in list(original_activations["concatenate_1"]) ])).mean(axis=0) loggerinfo(logger, str(output_activations)) pd.DataFrame(output_activations).to_csv(os.path.join( tmp_folder, "ridge_weights.csv"), index=False) plt.bar(x_labels, output_activations, 1, color="blue") plt.xlabel("Ridge function number") plt.ylabel("Feature importance") plt.title("Ridge function importance", fontdict={'fontsize': 10}) plt.show() plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png')) pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join( tmp_folder, "ridge_y.csv"), index=False) pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"), index=False) pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder, "input_columns.csv"), index=False) self.set_model_properties(model=xnn, features=orig_cols, importances=importances.tolist(), iterations=self.params['n_estimators'])
def transform(self, X: dt.Frame): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ # Get the logger if it exists logger = None tmp_folder = str(uuid.uuid4()) + "_arima_folder/" if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" # Create a temp folder to store files used during multi processing experiment # This temp folder will be removed at the end of the process loggerinfo(logger, "Arima temp folder {}".format(tmp_folder)) try: os.mkdir(tmp_folder) except PermissionError: # This not occur so log a warning loggerwarning(logger, "Arima was denied temp folder creation rights") tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except FileExistsError: # We should never be here since temp dir name is expected to be unique loggerwarning(logger, "Arima temp folder already exists") tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except: # Revert to temporary file path loggerwarning(logger, "Arima defaulted to create folder inside tmp directory.") tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just print where we are in the process of fitting models if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) # Create time group key to store and retrieve fitted models key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # Create file path to store data and pass it to the fitting pool X_path = os.path.join(tmp_folder, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join(tmp_folder, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column, tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) try: shutil.rmtree(tmp_folder) loggerinfo(logger, "Arima cleaned up temporary file folder.") except: loggerwarning(logger, "Arima could not delete the temporary file folder.") return XX
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool loggerinfo(logger, "Prophet will use {} workers for transform".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) print("Nb Groups = ", nb_groups) for _i_g, (key, X) in enumerate(XX_grp): # Log where we are in the transformation of the dataset if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget( None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) return XX
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits FB Prophet models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Convert to pandas XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) # Make sure labales are numeric if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) XX['y'] = np.array(y) # Set target prior self.nan_value = np.mean(y) # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} self.priors = {} # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, "Prophet will use {} workers for fitting".format(n_jobs)) loggerinfo( logger, "Prophet parameters holidays {} / monthly {}".format( self.country_holidays, self.monthly_seasonality)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # Fit 1 FB Prophet model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) self.priors[grp_hash] = X['y'].mean() params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } args = (X_path, grp_hash, tmp_folder, params) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Specify these parameters for the dataset. # # Also set feature engineering effort to 0 # under the features section of expert settings. ######################## # Specify the protected column. # The protected column must be numeric. self.protected_name = "black" # Specify the level of the protected group in the protected column self.protected_label = 1 # Specify the target level considered to be a positive outcome # Must be encoded as 0/1 self.positive_target = 0 # Set minimum mean protected ratio needed to avoid a penalty # (mean protected ratio = mean predictions for the protected group/mean predictions for all other groups) # # Try tuning this to values at or a little above # the mean of the positive target for the protected group # divided by the mean of the positive target for the unprotected group. # If it's set too large, the accuracy will be poor, so there # is a limit to the debiasing that can be obtained. self.mean_protected_prediction_ratio_minimum = 0.92 ######################## orig_cols = list(X.names) import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from collections import Counter import xgboost as xgb # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Current mu value mu = self.params["mu"] def fair_metric(predt: np.ndarray, dtrain: xgb.DMatrix): ''' FairXGB Error Metric''' # predt is the prediction array # Find the right protected group vector if len(predt) == len(protected_train): protected_feature = np.array(protected_train.copy()) elif len(predt) == len(protected_full): protected_feature = np.array(protected_full.copy()) elif len(predt) == len(protected_valid): protected_feature = np.array(protected_valid.copy()) else: protected_feature = 0 y = dtrain.get_label() answer = -y * np.log( sigmoid(predt)) - (1 - y) * np.log(1 - sigmoid(predt)) answer += mu * ( protected_feature * np.log(sigmoid(predt)) + (1 - protected_feature) * np.log(1 - sigmoid(predt))) return 'Fair_Metric', float(np.sum(answer) / len(answer)) def sigmoid(x): z = 1.0 / (1.0 + np.exp(-x)) return z def gradient(predt: np.ndarray, dtrain: xgb.DMatrix): '''Fair Xgboost Gradient''' # predt is the prediction array # Find the right protected group vector if len(predt) == len(protected_train): protected_feature = np.array(protected_train.copy()) elif len(predt) == len(protected_full): protected_feature = np.array(protected_full.copy()) elif len(predt) == len(protected_valid): protected_feature = np.array(protected_valid.copy()) else: protected_feature = 0 y = dtrain.get_label() answer = sigmoid(predt) - y answer += mu * (protected_feature - sigmoid(predt)) return answer def hessian(predt: np.ndarray, dtrain: xgb.DMatrix): '''Fair Xgboost Hessian''' # predt is the prediction array answer = (1 - mu) * sigmoid(predt) * (1 - sigmoid(predt)) return answer def fair(predt: np.ndarray, dtrain: xgb.DMatrix): ''' Fair xgb objective function ''' grad = gradient(predt, dtrain) hess = hessian(predt, dtrain) return grad, hess # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) params = {} params['eta'] = self.params["eta"] params['max_depth'] = self.params['max_depth'] params['min_child_weight'] = self.params['min_child_weight'] params['reg_lambda'] = self.params['reg_lambda'] params['reg_alpha'] = self.params['reg_alpha'] params['colsample_bytree'] = self.params['colsample_bytree'] params['subsample'] = self.params['subsample'] params['silent'] = 1 params['seed'] = self.params['random_state'] else: # fairxgb doesn't work for regression loggerinfo(logger, "PASS, no fairxgboost model") pass # Switch to pandas X = X.to_pandas() X.columns = orig_cols # Find the protected group column if it is present self.protected = "none" for col in X.columns: if col.find(self.protected_name) > -1: self.protected = col X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [ orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or ( X_datatypes[col_count] == 'object') ] self.X_numeric = [ item for item in orig_cols if item not in self.X_categorical ] self.encoded_categories = [] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: loggerinfo(logger, "Categorical encode") for colname in self.X_categorical: X[colname] = list(X[colname].fillna("Missing")) self.enc = OneHotEncoder(handle_unknown='ignore') if self.protected in self.X_categorical: self.X_categorical.remove(self.protected) if len(self.X_categorical) > 0: self.enc.fit(X[self.X_categorical]) self.encoded_categories = list( self.enc.get_feature_names( input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) # Replace missing values with a missing value code if len(self.X_numeric) > 0: for colname in self.X_numeric: X[colname] = list(X[colname].fillna(-999)) # Make sure the target that represents a positive outcome is 1 if self.positive_target == 0: y = 1 - y X_full = X.copy() y_full = y.copy() # Set up a validation step to find the optimal number of trees X_valid = X.iloc[int(0.7 * len(X_full)):, :] y_valid = y[int(0.7 * len(X_full)):] X = X.iloc[0:int(0.7 * len(X_full)), :] y = y[0:int(0.7 * len(X_full))] if self.protected != "none": # Set the protected group to 0 and all others 1 protected_full = [ int(item) for item in ~(np.array(X_full[self.protected]) == self.protected_label) ] protected_train = [ int(item) for item in ~(np.array(X[self.protected]) == self.protected_label) ] protected_valid = [ int(item) for item in ~(np.array(X_valid[self.protected]) == self.protected_label) ] else: mu = 0 protected_full = [] protected_train = [] protected_valid = [] # Remove the protected value from the model if self.protected != "none": X = X.drop(self.protected, axis=1) X_full = X_full.drop(self.protected, axis=1) X_valid = X_valid.drop(self.protected, axis=1) d_train = xgb.DMatrix(X, label=y, missing=np.nan) d_valid = xgb.DMatrix(X_valid, label=y_valid, missing=np.nan) # Initial run to find the optimal number of trees num_iterations = 10000 watchlist = [(d_train, 'train'), (d_valid, 'valid')] clf = xgb.train(params, d_train, num_iterations, watchlist, feval=fair_metric, verbose_eval=10, obj=fair, early_stopping_rounds=10) # Second xgboost run with the full dataset and optimal number of trees attribute_dict = clf.attributes() new_iterations = int(attribute_dict['best_iteration']) d_train = xgb.DMatrix(X_full, label=y_full, missing=np.nan) watchlist = [(d_train, 'train')] clf = xgb.train(params, d_train, new_iterations, watchlist, feval=fair_metric, verbose_eval=10, obj=fair) # Calculate feature importances importances_dict = clf.get_score(importance_type='gain') # Make sure the protected group has high feature importance # so that it doesn't get dropped by driverless if self.protected != "none": if len(importances_dict) > 0: importances_dict[self.protected] = max( importances_dict.values()) else: importances_dict[self.protected] = 1 for col in list(X.columns): importances_dict[col] = 1 # Make sure any dropped columns are listed with 0 importance for col in list(X.columns): if col not in importances_dict: importances_dict[col] = 0 self.mean_target = np.array(sum(y) / len(y)) loggerinfo(logger, "End fair check") loggerinfo(logger, str(mu)) loggerdata(logger, str(importances_dict)) self.is_train = True # Set model properties self.set_model_properties(model=clf, features=list(importances_dict.keys()), importances=list(importances_dict.values()), iterations=num_iterations)
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool loggerinfo(logger, "Arima will use {} workers for transform".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just print where we are in the process of fitting models if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) # Create time group key to store and retrieve fitted models key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # Create file path to store data and pass it to the fitting pool X_path = os.path.join(tmp_folder, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( tmp_folder, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column, self.pred_gap, tmp_folder) kwargs = {} pool.submit_tryget( None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) return XX
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target :param X: Datatable Frame containing the features :return: FB Prophet predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Predict y using unique dates X_time = X[['ds']].groupby('ds').first().reset_index() with suppress_stdout_stderr(): y_avg = self.model.predict(X_time)[['ds', 'yhat']] # Prophet transforms the date column to datetime so we need to transfrom that to merge back X_time.sort_values('ds', inplace=True) X_time['yhat'] = y_avg['yhat'] X_time.sort_index(inplace=True) # Merge back into original frame on 'ds' # pd.merge wipes the index ... so keep it to provide it again indices = X.index X = pd.merge( left=X, right=X_time[['ds', 'yhat']], on='ds', how='left' ) X.index = indices # Go through groups and recover the scaled target for knowed groups if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] inverted_ys = [] for key, X_grp in X_groups: grp_hash = self.get_hash(key) # Scale target for current group if grp_hash in self.scalers.keys(): inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']]) else: inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']]) # Put back in a DataFrame to keep track of original index inverted_df = pd.DataFrame(inverted_y, columns=['yhat']) inverted_df.index = X_grp.index inverted_ys.append(inverted_df) XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index() if self.top_groups: # Go though the groups and predict only top XX_paths = [] model_paths = [] def processor(out, res): out.append(res) num_tasks = len(self.top_groups) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) nb_groups = len(X_groups) for _i_g, (key, X_grp) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups)) # Create dict key to store the min max scaler grp_hash = self.get_hash(key) X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4())) if grp_hash not in self.top_groups: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue if self.grp_models[grp_hash] is None: XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat']) # unseen groups XX.index = X_grp.index save_obj(XX, X_path) XX_paths.append(X_path) continue model = self.grp_models[grp_hash] model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X_grp, X_path) model_paths.append(model_path) args = (model_path, X_path, self.priors[grp_hash], tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) pool.finish() XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) features_df = pd.DataFrame() features_df[self.display_name + '_GrpAvg'] = XX_general['yhat'] if self.top_groups: features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat'] self._output_feature_names = list(features_df.columns) self._feature_desc = list(features_df.columns) return features_df
def fit(self, X: dt.Frame, y: np.array = None): """ Fits ARIMA models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None tmp_folder = str(uuid.uuid4()) + "_arima_folder/" if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" # Create a temp folder to store files used during multi processing experiment # This temp folder will be removed at the end of the process loggerinfo(logger, "Arima temp folder {}".format(tmp_folder)) try: os.mkdir(tmp_folder) except PermissionError: # This not occur so log a warning loggerwarning(logger, "Arima was denied temp folder creation rights") tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except FileExistsError: # We should never be here since temp dir name is expected to be unique loggerwarning(logger, "Arima temp folder already exists") tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except: # Revert to temporary file path tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) # Import the ARIMA python module pm = importlib.import_module('pmdarima') # Init models self.models = {} # Convert to pandas X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool if hasattr(self, "params_base"): max_workers = self.params_base['n_jobs'] else: loggerinfo(logger, "Custom Recipe does not have a params_base attribute") # Beware not to use the disable_gpus keyword here. looks like cython does not like it # max_workers = get_max_workers(True) # Just set default to 2 max_workers = 2 loggerinfo(logger, "Arima will use {} workers for parallel processing".format(max_workers)) pool = pool_to_use( logger=None, processor=processor, num_tasks=num_tasks, max_workers=max_workers ) # Build 1 ARIMA model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "autoarima_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash, self.time_column, tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) try: shutil.rmtree(tmp_folder) loggerinfo(logger, "Arima cleaned up temporary file folder.") except: loggerwarning(logger, "Arima could not delete the temporary file folder.") return self
def predict(self, X, **kwargs): model_config, _, _, _ = self.get_model_properties() models = model_config['models'] cap = model_config['cap'] priors = model_config['priors'] prior = model_config['prior'] if self.tgc is None or not all([x in X.names for x in self.tgc]): return np.ones(X.shape[0]) * self.nan_value logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo(logger, "Start Predicting with Prophet") # Reduce to TimeGroupColumns if isinstance(X, dt.Frame): # Convert to pandas XX = X[:, self.tgc].to_pandas() else: XX = X[:, self.tgc].copy() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) if self.params["growth"] == "logistic": XX["cap"] = cap # Compute groups # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Go Through groups and predict # nb_groups = len(XX_grp) preds = [] for _i_g, (key, X) in enumerate(XX_grp): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet Model : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # Facebook Prophet returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(pd.to_datetime(X["ds"])) if grp_hash in models.keys(): model = models[grp_hash] if model is not None: # Run prophet yhat = model.predict(X) XX = yhat else: if grp_hash in priors.keys(): XX = pd.DataFrame(np.full((X.shape[0], 1), priors[grp_hash]), columns=['yhat']) else: # This should not happen loggerinfo(logger, "Group in models but not in priors") XX = pd.DataFrame(np.full((X.shape[0], 1), prior), columns=['yhat']) else: # print("No Group") XX = pd.DataFrame(np.full((X.shape[0], 1), prior), columns=['yhat']) # unseen groups # Reorder the index like prophet re-ordered the predictions XX.index = X.index[order] # print("Transformed Output for Group") # print(XX.sort_index().head(20), flush=True) preds.append(XX[['yhat']]) XX = pd.concat(tuple(preds), axis=0).sort_index() return XX['yhat'].values
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from collections import Counter import pygam from pygam import LinearGAM, LogisticGAM import matplotlib.pyplot as plt # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folder tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) clf = LogisticGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = True else: clf = LinearGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = False X = self.basic_impute(X) # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [ orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or ( X_datatypes[col_count] == 'object') ] self.X_numeric = [ item for item in orig_cols if item not in self.X_categorical ] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: X.loc[:, self.X_categorical] = X[self.X_categorical].fillna( "Missing").copy() self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list( self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) # Replace missing values with a missing value code self.median_train = {} if len(self.X_numeric) > 0: for colname in self.X_numeric: self.median_train[colname] = X[colname].quantile(0.5) X.loc[:, colname] = X[colname].fillna( self.median_train[colname]).copy() try: clf.fit(X, y) except np.linalg.LinAlgError as e: raise IgnoreError("np.linalg.LinAlgError") from e except pygam.utils.OptimizationError as e: raise IgnoreError("pygam.utils.OptimizationError") from e except ValueError as e: if 'On entry to DLASCL parameter number' in str(e): raise IgnoreError('On entry to DLASCL parameter number') from e raise p_values = np.array(clf.statistics_['p_values']) # Plot the partial dependence plots for each feature for ii in range(X.shape[1]): XX = clf.generate_X_grid(term=ii) plt.figure() plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX)) plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX, width=.95)[1], c='r', ls='--') plt.title("Partial Dependence " + str(ii), fontdict={'fontsize': 10}) plt.show() plt.savefig(os.path.join( tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'), bbox_inches="tight") if max(p_values[0:(len(p_values) - 1)]) > 0: importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16)) importances = list(importances / max(importances)) else: importances = [1] * (len(p_values) - 1) self.mean_target = np.array(sum(y) / len(y)) self.set_model_properties(model=clf, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from collections import Counter from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn import tree import matplotlib.pyplot as plt # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folter tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) clf = DecisionTreeClassifier(random_state=42, max_depth=self.params["tree_depth"]) self.is_classifier = True else: clf = DecisionTreeRegressor(random_state=42, max_depth=self.params["tree_depth"]) self.is_classifier = False # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [ orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or ( X_datatypes[col_count] == 'object') ] self.X_numeric = [ item for item in orig_cols if item not in self.X_categorical ] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: X.loc[:, self.X_categorical] = X[self.X_categorical].fillna( "Missing").copy() self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list( self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) # Replace missing values with a missing value code if len(self.X_numeric) > 0: X.loc[:, self.X_numeric] = X[self.X_numeric].fillna(-999).copy() # Fit the decision tree clf.fit(X, y) if self.is_classifier: yy = clf.predict_proba(X) p = np.round_(yy[:, 1], 5) else: yy = clf.predict(X) p = np.round_(yy, 5) self.leaf_categories = list(set(p)) # Fit linear or logistic models to each leaf node model_array = {} equation_log = [] for cat in self.leaf_categories: if self.is_classifier: if (np.mean(y[p == cat]) < 1) and (np.mean(y[p == cat]) > 0): lm = LogisticRegression(random_state=42) lm.fit(X[p == cat], y[p == cat]) model_array[cat] = lm equation_log.append([[ int(round((1 - cat) * sum(p == cat))), int(round(cat * sum(p == cat))) ], sum(p == cat), lm.intercept_[0]] + list(lm.coef_[0])) else: loggerinfo(logger, "No leaf fit") model_array[cat] = "dt" else: try: lm = LinearRegression() lm.fit(X[p == cat], y[p == cat]) model_array[cat] = lm equation_log.append( [cat, sum(p == cat), lm.intercept_] + list(lm.coef_)) except: loggerinfo(logger, "No leaf fit") model_array[cat] = "dt" # Save the leaf models pd.DataFrame(equation_log, columns=['leaf value', 'number of samples', 'intercept'] + list(X.columns)).to_csv( os.path.join(tmp_folder, 'Leaf_model_coef.csv')) # Plot the decision tree fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 8), dpi=1600) tree.plot_tree(clf, feature_names=list(X.columns)) fig.savefig(os.path.join(tmp_folder, 'Decision_tree_plot.png')) importances = clf.feature_importances_ loggerinfo(logger, str(importances)) self.mean_target = np.array(sum(y) / len(y)) model = [clf, model_array] # Set model properties self.set_model_properties(model=model, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self._make_logger: # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self._show_logger_test: loggerinfo(logger, "TestLOGGER: Fit CatBoost") if self._show_task_test: # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "Tuning CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType # label encode target and setup type of problem lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set = [(valid_X, valid_y)] self.params.update({'objective': 'Logloss'}) if self.num_classes > 2: self.params.update({'objective': 'MultiClass'}) if isinstance(X, dt.Frame): orig_cols = list(X.names) numeric_cols = list(X[:, [bool, int, float]].names) else: orig_cols = list(X.columns) numeric_cols = list(X.select_dtypes([np.number]).columns) # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc. self.params['cat_features'] = [ i for i, x in enumerate(orig_cols) if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols ] if not self.get_uses_gpus(self.params): # monotonicity constraints not available for GPU for catboost # get names of columns in same order X_names = list(dt.Frame(X).names) X_numeric = self.get_X_ordered_numerics(X) X_numeric_names = list(X_numeric.names) _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y) # if non-numerics, then fix those to have 0 constraint self.params['monotone_constraints'] = [0] * len(X_names) colnumi = 0 for coli in X_names: if X_names[coli] in X_numeric_names: self.params['monotone_constraints'][coli] = constraints[ colnumi] colnumi += 1 if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> catboost internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy( ) # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray( valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] eval_set = [(valid_X, valid_y)] if eval_set is not None: valid_X_shape = eval_set[0][0].shape else: valid_X_shape = None X, eval_set = self.process_cats(X, eval_set, orig_cols) # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape) params = copy.deepcopy( self.params ) # keep separate, since then can be pulled form lightgbm params params = self.transcribe_params(params=params, **kwargs) if logger is not None: loggerdata( logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params))) if self.num_classes == 1: self.model = CatBoostRegressor(**params) else: self.model = CatBoostClassifier(**params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None kwargs_fit = dict(baseline=baseline, eval_set=eval_set) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join(exp_dir(), "catboost%s.tmp.pickle" % self.uuid) save_obj((self.model, X, y, sample_weight, kwargs_fit), pickle_path) # FIT (with migration safety before hyperopt/Optuna function added) try: if hasattr(self, 'dask_or_hyper_or_normal_fit'): self.dask_or_hyper_or_normal_fit(X, y, sample_weight=sample_weight, kwargs=kwargs, **kwargs_fit) else: self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit) except Exception as e: if "All features are either constant or ignored" in str(e): raise IgnoreEntirelyError(str(e)) raise if config.debug_daimodel_level <= 2: remove(pickle_path) # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # need to move to wrapper if self.model.get_best_iteration() is not None: iterations = self.model.get_best_iteration() + 1 else: iterations = self.params['n_estimators'] # must always set best_iterations self.model_path = None importances = copy.deepcopy(self.model.feature_importances_) if not self._save_by_pickle: self.uuid = str(uuid.uuid4())[:6] model_file = "catboost_%s.bin" % str(self.uuid) self.model_path = os.path.join(self.context.experiment_tmp_dir, model_file) self.model.save_model(self.model_path) with open(self.model_path, mode='rb') as f: model = f.read() else: model = self.model self.set_model_properties( model= model, # overwrites self.model object with bytes if not using pickle features=orig_cols, importances=importances, iterations=iterations)
def predict(self, X, **kwargs): orig_cols = list(X.names) import pandas as pd import xgboost as xgb import numpy as np def sigmoid(x): z = 1.0 / (1.0 + np.exp(-x)) return z # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) X = dt.Frame(X) X = X.to_pandas() if self.protected in list(X.columns): # Set the protected group to 0 and all others 1 loggerdebug(logger, "Protected test found") protected_test = np.array([ int(item) for item in ~(np.array(X[self.protected]) == self.protected_label) ]) else: loggerdebug(logger, "Protected test not found") protected_test = np.array([]) if self.protected in list(X.columns): X = X.drop(self.protected, axis=1) # Replace missing values with a missing category # Replace categories that weren't in the training set with the mode if len(self.X_categorical) > 0: for colname in self.X_categorical: if colname in list(X.columns): X[colname] = list(X[colname].fillna("Missing")) for label in self.X_categorical: if label in list(X.columns): # Replace anything not in the test set train_categories = self.train_levels[label] X_label = np.array(X[label]) mmode = self.train_mode[label] X_label[~np.isin(X_label, train_categories)] = mmode X[label] = X_label # Replace missing values with a missing value code if len(self.X_numeric) > 0: for colname in self.X_numeric: if colname in list(X.columns): X[colname] = list(X[colname].fillna(-999)) # Get model model, _, _, _ = self.get_model_properties() # Remove the protected group if self.protected in self.X_categorical: self.X_categorical.remove(self.protected) # One hot encode categorical features if len(self.X_categorical) > 0: X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) d_test = xgb.DMatrix(X, missing=np.nan) # If the positive target was 0, change the final result to 1-p if self.positive_target == 0: preds = 1.0 - sigmoid(model.predict(d_test)) else: preds = sigmoid(model.predict(d_test)) mean_preds = np.mean(preds) # Set a penalty value to which some probabilities will be changed # if the fairness threshold isn't reached epsilon = 0.0001 if mean_preds > 0.5: penalty = epsilon else: penalty = 1.0 - epsilon # Only apply penalties in the training stage if self.is_train: # If the protected value was removed, use the maximum penalty # by changing all probabilities to the penalty value # (the recipe needs to be able to use the protected values) if self.protected == "none": preds[0:len(preds)] = penalty loggerdata(logger, str(preds)) loggerdata(logger, "Removal_penalty") else: # The mean ratio calculation for target=0 and target=1 if self.positive_target == 0: if np.mean(preds[protected_test == 1]) < 1.0: DI = (1.0 - np.mean(preds[protected_test == 0])) / ( 1.0 - np.mean(preds[protected_test == 1])) else: DI = 1 else: if np.mean(preds[protected_test == 1]) > 0.0: DI = np.mean(preds[protected_test == 0]) / np.mean( preds[protected_test == 1]) else: DI = 1 loggerdata(logger, "Mean ratio Check") loggerdata(logger, str(DI)) if DI < self.mean_protected_prediction_ratio_minimum: # Create a penalty proportional to the distance below the specified threshold len_preds = len(preds) num_penalty = min( len_preds, int((self.mean_protected_prediction_ratio_minimum - DI) / self.mean_protected_prediction_ratio_minimum * len_preds)) preds[0:num_penalty] = penalty loggerdata(logger, "num_penalty1") loggerdata(logger, str(num_penalty), str(num_penalty / len(preds))) self.is_train = False return preds
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo(logger, "TestLOGGER: Fit CatBoost") # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "TestMessage: CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if isinstance(X, dt.Frame): orig_cols = list(X.names) # dt -> lightgbm internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy() # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy() # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray(valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] if self.num_classes >= 2: valid_y = lb.transform(valid_y) eval_set[0] = (valid_X, valid_y) else: orig_cols = list(X.columns) if self.num_classes == 1: model = CatBoostRegressor(**self.params) else: model = CatBoostClassifier(**self.params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None model.fit(X, y=y, sample_weight=sample_weight, baseline=baseline, eval_set=eval_set, early_stopping_rounds=kwargs.get('early_stopping_rounds', None), verbose=self.params.get('verbose', False) ) # need to move to wrapper if model.get_best_iteration() is not None: iterations = model.get_best_iteration() + 1 else: iterations = self.params['iterations'] + 1 # must always set best_iterations self.set_model_properties(model=model, features=orig_cols, importances=model.feature_importances_, iterations=iterations)
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits FB Prophet models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) try: # Add value of prophet_top_n in recipe_dict variable inside of config.toml file # eg1: recipe_dict="{'prophet_top_n': 200}" # eg2: recipe_dict="{'prophet_top_n':10}" self.top_n = config.recipe_dict['prophet_top_n'] except KeyError: self.top_n = 50 loggerinfo(logger, f"Prophet will use {self.top_n} groups as well as average target data.") tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Add target, Label encoder is only used for Classif. which we don't support... if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) X['y'] = np.array(y) self.nan_value = X['y'].mean() # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Create a general scale now that will be used for unknown groups at prediction time # Can we do smarter than that ? self.general_scaler = MinMaxScaler().fit(X[['y', 'ds']].groupby('ds').median().values) # Go through groups and standard scale them if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] self.scalers = {} scaled_ys = [] print(f'{datetime.now()} Start of group scaling') for key, X_grp in X_groups: # Create dict key to store the min max scaler grp_hash = self.get_hash(key) # Scale target for current group self.scalers[grp_hash] = MinMaxScaler() y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values) # Put back in a DataFrame to keep track of original index y_skl_df = pd.DataFrame(y_skl, columns=['y']) # (0, 'A') (1, 4) (100, 1) (100, 1) # print(grp_hash, X_grp.shape, y_skl.shape, y_skl_df.shape) y_skl_df.index = X_grp.index scaled_ys.append(y_skl_df) print(f'{datetime.now()} End of group scaling') # Set target back in original frame but keep original X['y_orig'] = X['y'] X['y'] = pd.concat(tuple(scaled_ys), axis=0) # Now Average groups X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index() # Send that to Prophet params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") self.model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) if params["country_holidays"] is not None: self.model.add_country_holidays(country_name=params["country_holidays"]) if params["monthly_seasonality"]: self.model.add_seasonality(name='monthly', period=30.5, fourier_order=5) with suppress_stdout_stderr(): self.model.fit(X[['ds', 'y']]) print(f'{datetime.now()} General Model Fitted') self.top_groups = None if len(tgc_wo_time) > 0: if self.top_n > 0: top_n_grp = X.groupby(tgc_wo_time).size().sort_values().reset_index()[tgc_wo_time].iloc[-self.top_n:].values self.top_groups = [ '_'.join(map(str, key)) for key in top_n_grp ] if self.top_groups: self.grp_models = {} self.priors = {} # Prepare for multi processing num_tasks = len(self.top_groups) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.") loggerinfo(logger, "Prophet parameters holidays {} / monthly {}".format(self.country_holidays, self.monthly_seasonality)) pool = pool_to_use( logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs ) # # Fit 1 FB Prophet model per time group columns nb_groups = len(X_groups) # Put y back to its unscaled value for top groups X['y'] = X['y_orig'] for _i_g, (key, X) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) grp_hash = self.get_hash(key) if grp_hash not in self.top_groups: continue self.priors[grp_hash] = X['y'].mean() params = { "country_holidays": self.country_holidays, "monthly_seasonality": self.monthly_seasonality } args = (X_path, grp_hash, tmp_folder, params) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.grp_models) pool.finish() for k, v in self.grp_models.items(): self.grp_models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get TGC and time column self.tgc = self.params_base.get('tgc', None) self.time_column = self.params_base.get('time_column', None) self.nan_value = np.mean(y) self.cap = np.max( y ) * 1.5 # TODO Don't like this we should compute a cap from average yearly growth self.prior = np.mean(y) if self.time_column is None: self.time_column = self.tgc[0] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo( logger, "Start Fitting Prophet Model with params : {}".format(self.params)) # Get temporary folders for multi process communication tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Convert to pandas XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) # Make target available in the Frame XX['y'] = np.array(y) # Set target prior self.nan_value = np.mean(y) # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} self.priors = {} # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerdebug(logger, "Prophet will use {} workers for fitting".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # Fit 1 FB Prophet model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) self.priors[grp_hash] = X['y'].mean() args = (X_path, grp_hash, tmp_folder, self.params, self.cap) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return None
def fit(self, X: dt.Frame, y: np.array = None, **kwargs): """ Fits ARIMA models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None tmp_folder = str(uuid.uuid4()) + "_arima_folder/" if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Import the ARIMA python module pm = importlib.import_module('pmdarima') # Init models self.models = {} # Convert to pandas X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo( logger, "Arima will use {} workers for parallel processing".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # Build 1 ARIMA model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "autoarima_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash, self.time_column, tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) return self
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from skrules import SkopeRules from sklearn.preprocessing import OneHotEncoder from collections import Counter # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folder tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) model = SkopeRules(max_depth_duplication=self.params["max_depth_duplication"], n_estimators=self.params["n_estimators"], precision_min=self.params["precision_min"], recall_min=self.params["recall_min"], max_samples=self.params["max_samples"], max_samples_features=self.params["max_samples_features"], max_depth=self.params["max_depth"], max_features=self.params["max_features"], min_samples_split=self.params["min_samples_split"], bootstrap=self.params["bootstrap"], bootstrap_features=self.params["bootstrap_features"], random_state=self.params["random_state"], feature_names=orig_cols) else: # Skopes doesn't work for regression loggerinfo(logger, "PASS, no skopes model") pass # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or (X_datatypes[col_count] == 'object')] self.X_numeric = [item for item in orig_cols if item not in self.X_categorical] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: loggerinfo(logger, "PCategorical encode") for colname in self.X_categorical: X[colname] = list(X[colname].fillna("Missing")) self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list(self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories)], axis=1) # Replace missing values with a missing value code if len(self.X_numeric) > 0: for colname in self.X_numeric: X[colname] = list(X[colname].fillna(-999)) model.fit(np.array(X), np.array(y)) # Find the rule list self.rule_list = model.rules_ # Calculate feature importances var_imp = [] for var in orig_cols: var_imp.append(sum(int(var in item[0]) for item in self.rule_list)) if max(var_imp) != 0: importances = list(np.array(var_imp) / max(var_imp)) else: importances = [1] * len(var_imp) pd.DataFrame(model.rules_, columns=['Rule', '(Precision, Recall, nb)']).to_csv( os.path.join(tmp_folder, 'Skope_rules.csv'), index=False) self.mean_target = np.array(sum(y) / len(y)) # Set model properties self.set_model_properties(model=model, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get TGC and time column self.tgc = self.params_base.get('tgc', None) self.time_column = self.params_base.get('time_column', None) self.nan_value = np.mean(y) self.cap = np.max( y ) * 1.5 # TODO Don't like this we should compute a cap from average yearly growth self.prior = np.mean(y) if self.time_column is None: self.time_column = self.tgc[0] # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo( logger, "Start Fitting Prophet Model with params : {}".format(self.params)) try: # Add value of prophet_top_n in recipe_dict variable inside of config.toml file # eg1: recipe_dict="{'prophet_top_n': 200}" # eg2: recipe_dict="{'prophet_top_n':10}" self.top_n = config.recipe_dict['prophet_top_n'] except KeyError: self.top_n = 50 loggerinfo( logger, f"Prophet will use {self.top_n} groups as well as average target data." ) # Get temporary folders for multi process communication tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) # Reduce X to TGC tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) X = X[:, self.tgc].to_pandas() # Fill NaNs or None X = X.replace([None, np.nan], 0) # Add target, Label encoder is only used for Classif. which we don't support... if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) X['y'] = np.array(y) self.nan_value = X['y'].mean() # Change date feature name to match Prophet requirements X.rename(columns={self.time_column: "ds"}, inplace=True) # Create a general scale now that will be used for unknown groups at prediction time # Can we do smarter than that ? general_scaler = MinMaxScaler().fit( X[['y', 'ds']].groupby('ds').median().values) # Go through groups and standard scale them if len(tgc_wo_time) > 0: X_groups = X.groupby(tgc_wo_time) else: X_groups = [([None], X)] scalers = {} scaled_ys = [] print('Number of groups : ', len(X_groups)) for g in tgc_wo_time: print(f'Number of groups in {g} groups : {X[g].unique().shape}') for key, X_grp in X_groups: # Create dict key to store the min max scaler grp_hash = self.get_hash(key) # Scale target for current group scalers[grp_hash] = MinMaxScaler() y_skl = scalers[grp_hash].fit_transform(X_grp[['y']].values) # Put back in a DataFrame to keep track of original index y_skl_df = pd.DataFrame(y_skl, columns=['y']) y_skl_df.index = X_grp.index scaled_ys.append(y_skl_df) # Set target back in original frame but keep original X['y_orig'] = X['y'] X['y'] = pd.concat(tuple(scaled_ys), axis=0) # Now Average groups X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index() # Send that to Prophet mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") nrows = X[['ds', 'y']].shape[0] n_changepoints = max(1, int(nrows * (2 / 3))) if n_changepoints < 25: model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True, n_changepoints=n_changepoints) else: model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) if self.params["country_holidays"] is not None: model.add_country_holidays( country_name=self.params["country_holidays"]) if self.params["monthly_seasonality"]: model.add_seasonality( name='monthly', period=30.5, fourier_order=self.params["monthly_seasonality"]) if self.params["quarterly_seasonality"]: model.add_seasonality( name='quarterly', period=92, fourier_order=self.params["quarterly_seasonality"]) with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) top_groups = None if len(tgc_wo_time) > 0: if self.top_n > 0: top_n_grp = X.groupby(tgc_wo_time).size().sort_values( ).reset_index()[tgc_wo_time].iloc[-self.top_n:].values top_groups = ['_'.join(map(str, key)) for key in top_n_grp] grp_models = {} priors = {} if top_groups: # Prepare for multi processing num_tasks = len(top_groups) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.") pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) # # Fit 1 FB Prophet model per time group columns nb_groups = len(X_groups) # Put y back to its unscaled value for top groups X['y'] = X['y_orig'] for _i_g, (key, X) in enumerate(X_groups): # Just log where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) grp_hash = self.get_hash(key) if grp_hash not in top_groups: continue priors[grp_hash] = X['y'].mean() args = (X_path, grp_hash, tmp_folder, self.params, self.cap) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=grp_models) pool.finish() for k, v in grp_models.items(): grp_models[k] = load_obj(v) if v is not None else None remove(v) self._clean_tmp_folder(logger, tmp_folder) self.set_model_properties( model={ 'avg': model, 'group': grp_models, 'priors': priors, 'topgroups': top_groups, 'skl': scalers, 'gen_scaler': general_scaler }, features=self.tgc, # Prophet uses time and timegroups importances=np.ones(len(self.tgc)), iterations=-1 # Does not have iterations ) return None
def fit_transform(self, X: dt.Frame, y: np.array = None, **kwargs): X_original = X X = X[:, dt.f[int].extend(dt.f[float]).extend(dt.f[bool]). extend(dt.f[str])] if hasattr(self, 'runcount'): self.run_count += 1 else: self.run_count = 0 # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir, username=self.context.username, ) survival_event = self.__class__._survival_event if survival_event in X.names: raise ValueError( "Consider renaming feature '{}'.".format(survival_event)) # bind y to X to use as event in CoxPH X[:, survival_event] = np.array(LabelEncoder().fit_transform(y)) # sanity check that target is binary if X[survival_event].nunique()[0, 0] != 2: raise ValueError( "Too many values {} in event column - must be exactly 2.". format(X[survival_event].nunique()[0, 0])) # redress target values into 0, 1 event_max = X[survival_event].max()[0, 0] X[dt.f[survival_event] != event_max, survival_event] = 0 X[dt.f[survival_event] == event_max, survival_event] = 1 stop_column_name = self.__class__._stop_column_name ignored_columns = self.__class__._ignored_columns if stop_column_name is None: raise ValueError("Stop column name can't be null.") main_message = "Survival Analysis CoxPH pre-transformer will use event '{}' and time '{}' columns.". \ format(survival_event, stop_column_name) # in accpetance test simply return input X if stop_column_name not in X.names: loggerwarning( logger, "Survival Analysis CoxPH pre-transformer found no time column '{}'." .format(stop_column_name)) return X_original if not X[:, stop_column_name].stype in [ dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64 ]: raise ValueError( "Stop column `{}' type must be numeric, but found '{}'".format( stop_column_name, X[:, stop_column_name].stype)) # remove stop column from X del X_original[:, stop_column_name] self._output_feature_names = list(X_original.names) self._feature_desc = list(X_original.names) if self.run_count == 0 and self.context and self.context.experiment_id: loggerinfo(logger, main_message) task = kwargs.get('task') if task and main_message is not None: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=main_message)) task.flush() # Validate CoxPH requirements on stop column if X[stop_column_name].min()[0, 0] < 0: X[dt.f[stop_column_name] < 0, stop_column_name] = 0 loggerwarning( logger, "Stop column can't be negative: replaced negative values with 0." ) if X[stop_column_name].countna()[0, 0] > 0: X[dt.isna(dt.f[stop_column_name]), stop_column_name] = 0 loggerwarning( logger, "Stop column can't contain NULLs: replaced NULL with 0.") h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model = H2OCoxProportionalHazardsEstimator( stop_column=stop_column_name, ties=self.ties, max_iterations=self.max_iterations) frame = h2o.H2OFrame(X.to_pandas()) model_path = None risk_frame = None try: model.train(y=survival_event, training_frame=frame, ignored_columns=ignored_columns) self.id = model.model_id model_path = os.path.join(temporary_files_path, "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: self.raw_model_bytes = f.read() risk_frame = model.predict(frame) X_original[:, "risk_score_coxph_{}_{}".format( self.ties, self.max_iterations)] = risk_frame.as_data_frame( header=False) self._output_feature_names.append( f"{self.display_name}{orig_feat_prefix}riskscore_coxph{extra_prefix}{self.ties}_{self.max_iterations}" ) self._feature_desc.append( f"CoxPH model risk score [ties={self.ties}, max.iter={self.max_iterations}" ) return X_original finally: if model_path is not None: remove(model_path) h2o.remove(model) h2o.remove(frame) if risk_frame is not None: h2o.remove(risk_frame)