Ejemplo n.º 1
0
    def transform(self, X: dt.Frame):
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        try:
            X = dt.Frame(X)
            original_zip_column_name = X.names[0]
            X.names = ['zip_key']
            X = X[:, str('zip_key')]
            zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0]
            zip_features = [self.get_zipcode_features(x) for x in zip_list]
            X_g = dt.Frame({"zip_key": zip_list})
            X_g.cbind(dt.Frame(zip_features))
            X_g.key = 'zip_key'
            X_result = X[:, :, dt.join(X_g)]
            self._output_feature_names = [
                "{}.{}".format(original_zip_column_name, f)
                for f in list(X_result[:, 1:].names)
            ]
            self._feature_desc = [
                "Property '{}' of US zipcode found in '{}'".format(
                    f, original_zip_column_name)
                for f in list(X_result[:, 1:].names)
            ]
            return X_result[:, 1:]
        except Exception as ex:
            loggerwarning(
                logger, "USZipcodeDatabaseTransformer got exception {}".format(
                    type(ex).__name__))
            return np.zeros(X.shape[0])
Ejemplo n.º 2
0
 def get_experiment_logger(self):
     logger = None
     if self.context and self.context.experiment_id:
         logger = make_experiment_logger(
             experiment_id=self.context.experiment_id,
             tmp_dir=self.context.tmp_dir,
             experiment_tmp_dir=self.context.experiment_tmp_dir)
     return logger
 def logger(self):
     from h2oaicore import application_context
     from h2oaicore.systemutils import exp_dir
     # Don't assign to self, not picklable
     return make_experiment_logger(
         experiment_id=application_context.context.experiment_id,
         tmp_dir=None,
         experiment_tmp_dir=exp_dir())
Ejemplo n.º 4
0
    def _get_experiment_logger(self):
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        return logger
    def transform(self, X: dt.Frame):
        """
        Uses fitted models (1 per time group) to predict the target
        If self.is_train exists, it means we are doing in-sample predictions
        if it does not then we Arima is used to predict the future
        :param X: Datatable Frame containing the features
        :return: ARIMA predictions
        """
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        nb_groups = len(XX_grp)
        preds = []
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # print("auto arima - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            order = np.argsort(X[self.time_column])
            if grp_hash in self.models:
                model = self.models[grp_hash]
                if model is not None:
                    yhat = model.predict_in_sample() \
                        if hasattr(self, 'is_train') else model.predict(n_periods=X.shape[0])
                    yhat = yhat[order]
                    XX = pd.DataFrame(yhat, columns=['yhat'])
                else:
                    XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                      columns=['yhat'])  # invalid model
            else:
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
            XX.index = X.index
            preds.append(XX)
        XX = pd.concat(tuple(preds), axis=0).sort_index()

        return XX
 def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
     logger = None
     if self.context and self.context.experiment_id:
         logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir,
                                         experiment_tmp_dir=self.context.experiment_tmp_dir)
     maybe_download_language_model(logger,
                                   save_directory=self.__class__._model_path,
                                   model_link=self.__class__._model_link,
                                   config_link=self.__class__._config_link,
                                   vocab_link=self.__class__._vocab_link)
     super().fit(X, y, sample_weight, eval_set, sample_weight_eval_set, **kwargs)
    def fit(self, X: dt.Frame, y: np.array = None):
        """
        Fits ARIMA models (1 per time group) using historical target values contained in y
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """
        # Import the ARIMA python module
        pm = importlib.import_module('pmdarima')
        # Init models
        self.models = {}
        # Convert to pandas
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Build 1 ARIMA model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # print("auto arima - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            order = np.argsort(X[self.time_column])
            try:
                model = pm.auto_arima(X['y'].values[order],
                                      error_action='ignore')
            except:
                model = None
            self.models[grp_hash] = model
        return self
Ejemplo n.º 8
0
    def transform(self, X: dt.Frame):
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        X = dt.Frame(X)
        original_zip_column_name = X.names[0]
        X = X[:, dt.str64(dt.f[0])]
        X.names = ['zip_key']
        try:
            zip_list = dt.unique(X[~dt.isna(dt.f.zip_key),
                                   0]).to_list()[0] + ['79936']
            zip_features = [self.get_zipcode_features(x) for x in zip_list]
            X_g = dt.Frame({"zip_key": zip_list})
            X_g.cbind(dt.Frame(zip_features))
            X_g.key = 'zip_key'
            X_result = X[:, :, dt.join(X_g)]
            self._output_feature_names = [
                "{}:{}.{}".format(self.transformer_name,
                                  original_zip_column_name,
                                  self.replaceBannedCharacters(f))
                for f in list(X_result[:, 1:].names)
            ]
            self._feature_desc = [
                "Property '{}' of zipcode column ['{}'] from US zipcode database (recipe '{}')"
                .format(f, original_zip_column_name, self.transformer_name)
                for f in list(X_result[:, 1:].names)
            ]
            return X_result[:, 1:]
        except ValueError as ve:
            loggerinfo(
                logger, "Column '{}' is not a zipcode: {}".format(
                    original_zip_column_name, str(ve)))
            return self.get_zipcode_null_result(X, original_zip_column_name)
        except TypeError as te:
            loggerwarning(
                logger, "Column '{}' triggered TypeError: {}".format(
                    original_zip_column_name, str(te)))
            raise te
    def fit_transform(self, X: dt.Frame, y: np.array = None, **kwargs):

        X_original = X

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir,
                username=self.context.username,
            )

        self._output_feature_names = [
            "pre:" + x for x in list(X_original.names)
        ]
        self._feature_desc = [
            "Pre-transformed feature " + x for x in list(X_original.names)
        ]

        return X_original
Ejemplo n.º 10
0
    def mutate_params(self, accuracy, time_tolerance, interpretability,
                      **kwargs):

        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Default version is do no mutation
        # Otherwise, change self.params for this model
        holiday_choice = [None, "US", "UK", "DE", "FRA"]
        if accuracy >= 8:
            weekly_choice = [False, 'auto', 5, 7, 10, 15]
            yearly_choice = [False, 'auto', 5, 10, 15, 20, 30]
            monthly_choice = [False, 3, 5, 7, 10]
            quarterly_choice = [False, 3, 5, 7, 10]
        elif accuracy >= 5:
            weekly_choice = [False, 'auto', 10, 20]
            yearly_choice = [False, 'auto', 10, 20]
            monthly_choice = [False, 5]
            quarterly_choice = [False, 5]
        else:
            # No alternative seasonality, and no seasonality override for weekly and yearly
            weekly_choice = [False, 'auto']
            yearly_choice = [False, 'auto']
            monthly_choice = [False]
            quarterly_choice = [False]

        self.params["country_holidays"] = np.random.choice(holiday_choice)
        self.params["seasonality_mode"] = np.random.choice(
            ["additive", "multiplicative"])
        self.params["weekly_seasonality"] = np.random.choice(weekly_choice)
        self.params["monthly_seasonality"] = np.random.choice(monthly_choice)
        self.params["quarterly_seasonality"] = np.random.choice(
            quarterly_choice)
        self.params["yearly_seasonality"] = np.random.choice(yearly_choice)
        self.params["growth"] = np.random.choice(["linear", "logistic"])
Ejemplo n.º 11
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get column names
        orig_cols = list(X.names)

        from h2oaicore.tensorflow_dynamic import got_cpu_tf, got_gpu_tf
        import tensorflow as tf
        import shap
        import scipy
        import pandas as pd

        self.setup_keras_session()

        import h2oaicore.keras as keras
        import matplotlib.pyplot as plt

        if not hasattr(self, 'save_model_path'):
            model_id = str(uuid.uuid4())[:8]
            self.save_model_path = os.path.join(user_dir(),
                                                "custom_xnn_model.hdf5")

        np.random.seed(self.random_state)

        my_init = keras.initializers.RandomUniform(seed=self.random_state)

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folter
        tmp_folder = self._create_tmp_folder(logger)

        # define base model
        def xnn_initialize(features,
                           ridge_functions=3,
                           arch=[20, 12],
                           learning_rate=0.01,
                           bg_samples=100,
                           beta1=0.9,
                           beta2=0.999,
                           dec=0.0,
                           ams=True,
                           bseed=None,
                           is_categorical=False):

            #
            # Prepare model architecture
            #
            # Input to the network, our observation containing all the features
            input = keras.layers.Input(shape=(features, ), name='main_input')

            # Record current column names
            loggerinfo(logger, "XNN LOG")
            loggerdata(logger, "Feature list:")
            loggerdata(logger, str(orig_cols))

            # Input to ridge function number i is the dot product of our original input vector times coefficients
            ridge_input = keras.layers.Dense(ridge_functions,
                                             name="projection_layer",
                                             activation='linear')(input)

            ridge_networks = []
            # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it
            ridge_inputs = SplitLayer(ridge_functions)(ridge_input)
            for i, ridge_input in enumerate(ridge_inputs):
                # Generate subnetwork i
                mlp = _mlp(ridge_input, i, arch)
                ridge_networks.append(mlp)

            added = keras.layers.Concatenate(
                name='concatenate_1')(ridge_networks)

            # Add the correct output layer for the problem
            if is_categorical:
                out = keras.layers.Dense(1,
                                         activation='sigmoid',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)
            else:
                out = keras.layers.Dense(1,
                                         activation='linear',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)

            model = keras.models.Model(inputs=input, outputs=out)

            optimizer = keras.optimizers.Adam(lr=learning_rate,
                                              beta_1=beta1,
                                              beta_2=beta2,
                                              decay=dec,
                                              amsgrad=ams)

            # Use the correct loss for the problem
            if is_categorical:
                model.compile(loss={'main_output': 'binary_crossentropy'},
                              optimizer=optimizer)
            else:
                model.compile(loss={'main_output': 'mean_squared_error'},
                              optimizer=optimizer)

            return model

        def _mlp(input, idx, arch=[20, 12], activation='relu'):
            # Set up a submetwork

            # Hidden layers
            mlp = keras.layers.Dense(arch[0],
                                     activation=activation,
                                     name='mlp_{}_dense_0'.format(idx),
                                     kernel_initializer=my_init)(input)
            for i, layer in enumerate(arch[1:]):
                mlp = keras.layers.Dense(layer,
                                         activation=activation,
                                         name='mlp_{}_dense_{}'.format(
                                             idx, i + 1),
                                         kernel_initializer=my_init)(mlp)

            # Output of the MLP
            mlp = keras.layers.Dense(
                1,
                activation='linear',
                name='mlp_{}_dense_last'.format(idx),
                kernel_regularizer=keras.regularizers.l1(1e-3),
                kernel_initializer=my_init)(mlp)
            return mlp

        def get_shap(X, model):
            # Calculate the Shap values
            np.random.seed(24)
            bg_samples = min(X.shape[0], 1000)

            if isinstance(X, pd.DataFrame):
                background = X.iloc[np.random.choice(X.shape[0],
                                                     bg_samples,
                                                     replace=False)]
            else:
                background = X[np.random.choice(X.shape[0],
                                                bg_samples,
                                                replace=False)]

            # Explain predictions of the model on the subset
            explainer = shap.DeepExplainer(model, background)
            shap_values = explainer.shap_values(X)

            # Return the mean absolute value of each shap value for each dataset
            xnn_shap = np.abs(shap_values[0]).mean(axis=0)

            return xnn_shap

        # Initialize the xnn's
        features = X.shape[1]
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            self.is_cat = True
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)
        else:
            self.is_cat = False
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        inputs = {'main_input': X}
        validation_set = 0
        verbose = 0

        # Train the neural network once with early stopping and a validation set
        history = keras.callbacks.History()
        es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')

        history = xnn1.fit(inputs,
                           y,
                           epochs=self.params["n_estimators"],
                           batch_size=self.params["batch_size"],
                           validation_split=0.3,
                           verbose=verbose,
                           callbacks=[history, es])

        # Train again on the full data
        number_of_epochs_it_ran = len(history.history['loss'])

        xnn.fit(inputs,
                y,
                epochs=number_of_epochs_it_ran,
                batch_size=self.params["batch_size"],
                validation_split=0.0,
                verbose=verbose)

        # Get the mean absolute Shapley values
        importances = np.array(get_shap(X, xnn))

        int_output = {}
        int_weights = {}
        int_bias = {}
        int_input = {}

        original_activations = {}

        x_labels = list(map(lambda x: 'x' + str(x), range(features)))

        intermediate_output = []

        # Record and plot the projection weights
        #
        weight_list = []
        for layer in xnn.layers:

            layer_name = layer.get_config()['name']
            if layer_name != "main_input":
                print(layer_name)
                weights = layer.get_weights()

                # Record the biases
                try:
                    bias = layer.get_weights()[1]
                    int_bias[layer_name] = bias
                except:
                    print("No Bias")

                # Record outputs for the test set
                intermediate_layer_model = keras.models.Model(
                    inputs=xnn.input, outputs=xnn.get_layer(layer_name).output)

                # Record the outputs from the training set
                if self.is_cat and (layer_name == 'main_output'):
                    original_activations[layer_name] = scipy.special.logit(
                        intermediate_layer_model.predict(X))
                    original_activations[
                        layer_name +
                        "_p"] = intermediate_layer_model.predict(X)
                else:
                    original_activations[
                        layer_name] = intermediate_layer_model.predict(X)

                    # Record other weights, inputs, and outputs
                int_weights[layer_name] = weights
                int_input[layer_name] = layer.input
                int_output[layer_name] = layer.output

            # Plot the projection layers
            if "projection_layer" in layer.get_config()['name']:

                # print(layer.get_config()['name'])

                # Record the weights for each projection layer
                weights = [np.transpose(layer.get_weights()[0])]

                weight_list2 = []
                for i, weight in enumerate(weights[0]):
                    weight_list.append(weight)
                    weight_list2.append(
                        list(np.reshape(weight, (1, features))[0]))

                    # Plot weights
                    plt.bar(orig_cols,
                            abs(np.reshape(weight, (1, features))[0]),
                            1,
                            color="blue")
                    plt.ylabel("Coefficient value")
                    plt.title("Projection Layer Weights {}".format(i),
                              fontdict={'fontsize': 10})
                    plt.xticks(rotation=90)
                    plt.show()
                    plt.savefig(os.path.join(
                        tmp_folder, 'projection_layer_' + str(i) + '.png'),
                                bbox_inches="tight")
                    plt.clf()

            if "main_output" in layer.get_config()['name']:
                weights_main = layer.get_weights()
                print(weights_main)

        pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder,
                                                       "projection_data.csv"),
                                          index=False)

        intermediate_output = []

        for feature_num in range(features):
            intermediate_layer_model = keras.models.Model(
                inputs=xnn.input,
                outputs=xnn.get_layer('mlp_' + str(feature_num) +
                                      '_dense_last').output)
            intermediate_output.append(intermediate_layer_model.predict(X))

        # Record and plot the ridge functions
        ridge_x = []
        ridge_y = []
        for weight_number in range(len(weight_list)):
            ridge_x.append(
                list(
                    sum(X[:, ii] * weight_list[weight_number][ii]
                        for ii in range(features))))
            ridge_y.append(list(intermediate_output[weight_number]))

            plt.plot(
                sum(X[:, ii] * weight_list[weight_number][ii]
                    for ii in range(features)),
                intermediate_output[weight_number], 'o')
            plt.xlabel("Input")
            plt.ylabel("Subnetwork " + str(weight_number))
            plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(
                os.path.join(tmp_folder,
                             'ridge_' + str(weight_number) + '.png'))
            plt.clf()

        # Output the ridge function importance
        weights2 = np.array([item[0] for item in list(weights)[0]])

        output_activations = np.abs(
            np.array([
                item * weights2
                for item in list(original_activations["concatenate_1"])
            ])).mean(axis=0)
        loggerinfo(logger, str(output_activations))
        pd.DataFrame(output_activations).to_csv(os.path.join(
            tmp_folder, "ridge_weights.csv"),
                                                index=False)

        plt.bar(x_labels, output_activations, 1, color="blue")
        plt.xlabel("Ridge function number")
        plt.ylabel("Feature importance")
        plt.title("Ridge function importance", fontdict={'fontsize': 10})
        plt.show()
        plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png'))

        pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join(
            tmp_folder, "ridge_y.csv"),
                                                              index=False)
        pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"),
                                     index=False)

        pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder,
                                                    "input_columns.csv"),
                                       index=False)

        self.set_model_properties(model=xnn,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=self.params['n_estimators'])
Ejemplo n.º 12
0
    def transform(self, X: dt.Frame):
        """
        Uses fitted models (1 per time group) to predict the target
        If self.is_train exists, it means we are doing in-sample predictions
        if it does not then we Arima is used to predict the future
        :param X: Datatable Frame containing the features
        :return: ARIMA predictions
        """
        # Get the logger if it exists
        logger = None
        tmp_folder = str(uuid.uuid4()) + "_arima_folder/"
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

            tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/"

        # Create a temp folder to store files used during multi processing experiment
        # This temp folder will be removed at the end of the process
        loggerinfo(logger, "Arima temp folder {}".format(tmp_folder))
        try:
            os.mkdir(tmp_folder)
        except PermissionError:
            # This not occur so log a warning
            loggerwarning(logger, "Arima was denied temp folder creation rights")
            tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/"
            os.mkdir(tmp_folder)
        except  FileExistsError:
            # We should never be here since temp dir name is expected to be unique
            loggerwarning(logger, "Arima temp folder already exists")
            tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/"
            os.mkdir(tmp_folder)
        except:
            # Revert to temporary file path
            loggerwarning(logger, "Arima defaulted to create folder inside tmp directory.")
            tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/"
            os.mkdir(tmp_folder)

        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just print where we are in the process of fitting models
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups))

            # Create time group key to store and retrieve fitted models
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # Create file path to store data and pass it to the fitting pool
            X_path = os.path.join(tmp_folder, "autoarima_Xt" + str(uuid.uuid4()))

            # Commented for performance, uncomment for debug
            # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(tmp_folder, "autoarima_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column, tmp_folder)
                kwargs = {}
                pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)
            else:
                # Don't go through pools
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat'])  # unseen groups
                # Sync indices
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)

        try:
            shutil.rmtree(tmp_folder)
            loggerinfo(logger, "Arima cleaned up temporary file folder.")
        except:
            loggerwarning(logger, "Arima could not delete the temporary file folder.")

        return XX
Ejemplo n.º 13
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        loggerinfo(logger,
                   "Prophet will use {} workers for transform".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)
        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        print("Nb Groups = ", nb_groups)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Log where we are in the transformation of the dataset
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(tmp_folder,
                                  "fbprophet_Xt" + str(uuid.uuid4()))
            # Commented for performance, uncomment for debug
            # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelProphetTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        return XX
Ejemplo n.º 14
0
    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits FB Prophet models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Convert to pandas
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        # Make sure labales are numeric
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        XX['y'] = np.array(y)
        # Set target prior
        self.nan_value = np.mean(y)

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        self.models = {}
        self.priors = {}

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        loggerinfo(logger,
                   "Prophet will use {} workers for fitting".format(n_jobs))
        loggerinfo(
            logger, "Prophet parameters holidays {} / monthly {}".format(
                self.country_holidays, self.monthly_seasonality))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        # Fit 1 FB Prophet model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just log where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder,
                                  "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))

            self.priors[grp_hash] = X['y'].mean()

            params = {
                "country_holidays": self.country_holidays,
                "monthly_seasonality": self.monthly_seasonality
            }

            args = (X_path, grp_hash, tmp_folder, params)
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelProphetTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self
Ejemplo n.º 15
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Specify these parameters for the dataset.
        #
        # Also set feature engineering effort to 0
        # under the features section of expert settings.
        ########################
        # Specify the protected column.
        # The protected column must be numeric.
        self.protected_name = "black"
        # Specify the level of the protected group in the protected column
        self.protected_label = 1
        # Specify the target level considered to be a positive outcome
        # Must be encoded as 0/1
        self.positive_target = 0
        # Set minimum mean protected ratio needed to avoid a penalty
        # (mean protected ratio = mean predictions for the protected group/mean predictions for all other groups)
        #
        # Try tuning this to values at or a little above
        # the mean of the positive target for the protected group
        # divided by the mean of the positive target for the unprotected group.
        # If it's set too large, the accuracy will be poor, so there
        # is a limit to the debiasing that can be obtained.
        self.mean_protected_prediction_ratio_minimum = 0.92
        ########################

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter
        import xgboost as xgb

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Current mu value
        mu = self.params["mu"]

        def fair_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
            ''' FairXGB Error Metric'''
            # predt is the prediction array

            # Find the right protected group vector
            if len(predt) == len(protected_train):
                protected_feature = np.array(protected_train.copy())

            elif len(predt) == len(protected_full):
                protected_feature = np.array(protected_full.copy())

            elif len(predt) == len(protected_valid):
                protected_feature = np.array(protected_valid.copy())

            else:
                protected_feature = 0

            y = dtrain.get_label()

            answer = -y * np.log(
                sigmoid(predt)) - (1 - y) * np.log(1 - sigmoid(predt))

            answer += mu * (
                protected_feature * np.log(sigmoid(predt)) +
                (1 - protected_feature) * np.log(1 - sigmoid(predt)))

            return 'Fair_Metric', float(np.sum(answer) / len(answer))

        def sigmoid(x):
            z = 1.0 / (1.0 + np.exp(-x))
            return z

        def gradient(predt: np.ndarray, dtrain: xgb.DMatrix):
            '''Fair Xgboost Gradient'''
            # predt is the prediction array

            # Find the right protected group vector
            if len(predt) == len(protected_train):
                protected_feature = np.array(protected_train.copy())

            elif len(predt) == len(protected_full):
                protected_feature = np.array(protected_full.copy())

            elif len(predt) == len(protected_valid):
                protected_feature = np.array(protected_valid.copy())

            else:
                protected_feature = 0

            y = dtrain.get_label()

            answer = sigmoid(predt) - y
            answer += mu * (protected_feature - sigmoid(predt))

            return answer

        def hessian(predt: np.ndarray, dtrain: xgb.DMatrix):
            '''Fair Xgboost Hessian'''
            # predt is the prediction array

            answer = (1 - mu) * sigmoid(predt) * (1 - sigmoid(predt))

            return answer

        def fair(predt: np.ndarray, dtrain: xgb.DMatrix):
            ''' Fair xgb objective function
            '''

            grad = gradient(predt, dtrain)
            hess = hessian(predt, dtrain)
            return grad, hess

            # Set up model

        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            params = {}
            params['eta'] = self.params["eta"]
            params['max_depth'] = self.params['max_depth']
            params['min_child_weight'] = self.params['min_child_weight']
            params['reg_lambda'] = self.params['reg_lambda']
            params['reg_alpha'] = self.params['reg_alpha']
            params['colsample_bytree'] = self.params['colsample_bytree']
            params['subsample'] = self.params['subsample']
            params['silent'] = 1
            params['seed'] = self.params['random_state']
        else:
            # fairxgb doesn't work for regression
            loggerinfo(logger, "PASS, no fairxgboost model")
            pass

        # Switch to pandas
        X = X.to_pandas()
        X.columns = orig_cols

        # Find the protected group column if it is present
        self.protected = "none"
        for col in X.columns:
            if col.find(self.protected_name) > -1:
                self.protected = col

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [
            orig_cols[col_count] for col_count in range(len(orig_cols))
            if (X_datatypes[col_count] == 'category') or (
                X_datatypes[col_count] == 'object')
        ]
        self.X_numeric = [
            item for item in orig_cols if item not in self.X_categorical
        ]
        self.encoded_categories = []

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            loggerinfo(logger, "Categorical encode")

            for colname in self.X_categorical:
                X[colname] = list(X[colname].fillna("Missing"))

            self.enc = OneHotEncoder(handle_unknown='ignore')

            if self.protected in self.X_categorical:
                self.X_categorical.remove(self.protected)

            if len(self.X_categorical) > 0:
                self.enc.fit(X[self.X_categorical])
                self.encoded_categories = list(
                    self.enc.get_feature_names(
                        input_features=self.X_categorical))

                X_enc = self.enc.transform(X[self.X_categorical]).toarray()

                X = pd.concat([
                    X[self.X_numeric],
                    pd.DataFrame(X_enc, columns=self.encoded_categories)
                ],
                              axis=1)

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:

            for colname in self.X_numeric:
                X[colname] = list(X[colname].fillna(-999))

                # Make sure the target that represents a positive outcome is 1
        if self.positive_target == 0:
            y = 1 - y
        X_full = X.copy()
        y_full = y.copy()

        # Set up a validation step to find the optimal number of trees
        X_valid = X.iloc[int(0.7 * len(X_full)):, :]
        y_valid = y[int(0.7 * len(X_full)):]
        X = X.iloc[0:int(0.7 * len(X_full)), :]
        y = y[0:int(0.7 * len(X_full))]

        if self.protected != "none":
            # Set the protected group to 0 and all others 1
            protected_full = [
                int(item) for item in
                ~(np.array(X_full[self.protected]) == self.protected_label)
            ]
            protected_train = [
                int(item) for item in
                ~(np.array(X[self.protected]) == self.protected_label)
            ]
            protected_valid = [
                int(item) for item in
                ~(np.array(X_valid[self.protected]) == self.protected_label)
            ]
        else:
            mu = 0
            protected_full = []
            protected_train = []
            protected_valid = []

        # Remove the protected value from the model
        if self.protected != "none":
            X = X.drop(self.protected, axis=1)

            X_full = X_full.drop(self.protected, axis=1)

            X_valid = X_valid.drop(self.protected, axis=1)

        d_train = xgb.DMatrix(X, label=y, missing=np.nan)

        d_valid = xgb.DMatrix(X_valid, label=y_valid, missing=np.nan)

        # Initial run to find the optimal number of trees
        num_iterations = 10000
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        clf = xgb.train(params,
                        d_train,
                        num_iterations,
                        watchlist,
                        feval=fair_metric,
                        verbose_eval=10,
                        obj=fair,
                        early_stopping_rounds=10)

        # Second xgboost run with the full dataset and optimal number of trees
        attribute_dict = clf.attributes()
        new_iterations = int(attribute_dict['best_iteration'])

        d_train = xgb.DMatrix(X_full, label=y_full, missing=np.nan)
        watchlist = [(d_train, 'train')]
        clf = xgb.train(params,
                        d_train,
                        new_iterations,
                        watchlist,
                        feval=fair_metric,
                        verbose_eval=10,
                        obj=fair)

        # Calculate feature importances
        importances_dict = clf.get_score(importance_type='gain')

        # Make sure the protected group has high feature importance
        # so that it doesn't get dropped by driverless
        if self.protected != "none":
            if len(importances_dict) > 0:
                importances_dict[self.protected] = max(
                    importances_dict.values())
            else:

                importances_dict[self.protected] = 1
                for col in list(X.columns):
                    importances_dict[col] = 1

        # Make sure any dropped columns are listed with 0 importance
        for col in list(X.columns):
            if col not in importances_dict:
                importances_dict[col] = 0

        self.mean_target = np.array(sum(y) / len(y))

        loggerinfo(logger, "End fair check")
        loggerinfo(logger, str(mu))
        loggerdata(logger, str(importances_dict))
        self.is_train = True

        # Set model properties
        self.set_model_properties(model=clf,
                                  features=list(importances_dict.keys()),
                                  importances=list(importances_dict.values()),
                                  iterations=num_iterations)
Ejemplo n.º 16
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        If self.is_train exists, it means we are doing in-sample predictions
        if it does not then we Arima is used to predict the future
        :param X: Datatable Frame containing the features
        :return: ARIMA predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        loggerinfo(logger,
                   "Arima will use {} workers for transform".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just print where we are in the process of fitting models
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            # Create time group key to store and retrieve fitted models
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # Create file path to store data and pass it to the fitting pool
            X_path = os.path.join(tmp_folder,
                                  "autoarima_Xt" + str(uuid.uuid4()))

            # Commented for performance, uncomment for debug
            # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    tmp_folder, "autoarima_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value,
                        hasattr(self, 'is_train'), self.time_column,
                        self.pred_gap, tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelAutoArimaTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                # Don't go through pools
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                # Sync indices
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        return XX
Ejemplo n.º 17
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Predict y using unique dates
        X_time = X[['ds']].groupby('ds').first().reset_index()
        with suppress_stdout_stderr():
            y_avg = self.model.predict(X_time)[['ds', 'yhat']]

        # Prophet transforms the date column to datetime so we need to transfrom that to merge back
        X_time.sort_values('ds', inplace=True)
        X_time['yhat'] = y_avg['yhat']
        X_time.sort_index(inplace=True)

        # Merge back into original frame on 'ds'
        # pd.merge wipes the index ... so keep it to provide it again
        indices = X.index
        X = pd.merge(
            left=X,
            right=X_time[['ds', 'yhat']],
            on='ds',
            how='left'
        )
        X.index = indices

        # Go through groups and recover the scaled target for knowed groups
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        inverted_ys = []
        for key, X_grp in X_groups:
            grp_hash = self.get_hash(key)

            # Scale target for current group
            if grp_hash in self.scalers.keys():
                inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']])
            else:
                inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']])

            # Put back in a DataFrame to keep track of original index
            inverted_df = pd.DataFrame(inverted_y, columns=['yhat'])
            inverted_df.index = X_grp.index
            inverted_ys.append(inverted_df)

        XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index()

        if self.top_groups:
            # Go though the groups and predict only top
            XX_paths = []
            model_paths = []

            def processor(out, res):
                out.append(res)
            num_tasks = len(self.top_groups)
            pool_to_use = small_job_pool
            pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs)

            nb_groups = len(X_groups)
            for _i_g, (key, X_grp) in enumerate(X_groups):

                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups))

                # Create dict key to store the min max scaler
                grp_hash = self.get_hash(key)
                X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4()))

                if grp_hash not in self.top_groups:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                if self.grp_models[grp_hash] is None:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                model = self.grp_models[grp_hash]
                model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X_grp, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)

            pool.finish()
            XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
            for p in XX_paths + model_paths:
                remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        features_df = pd.DataFrame()
        features_df[self.display_name + '_GrpAvg'] = XX_general['yhat']

        if self.top_groups:
            features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat']

        self._output_feature_names = list(features_df.columns)
        self._feature_desc = list(features_df.columns)

        return features_df
Ejemplo n.º 18
0
    def fit(self, X: dt.Frame, y: np.array = None):
        """
        Fits ARIMA models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """
        # Get the logger if it exists
        logger = None
        tmp_folder = str(uuid.uuid4()) + "_arima_folder/"
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

            tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/"

        # Create a temp folder to store files used during multi processing experiment
        # This temp folder will be removed at the end of the process
        loggerinfo(logger, "Arima temp folder {}".format(tmp_folder))
        try:
            os.mkdir(tmp_folder)
        except PermissionError:
            # This not occur so log a warning
            loggerwarning(logger, "Arima was denied temp folder creation rights")
            tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/"
            os.mkdir(tmp_folder)
        except  FileExistsError:
            # We should never be here since temp dir name is expected to be unique
            loggerwarning(logger, "Arima temp folder already exists")
            tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/"
            os.mkdir(tmp_folder)
        except:
            # Revert to temporary file path
            tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/"
            os.mkdir(tmp_folder)

        # Import the ARIMA python module
        pm = importlib.import_module('pmdarima')
        # Init models
        self.models = {}
        # Convert to pandas
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        if hasattr(self, "params_base"):
            max_workers = self.params_base['n_jobs']
        else:
            loggerinfo(logger, "Custom Recipe does not have a params_base attribute")
            # Beware not to use the disable_gpus keyword here. looks like cython does not like it
            # max_workers = get_max_workers(True)
            # Just set default to 2
            max_workers = 2

        loggerinfo(logger, "Arima will use {} workers for parallel processing".format(max_workers))
        pool = pool_to_use(
            logger=None, processor=processor,
            num_tasks=num_tasks, max_workers=max_workers
        )

        # Build 1 ARIMA model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder, "autoarima_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash, self.time_column, tmp_folder)
            kwargs = {}
            pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs,
                               out=self.models)
        pool.finish()

        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        try:
            shutil.rmtree(tmp_folder)
            loggerinfo(logger, "Arima cleaned up temporary file folder.")
        except:
            loggerwarning(logger, "Arima could not delete the temporary file folder.")

        return self
Ejemplo n.º 19
0
    def predict(self, X, **kwargs):

        model_config, _, _, _ = self.get_model_properties()

        models = model_config['models']
        cap = model_config['cap']
        priors = model_config['priors']
        prior = model_config['prior']

        if self.tgc is None or not all([x in X.names for x in self.tgc]):
            return np.ones(X.shape[0]) * self.nan_value

        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)
        loggerinfo(logger, "Start Predicting with Prophet")

        # Reduce to TimeGroupColumns
        if isinstance(X, dt.Frame):
            # Convert to pandas
            XX = X[:, self.tgc].to_pandas()
        else:
            XX = X[:, self.tgc].copy()

        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)

        if self.params["growth"] == "logistic":
            XX["cap"] = cap

        # Compute groups
        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Go Through groups and predict
        #
        nb_groups = len(XX_grp)
        preds = []
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just log where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet Model : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))

            # Facebook Prophet returns the predictions ordered by time
            # So we should keep track of the time order for each group so that
            # predictions are ordered the same as the imput frame
            # Keep track of the order
            order = np.argsort(pd.to_datetime(X["ds"]))
            if grp_hash in models.keys():
                model = models[grp_hash]
                if model is not None:
                    # Run prophet
                    yhat = model.predict(X)
                    XX = yhat
                else:
                    if grp_hash in priors.keys():
                        XX = pd.DataFrame(np.full((X.shape[0], 1),
                                                  priors[grp_hash]),
                                          columns=['yhat'])
                    else:
                        # This should not happen
                        loggerinfo(logger, "Group in models but not in priors")
                        XX = pd.DataFrame(np.full((X.shape[0], 1), prior),
                                          columns=['yhat'])
            else:
                # print("No Group")
                XX = pd.DataFrame(np.full((X.shape[0], 1), prior),
                                  columns=['yhat'])  # unseen groups

            # Reorder the index like prophet re-ordered the predictions
            XX.index = X.index[order]
            # print("Transformed Output for Group")
            # print(XX.sort_index().head(20), flush=True)
            preds.append(XX[['yhat']])

        XX = pd.concat(tuple(preds), axis=0).sort_index()

        return XX['yhat'].values
Ejemplo n.º 20
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter
        import pygam
        from pygam import LinearGAM, LogisticGAM
        import matplotlib.pyplot as plt

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folder
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            clf = LogisticGAM(terms="auto",
                              lam=self.params["lam"],
                              max_iter=self.params["max_iter"])
            self.is_classifier = True

        else:
            clf = LinearGAM(terms="auto",
                            lam=self.params["lam"],
                            max_iter=self.params["max_iter"])
            self.is_classifier = False

        X = self.basic_impute(X)
        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [
            orig_cols[col_count] for col_count in range(len(orig_cols))
            if (X_datatypes[col_count] == 'category') or (
                X_datatypes[col_count] == 'object')
        ]
        self.X_numeric = [
            item for item in orig_cols if item not in self.X_categorical
        ]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            X.loc[:, self.X_categorical] = X[self.X_categorical].fillna(
                "Missing").copy()
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(
                self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        # Replace missing values with a missing value code
        self.median_train = {}

        if len(self.X_numeric) > 0:
            for colname in self.X_numeric:
                self.median_train[colname] = X[colname].quantile(0.5)
                X.loc[:, colname] = X[colname].fillna(
                    self.median_train[colname]).copy()

        try:
            clf.fit(X, y)
        except np.linalg.LinAlgError as e:
            raise IgnoreError("np.linalg.LinAlgError") from e
        except pygam.utils.OptimizationError as e:
            raise IgnoreError("pygam.utils.OptimizationError") from e
        except ValueError as e:
            if 'On entry to DLASCL parameter number' in str(e):
                raise IgnoreError('On entry to DLASCL parameter number') from e
            raise

        p_values = np.array(clf.statistics_['p_values'])

        # Plot the partial dependence plots for each feature
        for ii in range(X.shape[1]):
            XX = clf.generate_X_grid(term=ii)
            plt.figure()
            plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX))
            plt.plot(XX[:, ii],
                     clf.partial_dependence(term=ii, X=XX, width=.95)[1],
                     c='r',
                     ls='--')
            plt.title("Partial Dependence " + str(ii),
                      fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(os.path.join(
                tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'),
                        bbox_inches="tight")

        if max(p_values[0:(len(p_values) - 1)]) > 0:
            importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16))

            importances = list(importances / max(importances))
        else:
            importances = [1] * (len(p_values) - 1)

        self.mean_target = np.array(sum(y) / len(y))

        self.set_model_properties(model=clf,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter
        from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
        from sklearn.linear_model import LogisticRegression, LinearRegression
        from sklearn import tree
        import matplotlib.pyplot as plt

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folter
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            clf = DecisionTreeClassifier(random_state=42,
                                         max_depth=self.params["tree_depth"])
            self.is_classifier = True

        else:
            clf = DecisionTreeRegressor(random_state=42,
                                        max_depth=self.params["tree_depth"])
            self.is_classifier = False

        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [
            orig_cols[col_count] for col_count in range(len(orig_cols))
            if (X_datatypes[col_count] == 'category') or (
                X_datatypes[col_count] == 'object')
        ]
        self.X_numeric = [
            item for item in orig_cols if item not in self.X_categorical
        ]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            X.loc[:, self.X_categorical] = X[self.X_categorical].fillna(
                "Missing").copy()
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(
                self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:
            X.loc[:, self.X_numeric] = X[self.X_numeric].fillna(-999).copy()

            # Fit the decision tree
        clf.fit(X, y)
        if self.is_classifier:
            yy = clf.predict_proba(X)

            p = np.round_(yy[:, 1], 5)
        else:
            yy = clf.predict(X)

            p = np.round_(yy, 5)

        self.leaf_categories = list(set(p))

        # Fit linear or logistic models to each leaf node
        model_array = {}
        equation_log = []
        for cat in self.leaf_categories:
            if self.is_classifier:
                if (np.mean(y[p == cat]) < 1) and (np.mean(y[p == cat]) > 0):

                    lm = LogisticRegression(random_state=42)

                    lm.fit(X[p == cat], y[p == cat])

                    model_array[cat] = lm
                    equation_log.append([[
                        int(round((1 - cat) * sum(p == cat))),
                        int(round(cat * sum(p == cat)))
                    ],
                                         sum(p == cat), lm.intercept_[0]] +
                                        list(lm.coef_[0]))
                else:
                    loggerinfo(logger, "No leaf fit")
                    model_array[cat] = "dt"
            else:
                try:
                    lm = LinearRegression()
                    lm.fit(X[p == cat], y[p == cat])

                    model_array[cat] = lm

                    equation_log.append(
                        [cat, sum(p == cat), lm.intercept_] + list(lm.coef_))
                except:
                    loggerinfo(logger, "No leaf fit")
                    model_array[cat] = "dt"

                    # Save the leaf models
        pd.DataFrame(equation_log,
                     columns=['leaf value', 'number of samples', 'intercept'] +
                     list(X.columns)).to_csv(
                         os.path.join(tmp_folder, 'Leaf_model_coef.csv'))

        # Plot the decision tree
        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 8), dpi=1600)
        tree.plot_tree(clf, feature_names=list(X.columns))
        fig.savefig(os.path.join(tmp_folder, 'Decision_tree_plot.png'))

        importances = clf.feature_importances_
        loggerinfo(logger, str(importances))

        self.mean_target = np.array(sum(y) / len(y))

        model = [clf, model_array]
        # Set model properties
        self.set_model_properties(model=model,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
Ejemplo n.º 22
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        logger = None
        if self._make_logger:
            # Example use of logger, with required import of:
            #  from h2oaicore.systemutils import make_experiment_logger, loggerinfo
            # Can use loggerwarning, loggererror, etc. for different levels
            if self.context and self.context.experiment_id:
                logger = make_experiment_logger(
                    experiment_id=self.context.experiment_id,
                    tmp_dir=self.context.tmp_dir,
                    experiment_tmp_dir=self.context.experiment_tmp_dir)

        if self._show_logger_test:
            loggerinfo(logger, "TestLOGGER: Fit CatBoost")

        if self._show_task_test:
            # Example task sync operations
            if hasattr(self, 'testcount'):
                self.test_count += 1
            else:
                self.test_count = 0

            # The below generates a message in the GUI notifications panel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                warning = "TestWarning: First CatBoost fit for this model instance"
                loggerwarning(logger, warning)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='warning', data=warning))
                    task.flush()

            # The below generates a message in the GUI top-middle panel above the progress wheel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                message = "Tuning CatBoost"
                loggerinfo(logger, message)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='update', message=message))
                    task.flush()

        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType

        # label encode target and setup type of problem
        lb = LabelEncoder()
        if self.num_classes >= 2:
            lb.fit(self.labels)
            y = lb.transform(y)
            if eval_set is not None:
                valid_X = eval_set[0][0]
                valid_y = eval_set[0][1]
                valid_y = lb.transform(valid_y)
                eval_set = [(valid_X, valid_y)]
            self.params.update({'objective': 'Logloss'})
        if self.num_classes > 2:
            self.params.update({'objective': 'MultiClass'})

        if isinstance(X, dt.Frame):
            orig_cols = list(X.names)
            numeric_cols = list(X[:, [bool, int, float]].names)
        else:
            orig_cols = list(X.columns)
            numeric_cols = list(X.select_dtypes([np.number]).columns)

        # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc.
        self.params['cat_features'] = [
            i for i, x in enumerate(orig_cols)
            if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols
        ]

        if not self.get_uses_gpus(self.params):
            # monotonicity constraints not available for GPU for catboost
            # get names of columns in same order
            X_names = list(dt.Frame(X).names)
            X_numeric = self.get_X_ordered_numerics(X)
            X_numeric_names = list(X_numeric.names)
            _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y)
            # if non-numerics, then fix those to have 0 constraint
            self.params['monotone_constraints'] = [0] * len(X_names)
            colnumi = 0
            for coli in X_names:
                if X_names[coli] in X_numeric_names:
                    self.params['monotone_constraints'][coli] = constraints[
                        colnumi]
                    colnumi += 1

        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> catboost internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)
            if eval_set is not None:
                valid_X = eval_set[0][0].to_numpy(
                )  # don't assign back to X so don't damage during predict
                valid_X = np.ascontiguousarray(
                    valid_X,
                    dtype=np.float32
                    if config.data_precision == "float32" else np.float64)
                valid_y = eval_set[0][1]
                eval_set = [(valid_X, valid_y)]

        if eval_set is not None:
            valid_X_shape = eval_set[0][0].shape
        else:
            valid_X_shape = None

        X, eval_set = self.process_cats(X, eval_set, orig_cols)

        # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes
        self.acquire_gpus_function(train_shape=X.shape,
                                   valid_shape=valid_X_shape)

        params = copy.deepcopy(
            self.params
        )  # keep separate, since then can be pulled form lightgbm params
        params = self.transcribe_params(params=params, **kwargs)

        if logger is not None:
            loggerdata(
                logger,
                "CatBoost parameters: params_base : %s params: %s catboost_params: %s"
                % (str(self.params_base), str(self.params), str(params)))

        if self.num_classes == 1:
            self.model = CatBoostRegressor(**params)
        else:
            self.model = CatBoostClassifier(**params)
        # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored.
        if self.num_classes == 1:
            # assume not mae, which would use median
            # baseline = [np.mean(y)] * len(y)
            baseline = None
        else:
            baseline = None

        kwargs_fit = dict(baseline=baseline, eval_set=eval_set)
        pickle_path = None
        if config.debug_daimodel_level >= 2:
            self.uuid = str(uuid.uuid4())[:6]
            pickle_path = os.path.join(exp_dir(),
                                       "catboost%s.tmp.pickle" % self.uuid)
            save_obj((self.model, X, y, sample_weight, kwargs_fit),
                     pickle_path)

        # FIT (with migration safety before hyperopt/Optuna function added)
        try:
            if hasattr(self, 'dask_or_hyper_or_normal_fit'):
                self.dask_or_hyper_or_normal_fit(X,
                                                 y,
                                                 sample_weight=sample_weight,
                                                 kwargs=kwargs,
                                                 **kwargs_fit)
            else:
                self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit)
        except Exception as e:
            if "All features are either constant or ignored" in str(e):
                raise IgnoreEntirelyError(str(e))
            raise

        if config.debug_daimodel_level <= 2:
            remove(pickle_path)

        # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html
        # need to move to wrapper
        if self.model.get_best_iteration() is not None:
            iterations = self.model.get_best_iteration() + 1
        else:
            iterations = self.params['n_estimators']
        # must always set best_iterations
        self.model_path = None
        importances = copy.deepcopy(self.model.feature_importances_)
        if not self._save_by_pickle:
            self.uuid = str(uuid.uuid4())[:6]
            model_file = "catboost_%s.bin" % str(self.uuid)
            self.model_path = os.path.join(self.context.experiment_tmp_dir,
                                           model_file)
            self.model.save_model(self.model_path)
            with open(self.model_path, mode='rb') as f:
                model = f.read()
        else:
            model = self.model
        self.set_model_properties(
            model=
            model,  # overwrites self.model object with bytes if not using pickle
            features=orig_cols,
            importances=importances,
            iterations=iterations)
Ejemplo n.º 23
0
    def predict(self, X, **kwargs):
        orig_cols = list(X.names)
        import pandas as pd
        import xgboost as xgb
        import numpy as np

        def sigmoid(x):
            z = 1.0 / (1.0 + np.exp(-x))
            return z

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        X = dt.Frame(X)

        X = X.to_pandas()

        if self.protected in list(X.columns):
            # Set the protected group to 0 and all others 1
            loggerdebug(logger, "Protected test found")
            protected_test = np.array([
                int(item) for item in
                ~(np.array(X[self.protected]) == self.protected_label)
            ])

        else:
            loggerdebug(logger, "Protected test not found")
            protected_test = np.array([])

        if self.protected in list(X.columns):
            X = X.drop(self.protected, axis=1)

        # Replace missing values with a missing category
        # Replace categories that weren't in the training set with the mode
        if len(self.X_categorical) > 0:

            for colname in self.X_categorical:
                if colname in list(X.columns):
                    X[colname] = list(X[colname].fillna("Missing"))

            for label in self.X_categorical:
                if label in list(X.columns):
                    # Replace anything not in the test set
                    train_categories = self.train_levels[label]
                    X_label = np.array(X[label])
                    mmode = self.train_mode[label]
                    X_label[~np.isin(X_label, train_categories)] = mmode
                    X[label] = X_label

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:
            for colname in self.X_numeric:
                if colname in list(X.columns):
                    X[colname] = list(X[colname].fillna(-999))

                    # Get model
        model, _, _, _ = self.get_model_properties()

        # Remove the protected group
        if self.protected in self.X_categorical:
            self.X_categorical.remove(self.protected)

        # One hot encode categorical features
        if len(self.X_categorical) > 0:
            X_enc = self.enc.transform(X[self.X_categorical]).toarray()
            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        d_test = xgb.DMatrix(X, missing=np.nan)

        # If the positive target was 0, change the final result to 1-p
        if self.positive_target == 0:
            preds = 1.0 - sigmoid(model.predict(d_test))
        else:
            preds = sigmoid(model.predict(d_test))

        mean_preds = np.mean(preds)

        # Set a penalty value to which some probabilities will be changed
        # if the fairness threshold isn't reached
        epsilon = 0.0001
        if mean_preds > 0.5:
            penalty = epsilon
        else:
            penalty = 1.0 - epsilon

        # Only apply penalties in the training stage
        if self.is_train:
            # If the protected value was removed, use the maximum penalty
            # by changing all probabilities to the penalty value
            # (the recipe needs to be able to use the protected values)
            if self.protected == "none":
                preds[0:len(preds)] = penalty
                loggerdata(logger, str(preds))
                loggerdata(logger, "Removal_penalty")

            else:
                # The mean ratio calculation for target=0 and target=1
                if self.positive_target == 0:
                    if np.mean(preds[protected_test == 1]) < 1.0:
                        DI = (1.0 - np.mean(preds[protected_test == 0])) / (
                            1.0 - np.mean(preds[protected_test == 1]))
                    else:
                        DI = 1
                else:
                    if np.mean(preds[protected_test == 1]) > 0.0:
                        DI = np.mean(preds[protected_test == 0]) / np.mean(
                            preds[protected_test == 1])
                    else:
                        DI = 1

                loggerdata(logger, "Mean ratio Check")
                loggerdata(logger, str(DI))

                if DI < self.mean_protected_prediction_ratio_minimum:
                    # Create a penalty proportional to the distance below the specified threshold
                    len_preds = len(preds)
                    num_penalty = min(
                        len_preds,
                        int((self.mean_protected_prediction_ratio_minimum - DI)
                            / self.mean_protected_prediction_ratio_minimum *
                            len_preds))

                    preds[0:num_penalty] = penalty
                    loggerdata(logger, "num_penalty1")
                    loggerdata(logger, str(num_penalty),
                               str(num_penalty / len(preds)))

        self.is_train = False

        return preds
Ejemplo n.º 24
0
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):

        # Example use of logger, with required import of:
        #  from h2oaicore.systemutils import make_experiment_logger, loggerinfo
        # Can use loggerwarning, loggererror, etc. for different levels
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir,
                                            experiment_tmp_dir=self.context.experiment_tmp_dir)
        loggerinfo(logger, "TestLOGGER: Fit CatBoost")

        # Example task sync operations
        if hasattr(self, 'testcount'):
            self.test_count += 1
        else:
            self.test_count = 0

        # The below generates a message in the GUI notifications panel
        if self.test_count == 0 and self.context and self.context.experiment_id:
            warning = "TestWarning: First CatBoost fit for this model instance"
            loggerwarning(logger, warning)
            task = kwargs.get('task')
            if task:
                task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning))
                task.flush()

        # The below generates a message in the GUI top-middle panel above the progress wheel
        if self.test_count == 0 and self.context and self.context.experiment_id:
            message = "TestMessage: CatBoost"
            loggerinfo(logger, message)
            task = kwargs.get('task')
            if task:
                task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message))
                task.flush()

        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
        lb = LabelEncoder()
        if self.num_classes >= 2:
            lb.fit(self.labels)
            y = lb.transform(y)

        if isinstance(X, dt.Frame):
            orig_cols = list(X.names)
            # dt -> lightgbm internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy()  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64)
            if eval_set is not None:
                valid_X = eval_set[0][0].to_numpy()  # don't assign back to X so don't damage during predict
                valid_X = np.ascontiguousarray(valid_X,
                                               dtype=np.float32 if config.data_precision == "float32" else np.float64)
                valid_y = eval_set[0][1]
                if self.num_classes >= 2:
                    valid_y = lb.transform(valid_y)
                eval_set[0] = (valid_X, valid_y)
        else:
            orig_cols = list(X.columns)

        if self.num_classes == 1:
            model = CatBoostRegressor(**self.params)
        else:
            model = CatBoostClassifier(**self.params)
        # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored.
        if self.num_classes == 1:
            # assume not mae, which would use median
            # baseline = [np.mean(y)] * len(y)
            baseline = None
        else:
            baseline = None

        model.fit(X, y=y,
                  sample_weight=sample_weight,
                  baseline=baseline,
                  eval_set=eval_set,
                  early_stopping_rounds=kwargs.get('early_stopping_rounds', None),
                  verbose=self.params.get('verbose', False)
                  )

        # need to move to wrapper
        if model.get_best_iteration() is not None:
            iterations = model.get_best_iteration() + 1
        else:
            iterations = self.params['iterations'] + 1
        # must always set best_iterations
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=model.feature_importances_,
                                  iterations=iterations)
Ejemplo n.º 25
0
    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits FB Prophet models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

        try:
            # Add value of prophet_top_n in recipe_dict variable inside of config.toml file
            # eg1: recipe_dict="{'prophet_top_n': 200}"
            # eg2: recipe_dict="{'prophet_top_n':10}"
            self.top_n = config.recipe_dict['prophet_top_n']
        except KeyError:
            self.top_n = 50

        loggerinfo(logger, f"Prophet will use {self.top_n} groups as well as average target data.")

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Add target, Label encoder is only used for Classif. which we don't support...
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        X['y'] = np.array(y)

        self.nan_value = X['y'].mean()

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Create a general scale now that will be used for unknown groups at prediction time
        # Can we do smarter than that ?
        self.general_scaler = MinMaxScaler().fit(X[['y', 'ds']].groupby('ds').median().values)

        # Go through groups and standard scale them
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        self.scalers = {}
        scaled_ys = []
        print(f'{datetime.now()} Start of group scaling')

        for key, X_grp in X_groups:
            # Create dict key to store the min max scaler
            grp_hash = self.get_hash(key)
            # Scale target for current group
            self.scalers[grp_hash] = MinMaxScaler()
            y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values)
            # Put back in a DataFrame to keep track of original index
            y_skl_df = pd.DataFrame(y_skl, columns=['y'])
            # (0, 'A') (1, 4) (100, 1) (100, 1)
            # print(grp_hash, X_grp.shape, y_skl.shape, y_skl_df.shape)

            y_skl_df.index = X_grp.index
            scaled_ys.append(y_skl_df)

        print(f'{datetime.now()} End of group scaling')
        # Set target back in original frame but keep original
        X['y_orig'] = X['y']
        X['y'] = pd.concat(tuple(scaled_ys), axis=0)

        # Now Average groups
        X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index()

        # Send that to Prophet
        params = {
            "country_holidays": self.country_holidays,
            "monthly_seasonality": self.monthly_seasonality
        }
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        self.model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)

        if params["country_holidays"] is not None:
            self.model.add_country_holidays(country_name=params["country_holidays"])
        if params["monthly_seasonality"]:
            self.model.add_seasonality(name='monthly', period=30.5, fourier_order=5)

        with suppress_stdout_stderr():
            self.model.fit(X[['ds', 'y']])

        print(f'{datetime.now()} General Model Fitted')

        self.top_groups = None
        if len(tgc_wo_time) > 0:
            if self.top_n > 0:
                top_n_grp = X.groupby(tgc_wo_time).size().sort_values().reset_index()[tgc_wo_time].iloc[-self.top_n:].values
                self.top_groups = [
                    '_'.join(map(str, key))
                    for key in top_n_grp
                ]

        if self.top_groups:
            self.grp_models = {}
            self.priors = {}

            # Prepare for multi processing
            num_tasks = len(self.top_groups)

            def processor(out, res):
                out[res[0]] = res[1]

            pool_to_use = small_job_pool
            loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.")
            loggerinfo(logger, "Prophet parameters holidays {} / monthly {}".format(self.country_holidays, self.monthly_seasonality))
            pool = pool_to_use(
                logger=None, processor=processor,
                num_tasks=num_tasks, max_workers=n_jobs
            )
            #
            # Fit 1 FB Prophet model per time group columns
            nb_groups = len(X_groups)

            # Put y back to its unscaled value for top groups
            X['y'] = X['y_orig']

            for _i_g, (key, X) in enumerate(X_groups):
                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups))

                X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4()))
                X = X.reset_index(drop=True)
                save_obj(X, X_path)

                grp_hash = self.get_hash(key)

                if grp_hash not in self.top_groups:
                    continue

                self.priors[grp_hash] = X['y'].mean()

                params = {
                    "country_holidays": self.country_holidays,
                    "monthly_seasonality": self.monthly_seasonality
                }

                args = (X_path, grp_hash, tmp_folder, params)
                kwargs = {}
                pool.submit_tryget(None, MyParallelProphetTransformer_fit_async,
                                   args=args, kwargs=kwargs, out=self.grp_models)
            pool.finish()

            for k, v in self.grp_models.items():
                self.grp_models[k] = load_obj(v) if v is not None else None
                remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self
Ejemplo n.º 26
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get TGC and time column
        self.tgc = self.params_base.get('tgc', None)
        self.time_column = self.params_base.get('time_column', None)
        self.nan_value = np.mean(y)
        self.cap = np.max(
            y
        ) * 1.5  # TODO Don't like this we should compute a cap from average yearly growth
        self.prior = np.mean(y)

        if self.time_column is None:
            self.time_column = self.tgc[0]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)
        loggerinfo(
            logger,
            "Start Fitting Prophet Model with params : {}".format(self.params))

        # Get temporary folders for multi process communication
        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Convert to pandas
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        # Make target available in the Frame
        XX['y'] = np.array(y)
        # Set target prior
        self.nan_value = np.mean(y)

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        self.models = {}
        self.priors = {}

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        loggerdebug(logger,
                    "Prophet will use {} workers for fitting".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        # Fit 1 FB Prophet model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just log where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder,
                                  "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))

            self.priors[grp_hash] = X['y'].mean()

            args = (X_path, grp_hash, tmp_folder, self.params, self.cap)
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelProphetTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return None
Ejemplo n.º 27
0
    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits ARIMA models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """
        # Get the logger if it exists
        logger = None
        tmp_folder = str(uuid.uuid4()) + "_arima_folder/"
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Import the ARIMA python module
        pm = importlib.import_module('pmdarima')
        # Init models
        self.models = {}
        # Convert to pandas
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        loggerinfo(
            logger,
            "Arima will use {} workers for parallel processing".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        # Build 1 ARIMA model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder,
                                  "autoarima_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash, self.time_column, tmp_folder)
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelAutoArimaTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()

        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from skrules import SkopeRules
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(experiment_id=self.context.experiment_id,
                                            tmp_dir=self.context.tmp_dir,
                                            experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folder
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            model = SkopeRules(max_depth_duplication=self.params["max_depth_duplication"],
                               n_estimators=self.params["n_estimators"],
                               precision_min=self.params["precision_min"],
                               recall_min=self.params["recall_min"],
                               max_samples=self.params["max_samples"],
                               max_samples_features=self.params["max_samples_features"],
                               max_depth=self.params["max_depth"],
                               max_features=self.params["max_features"],
                               min_samples_split=self.params["min_samples_split"],
                               bootstrap=self.params["bootstrap"],
                               bootstrap_features=self.params["bootstrap_features"],
                               random_state=self.params["random_state"],
                               feature_names=orig_cols)
        else:
            # Skopes doesn't work for regression
            loggerinfo(logger, "PASS, no skopes model")
            pass

        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [orig_cols[col_count] for col_count in range(len(orig_cols)) if
                              (X_datatypes[col_count] == 'category') or (X_datatypes[col_count] == 'object')]
        self.X_numeric = [item for item in orig_cols if item not in self.X_categorical]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            loggerinfo(logger, "PCategorical encode")

            for colname in self.X_categorical:
                X[colname] = list(X[colname].fillna("Missing"))
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories)], axis=1)

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:

            for colname in self.X_numeric:
                X[colname] = list(X[colname].fillna(-999))

        model.fit(np.array(X), np.array(y))

        # Find the rule list
        self.rule_list = model.rules_

        # Calculate feature importances
        var_imp = []
        for var in orig_cols:
            var_imp.append(sum(int(var in item[0]) for item in self.rule_list))

        if max(var_imp) != 0:
            importances = list(np.array(var_imp) / max(var_imp))
        else:
            importances = [1] * len(var_imp)

        pd.DataFrame(model.rules_, columns=['Rule', '(Precision, Recall, nb)']).to_csv(
            os.path.join(tmp_folder, 'Skope_rules.csv'), index=False)

        self.mean_target = np.array(sum(y) / len(y))

        # Set model properties
        self.set_model_properties(model=model,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get TGC and time column
        self.tgc = self.params_base.get('tgc', None)
        self.time_column = self.params_base.get('time_column', None)
        self.nan_value = np.mean(y)
        self.cap = np.max(
            y
        ) * 1.5  # TODO Don't like this we should compute a cap from average yearly growth
        self.prior = np.mean(y)

        if self.time_column is None:
            self.time_column = self.tgc[0]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        loggerinfo(
            logger,
            "Start Fitting Prophet Model with params : {}".format(self.params))

        try:
            # Add value of prophet_top_n in recipe_dict variable inside of config.toml file
            # eg1: recipe_dict="{'prophet_top_n': 200}"
            # eg2: recipe_dict="{'prophet_top_n':10}"
            self.top_n = config.recipe_dict['prophet_top_n']
        except KeyError:
            self.top_n = 50

        loggerinfo(
            logger,
            f"Prophet will use {self.top_n} groups as well as average target data."
        )

        # Get temporary folders for multi process communication
        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Add target, Label encoder is only used for Classif. which we don't support...
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        X['y'] = np.array(y)

        self.nan_value = X['y'].mean()

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Create a general scale now that will be used for unknown groups at prediction time
        # Can we do smarter than that ?
        general_scaler = MinMaxScaler().fit(
            X[['y', 'ds']].groupby('ds').median().values)

        # Go through groups and standard scale them
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        scalers = {}
        scaled_ys = []

        print('Number of groups : ', len(X_groups))
        for g in tgc_wo_time:
            print(f'Number of groups in {g} groups : {X[g].unique().shape}')

        for key, X_grp in X_groups:
            # Create dict key to store the min max scaler
            grp_hash = self.get_hash(key)
            # Scale target for current group
            scalers[grp_hash] = MinMaxScaler()
            y_skl = scalers[grp_hash].fit_transform(X_grp[['y']].values)
            # Put back in a DataFrame to keep track of original index
            y_skl_df = pd.DataFrame(y_skl, columns=['y'])

            y_skl_df.index = X_grp.index
            scaled_ys.append(y_skl_df)

        # Set target back in original frame but keep original
        X['y_orig'] = X['y']
        X['y'] = pd.concat(tuple(scaled_ys), axis=0)

        # Now Average groups
        X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index()

        # Send that to Prophet
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        nrows = X[['ds', 'y']].shape[0]
        n_changepoints = max(1, int(nrows * (2 / 3)))
        if n_changepoints < 25:
            model = Prophet(yearly_seasonality=True,
                            weekly_seasonality=True,
                            daily_seasonality=True,
                            n_changepoints=n_changepoints)
        else:
            model = Prophet(yearly_seasonality=True,
                            weekly_seasonality=True,
                            daily_seasonality=True)

        if self.params["country_holidays"] is not None:
            model.add_country_holidays(
                country_name=self.params["country_holidays"])
        if self.params["monthly_seasonality"]:
            model.add_seasonality(
                name='monthly',
                period=30.5,
                fourier_order=self.params["monthly_seasonality"])
        if self.params["quarterly_seasonality"]:
            model.add_seasonality(
                name='quarterly',
                period=92,
                fourier_order=self.params["quarterly_seasonality"])

        with suppress_stdout_stderr():
            model.fit(X[['ds', 'y']])

        top_groups = None
        if len(tgc_wo_time) > 0:
            if self.top_n > 0:
                top_n_grp = X.groupby(tgc_wo_time).size().sort_values(
                ).reset_index()[tgc_wo_time].iloc[-self.top_n:].values
                top_groups = ['_'.join(map(str, key)) for key in top_n_grp]

        grp_models = {}
        priors = {}
        if top_groups:
            # Prepare for multi processing
            num_tasks = len(top_groups)

            def processor(out, res):
                out[res[0]] = res[1]

            pool_to_use = small_job_pool
            loggerinfo(logger,
                       f"Prophet will use {n_jobs} workers for fitting.")

            pool = pool_to_use(logger=None,
                               processor=processor,
                               num_tasks=num_tasks,
                               max_workers=n_jobs)
            #
            # Fit 1 FB Prophet model per time group columns
            nb_groups = len(X_groups)

            # Put y back to its unscaled value for top groups
            X['y'] = X['y_orig']

            for _i_g, (key, X) in enumerate(X_groups):
                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(
                        logger, "FB Prophet : %d%% of groups fitted" %
                        (100 * (_i_g + 1) // nb_groups))

                X_path = os.path.join(tmp_folder,
                                      "fbprophet_X" + str(uuid.uuid4()))
                X = X.reset_index(drop=True)
                save_obj(X, X_path)

                grp_hash = self.get_hash(key)

                if grp_hash not in top_groups:
                    continue

                priors[grp_hash] = X['y'].mean()

                args = (X_path, grp_hash, tmp_folder, self.params, self.cap)
                kwargs = {}
                pool.submit_tryget(None,
                                   MyParallelProphetTransformer_fit_async,
                                   args=args,
                                   kwargs=kwargs,
                                   out=grp_models)
            pool.finish()

            for k, v in grp_models.items():
                grp_models[k] = load_obj(v) if v is not None else None
                remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        self.set_model_properties(
            model={
                'avg': model,
                'group': grp_models,
                'priors': priors,
                'topgroups': top_groups,
                'skl': scalers,
                'gen_scaler': general_scaler
            },
            features=self.tgc,  # Prophet uses time and timegroups
            importances=np.ones(len(self.tgc)),
            iterations=-1  # Does not have iterations
        )

        return None
Ejemplo n.º 30
0
    def fit_transform(self, X: dt.Frame, y: np.array = None, **kwargs):

        X_original = X

        X = X[:, dt.f[int].extend(dt.f[float]).extend(dt.f[bool]).
              extend(dt.f[str])]

        if hasattr(self, 'runcount'):
            self.run_count += 1
        else:
            self.run_count = 0

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir,
                username=self.context.username,
            )

        survival_event = self.__class__._survival_event
        if survival_event in X.names:
            raise ValueError(
                "Consider renaming feature '{}'.".format(survival_event))

        # bind y to X to use as event in CoxPH
        X[:, survival_event] = np.array(LabelEncoder().fit_transform(y))

        # sanity check that target is binary
        if X[survival_event].nunique()[0, 0] != 2:
            raise ValueError(
                "Too many values {} in event column - must be exactly 2.".
                format(X[survival_event].nunique()[0, 0]))

        # redress target values into 0, 1
        event_max = X[survival_event].max()[0, 0]
        X[dt.f[survival_event] != event_max, survival_event] = 0
        X[dt.f[survival_event] == event_max, survival_event] = 1

        stop_column_name = self.__class__._stop_column_name
        ignored_columns = self.__class__._ignored_columns

        if stop_column_name is None:
            raise ValueError("Stop column name can't be null.")

        main_message = "Survival Analysis CoxPH pre-transformer will use event '{}' and time '{}' columns.". \
            format(survival_event, stop_column_name)

        # in accpetance test simply return input X
        if stop_column_name not in X.names:
            loggerwarning(
                logger,
                "Survival Analysis CoxPH pre-transformer found no time column '{}'."
                .format(stop_column_name))
            return X_original

        if not X[:, stop_column_name].stype in [
                dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32,
                dt.float64
        ]:
            raise ValueError(
                "Stop column `{}' type must be numeric, but found '{}'".format(
                    stop_column_name, X[:, stop_column_name].stype))

        # remove stop column from X
        del X_original[:, stop_column_name]

        self._output_feature_names = list(X_original.names)
        self._feature_desc = list(X_original.names)

        if self.run_count == 0 and self.context and self.context.experiment_id:
            loggerinfo(logger, main_message)
            task = kwargs.get('task')
            if task and main_message is not None:
                task.sync(key=self.context.experiment_id,
                          progress=dict(type='update', message=main_message))
                task.flush()

        # Validate CoxPH requirements on stop column
        if X[stop_column_name].min()[0, 0] < 0:
            X[dt.f[stop_column_name] < 0, stop_column_name] = 0
            loggerwarning(
                logger,
                "Stop column can't be negative: replaced negative values with 0."
            )
        if X[stop_column_name].countna()[0, 0] > 0:
            X[dt.isna(dt.f[stop_column_name]), stop_column_name] = 0
            loggerwarning(
                logger,
                "Stop column can't contain NULLs: replaced NULL with 0.")

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model = H2OCoxProportionalHazardsEstimator(
            stop_column=stop_column_name,
            ties=self.ties,
            max_iterations=self.max_iterations)
        frame = h2o.H2OFrame(X.to_pandas())
        model_path = None
        risk_frame = None
        try:
            model.train(y=survival_event,
                        training_frame=frame,
                        ignored_columns=ignored_columns)
            self.id = model.model_id
            model_path = os.path.join(temporary_files_path,
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                self.raw_model_bytes = f.read()
            risk_frame = model.predict(frame)
            X_original[:, "risk_score_coxph_{}_{}".format(
                self.ties, self.max_iterations)] = risk_frame.as_data_frame(
                    header=False)
            self._output_feature_names.append(
                f"{self.display_name}{orig_feat_prefix}riskscore_coxph{extra_prefix}{self.ties}_{self.max_iterations}"
            )
            self._feature_desc.append(
                f"CoxPH model risk score [ties={self.ties}, max.iter={self.max_iterations}"
            )
            return X_original
        finally:
            if model_path is not None:
                remove(model_path)
            h2o.remove(model)
            h2o.remove(frame)
            if risk_frame is not None:
                h2o.remove(risk_frame)