Ejemplo n.º 1
0
    def predict(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        if self.tgc is None or not all([x in X.names for x in self.tgc]):
            loggerdebug(logger, "Return 0 predictions")
            return np.ones(X.shape[0]) * self.nan_value

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)

        if self.params["growth"] == "logistic":
            XX["cap"] = self.cap

        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        loggerdebug(logger,
                    "Prophet will use {} workers for transform".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)
        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        print("Nb Groups = ", nb_groups)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Log where we are in the transformation of the dataset
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(tmp_folder,
                                  "fbprophet_Xt" + str(uuid.uuid4()))
            # Commented for performance, uncomment for debug
            # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelProphetTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        return XX['yhat'].values
    def predict(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        if self.tgc is None or not all([x in X.names for x in self.tgc]):
            loggerdebug(logger, "Return 0 predictions")
            return np.ones(X.shape[0]) * self.nan_value

        models, _, _, _ = self.get_model_properties()

        model = models['avg']
        grp_models = models['group']
        priors = models['priors']
        top_groups = models['topgroups']
        scalers = models['skl']
        general_scaler = models['gen_scaler']

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        if self.params["growth"] == "logistic":
            X["cap"] = self.cap

        # Predict y using unique dates
        X_time = X[['ds']].groupby('ds').first().reset_index()
        with suppress_stdout_stderr():
            y_avg = model.predict(X_time)[['ds', 'yhat']]

        # Prophet transforms the date column to datetime so we need to transfrom that to merge back
        X_time.sort_values('ds', inplace=True)
        X_time['yhat'] = y_avg['yhat']
        X_time.sort_index(inplace=True)

        # Merge back into original frame on 'ds'
        # pd.merge wipes the index ... so keep it to provide it again
        indices = X.index
        X = pd.merge(left=X, right=X_time[['ds', 'yhat']], on='ds', how='left')
        X.index = indices

        # Go through groups and recover the scaled target for knowed groups
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        inverted_ys = []
        for key, X_grp in X_groups:
            grp_hash = self.get_hash(key)

            # Scale target for current group
            if grp_hash in scalers.keys():
                inverted_y = scalers[grp_hash].inverse_transform(
                    X_grp[['yhat']])
            else:
                inverted_y = general_scaler.inverse_transform(X_grp[['yhat']])

            # Put back in a DataFrame to keep track of original index
            inverted_df = pd.DataFrame(inverted_y, columns=['yhat'])
            inverted_df.index = X_grp.index
            inverted_ys.append(inverted_df)

        XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index()

        if top_groups:
            # Go though the groups and predict only top
            XX_paths = []
            model_paths = []

            def processor(out, res):
                out.append(res)

            num_tasks = len(top_groups)
            pool_to_use = small_job_pool
            pool = pool_to_use(logger=None,
                               processor=processor,
                               num_tasks=num_tasks,
                               max_workers=n_jobs)

            nb_groups = len(X_groups)
            for _i_g, (key, X_grp) in enumerate(X_groups):

                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(
                        logger, "FB Prophet : %d%% of groups predicted" %
                        (100 * (_i_g + 1) // nb_groups))

                # Create dict key to store the min max scaler
                grp_hash = self.get_hash(key)
                X_path = os.path.join(tmp_folder,
                                      "fbprophet_Xt" + str(uuid.uuid4()))

                if grp_hash not in top_groups:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan),
                                      columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                if grp_models[grp_hash] is None:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan),
                                      columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                model = grp_models[grp_hash]
                model_path = os.path.join(
                    tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X_grp, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelProphetTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)

            pool.finish()
            XX_top_groups = pd.concat((load_obj(XX_path)
                                       for XX_path in XX_paths),
                                      axis=0).sort_index()
            for p in XX_paths + model_paths:
                remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        features_df = pd.DataFrame()
        features_df['GrpAvg'] = XX_general['yhat']

        if top_groups:
            features_df[f'_Top{self.top_n}Grp'] = XX_top_groups['yhat']
            features_df.loc[features_df[f'_Top{self.top_n}Grp'].notnull(),
                            'GrpAvg'] = features_df.loc[
                                features_df[f'_Top{self.top_n}Grp'].notnull(),
                                f'_Top{self.top_n}Grp']

        # Models have to return a numpy array
        return features_df['GrpAvg'].values
Ejemplo n.º 3
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get TGC and time column
        self.tgc = self.params_base.get('tgc', None)
        self.time_column = self.params_base.get('time_column', None)
        self.nan_value = np.mean(y)
        self.cap = np.max(
            y
        ) * 1.5  # TODO Don't like this we should compute a cap from average yearly growth
        self.prior = np.mean(y)

        if self.time_column is None:
            self.time_column = self.tgc[0]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)
        loggerinfo(
            logger,
            "Start Fitting Prophet Model with params : {}".format(self.params))

        # Get temporary folders for multi process communication
        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Convert to pandas
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        # Make target available in the Frame
        XX['y'] = np.array(y)
        # Set target prior
        self.nan_value = np.mean(y)

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        self.models = {}
        self.priors = {}

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        loggerdebug(logger,
                    "Prophet will use {} workers for fitting".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        # Fit 1 FB Prophet model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just log where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder,
                                  "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))

            self.priors[grp_hash] = X['y'].mean()

            args = (X_path, grp_hash, tmp_folder, self.params, self.cap)
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelProphetTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return None
Ejemplo n.º 4
0
    def predict(self, X, **kwargs):
        orig_cols = list(X.names)
        import pandas as pd
        import xgboost as xgb
        import numpy as np

        def sigmoid(x):
            z = 1.0 / (1.0 + np.exp(-x))
            return z

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        X = dt.Frame(X)

        X = X.to_pandas()

        if self.protected in list(X.columns):
            # Set the protected group to 0 and all others 1
            loggerdebug(logger, "Protected test found")
            protected_test = np.array([
                int(item) for item in
                ~(np.array(X[self.protected]) == self.protected_label)
            ])

        else:
            loggerdebug(logger, "Protected test not found")
            protected_test = np.array([])

        if self.protected in list(X.columns):
            X = X.drop(self.protected, axis=1)

        # Replace missing values with a missing category
        # Replace categories that weren't in the training set with the mode
        if len(self.X_categorical) > 0:

            for colname in self.X_categorical:
                if colname in list(X.columns):
                    X[colname] = list(X[colname].fillna("Missing"))

            for label in self.X_categorical:
                if label in list(X.columns):
                    # Replace anything not in the test set
                    train_categories = self.train_levels[label]
                    X_label = np.array(X[label])
                    mmode = self.train_mode[label]
                    X_label[~np.isin(X_label, train_categories)] = mmode
                    X[label] = X_label

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:
            for colname in self.X_numeric:
                if colname in list(X.columns):
                    X[colname] = list(X[colname].fillna(-999))

                    # Get model
        model, _, _, _ = self.get_model_properties()

        # Remove the protected group
        if self.protected in self.X_categorical:
            self.X_categorical.remove(self.protected)

        # One hot encode categorical features
        if len(self.X_categorical) > 0:
            X_enc = self.enc.transform(X[self.X_categorical]).toarray()
            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        d_test = xgb.DMatrix(X, missing=np.nan)

        # If the positive target was 0, change the final result to 1-p
        if self.positive_target == 0:
            preds = 1.0 - sigmoid(model.predict(d_test))
        else:
            preds = sigmoid(model.predict(d_test))

        mean_preds = np.mean(preds)

        # Set a penalty value to which some probabilities will be changed
        # if the fairness threshold isn't reached
        epsilon = 0.0001
        if mean_preds > 0.5:
            penalty = epsilon
        else:
            penalty = 1.0 - epsilon

        # Only apply penalties in the training stage
        if self.is_train:
            # If the protected value was removed, use the maximum penalty
            # by changing all probabilities to the penalty value
            # (the recipe needs to be able to use the protected values)
            if self.protected == "none":
                preds[0:len(preds)] = penalty
                loggerdata(logger, str(preds))
                loggerdata(logger, "Removal_penalty")

            else:
                # The mean ratio calculation for target=0 and target=1
                if self.positive_target == 0:
                    if np.mean(preds[protected_test == 1]) < 1.0:
                        DI = (1.0 - np.mean(preds[protected_test == 0])) / (
                            1.0 - np.mean(preds[protected_test == 1]))
                    else:
                        DI = 1
                else:
                    if np.mean(preds[protected_test == 1]) > 0.0:
                        DI = np.mean(preds[protected_test == 0]) / np.mean(
                            preds[protected_test == 1])
                    else:
                        DI = 1

                loggerdata(logger, "Mean ratio Check")
                loggerdata(logger, str(DI))

                if DI < self.mean_protected_prediction_ratio_minimum:
                    # Create a penalty proportional to the distance below the specified threshold
                    len_preds = len(preds)
                    num_penalty = min(
                        len_preds,
                        int((self.mean_protected_prediction_ratio_minimum - DI)
                            / self.mean_protected_prediction_ratio_minimum *
                            len_preds))

                    preds[0:num_penalty] = penalty
                    loggerdata(logger, "num_penalty1")
                    loggerdata(logger, str(num_penalty),
                               str(num_penalty / len(preds)))

        self.is_train = False

        return preds