Ejemplo n.º 1
0
    def _fit_async(X_path, grp_hash, tmp_folder):
        """
        Fits a FB Prophet model for a particular time group
        :param X_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)
        # Commented for performance, uncomment for debug
        # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
        if X.shape[0] < 20:
            # print("prophet - small data work-around for group: %s" % grp_hash)
            return grp_hash, None
        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        model = Prophet()

        with suppress_stdout_stderr():
            model.fit(X[['ds', 'y']])
        model_path = os.path.join(tmp_folder,
                                  "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
Ejemplo n.º 2
0
    def _fit_async(X_path, grp_hash, time_column, tmp_folder):
        """
        Fits an ARIMA model for a particular time group
        :param X_path: Path to the data used to fit the ARIMA model
        :param grp_hash: Time group identifier
        :param time_column: Name of the time column in the input data
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)

        pm = importlib.import_module('pmdarima')
        with suppress_stdout_stderr():
            try:
                order = order = np.argsort(X[time_column])
                model = pm.auto_arima(X['y'].values[order],
                                      error_action='ignore')
            except:
                model = None
        model_path = os.path.join(tmp_folder,
                                  "autoarima_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
    def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column):
        model = load_obj(model_path)
        XX_path = os.path.join(temporary_files_path, "autoarima_XXt" + str(uuid.uuid4()))
        X = load_obj(X_path)
        # Facebook Prophet returns the predictions ordered by time
        # So we should keep track of the time order for each group so that
        # predictions are ordered the same as the imput frame
        # Keep track of the order

        order = np.argsort(X[time_column])
        if model is not None:
            yhat = model.predict_in_sample() \
                if has_is_train_attr else model.predict(n_periods=X.shape[0])
            yhat = yhat[order]
            XX = pd.DataFrame(yhat, columns=['yhat'])

        else:
            XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid model

        # Sync index
        XX.index = X.index
        assert XX.shape[1] == 1
        save_obj(XX, XX_path)
        remove(model_path)  # indicates success, no longer need
        remove(X_path)  # indicates success, no longer need
        return XX_path
Ejemplo n.º 4
0
 def _transform_async(model_path, X_path, nan_value, tmp_folder):
     """
     Predicts target for a particular time group
     :param model_path: path to the stored model
     :param X_path: Path to the data used to fit the FB Prophet model
     :param nan_value: Value of target prior, used when no fitted model has been found
     :return: self
     """
     model = load_obj(model_path)
     XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4()))
     X = load_obj(X_path)
     # Facebook Prophet returns the predictions ordered by time
     # So we should keep track of the time order for each group so that
     # predictions are ordered the same as the imput frame
     # Keep track of the order
     order = np.argsort(pd.to_datetime(X["ds"]))
     if model is not None:
         # Run prophet
         yhat = model.predict(X)['yhat'].values
         XX = pd.DataFrame(yhat, columns=['yhat'])
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid models
     XX.index = X.index[order]
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
Ejemplo n.º 5
0
    def _transform_async(model_path, X_path, nan_value, tmp_folder):
        """
        Predicts target for a particular time group
        :param model_path: path to the stored model
        :param X_path: Path to the data used to fit the FB Prophet model
        :param nan_value: Value of target prior, used when no fitted model has been found
        :return: self
        """
        model = load_obj(model_path)
        XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4()))
        X = load_obj(X_path)

        X_time = X[['ds']].groupby('ds').first().reset_index()
        with suppress_stdout_stderr():
            y_avg = model.predict(X_time)[['ds', 'yhat']]

        # Prophet transforms the date column to datetime so we need to transfrom that to merge back
        X_time.sort_values('ds', inplace=True)
        X_time['yhat'] = y_avg['yhat']
        X_time.sort_index(inplace=True)
        # Merge back into original frame on 'ds'
        # pd.merge wipes the index ... so keep it to provide it again
        indices = X.index
        X = pd.merge(left=X, right=X_time[['ds', 'yhat']], on='ds', how='left')
        X.index = indices

        save_obj(X[['yhat']], XX_path)
        remove(model_path)  # indicates success, no longer need
        remove(X_path)  # indicates success, no longer need

        return XX_path
Ejemplo n.º 6
0
    def _fit_async(X_path, grp_hash, tmp_folder, params):
        """
        Fits a FB Prophet model for a particular time group
        :param X_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)
        # Commented for performance, uncomment for debug
        # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
        if X.shape[0] < 20:
            # print("prophet - small data work-around for group: %s" % grp_hash)
            return grp_hash, None
        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)

        if params["country_holidays"] is not None:
            model.add_country_holidays(country_name=params["country_holidays"])
        if params["monthly_seasonality"]:
            model.add_seasonality(name='monthly', period=30.5, fourier_order=5)

        with suppress_stdout_stderr():
            model.fit(X[['ds', 'y']])
        model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
    def fit(self, X: dt.Frame, y: np.array = None):
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)  # TODO - store mean per group, not just global
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        self.models = {}
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        for key, X in XX_grp:
            X_path = os.path.join(temporary_files_path, "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash,)
            kwargs = {}
            pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)
        return self
Ejemplo n.º 8
0
    def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column, tmp_folder):
        """
        Predicts target for a particular time group
        :param model_path: path to the stored model
        :param X_path: Path to the data used to fit the ARIMA model
        :param nan_value: Value of target prior, used when no fitted model has been found
        :param has_is_train_attr: indicates if we predict in-sample or out-of-sample
        :param time_column: Name of the time column in the input data
        :return: self
        """
        model = load_obj(model_path)
        XX_path = os.path.join(tmp_folder, "autoarima_XXt" + str(uuid.uuid4()))
        X = load_obj(X_path)
        # Arima returns the predictions ordered by time
        # So we should keep track of the time order for each group so that
        # predictions are ordered the same as the imput frame
        # Keep track of the order

        order = np.argsort(X[time_column])
        if model is not None:
            yhat = model.predict_in_sample() \
                if has_is_train_attr else model.predict(n_periods=X.shape[0])
            yhat = yhat[order]
            XX = pd.DataFrame(yhat, columns=['yhat'])

        else:
            XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid model

        # Sync index
        XX.index = X.index
        assert XX.shape[1] == 1
        save_obj(XX, XX_path)
        remove(model_path)  # indicates success, no longer need
        remove(X_path)  # indicates success, no longer need
        return XX_path
 def _transform_async(model_path, X_path, nan_value):
     model = load_obj(model_path)
     XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4()))
     X = load_obj(X_path)
     if model is not None:
         # Facebook Prophet returns the predictions ordered by time
         # So we should keep track of the times for each group so that
         # predictions are ordered the same as the imput frame
         # Make a copy of the input dates
         X_ds = X.copy()
         X_ds['ds'] = pd.to_datetime(X_ds['ds'])
         # Predict with prophet, get the time and prediction and index by time as well
         # In the case date repeats inside of a group (this happens at least in acceptance test)
         # We groupby date and keep the max (prophet returns the same value for a given date)
         # XX will contain the predictions indexed by date
         XX = model.predict(X)[['ds', 'yhat']].groupby('ds').max()
         # Now put yhat in the right order, simply by maping the dates to the predictions
         X_ds['yhat'] = X_ds["ds"].map(XX['yhat'])
         # Now set XX back to the predictions and drop the index
         XX = X_ds[['yhat']].reset_index(drop=True)
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid models
     XX.index = X.index
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
    def _fit_async(X_path, grp_hash, tmp_folder, params, cap):
        """
        Fits a FB Prophet model for a particular time group
        :param X_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)
        # Commented for performance, uncomment for debug
        # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
        if X.shape[0] < 20:
            return grp_hash, None

        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")

        # Fit current model and prior
        nrows = X[['ds', 'y']].shape[0]
        n_changepoints = max(1, int(nrows * (2 / 3)))
        if n_changepoints < 25:
            model = Prophet(growth=params["growth"],
                            n_changepoints=n_changepoints)
        else:
            model = Prophet(growth=params["growth"])
        # Add params
        if params["country_holidays"] is not None:
            model.add_country_holidays(country_name=params["country_holidays"])
        if params["monthly_seasonality"]:
            model.add_seasonality(name='monthly',
                                  period=30.5,
                                  fourier_order=params["monthly_seasonality"])
        if params["quarterly_seasonality"]:
            model.add_seasonality(
                name='quarterly',
                period=92,
                fourier_order=params["quarterly_seasonality"])

        with suppress_stdout_stderr():
            if params["growth"] == "logistic":
                X["cap"] = cap
                model.fit(X[['ds', 'y', 'cap']])
            else:
                model.fit(X[['ds', 'y']])

        model_path = os.path.join(tmp_folder,
                                  "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
    def _fit_async(X_path, grp_hash, time_column):
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)

        pm = importlib.import_module('pmdarima')
        with suppress_stdout_stderr():
            try:
                order = order = np.argsort(X[time_column])
                model = pm.auto_arima(X['y'].values[order], error_action='ignore')
            except:
                model = None
        model_path = os.path.join(temporary_files_path, "autoarima_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
 def _transform_async(model_path, X_path, nan_value):
     model = load_obj(model_path)
     XX_path = os.path.join(temporary_files_path,
                            "fbprophet_XXt" + str(uuid.uuid4()))
     X = load_obj(X_path)
     if model is not None:
         XX = model.predict(X[['ds']])[['yhat']]
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value),
                           columns=['yhat'])  # invalid models
     XX.index = X.index
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
 def _fit_async(X_path, grp_hash):
     np.random.seed(1234)
     random.seed(1234)
     X = load_obj(X_path)
     # Commented for performance, uncomment for debug
     # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
     if X.shape[0] < 20:
         # print("prophet - small data work-around for group: %s" % grp_hash)
         return grp_hash, None
     mod = importlib.import_module('fbprophet')
     Prophet = getattr(mod, "Prophet")
     model = Prophet()
     with suppress_stdout_stderr():
         model.fit(X[['ds', 'y']])
     model_path = os.path.join(temporary_files_path, "fbprophet_model" + str(uuid.uuid4()))
     save_obj(model, model_path)
     remove(X_path)  # remove to indicate success
     return grp_hash, model_path
Ejemplo n.º 14
0
    def _fit_async(data_path, grp_hash, tmp_folder, params):
        """
        Fits a FB Prophet model for a particular time group
        :param data_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(data_path)

        # if X.shape[0] < 20:
        #     return grp_hash, None
        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        model = fit_prophet_model(Prophet, X, params)
        model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(data_path)  # remove to indicate success
        return grp_hash, model_path
    def fit(self, X: dt.Frame, y: np.array = None):
        pm = importlib.import_module('pmdarima')
        self.models = {}
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Fitted")
            X_path = os.path.join(temporary_files_path, "autoarima_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash, self.time_column,)
            kwargs = {}
            pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models)
        pool.finish()

        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)
        return self
    def transform(self, X: dt.Frame):
        X = X.to_pandas()
        X = X.replace([None, np.nan], 0)
        XX = X[self.tgc].copy()
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None,
                           processor=processor,
                           max_workers=self.n_jobs,
                           num_tasks=num_tasks)
        XX_paths = []
        model_paths = []
        for key, X in XX_grp:
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(temporary_files_path,
                                  "fbprophet_Xt" + str(uuid.uuid4()))
            print("prophet - transforming data of shape: %s for group: %s" %
                  (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    temporary_files_path,
                    "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelProphetTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)
        return XX
Ejemplo n.º 17
0
 def _transform_async(model_path, X_path, nan_value):
     model = load_obj(model_path)
     XX_path = os.path.join(temporary_files_path,
                            "fbprophet_XXt" + str(uuid.uuid4()))
     X = load_obj(X_path)
     # Facebook Prophet returns the predictions ordered by time
     # So we should keep track of the time order for each group so that
     # predictions are ordered the same as the imput frame
     # Keep track of the order
     order = np.argsort(pd.to_datetime(X["ds"]))
     if model is not None:
         # Run prophet
         yhat = model.predict(X)['yhat'].values
         XX = pd.DataFrame(yhat, columns=['yhat'])
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value),
                           columns=['yhat'])  # invalid models
     XX.index = X.index[order]
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
    def transform(self, X: dt.Frame):
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Transformed")
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(temporary_files_path, "autoarima_Xt" + str(uuid.uuid4()))

            # Commented for performance, uncomment for debug
            # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(temporary_files_path, "autoarima_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column,)
                kwargs = {}
                pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)
            else:
                # Don't go through pools
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat'])  # unseen groups
                # Sync indices
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)
        return XX
Ejemplo n.º 19
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get TGC and time column
        self.tgc = self.params_base.get('tgc', None)
        self.time_column = self.params_base.get('time_column', None)
        self.nan_value = np.mean(y)
        self.cap = np.max(
            y
        ) * 1.5  # TODO Don't like this we should compute a cap from average yearly growth
        self.prior = np.mean(y)

        if self.time_column is None:
            self.time_column = self.tgc[0]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)
        loggerinfo(
            logger,
            "Start Fitting Prophet Model with params : {}".format(self.params))

        # Get temporary folders for multi process communication
        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Convert to pandas
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        # Make target available in the Frame
        XX['y'] = np.array(y)
        # Set target prior
        self.nan_value = np.mean(y)

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        self.models = {}
        self.priors = {}

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        loggerdebug(logger,
                    "Prophet will use {} workers for fitting".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        # Fit 1 FB Prophet model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just log where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder,
                                  "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))

            self.priors[grp_hash] = X['y'].mean()

            args = (X_path, grp_hash, tmp_folder, self.params, self.cap)
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelProphetTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return None
Ejemplo n.º 20
0
    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits ARIMA models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """
        # Get the logger if it exists
        logger = None
        tmp_folder = str(uuid.uuid4()) + "_arima_folder/"
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Import the ARIMA python module
        pm = importlib.import_module('pmdarima')
        # Init models
        self.models = {}
        # Convert to pandas
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        loggerinfo(
            logger,
            "Arima will use {} workers for parallel processing".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        # Build 1 ARIMA model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder,
                                  "autoarima_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash, self.time_column, tmp_folder)
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelAutoArimaTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()

        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self
Ejemplo n.º 21
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        If self.is_train exists, it means we are doing in-sample predictions
        if it does not then we Arima is used to predict the future
        :param X: Datatable Frame containing the features
        :return: ARIMA predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        loggerinfo(logger,
                   "Arima will use {} workers for transform".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just print where we are in the process of fitting models
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            # Create time group key to store and retrieve fitted models
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # Create file path to store data and pass it to the fitting pool
            X_path = os.path.join(tmp_folder,
                                  "autoarima_Xt" + str(uuid.uuid4()))

            # Commented for performance, uncomment for debug
            # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    tmp_folder, "autoarima_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value,
                        hasattr(self, 'is_train'), self.time_column,
                        self.pred_gap, tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelAutoArimaTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                # Don't go through pools
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                # Sync indices
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        return XX
Ejemplo n.º 22
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        logger = None
        if self._make_logger:
            # Example use of logger, with required import of:
            #  from h2oaicore.systemutils import make_experiment_logger, loggerinfo
            # Can use loggerwarning, loggererror, etc. for different levels
            if self.context and self.context.experiment_id:
                logger = make_experiment_logger(
                    experiment_id=self.context.experiment_id,
                    tmp_dir=self.context.tmp_dir,
                    experiment_tmp_dir=self.context.experiment_tmp_dir)

        if self._show_logger_test:
            loggerinfo(logger, "TestLOGGER: Fit CatBoost")

        if self._show_task_test:
            # Example task sync operations
            if hasattr(self, 'testcount'):
                self.test_count += 1
            else:
                self.test_count = 0

            # The below generates a message in the GUI notifications panel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                warning = "TestWarning: First CatBoost fit for this model instance"
                loggerwarning(logger, warning)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='warning', data=warning))
                    task.flush()

            # The below generates a message in the GUI top-middle panel above the progress wheel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                message = "Tuning CatBoost"
                loggerinfo(logger, message)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='update', message=message))
                    task.flush()

        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType

        # label encode target and setup type of problem
        lb = LabelEncoder()
        if self.num_classes >= 2:
            lb.fit(self.labels)
            y = lb.transform(y)
            if eval_set is not None:
                valid_X = eval_set[0][0]
                valid_y = eval_set[0][1]
                valid_y = lb.transform(valid_y)
                eval_set = [(valid_X, valid_y)]
            self.params.update({'objective': 'Logloss'})
        if self.num_classes > 2:
            self.params.update({'objective': 'MultiClass'})

        if isinstance(X, dt.Frame):
            orig_cols = list(X.names)
            numeric_cols = list(X[:, [bool, int, float]].names)
        else:
            orig_cols = list(X.columns)
            numeric_cols = list(X.select_dtypes([np.number]).columns)

        # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc.
        self.params['cat_features'] = [
            i for i, x in enumerate(orig_cols)
            if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols
        ]

        if not self.get_uses_gpus(self.params):
            # monotonicity constraints not available for GPU for catboost
            # get names of columns in same order
            X_names = list(dt.Frame(X).names)
            X_numeric = self.get_X_ordered_numerics(X)
            X_numeric_names = list(X_numeric.names)
            _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y)
            # if non-numerics, then fix those to have 0 constraint
            self.params['monotone_constraints'] = [0] * len(X_names)
            colnumi = 0
            for coli in X_names:
                if X_names[coli] in X_numeric_names:
                    self.params['monotone_constraints'][coli] = constraints[
                        colnumi]
                    colnumi += 1

        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> catboost internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)
            if eval_set is not None:
                valid_X = eval_set[0][0].to_numpy(
                )  # don't assign back to X so don't damage during predict
                valid_X = np.ascontiguousarray(
                    valid_X,
                    dtype=np.float32
                    if config.data_precision == "float32" else np.float64)
                valid_y = eval_set[0][1]
                eval_set = [(valid_X, valid_y)]

        if eval_set is not None:
            valid_X_shape = eval_set[0][0].shape
        else:
            valid_X_shape = None

        X, eval_set = self.process_cats(X, eval_set, orig_cols)

        # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes
        self.acquire_gpus_function(train_shape=X.shape,
                                   valid_shape=valid_X_shape)

        params = copy.deepcopy(
            self.params
        )  # keep separate, since then can be pulled form lightgbm params
        params = self.transcribe_params(params=params, **kwargs)

        if logger is not None:
            loggerdata(
                logger,
                "CatBoost parameters: params_base : %s params: %s catboost_params: %s"
                % (str(self.params_base), str(self.params), str(params)))

        if self.num_classes == 1:
            self.model = CatBoostRegressor(**params)
        else:
            self.model = CatBoostClassifier(**params)
        # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored.
        if self.num_classes == 1:
            # assume not mae, which would use median
            # baseline = [np.mean(y)] * len(y)
            baseline = None
        else:
            baseline = None

        kwargs_fit = dict(baseline=baseline, eval_set=eval_set)
        pickle_path = None
        if config.debug_daimodel_level >= 2:
            self.uuid = str(uuid.uuid4())[:6]
            pickle_path = os.path.join(exp_dir(),
                                       "catboost%s.tmp.pickle" % self.uuid)
            save_obj((self.model, X, y, sample_weight, kwargs_fit),
                     pickle_path)

        # FIT (with migration safety before hyperopt/Optuna function added)
        try:
            if hasattr(self, 'dask_or_hyper_or_normal_fit'):
                self.dask_or_hyper_or_normal_fit(X,
                                                 y,
                                                 sample_weight=sample_weight,
                                                 kwargs=kwargs,
                                                 **kwargs_fit)
            else:
                self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit)
        except Exception as e:
            if "All features are either constant or ignored" in str(e):
                raise IgnoreEntirelyError(str(e))
            raise

        if config.debug_daimodel_level <= 2:
            remove(pickle_path)

        # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html
        # need to move to wrapper
        if self.model.get_best_iteration() is not None:
            iterations = self.model.get_best_iteration() + 1
        else:
            iterations = self.params['n_estimators']
        # must always set best_iterations
        self.model_path = None
        importances = copy.deepcopy(self.model.feature_importances_)
        if not self._save_by_pickle:
            self.uuid = str(uuid.uuid4())[:6]
            model_file = "catboost_%s.bin" % str(self.uuid)
            self.model_path = os.path.join(self.context.experiment_tmp_dir,
                                           model_file)
            self.model.save_model(self.model_path)
            with open(self.model_path, mode='rb') as f:
                model = f.read()
        else:
            model = self.model
        self.set_model_properties(
            model=
            model,  # overwrites self.model object with bytes if not using pickle
            features=orig_cols,
            importances=importances,
            iterations=iterations)
Ejemplo n.º 23
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Predict y using unique dates
        X_time = X[['ds']].groupby('ds').first().reset_index()
        with suppress_stdout_stderr():
            y_avg = self.model.predict(X_time)[['ds', 'yhat']]

        # Prophet transforms the date column to datetime so we need to transfrom that to merge back
        X_time.sort_values('ds', inplace=True)
        X_time['yhat'] = y_avg['yhat']
        X_time.sort_index(inplace=True)

        # Merge back into original frame on 'ds'
        # pd.merge wipes the index ... so keep it to provide it again
        indices = X.index
        X = pd.merge(
            left=X,
            right=X_time[['ds', 'yhat']],
            on='ds',
            how='left'
        )
        X.index = indices

        # Go through groups and recover the scaled target for knowed groups
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        inverted_ys = []
        for key, X_grp in X_groups:
            grp_hash = self.get_hash(key)

            # Scale target for current group
            if grp_hash in self.scalers.keys():
                inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']])
            else:
                inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']])

            # Put back in a DataFrame to keep track of original index
            inverted_df = pd.DataFrame(inverted_y, columns=['yhat'])
            inverted_df.index = X_grp.index
            inverted_ys.append(inverted_df)

        XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index()

        if self.top_groups:
            # Go though the groups and predict only top
            XX_paths = []
            model_paths = []

            def processor(out, res):
                out.append(res)
            num_tasks = len(self.top_groups)
            pool_to_use = small_job_pool
            pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs)

            nb_groups = len(X_groups)
            for _i_g, (key, X_grp) in enumerate(X_groups):

                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups))

                # Create dict key to store the min max scaler
                grp_hash = self.get_hash(key)
                X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4()))

                if grp_hash not in self.top_groups:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                if self.grp_models[grp_hash] is None:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                model = self.grp_models[grp_hash]
                model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X_grp, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)

            pool.finish()
            XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
            for p in XX_paths + model_paths:
                remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        features_df = pd.DataFrame()
        features_df[self.display_name + '_GrpAvg'] = XX_general['yhat']

        if self.top_groups:
            features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat']

        self._output_feature_names = list(features_df.columns)
        self._feature_desc = list(features_df.columns)

        return features_df
Ejemplo n.º 24
0
    def predict(self, X, y=None, **kwargs):
        model, features, importances, iterations = self.get_model_properties()
        if not self._save_by_pickle:
            from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
            if self.num_classes >= 2:
                from_file = CatBoostClassifier()
            else:
                from_file = CatBoostRegressor()
            with open(self.model_path, mode='wb') as f:
                f.write(model)
            model = from_file.load_model(self.model_path)

        # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up.
        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> lightgbm internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)

        X, eval_set = self.process_cats(X, None, self.feature_names_fitted)

        pred_contribs = kwargs.get('pred_contribs', False)
        output_margin = kwargs.get('output_margin', False)
        fast_approx = kwargs.pop('fast_approx', False)
        if fast_approx:
            iterations = min(config.fast_approx_num_trees, iterations)

        # implicit import
        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool
        n_jobs = max(1, physical_cores_count)
        if not pred_contribs and not output_margin:
            if self.num_classes >= 2:
                preds = model.predict_proba(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                if preds.shape[1] == 2:
                    return preds[:, 1]
                else:
                    return preds
            else:
                return model.predict(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )
        elif output_margin:
            # uses "predict" for raw for any class
            preds = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if len(preds.shape
                   ) > 1 and preds.shape[1] == 2 and self.num_classes == 2:
                return preds[:, 1]
            else:
                return preds
        elif pred_contribs:
            # For Shapley, doesn't come from predict
            # For regression/binary, shap is shape of (rows, features + bias)
            # for multiclass, shap is shape of (rows, classes, features + bias)
            data = Pool(X, label=y, cat_features=self.params['cat_features'])
            if fast_approx:
                # https://github.com/catboost/catboost/issues/1146
                # https://github.com/catboost/catboost/issues/1535
                # can't specify trees, but they have approx version
                # Regular, Exact, or Approximate
                shap_calc_type = "Approximate"
            else:
                shap_calc_type = "Regular"
            # See also shap_mode
            # help(CatBoostClassifier.get_feature_importance)
            print_debug("shap_calc_type: %s" % shap_calc_type)

            pickle_path = None
            if config.debug_daimodel_level >= 2:
                self.uuid = str(uuid.uuid4())[:6]
                pickle_path = os.path.join(
                    exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid)
                model.save_model(
                    os.path.join(exp_dir(), "catshapproblem%s.catboost.model" %
                                 self.uuid))
                # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path)
                save_obj((model, X, y, kwargs, shap_calc_type,
                          self.params['cat_features']), pickle_path)

            preds_shap = model.get_feature_importance(
                data=data,
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported,
                type=EFstrType.ShapValues,
                shap_calc_type=shap_calc_type,
            )
            # repair broken shap sum: https://github.com/catboost/catboost/issues/1125
            print_debug("shap_fix")
            preds_raw = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if self.num_classes <= 2:
                axis = 1
            else:
                axis = 2
            orig_sum = np.sum(preds_shap, axis=axis)
            print_debug("shap_fix2")
            # avoid division by 0, need different trick, e.g. change baseline, to fix that case
            if axis == 1:
                orig_sum[orig_sum[:] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:,
                                                                        None]
            else:
                # each feature and each class must sum up
                orig_sum[orig_sum[:, :] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, :,
                                                    None] / orig_sum[:, :,
                                                                     None]

            if config.hard_asserts and config.debug_daimodel_level >= 2:
                print_debug("shap_check")
                model.save_model(os.path.join(exp_dir(), "catshapproblem"))
                pickle.dump((X, y, self.params['cat_features']),
                            open(os.path.join(exp_dir(), "catshapproblem.pkl"),
                                 "wb"))
                preds_raw = model.predict(
                    X,
                    prediction_type="RawFormulaVal",
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                assert np.isclose(preds_raw, np.sum(
                    preds_shap, axis=axis)).all(
                    ), "catboost shapley does not sum up correctly"

            if config.debug_daimodel_level <= 2:
                remove(pickle_path)

            if axis == 1:
                return preds_shap
            else:
                # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of
                # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ...
                return preds_shap.reshape(
                    preds_shap.shape[0],
                    preds_shap.shape[1] * preds_shap.shape[2])
        else:
            raise RuntimeError("No such case")
Ejemplo n.º 25
0
    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits FB Prophet models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

        try:
            # Add value of prophet_top_n in recipe_dict variable inside of config.toml file
            # eg1: recipe_dict="{'prophet_top_n': 200}"
            # eg2: recipe_dict="{'prophet_top_n':10}"
            self.top_n = config.recipe_dict['prophet_top_n']
        except KeyError:
            self.top_n = 50

        loggerinfo(logger, f"Prophet will use {self.top_n} groups as well as average target data.")

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Add target, Label encoder is only used for Classif. which we don't support...
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        X['y'] = np.array(y)

        self.nan_value = X['y'].mean()

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Create a general scale now that will be used for unknown groups at prediction time
        # Can we do smarter than that ?
        self.general_scaler = MinMaxScaler().fit(X[['y', 'ds']].groupby('ds').median().values)

        # Go through groups and standard scale them
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        self.scalers = {}
        scaled_ys = []
        print(f'{datetime.now()} Start of group scaling')

        for key, X_grp in X_groups:
            # Create dict key to store the min max scaler
            grp_hash = self.get_hash(key)
            # Scale target for current group
            self.scalers[grp_hash] = MinMaxScaler()
            y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values)
            # Put back in a DataFrame to keep track of original index
            y_skl_df = pd.DataFrame(y_skl, columns=['y'])
            # (0, 'A') (1, 4) (100, 1) (100, 1)
            # print(grp_hash, X_grp.shape, y_skl.shape, y_skl_df.shape)

            y_skl_df.index = X_grp.index
            scaled_ys.append(y_skl_df)

        print(f'{datetime.now()} End of group scaling')
        # Set target back in original frame but keep original
        X['y_orig'] = X['y']
        X['y'] = pd.concat(tuple(scaled_ys), axis=0)

        # Now Average groups
        X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index()

        # Send that to Prophet
        params = {
            "country_holidays": self.country_holidays,
            "monthly_seasonality": self.monthly_seasonality
        }
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        self.model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)

        if params["country_holidays"] is not None:
            self.model.add_country_holidays(country_name=params["country_holidays"])
        if params["monthly_seasonality"]:
            self.model.add_seasonality(name='monthly', period=30.5, fourier_order=5)

        with suppress_stdout_stderr():
            self.model.fit(X[['ds', 'y']])

        print(f'{datetime.now()} General Model Fitted')

        self.top_groups = None
        if len(tgc_wo_time) > 0:
            if self.top_n > 0:
                top_n_grp = X.groupby(tgc_wo_time).size().sort_values().reset_index()[tgc_wo_time].iloc[-self.top_n:].values
                self.top_groups = [
                    '_'.join(map(str, key))
                    for key in top_n_grp
                ]

        if self.top_groups:
            self.grp_models = {}
            self.priors = {}

            # Prepare for multi processing
            num_tasks = len(self.top_groups)

            def processor(out, res):
                out[res[0]] = res[1]

            pool_to_use = small_job_pool
            loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.")
            loggerinfo(logger, "Prophet parameters holidays {} / monthly {}".format(self.country_holidays, self.monthly_seasonality))
            pool = pool_to_use(
                logger=None, processor=processor,
                num_tasks=num_tasks, max_workers=n_jobs
            )
            #
            # Fit 1 FB Prophet model per time group columns
            nb_groups = len(X_groups)

            # Put y back to its unscaled value for top groups
            X['y'] = X['y_orig']

            for _i_g, (key, X) in enumerate(X_groups):
                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups))

                X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4()))
                X = X.reset_index(drop=True)
                save_obj(X, X_path)

                grp_hash = self.get_hash(key)

                if grp_hash not in self.top_groups:
                    continue

                self.priors[grp_hash] = X['y'].mean()

                params = {
                    "country_holidays": self.country_holidays,
                    "monthly_seasonality": self.monthly_seasonality
                }

                args = (X_path, grp_hash, tmp_folder, params)
                kwargs = {}
                pool.submit_tryget(None, MyParallelProphetTransformer_fit_async,
                                   args=args, kwargs=kwargs, out=self.grp_models)
            pool.finish()

            for k, v in self.grp_models.items():
                self.grp_models[k] = load_obj(v) if v is not None else None
                remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self
Ejemplo n.º 26
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = self.get_experiment_logger()

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))

        # Change date feature name to match Prophet requirements
        X = self.convert_to_prophet(X)

        y_predictions = self.predict_with_average_model(X, tgc_wo_time)
        y_predictions.columns = ['average_pred']

        # Go through groups
        for grp_col in tgc_wo_time:
            # Get the unique dates to be predicted
            X_groups = X[['ds', grp_col]].groupby(grp_col)

            # Go though the groups and predict only top
            XX_paths = []
            model_paths = []

            def processor(out, res):
                out.append(res)

            num_tasks = len(X_groups)
            pool_to_use = small_job_pool
            pool = pool_to_use(logger=None,
                               processor=processor,
                               num_tasks=num_tasks,
                               max_workers=n_jobs)

            for _i_g, (key, X_grp) in enumerate(X_groups):

                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, num_tasks // 20) == 0:
                    loggerinfo(
                        logger, "FB Prophet : %d%% of groups predicted" %
                        (100 * (_i_g + 1) // num_tasks))

                # Create dict key to store the min max scaler
                grp_hash = self.get_hash(key)
                X_path = os.path.join(tmp_folder,
                                      "fbprophet_Xt" + str(uuid.uuid4()))

                if grp_hash not in self.grp_models[grp_col]:
                    # unseen groups
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan),
                                      columns=['yhat'])
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                if self.grp_models[grp_col][grp_hash] is None:
                    # known groups but not enough train data
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan),
                                      columns=['yhat'])
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                model = self.grp_models[grp_col][grp_hash]
                model_path = os.path.join(
                    tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X_grp, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_col][grp_hash],
                        tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyProphetOnSingleGroupsTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)

            pool.finish()
            y_predictions[f'{grp_col}_pred'] = pd.concat(
                (load_obj(XX_path) for XX_path in XX_paths),
                axis=0).sort_index()
            for p in XX_paths + model_paths:
                remove(p)

        # Now we can invert scale
        # But first get rid of NaNs
        for grp_col in tgc_wo_time:
            # Add time group to the predictions, will be used to invert scaling
            y_predictions[grp_col] = X[grp_col]
            # Fill NaN
            y_predictions[f'{grp_col}_pred'] = y_predictions[
                f'{grp_col}_pred'].fillna(y_predictions['average_pred'])

        # Go through groups and recover the scaled target for knowed groups
        if len(tgc_wo_time) > 0:
            X_groups = y_predictions.groupby(tgc_wo_time)
        else:
            X_groups = [([None], y_predictions)]

        for _f in [f'{grp_col}_pred'
                   for grp_col in tgc_wo_time] + ['average_pred']:
            inverted_ys = []
            for key, X_grp in X_groups:
                grp_hash = self.get_hash(key)
                # Scale target for current group
                if grp_hash in self.scalers.keys():
                    inverted_y = self.scalers[grp_hash].inverse_transform(
                        X_grp[[_f]])
                else:
                    inverted_y = self.general_scaler.inverse_transform(
                        X_grp[[_f]])

                # Put back in a DataFrame to keep track of original index
                inverted_df = pd.DataFrame(inverted_y, columns=[_f])
                inverted_df.index = X_grp.index
                inverted_ys.append(inverted_df)
            y_predictions[_f] = pd.concat(tuple(inverted_ys),
                                          axis=0).sort_index()[_f]

        self._clean_tmp_folder(logger, tmp_folder)

        y_predictions.drop(tgc_wo_time, axis=1, inplace=True)

        self._output_feature_names = [
            f'{self.display_name}_{_f}' for _f in y_predictions
        ]
        self._feature_desc = [
            f'{self.display_name}_{_f}' for _f in y_predictions
        ]

        return y_predictions
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get TGC and time column
        self.tgc = self.params_base.get('tgc', None)
        self.time_column = self.params_base.get('time_column', None)
        self.nan_value = np.mean(y)
        self.cap = np.max(
            y
        ) * 1.5  # TODO Don't like this we should compute a cap from average yearly growth
        self.prior = np.mean(y)

        if self.time_column is None:
            self.time_column = self.tgc[0]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        loggerinfo(
            logger,
            "Start Fitting Prophet Model with params : {}".format(self.params))

        try:
            # Add value of prophet_top_n in recipe_dict variable inside of config.toml file
            # eg1: recipe_dict="{'prophet_top_n': 200}"
            # eg2: recipe_dict="{'prophet_top_n':10}"
            self.top_n = config.recipe_dict['prophet_top_n']
        except KeyError:
            self.top_n = 50

        loggerinfo(
            logger,
            f"Prophet will use {self.top_n} groups as well as average target data."
        )

        # Get temporary folders for multi process communication
        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Add target, Label encoder is only used for Classif. which we don't support...
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        X['y'] = np.array(y)

        self.nan_value = X['y'].mean()

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Create a general scale now that will be used for unknown groups at prediction time
        # Can we do smarter than that ?
        general_scaler = MinMaxScaler().fit(
            X[['y', 'ds']].groupby('ds').median().values)

        # Go through groups and standard scale them
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        scalers = {}
        scaled_ys = []

        print('Number of groups : ', len(X_groups))
        for g in tgc_wo_time:
            print(f'Number of groups in {g} groups : {X[g].unique().shape}')

        for key, X_grp in X_groups:
            # Create dict key to store the min max scaler
            grp_hash = self.get_hash(key)
            # Scale target for current group
            scalers[grp_hash] = MinMaxScaler()
            y_skl = scalers[grp_hash].fit_transform(X_grp[['y']].values)
            # Put back in a DataFrame to keep track of original index
            y_skl_df = pd.DataFrame(y_skl, columns=['y'])

            y_skl_df.index = X_grp.index
            scaled_ys.append(y_skl_df)

        # Set target back in original frame but keep original
        X['y_orig'] = X['y']
        X['y'] = pd.concat(tuple(scaled_ys), axis=0)

        # Now Average groups
        X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index()

        # Send that to Prophet
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        nrows = X[['ds', 'y']].shape[0]
        n_changepoints = max(1, int(nrows * (2 / 3)))
        if n_changepoints < 25:
            model = Prophet(yearly_seasonality=True,
                            weekly_seasonality=True,
                            daily_seasonality=True,
                            n_changepoints=n_changepoints)
        else:
            model = Prophet(yearly_seasonality=True,
                            weekly_seasonality=True,
                            daily_seasonality=True)

        if self.params["country_holidays"] is not None:
            model.add_country_holidays(
                country_name=self.params["country_holidays"])
        if self.params["monthly_seasonality"]:
            model.add_seasonality(
                name='monthly',
                period=30.5,
                fourier_order=self.params["monthly_seasonality"])
        if self.params["quarterly_seasonality"]:
            model.add_seasonality(
                name='quarterly',
                period=92,
                fourier_order=self.params["quarterly_seasonality"])

        with suppress_stdout_stderr():
            model.fit(X[['ds', 'y']])

        top_groups = None
        if len(tgc_wo_time) > 0:
            if self.top_n > 0:
                top_n_grp = X.groupby(tgc_wo_time).size().sort_values(
                ).reset_index()[tgc_wo_time].iloc[-self.top_n:].values
                top_groups = ['_'.join(map(str, key)) for key in top_n_grp]

        grp_models = {}
        priors = {}
        if top_groups:
            # Prepare for multi processing
            num_tasks = len(top_groups)

            def processor(out, res):
                out[res[0]] = res[1]

            pool_to_use = small_job_pool
            loggerinfo(logger,
                       f"Prophet will use {n_jobs} workers for fitting.")

            pool = pool_to_use(logger=None,
                               processor=processor,
                               num_tasks=num_tasks,
                               max_workers=n_jobs)
            #
            # Fit 1 FB Prophet model per time group columns
            nb_groups = len(X_groups)

            # Put y back to its unscaled value for top groups
            X['y'] = X['y_orig']

            for _i_g, (key, X) in enumerate(X_groups):
                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(
                        logger, "FB Prophet : %d%% of groups fitted" %
                        (100 * (_i_g + 1) // nb_groups))

                X_path = os.path.join(tmp_folder,
                                      "fbprophet_X" + str(uuid.uuid4()))
                X = X.reset_index(drop=True)
                save_obj(X, X_path)

                grp_hash = self.get_hash(key)

                if grp_hash not in top_groups:
                    continue

                priors[grp_hash] = X['y'].mean()

                args = (X_path, grp_hash, tmp_folder, self.params, self.cap)
                kwargs = {}
                pool.submit_tryget(None,
                                   MyParallelProphetTransformer_fit_async,
                                   args=args,
                                   kwargs=kwargs,
                                   out=grp_models)
            pool.finish()

            for k, v in grp_models.items():
                grp_models[k] = load_obj(v) if v is not None else None
                remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        self.set_model_properties(
            model={
                'avg': model,
                'group': grp_models,
                'priors': priors,
                'topgroups': top_groups,
                'skl': scalers,
                'gen_scaler': general_scaler
            },
            features=self.tgc,  # Prophet uses time and timegroups
            importances=np.ones(len(self.tgc)),
            iterations=-1  # Does not have iterations
        )

        return None
Ejemplo n.º 28
0
    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits FB Prophet models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """

        # Get the logger if it exists
        logger = self.get_experiment_logger()

        loggerinfo(
            logger,
            f"Prophet will use individual groups as well as average target data."
        )

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))

        X = self.convert_to_prophet(X)

        # Add target, Label encoder is only used for Classif. which we don't support...
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        X['y'] = np.array(y)

        self.prior_value = X['y'].mean()

        self.general_scaler = self.fit_scaler_to_median_target(X)

        X = self.scale_target_for_each_time_group(X, tgc_wo_time)

        self.avg_model = self.fit_prophet_model_on_average_target(X)

        # Go through individual time group columns and create avg models
        self.grp_models = {}
        self.priors = {}
        for grp_col in tgc_wo_time:
            self.grp_models[grp_col] = {}
            self.priors[grp_col] = {}

            X_groups = X[['ds', 'y', grp_col]].groupby(grp_col)

            nb_groups = len(X_groups)

            def processor(out, res):
                out[res[0]] = res[1]

            pool_to_use = small_job_pool
            loggerinfo(logger,
                       f"Prophet will use {n_jobs} workers for fitting.")
            loggerinfo(
                logger, "Prophet parameters holidays {} / monthly {}".format(
                    self.country_holidays, self.monthly_seasonality))
            pool = pool_to_use(logger=None,
                               processor=processor,
                               num_tasks=nb_groups,
                               max_workers=n_jobs)

            for _i_g, (key, X_grp) in enumerate(X_groups):
                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(
                        logger, "FB Prophet : %d%% of groups fitted" %
                        (100 * (_i_g + 1) // nb_groups))

                X_path = os.path.join(tmp_folder,
                                      "fbprophet_X" + str(uuid.uuid4()))

                # Save target average for current group
                grp_hash = self.get_hash(key)
                self.priors[grp_col][grp_hash] = X_grp['y'].mean()

                # Average by date
                X_grp_avg = X_grp.groupby('ds')['y'].mean().reset_index()

                save_obj(X_grp_avg, X_path)

                params = {
                    "country_holidays": self.country_holidays,
                    "monthly_seasonality": self.monthly_seasonality
                }

                args = (X_path, grp_hash, tmp_folder, params)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyProphetOnSingleGroupsTransformer_fit_async,
                    args=args,
                    kwargs=kwargs,
                    out=self.grp_models[grp_col])
            pool.finish()

            for k, v in self.grp_models[grp_col].items():
                self.grp_models[grp_col][k] = load_obj(
                    v) if v is not None else None
                remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self
Ejemplo n.º 29
0
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        loggerinfo(logger,
                   "Prophet will use {} workers for transform".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)
        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        print("Nb Groups = ", nb_groups)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Log where we are in the transformation of the dataset
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(tmp_folder,
                                  "fbprophet_Xt" + str(uuid.uuid4()))
            # Commented for performance, uncomment for debug
            # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelProphetTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        return XX
Ejemplo n.º 30
0
    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits FB Prophet models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Convert to pandas
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        # Make sure labales are numeric
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        XX['y'] = np.array(y)
        # Set target prior
        self.nan_value = np.mean(y)

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        self.models = {}
        self.priors = {}

        # Prepare for multi processing
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        loggerinfo(logger,
                   "Prophet will use {} workers for fitting".format(n_jobs))
        loggerinfo(
            logger, "Prophet parameters holidays {} / monthly {}".format(
                self.country_holidays, self.monthly_seasonality))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        # Fit 1 FB Prophet model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just log where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "FB Prophet : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            X_path = os.path.join(tmp_folder,
                                  "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))

            self.priors[grp_hash] = X['y'].mean()

            params = {
                "country_holidays": self.country_holidays,
                "monthly_seasonality": self.monthly_seasonality
            }

            args = (X_path, grp_hash, tmp_folder, params)
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelProphetTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self