Python Frame.groupby Examples

Programming Language: Python

Namespace/Package Name: datatable

Class/Type: Frame

Method/Function: groupby

Examples at hotexamples.com: 3

Python Frame.groupby - 3 examples found. These are the top rated real world Python examples of datatable.Frame.groupby extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Frame(13)

drop(6)

cbind(3)

groupby(3)

copy(2)

index(2)

apply(1)

astype(1)

drop_duplicates(1)

head(1)

Example #1

Show file

    def transform(self, X: dt.Frame):
        """Transform features once grouped by Time Group Columns (TGC)"""
        # With the col_type set to "all", X can contain text features
        # So restrict to int float and bool types
        # This is easily done in datatable
        X = X[:, [int, float, bool]]
        # If after the filtering there are no features left then just return a zero valued features
        if X.ncols == 0:
            return np.zeros(X.nrows)

        # Move to pandas to use the apply method
        X = X.to_pandas()

        group_cols = [_f for _f in self.tgc if _f != self.time_column]

        # Check if we really have any group columns available
        if len(group_cols) == 0:
            # Apply MACD directly on the available features but drop the time column
            features = [_f for _f in X.columns if _f != self.time_column]
            return self.normalized_macd(X[features])

        # Get the data columns, i.e. not the group columns or time column
        col = np.setdiff1d(X.columns, self.tgc)
        if len(col) > 0:
            # Groupby by the TGC and apply normalized MACD to the data
            # Pandas.apply ios not time effective so should move this to data table
            res = X.groupby(group_cols)[col].apply(self.normalized_macd)

            res.index = X.index
            return res
        else:
            return np.zeros(X.nrows)

Example #2

Show file

    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        :param X: Datatable Frame containing the features
        :return: FB Prophet predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Predict y using unique dates
        X_time = X[['ds']].groupby('ds').first().reset_index()
        with suppress_stdout_stderr():
            y_avg = self.model.predict(X_time)[['ds', 'yhat']]

        # Prophet transforms the date column to datetime so we need to transfrom that to merge back
        X_time.sort_values('ds', inplace=True)
        X_time['yhat'] = y_avg['yhat']
        X_time.sort_index(inplace=True)

        # Merge back into original frame on 'ds'
        # pd.merge wipes the index ... so keep it to provide it again
        indices = X.index
        X = pd.merge(
            left=X,
            right=X_time[['ds', 'yhat']],
            on='ds',
            how='left'
        )
        X.index = indices

        # Go through groups and recover the scaled target for knowed groups
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        inverted_ys = []
        for key, X_grp in X_groups:
            grp_hash = self.get_hash(key)

            # Scale target for current group
            if grp_hash in self.scalers.keys():
                inverted_y = self.scalers[grp_hash].inverse_transform(X_grp[['yhat']])
            else:
                inverted_y = self.general_scaler.inverse_transform(X_grp[['yhat']])

            # Put back in a DataFrame to keep track of original index
            inverted_df = pd.DataFrame(inverted_y, columns=['yhat'])
            inverted_df.index = X_grp.index
            inverted_ys.append(inverted_df)

        XX_general = pd.concat(tuple(inverted_ys), axis=0).sort_index()

        if self.top_groups:
            # Go though the groups and predict only top
            XX_paths = []
            model_paths = []

            def processor(out, res):
                out.append(res)
            num_tasks = len(self.top_groups)
            pool_to_use = small_job_pool
            pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs)

            nb_groups = len(X_groups)
            for _i_g, (key, X_grp) in enumerate(X_groups):

                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(logger, "FB Prophet : %d%% of groups predicted" % (100 * (_i_g + 1) // nb_groups))

                # Create dict key to store the min max scaler
                grp_hash = self.get_hash(key)
                X_path = os.path.join(tmp_folder, "fbprophet_Xt" + str(uuid.uuid4()))

                if grp_hash not in self.top_groups:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                if self.grp_models[grp_hash] is None:
                    XX = pd.DataFrame(np.full((X_grp.shape[0], 1), np.nan), columns=['yhat'])  # unseen groups
                    XX.index = X_grp.index
                    save_obj(XX, X_path)
                    XX_paths.append(X_path)
                    continue

                model = self.grp_models[grp_hash]
                model_path = os.path.join(tmp_folder, "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X_grp, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.priors[grp_hash], tmp_folder)
                kwargs = {}
                pool.submit_tryget(None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)

            pool.finish()
            XX_top_groups = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
            for p in XX_paths + model_paths:
                remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        features_df = pd.DataFrame()
        features_df[self.display_name + '_GrpAvg'] = XX_general['yhat']

        if self.top_groups:
            features_df[self.display_name + f'_Top{self.top_n}Grp'] = XX_top_groups['yhat']

        self._output_feature_names = list(features_df.columns)
        self._feature_desc = list(features_df.columns)

        return features_df

Example #3

Show file

    def fit(self, X: dt.Frame, y: np.array = None, **kwargs):
        """
        Fits FB Prophet models (1 per time group) using historical target values contained in y
        Model fitting is distributed over a pool of processes and uses file storage to share the data with workers
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir
            )

        try:
            # Add value of prophet_top_n in recipe_dict variable inside of config.toml file
            # eg1: recipe_dict="{'prophet_top_n': 200}"
            # eg2: recipe_dict="{'prophet_top_n':10}"
            self.top_n = config.recipe_dict['prophet_top_n']
        except KeyError:
            self.top_n = 50

        loggerinfo(logger, f"Prophet will use {self.top_n} groups as well as average target data.")

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        # Reduce X to TGC
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        X = X[:, self.tgc].to_pandas()

        # Fill NaNs or None
        X = X.replace([None, np.nan], 0)

        # Add target, Label encoder is only used for Classif. which we don't support...
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        X['y'] = np.array(y)

        self.nan_value = X['y'].mean()

        # Change date feature name to match Prophet requirements
        X.rename(columns={self.time_column: "ds"}, inplace=True)

        # Create a general scale now that will be used for unknown groups at prediction time
        # Can we do smarter than that ?
        self.general_scaler = MinMaxScaler().fit(X[['y', 'ds']].groupby('ds').median().values)

        # Go through groups and standard scale them
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        self.scalers = {}
        scaled_ys = []
        print(f'{datetime.now()} Start of group scaling')

        for key, X_grp in X_groups:
            # Create dict key to store the min max scaler
            grp_hash = self.get_hash(key)
            # Scale target for current group
            self.scalers[grp_hash] = MinMaxScaler()
            y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values)
            # Put back in a DataFrame to keep track of original index
            y_skl_df = pd.DataFrame(y_skl, columns=['y'])
            # (0, 'A') (1, 4) (100, 1) (100, 1)
            # print(grp_hash, X_grp.shape, y_skl.shape, y_skl_df.shape)

            y_skl_df.index = X_grp.index
            scaled_ys.append(y_skl_df)

        print(f'{datetime.now()} End of group scaling')
        # Set target back in original frame but keep original
        X['y_orig'] = X['y']
        X['y'] = pd.concat(tuple(scaled_ys), axis=0)

        # Now Average groups
        X_avg = X[['ds', 'y']].groupby('ds').mean().reset_index()

        # Send that to Prophet
        params = {
            "country_holidays": self.country_holidays,
            "monthly_seasonality": self.monthly_seasonality
        }
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        self.model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)

        if params["country_holidays"] is not None:
            self.model.add_country_holidays(country_name=params["country_holidays"])
        if params["monthly_seasonality"]:
            self.model.add_seasonality(name='monthly', period=30.5, fourier_order=5)

        with suppress_stdout_stderr():
            self.model.fit(X[['ds', 'y']])

        print(f'{datetime.now()} General Model Fitted')

        self.top_groups = None
        if len(tgc_wo_time) > 0:
            if self.top_n > 0:
                top_n_grp = X.groupby(tgc_wo_time).size().sort_values().reset_index()[tgc_wo_time].iloc[-self.top_n:].values
                self.top_groups = [
                    '_'.join(map(str, key))
                    for key in top_n_grp
                ]

        if self.top_groups:
            self.grp_models = {}
            self.priors = {}

            # Prepare for multi processing
            num_tasks = len(self.top_groups)

            def processor(out, res):
                out[res[0]] = res[1]

            pool_to_use = small_job_pool
            loggerinfo(logger, f"Prophet will use {n_jobs} workers for fitting.")
            loggerinfo(logger, "Prophet parameters holidays {} / monthly {}".format(self.country_holidays, self.monthly_seasonality))
            pool = pool_to_use(
                logger=None, processor=processor,
                num_tasks=num_tasks, max_workers=n_jobs
            )
            #
            # Fit 1 FB Prophet model per time group columns
            nb_groups = len(X_groups)

            # Put y back to its unscaled value for top groups
            X['y'] = X['y_orig']

            for _i_g, (key, X) in enumerate(X_groups):
                # Just log where we are in the fitting process
                if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                    loggerinfo(logger, "FB Prophet : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups))

                X_path = os.path.join(tmp_folder, "fbprophet_X" + str(uuid.uuid4()))
                X = X.reset_index(drop=True)
                save_obj(X, X_path)

                grp_hash = self.get_hash(key)

                if grp_hash not in self.top_groups:
                    continue

                self.priors[grp_hash] = X['y'].mean()

                params = {
                    "country_holidays": self.country_holidays,
                    "monthly_seasonality": self.monthly_seasonality
                }

                args = (X_path, grp_hash, tmp_folder, params)
                kwargs = {}
                pool.submit_tryget(None, MyParallelProphetTransformer_fit_async,
                                   args=args, kwargs=kwargs, out=self.grp_models)
            pool.finish()

            for k, v in self.grp_models.items():
                self.grp_models[k] = load_obj(v) if v is not None else None
                remove(v)

        self._clean_tmp_folder(logger, tmp_folder)

        return self