Esempio n. 1
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     XX = X.to_pandas().iloc[:, 0].values
     is_na = np.isnan(XX)
     self._offset = -np.nanmin(XX) if np.nanmin(XX) < 0 else 0
     self._offset += 1e-3
     self._lmbda = None
     if not any(~is_na):
         return X
     self._lmbda = yeojohnson(self._offset + XX[~is_na], lmbda=self._lmbda)[1]  # compute lambda
     return self.transform(X)
 def transform(self, X: dt.Frame):
     XX = X.to_pandas().iloc[:, 0].values
     is_na = np.isnan(XX) | np.array(XX <= -self._offset)
     if not any(~is_na) or self._lmbda is None:
         return X
     ret = yeojohnson(
         self._offset + XX[~is_na],
         lmbda=self._lmbda)  # apply transform with pre-computed lambda
     XX[~is_na] = ret
     return XX
Esempio n. 3
0
 def transform(self, X: dt.Frame):
     orig_col_name = X.names[0]
     X = dt.Frame(X).to_pandas().astype(str).fillna("NA")
     new_X = X.apply(lambda x: self.get_ne_count(x[orig_col_name]),
                     axis=1,
                     result_type='expand')
     new_X.columns = [
         f'{orig_col_name}_Count_{ne_type}' for ne_type in self.ne_types
     ]
     return new_X
Esempio n. 4
0
    def transform(self, X: dt.Frame):
        import pandas as pd

        ret_df = pd.DataFrame(
            [self.get_imports_features(x) for x in X.to_pandas().values[:, 0]])

        self._output_feature_names = ret_df.columns.to_list()
        self._feature_desc = self._output_feature_names

        return ret_df
Esempio n. 5
0
    def transform(self, X: dt.Frame, y: np.array = None):
        if ngpus_vis == 0:
            raise IgnoreEntirelyError("Transformer cannot run without GPUs")

        import cudf
        import cuml
        cuml.common.memory_utils.set_global_output_type('numpy')
        X = X.to_pandas().fillna(0)
        X = cudf.DataFrame(X)
        return self.model.predict(X)
    def fit(self, X: dt.Frame, y: np.array = None):
        """
        Fits ARIMA models (1 per time group) using historical target values contained in y
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """
        # Import the ARIMA python module
        pm = importlib.import_module('pmdarima')

        # Create dictionary that will link models to groups
        self.models = {}

        # Convert to pandas
        X = X.to_pandas()
        # Keep the Time Group Columns
        XX = X[self.tgc].copy()
        # Add the target
        XX['y'] = np.array(y)

        self.mean_value = np.mean(y)
        self.ntrain = X.shape[0]

        # Get the logger if it exists
        logger = self._get_logger()

        # Group the input by TGC (Time group column) excluding the time column itself
        # What we want is being able to access the time series related to each group
        # So that we can predict future sales for each store/department independently
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Build 1 ARIMA model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # print("auto arima - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            order = np.argsort(X[self.time_column])
            try:
                model = pm.auto_arima(X['y'].values[order], error_action='ignore')
            except Exception as e:
                loggerinfo(logger, "Auto ARIMA warning: {}".format(e))
                model = None

            self.models[grp_hash] = model

        return self
    def transform(self, X: dt.Frame):
        """
        Uses fitted models (1 per time group) to predict the target
        If self.is_train exists, it means we are doing in-sample predictions
        if it does not then we Arima is used to predict the future
        :param X: Datatable Frame containing the features
        :return: ARIMA predictions
        """
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        nb_groups = len(XX_grp)
        preds = []
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # print("auto arima - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            order = np.argsort(X[self.time_column])
            if grp_hash in self.models:
                model = self.models[grp_hash]
                if model is not None:
                    yhat = model.predict_in_sample() \
                        if hasattr(self, 'is_train') else model.predict(n_periods=X.shape[0])
                    yhat = yhat[order]
                    XX = pd.DataFrame(yhat, columns=['yhat'])
                else:
                    XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                      columns=['yhat'])  # invalid model
            else:
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
            XX.index = X.index
            preds.append(XX)
        XX = pd.concat(tuple(preds), axis=0).sort_index()

        return XX
Esempio n. 8
0
def write_pset_table(pset_df, df_name, pset_name, df_dir):
    """
    Write a PSet table to a CSV file.

    @param pset_df: [`DataFrame`] A PSet DataFrame
    @param pset_name: [`string`] The name of the PSet
    @param df_dir: [`string`] The name of the directory to hold all the PSet tables
    @return [`None`]
    """
    pset_path = os.path.join(df_dir, pset_name)
    # Make sure directory for this PSet exists
    if not os.path.exists(pset_path):
        os.mkdir(pset_path)

    # Convert to datatable Frame for fast write to disk
    pset_df = Frame(pset_df)

    print(f'Writing {df_name} table to {pset_path}...')
    # Use datatable to convert df to csv
    pset_df.to_csv(os.path.join(pset_path, f'{pset_name}_{df_name}.csv'))
    def create_data(X: dt.Frame = None):

        if X is None:
            return []
        fixup = process_tweets()

        X = dt.Frame(X).to_pandas()
        for text_colname in text_colnames:
            X["preprocessed_" + text_colname] = X[text_colname].astype(
                str).apply(lambda x: fixup.preprocess(x))

        temp_path = os.path.join(config.data_directory,
                                 config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        #Save files to disk
        file_train = os.path.join(temp_path, output_dataset_name + ".csv")
        X.to_csv(file_train, index=False)

        return [file_train]
Esempio n. 10
0
    def transform(self, X: dt.Frame):
        col_names = X.names
        print(col_names)
        lat = []
        long = []
        for col in col_names:
            if col.find("latitude") > -1:
                lat.append(col)
            if (col.find("longitude") > -1):
                long.append(col)

        if (len(lat) == 2 and len(long) == 2):
            return X.to_pandas().apply(lambda row: \
                                           distance(row[lat[0]], \
                                                    row[long[0]], \
                                                    row[lat[1]], \
                                                    row[long[1]]), \
                                       axis=1)
        else:
            return X.to_pandas().iloc[:, 0]
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     X = X.to_pandas().astype(str).iloc[:, 0].fillna("NA")
     # Count Vectorizer
     self.cnt_vec = CountVectorizer(analyzer="char", ngram_range=(1,self.max_ngram))
     X = self.cnt_vec.fit_transform(X)
     # Truncated SVD
     if len(self.cnt_vec.vocabulary_) <= self.n_svd_comp:
         self.n_svd_comp = len(self.cnt_vec.vocabulary_) - 1
     self.truncated_svd = TruncatedSVD(n_components=self.n_svd_comp, random_state=2019)
     X = self.truncated_svd.fit_transform(X)
     return X
Esempio n. 12
0
    def transform(self, X: dt.Frame):
        # Keep date only
        X = X[:, self.time_column].to_pandas()
        # Transform to pandas date time
        X[self.time_column] = pd.to_datetime(X[self.time_column])
        # Create Year and day of year so that we can merge with stored holidays
        X['year'] = X[self.time_column].dt.year
        X['doy'] = X[self.time_column].dt.dayofyear

        # General first
        holi_df = self.memos['country']
        holi_df['is_DE_holiday_country'] = 1
        X["is_DE_holiday_country"] = X.merge(
            self.memos['country'], on=['year', 'doy'],
            how='left').fillna(0)['is_DE_holiday_country']

        # Then Landers
        for prov in [
                'BW', 'BY', 'BE', 'BB', 'HB', 'HH', 'HE', 'MV', 'NI', 'NW',
                'RP', 'SL', 'SN', 'ST', 'SH', 'TH'
        ]:
            holi_df = self.memos[prov]
            holi_df[f'is_DE_holiday_{prov}'] = 1
            X[f'is_DE_holiday_{prov}'] = X.merge(
                holi_df, on=['year', 'doy'],
                how='left').fillna(0)[f'is_DE_holiday_{prov}']

        X.drop([self.time_column, 'year', 'doy'], axis=1, inplace=True)

        features = [
            f'is_DE_holiday%s{prov}' %
            (orig_feat_prefix + orig_feat_prefix.join([self.time_column]) +
             extra_prefix) for prov in [
                 'country', 'BW', 'BY', 'BE', 'BB', 'HB', 'HH', 'HE', 'MV',
                 'NI', 'NW', 'RP', 'SL', 'SN', 'ST', 'SH', 'TH'
             ]
        ]
        self._output_feature_names = list(features)
        self._feature_desc = list(features)

        return X
    def fit(self, X: dt.Frame, y: np.array = None):
        """
        Fits ARIMA models (1 per time group) using historical target values contained in y
        :param X: Datatable frame containing the features
        :param y: numpy array containing the historical values of the target
        :return: self
        """
        # Import the ARIMA python module
        pm = importlib.import_module('pmdarima')
        # Init models
        self.models = {}
        # Convert to pandas
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]

        # Group the input by TGC (Time group column) excluding the time column itself
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Build 1 ARIMA model per time group columns
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just say where we are in the fitting process
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups fitted" %
                    (100 * (_i_g + 1) // nb_groups))

            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # print("auto arima - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            order = np.argsort(X[self.time_column])
            try:
                model = pm.auto_arima(X['y'].values[order],
                                      error_action='ignore')
            except:
                model = None
            self.models[grp_hash] = model
        return self
Esempio n. 14
0
 def transform(self, X: dt.Frame):
     h2o.init()
     model_path = os.path.join(temporary_files_path, self.id)
     with open(model_path, "wb") as f:
         f.write(self.raw_model_bytes)
     model = h2o.load_model(model_path)
     os.remove(model_path)
     frame = h2o.H2OFrame(X.to_pandas())
     try:
         return model.anomaly(frame).as_data_frame(header=False)
     finally:
         h2o.remove(self.id)
Esempio n. 15
0
 def transform(self, X: dt.Frame):
     X = dt.Frame(X)
     orig_col_name = X.names[0]
     new_X = X.to_pandas().astype(str).fillna("NA").iloc[:, 0].values
     new_X = [doc.split() for doc in new_X]
     new_X = [self.dictionary.doc2bow(doc) for doc in new_X]
     new_X = self.model.inference(new_X)[0]
     self._output_feature_names = [f'{self.display_name}{orig_feat_prefix}{orig_col_name}{extra_prefix}topic{i}'
                                   for i in range(new_X.shape[1])]
     self._feature_desc = [f'LDA Topic {i} of {self.n_topics} for {orig_col_name} column' for i in
                           range(new_X.shape[1])]
     return new_X
    def transform(self, X: dt.Frame):
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Transformed")
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(temporary_files_path, "autoarima_Xt" + str(uuid.uuid4()))

            # Commented for performance, uncomment for debug
            # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(temporary_files_path, "autoarima_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column,)
                kwargs = {}
                pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)
            else:
                # Don't go through pools
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat'])  # unseen groups
                # Sync indices
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)
        return XX
Esempio n. 17
0
 def transform(self, X: dt.Frame):
     XX = X.to_pandas().iloc[:, 0].values
     is_na = np.isnan(XX) | np.array(XX <= -self._offset)
     if not any(~is_na) or self._lmbda is None:
         return X
     x = self._offset + XX[~is_na]
     x = np.asarray(x)
     x[x <=
       0] = 1e-3  # don't worry if not invertible, just ensure can transform and valid transforms are kept valid
     ret = boxcox(
         x, lmbda=self._lmbda)  # apply transform with pre-computed lambda
     XX[~is_na] = ret
     return XX
Esempio n. 18
0
 def transform(self, X: dt.Frame):
     
     import pandas as pd
     orig_col_name = X.names[0]
     ret_df = pd.DataFrame(
             [
                 self.get_norm_byte_count(x)
                 for x in X.to_pandas().values[:,0]
             ]
         )
     self._output_feature_names = ['ByteNormCount_{}'.format(x) for x in range(ret_df.shape[1])]
     self._feature_desc = [f'Normalized Count of Byte value {x} for {orig_col_name} column' for x in range(ret_df.shape[1])]
     return ret_df
    def fit(self, X: dt.Frame, y: np.array = None):
        pm = importlib.import_module('pmdarima')
        self.models = {}
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Fitted")
            X_path = os.path.join(temporary_files_path, "autoarima_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash, self.time_column,)
            kwargs = {}
            pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models)
        pool.finish()

        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)
        return self
    def create_data(X: dt.Frame = None) -> pd.DataFrame:
        if X is None:
            return []

        from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

        X = X.to_pandas()
        y = X[TARGET_COLUMN].values
        X.drop(TARGET_COLUMN, axis=1, inplace=True)

        efs = EFS(ESTIMATOR,
                  min_features=MIN_FEATURES,
                  max_features=MAX_FEATURES,
                  scoring=SCORING,
                  cv=CV,
                  n_jobs=-1)

        efs.fit(X, y)

        X_fs = X.iloc[:, list(efs.best_idx_)]

        return X_fs
Esempio n. 21
0
    def transform(self, X: dt.Frame):
        """Transform features once grouped by Time Group Columns (TGC)"""
        # With the col_type set to "all", X can contain text features
        # So restrict to int float and bool types
        # This is easily done in datatable
        X = X[:, [int, float, bool]]
        # If after the filtering there are no features left then just return a zero valued features
        if X.ncols == 0:
            return np.zeros((X.nrows, 1))

        # Move to pandas to use the apply method
        X = X.to_pandas()

        group_cols = [_f for _f in self.tgc if _f != self.time_column]

        # Check if we really have any group columns available
        if len(group_cols) == 0:
            # Apply MACD directly on the available features but drop the time column
            features = [_f for _f in X.columns if _f != self.time_column]
            return self.normalized_macd(X[features])

        # Get the data columns, i.e. not the group columns or time column
        col = np.setdiff1d(X.columns, self.tgc)
        if len(col) > 0:
            # Groupby by the TGC and apply normalized MACD to the data
            # Pandas.apply ios not time effective so should move this to data table
            try:
                res = X.groupby(group_cols)[col].apply(self.normalized_macd)
            except KeyError:
                return np.zeros((X.nrows, 1))

            res.index = X.index
            if res.shape[1] == 0:
                return np.zeros((X.nrows, 1))
            else:
                return res
        else:
            return np.zeros((X.nrows, 1))
    def create_data(X: dt.Frame = None) -> pd.DataFrame:
        if X is None:
            return []

        from mlxtend.feature_selection import SequentialFeatureSelector as SFS

        X = X.to_pandas()
        y = X[TARGET_COLUMN].values
        X.drop(TARGET_COLUMN, axis=1, inplace=True)

        sfs = SFS(ESTIMATOR,
                  k_features=K_FEATURES,
                  forward=False,
                  floating=False,
                  scoring=SCORING,
                  cv=CV,
                  n_jobs=-1)

        sfs.fit(X, y)

        X_fs = X.iloc[:, list(sfs.k_feature_idx_)]

        return X_fs
 def transform(self, X: dt.Frame):
     XX = X.to_pandas().iloc[:, 0].values
     is_na = np.isnan(XX) | np.array(XX <= -self._offset)
     if not any(~is_na) or self._lmbda is None:
         return X
     ret = yeojohnson(
         self._offset + XX[~is_na],
         lmbda=self._lmbda)  # apply transform with pre-computed lambda
     XX[~is_na] = ret
     XX = dt.Frame(XX)
     # Don't leave inf/-inf
     for i in range(XX.ncols):
         XX.replace([math.inf, -math.inf], None)
     return XX
Esempio n. 24
0
    def fit_transform(self, X: dt.Frame, y: np.array = None):
        if ngpus_vis == 0:
            raise IgnoreEntirelyError("Transformer cannot run without GPUs")

        import cudf
        import cuml
        cuml.common.memory_utils.set_global_output_type('numpy')
        self.n_clusters = min(self.n_clusters, X.nrows)
        self.model = cuml.cluster.KMeans(n_clusters=self.n_clusters,
                                         max_iter=self.max_iters,
                                         tol=self.tol)
        X = X.to_pandas().fillna(0)
        X = cudf.DataFrame(X)
        return self.model.fit_predict(X)
Esempio n. 25
0
    def transform(self, X: dt.Frame):

        import pandas as pd

        mels = X.to_pandas().iloc[:, 0].apply(lambda x: self.get_mfcc(x))

        col_names = ['X_' + str(i) for i in range(0, len(mels[0]))]
        rows = len(mels)
        cols = len(mels[0])
        output_df = pd.DataFrame(data=np.reshape(np.concatenate(mels),
                                                 (rows, cols)),
                                 columns=col_names)

        return output_df
 def transform(self, X: dt.Frame):
     output = []
     X = X.to_pandas()
     text1_arr = X.iloc[:, 0].values
     text2_arr = X.iloc[:, 1].values
     for ind, text1 in enumerate(text1_arr):
         try:
             text1 = set(str(text1).lower().split())
             text2 = text2_arr[ind]
             text2 = set(str(text2).lower().split())
             output.append(len(text1.intersection(text2)) / len(text1.union(text2)))
         except:
             output.append(-1)
     return np.array(output)
Esempio n. 27
0
 def _do_ale_per_feature(self, X: dt.Frame, features: list) -> dict:
     ale_per_feature = dict()
     for feature in features:
         try:
             ale_per_feature[feature] = self._do_ale(
                 X=X.to_pandas(),
                 feature=feature,
                 bins=self.cfg_feature_bins.get(feature, self.cfg_bins),
             )
         except Exception as ex:
             self.logger.warning(f"ALE: skipping feature {feature}")
             self.logger.debug(
                 f"ALE: skipping feature {feature} as it failed with: {ex}")
     return ale_per_feature
Esempio n. 28
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     import gensim
     from gensim import corpora
     X = dt.Frame(X)
     new_X = X.to_pandas().astype(str).fillna("NA").iloc[:, 0].values
     new_X = [doc.split() for doc in new_X]
     self.dictionary = corpora.Dictionary(new_X)
     new_X = [self.dictionary.doc2bow(doc) for doc in new_X]
     self.model = gensim.models.ldamodel.LdaModel(new_X,
                                                  num_topics=self.n_topics,
                                                  id2word=self.dictionary,
                                                  passes=10,
                                                  random_state=2019)
     return self.transform(X)
Esempio n. 29
0
    def fit(self, X: dt.Frame, y: np.array = None):
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        XX['y'] = np.array(y)
        self.nan_value = np.mean(
            y)  # TODO - store mean per group, not just global
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        self.models = {}
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks)
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                print(100 * (_i_g + 1) // nb_groups, " of Groups Fitted")
            X_path = os.path.join(temporary_files_path,
                                  "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (
                X_path,
                grp_hash,
            )
            kwargs = {}
            pool.submit_tryget(None,
                               MyParallelProphetTransformer_fit_async,
                               args=args,
                               kwargs=kwargs,
                               out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)
        return self
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        col_count = 2
        col_names = ["random_col_1", "random_col_2"]

        if col_count != len(col_names):
            raise ValueError("Number of column names must be equal to number of columns.")

        if X is None:
            return []

        rcol = dt.Frame(np.random.randint(0, 100, size=(X.shape[0], col_count)))
        rcol.names = col_names
        X.cbind(rcol)

        return X