def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column):
        model = load_obj(model_path)
        XX_path = os.path.join(temporary_files_path, "autoarima_XXt" + str(uuid.uuid4()))
        X = load_obj(X_path)
        # Facebook Prophet returns the predictions ordered by time
        # So we should keep track of the time order for each group so that
        # predictions are ordered the same as the imput frame
        # Keep track of the order

        order = np.argsort(X[time_column])
        if model is not None:
            yhat = model.predict_in_sample() \
                if has_is_train_attr else model.predict(n_periods=X.shape[0])
            yhat = yhat[order]
            XX = pd.DataFrame(yhat, columns=['yhat'])

        else:
            XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid model

        # Sync index
        XX.index = X.index
        assert XX.shape[1] == 1
        save_obj(XX, XX_path)
        remove(model_path)  # indicates success, no longer need
        remove(X_path)  # indicates success, no longer need
        return XX_path
Exemple #2
0
    def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column, tmp_folder):
        """
        Predicts target for a particular time group
        :param model_path: path to the stored model
        :param X_path: Path to the data used to fit the ARIMA model
        :param nan_value: Value of target prior, used when no fitted model has been found
        :param has_is_train_attr: indicates if we predict in-sample or out-of-sample
        :param time_column: Name of the time column in the input data
        :return: self
        """
        model = load_obj(model_path)
        XX_path = os.path.join(tmp_folder, "autoarima_XXt" + str(uuid.uuid4()))
        X = load_obj(X_path)
        # Arima returns the predictions ordered by time
        # So we should keep track of the time order for each group so that
        # predictions are ordered the same as the imput frame
        # Keep track of the order

        order = np.argsort(X[time_column])
        if model is not None:
            yhat = model.predict_in_sample() \
                if has_is_train_attr else model.predict(n_periods=X.shape[0])
            yhat = yhat[order]
            XX = pd.DataFrame(yhat, columns=['yhat'])

        else:
            XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid model

        # Sync index
        XX.index = X.index
        assert XX.shape[1] == 1
        save_obj(XX, XX_path)
        remove(model_path)  # indicates success, no longer need
        remove(X_path)  # indicates success, no longer need
        return XX_path
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        X = self.inf_impute(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        model_file = os.path.join(model_path,
                                  "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_file))
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                return model.predict_contributions(test_frame).as_data_frame(
                    header=False).values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)
            if self.num_classes == 1:
                return preds.values.ravel()
            elif self.num_classes == 2:
                return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model
            h2o.remove(test_frame)
            remove(model_file)
            if preds_frame is not None:
                h2o.remove(preds_frame)
Exemple #4
0
    def transform(self, X: dt.Frame):

        stop_column_name = self.__class__._stop_column_name
        if stop_column_name in X.names:
            del X[:, stop_column_name]
        else:
            return X

        if self.id is None:
            return X

        # self._output_feature_names = list(X.names)
        # self._feature_desc = list(X.names)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(temporary_files_path, self.id)
        with open(model_path, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)

        frame = h2o.H2OFrame(X.to_pandas())
        try:
            risk_frame = model.predict(frame)
            X[:, "risk_score_coxph_{}_{}".format(self.ties, self.max_iterations
                                                 )] = risk_frame.as_data_frame(
                                                     header=False)
            return X
        finally:
            h2o.remove(self.id)
            h2o.remove(frame)
            if risk_frame is not None:
                h2o.remove(risk_frame)
Exemple #5
0
    def _transform_async(model_path, X_path, nan_value, tmp_folder):
        """
        Predicts target for a particular time group
        :param model_path: path to the stored model
        :param X_path: Path to the data used to fit the FB Prophet model
        :param nan_value: Value of target prior, used when no fitted model has been found
        :return: self
        """
        model = load_obj(model_path)
        XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4()))
        X = load_obj(X_path)

        X_time = X[['ds']].groupby('ds').first().reset_index()
        with suppress_stdout_stderr():
            y_avg = model.predict(X_time)[['ds', 'yhat']]

        # Prophet transforms the date column to datetime so we need to transfrom that to merge back
        X_time.sort_values('ds', inplace=True)
        X_time['yhat'] = y_avg['yhat']
        X_time.sort_index(inplace=True)
        # Merge back into original frame on 'ds'
        # pd.merge wipes the index ... so keep it to provide it again
        indices = X.index
        X = pd.merge(left=X, right=X_time[['ds', 'yhat']], on='ds', how='left')
        X.index = indices

        save_obj(X[['yhat']], XX_path)
        remove(model_path)  # indicates success, no longer need
        remove(X_path)  # indicates success, no longer need

        return XX_path
Exemple #6
0
    def _fit_async(X_path, grp_hash, tmp_folder):
        """
        Fits a FB Prophet model for a particular time group
        :param X_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)
        # Commented for performance, uncomment for debug
        # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
        if X.shape[0] < 20:
            # print("prophet - small data work-around for group: %s" % grp_hash)
            return grp_hash, None
        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        model = Prophet()

        with suppress_stdout_stderr():
            model.fit(X[['ds', 'y']])
        model_path = os.path.join(tmp_folder,
                                  "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
    def fit(self, X: dt.Frame, y: np.array = None):
        XX = X[:, self.tgc].to_pandas()
        XX = XX.replace([None, np.nan], 0)
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        if self.labels is not None:
            y = LabelEncoder().fit(self.labels).transform(y)
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)  # TODO - store mean per group, not just global
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        self.models = {}
        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        for key, X in XX_grp:
            X_path = os.path.join(temporary_files_path, "fbprophet_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash,)
            kwargs = {}
            pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models)
        pool.finish()
        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)
        return self
 def preprocess_image(self, source_img_path, check_only=False):
     try:
         final_img_path = os.path.join(user_dir(), self.uuid,
                                       os.path.basename(source_img_path))
     except:  # we are sometimes getting np.float32, why?
         return None
     delete = False
     if not os.path.exists(final_img_path):
         if not os.path.exists(source_img_path):
             try:
                 self.download(source_img_path, final_img_path)
             except requests.RequestException as e:
                 # print_debug("Error: %s for source_img_path: %s" % (str(e), str(source_img_path)))
                 return None
             delete = False  # True to avoid re-download or a race condition between multiple procs
         else:
             final_img_path = source_img_path
     if not check_only:
         import h2oaicore.keras as keras
         importlib.reload(keras)
         img = keras.preprocessing.image.load_img(final_img_path,
                                                  target_size=(224, 224))
         if delete:
             remove(final_img_path)
         x = keras.preprocessing.image.img_to_array(img)
         x = np.expand_dims(x, axis=0)
         x = keras.applications.resnet50.preprocess_input(x)
         return x
     else:
         return True
Exemple #9
0
    def _fit_async(X_path, grp_hash, tmp_folder, params):
        """
        Fits a FB Prophet model for a particular time group
        :param X_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)
        # Commented for performance, uncomment for debug
        # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
        if X.shape[0] < 20:
            # print("prophet - small data work-around for group: %s" % grp_hash)
            return grp_hash, None
        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)

        if params["country_holidays"] is not None:
            model.add_country_holidays(country_name=params["country_holidays"])
        if params["monthly_seasonality"]:
            model.add_seasonality(name='monthly', period=30.5, fourier_order=5)

        with suppress_stdout_stderr():
            model.fit(X[['ds', 'y']])
        model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
 def _transform_async(model_path, X_path, nan_value):
     model = load_obj(model_path)
     XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4()))
     X = load_obj(X_path)
     if model is not None:
         # Facebook Prophet returns the predictions ordered by time
         # So we should keep track of the times for each group so that
         # predictions are ordered the same as the imput frame
         # Make a copy of the input dates
         X_ds = X.copy()
         X_ds['ds'] = pd.to_datetime(X_ds['ds'])
         # Predict with prophet, get the time and prediction and index by time as well
         # In the case date repeats inside of a group (this happens at least in acceptance test)
         # We groupby date and keep the max (prophet returns the same value for a given date)
         # XX will contain the predictions indexed by date
         XX = model.predict(X)[['ds', 'yhat']].groupby('ds').max()
         # Now put yhat in the right order, simply by maping the dates to the predictions
         X_ds['yhat'] = X_ds["ds"].map(XX['yhat'])
         # Now set XX back to the predictions and drop the index
         XX = X_ds[['yhat']].reset_index(drop=True)
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid models
     XX.index = X.index
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
Exemple #11
0
 def _transform_async(model_path, X_path, nan_value, tmp_folder):
     """
     Predicts target for a particular time group
     :param model_path: path to the stored model
     :param X_path: Path to the data used to fit the FB Prophet model
     :param nan_value: Value of target prior, used when no fitted model has been found
     :return: self
     """
     model = load_obj(model_path)
     XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4()))
     X = load_obj(X_path)
     # Facebook Prophet returns the predictions ordered by time
     # So we should keep track of the time order for each group so that
     # predictions are ordered the same as the imput frame
     # Keep track of the order
     order = np.argsort(pd.to_datetime(X["ds"]))
     if model is not None:
         # Run prophet
         yhat = model.predict(X)['yhat'].values
         XX = pd.DataFrame(yhat, columns=['yhat'])
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat'])  # invalid models
     XX.index = X.index[order]
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(temporary_files_path, self.id)
        with open(model_path, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                return model.predict_contributions(test_frame).as_data_frame(
                    header=False).values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)
            if self.num_classes == 1:
                return preds.values.ravel()
            elif self.num_classes == 2:
                return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            h2o.remove(self.id)
            h2o.remove(test_frame)
            if preds_frame is not None:
                h2o.remove(preds_frame)
    def _fit_async(X_path, grp_hash, time_column, tmp_folder):
        """
        Fits an ARIMA model for a particular time group
        :param X_path: Path to the data used to fit the ARIMA model
        :param grp_hash: Time group identifier
        :param time_column: Name of the time column in the input data
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)

        pm = importlib.import_module('pmdarima')
        with suppress_stdout_stderr():
            try:
                order = order = np.argsort(X[time_column])
                model = pm.auto_arima(X['y'].values[order],
                                      error_action='ignore')
            except:
                model = None
        model_path = os.path.join(tmp_folder,
                                  "autoarima_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        X = self.inf_impute(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(exp_dir(), self.id)
        model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_file))
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                orig_cols = list(X.names)
                df_varimp_orig, df_varimp, df_varimp_merged = self.get_df_varimp(model, orig_cols)
                dfmap = {k: v for k, v in zip(df_varimp_orig.index, df_varimp.index)}
                preds_df = model.predict_contributions(test_frame).as_data_frame(header=False)
                # this only has to work for regression and binary since h2o-3 does not support multiclass shapley
                preds_df.columns = [dfmap.get(x, x) for x in preds_df.columns]
                preds_df = preds_df.groupby(preds_df.columns, axis=1).sum()
                return preds_df.values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)

            is_final = 'IS_FINAL' in kwargs
            struuid = str(uuid.uuid4())
            json_file = os.path.join(exp_dir(), 'stderr_is_final_%s_%s.json' % (is_final, struuid))

            if self.num_classes == 1:
                if self.doing_p_values():
                    df = preds.iloc[:, 1]
                    with open(json_file, "wt") as f:
                        pd.set_option('precision', 16)
                        f.write(json.dumps(json.loads(df.to_json()), indent=4))
                        pd.set_option('precision', 6)
                    return preds.iloc[:, 0].values.ravel()
                else:
                    return preds.values.ravel()
            elif self.num_classes == 2:
                if self.doing_p_values():
                    df = preds.iloc[:, 2]
                    with open(json_file, "wt") as f:
                        pd.set_option('precision', 16)
                        f.write(json.dumps(json.loads(df.to_json()), indent=4))
                        pd.set_option('precision', 6)
                    return preds.iloc[:, -1 - 1].values.ravel()
                else:
                    return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model
            h2o.remove(test_frame)
            remove(model_file)
            if preds_frame is not None:
                h2o.remove(preds_frame)
Exemple #15
0
 def atomic_copy(self, src=None, dst=None):
     import uuid
     my_uuid = uuid.uuid4()
     src_tmp = src + str(my_uuid)
     shutil.copy(src, src_tmp)
     os.makedirs(os.path.dirname(dst), exist_ok=True)
     self.atomic_move(src_tmp, dst)
     remove(src_tmp)
    def transform(self, X: dt.Frame):
        X = X.to_pandas()
        X = X.replace([None, np.nan], 0)
        XX = X[self.tgc].copy()
        XX.rename(columns={self.time_column: "ds"}, inplace=True)
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]
        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None,
                           processor=processor,
                           max_workers=self.n_jobs,
                           num_tasks=num_tasks)
        XX_paths = []
        model_paths = []
        for key, X in XX_grp:
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(temporary_files_path,
                                  "fbprophet_Xt" + str(uuid.uuid4()))
            print("prophet - transforming data of shape: %s for group: %s" %
                  (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    temporary_files_path,
                    "fbprophet_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelProphetTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)
        return XX
    def transform(self, X: dt.Frame):
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Transformed")
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            X_path = os.path.join(temporary_files_path, "autoarima_Xt" + str(uuid.uuid4()))

            # Commented for performance, uncomment for debug
            # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(temporary_files_path, "autoarima_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column,)
                kwargs = {}
                pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs,
                                   out=XX_paths)
            else:
                # Don't go through pools
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat'])  # unseen groups
                # Sync indices
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)
        return XX
    def _fit_async(X_path, grp_hash, tmp_folder, params, cap):
        """
        Fits a FB Prophet model for a particular time group
        :param X_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)
        # Commented for performance, uncomment for debug
        # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
        if X.shape[0] < 20:
            return grp_hash, None

        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")

        # Fit current model and prior
        nrows = X[['ds', 'y']].shape[0]
        n_changepoints = max(1, int(nrows * (2 / 3)))
        if n_changepoints < 25:
            model = Prophet(growth=params["growth"],
                            n_changepoints=n_changepoints)
        else:
            model = Prophet(growth=params["growth"])
        # Add params
        if params["country_holidays"] is not None:
            model.add_country_holidays(country_name=params["country_holidays"])
        if params["monthly_seasonality"]:
            model.add_seasonality(name='monthly',
                                  period=30.5,
                                  fourier_order=params["monthly_seasonality"])
        if params["quarterly_seasonality"]:
            model.add_seasonality(
                name='quarterly',
                period=92,
                fourier_order=params["quarterly_seasonality"])

        with suppress_stdout_stderr():
            if params["growth"] == "logistic":
                X["cap"] = cap
                model.fit(X[['ds', 'y', 'cap']])
            else:
                model.fit(X[['ds', 'y']])

        model_path = os.path.join(tmp_folder,
                                  "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     import spacy
     env_dir = ContribLoader._env_dir.resolve()
     lock_file = os.path.join(env_dir, "spacy.lock")
     try:
         with filelock.FileLock(lock_file):
             from spacy.cli import download
             download('en_core_web_sm', False, "--install-option=--prefix=%s" % ContribLoader._env_dir.resolve())
             self.nlp = spacy.load('en_core_web_sm')
     finally:
         remove(lock_file)
     self.ne_types = {"PERSON", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "DATE"}
 def _transform_async(model_path, X_path, nan_value):
     model = load_obj(model_path)
     XX_path = os.path.join(temporary_files_path,
                            "fbprophet_XXt" + str(uuid.uuid4()))
     X = load_obj(X_path)
     if model is not None:
         XX = model.predict(X[['ds']])[['yhat']]
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value),
                           columns=['yhat'])  # invalid models
     XX.index = X.index
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
    def _fit_async(X_path, grp_hash, time_column):
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(X_path)

        pm = importlib.import_module('pmdarima')
        with suppress_stdout_stderr():
            try:
                order = order = np.argsort(X[time_column])
                model = pm.auto_arima(X['y'].values[order], error_action='ignore')
            except:
                model = None
        model_path = os.path.join(temporary_files_path, "autoarima_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(X_path)  # remove to indicate success
        return grp_hash, model_path
Exemple #22
0
    def transform(self, X: dt.Frame):
        h2o.init(port=config.h2o_recipes_port)
        model_path = os.path.join(temporary_files_path, self.id)
        with open(model_path, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        frame = h2o.H2OFrame(X.to_pandas())
        anomaly_frame = None

        try:
            anomaly_frame = model.anomaly(frame)
            anomaly_frame_df = anomaly_frame.as_data_frame(header=False)
            return anomaly_frame_df
        finally:
            h2o.remove(self.id)
            h2o.remove(anomaly_frame)
 def _fit_async(X_path, grp_hash):
     np.random.seed(1234)
     random.seed(1234)
     X = load_obj(X_path)
     # Commented for performance, uncomment for debug
     # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash))
     if X.shape[0] < 20:
         # print("prophet - small data work-around for group: %s" % grp_hash)
         return grp_hash, None
     mod = importlib.import_module('fbprophet')
     Prophet = getattr(mod, "Prophet")
     model = Prophet()
     with suppress_stdout_stderr():
         model.fit(X[['ds', 'y']])
     model_path = os.path.join(temporary_files_path, "fbprophet_model" + str(uuid.uuid4()))
     save_obj(model, model_path)
     remove(X_path)  # remove to indicate success
     return grp_hash, model_path
Exemple #24
0
    def transform(self, X: dt.Frame):
        h2o.init(port=config.h2o_recipes_port)
        model_path = os.path.join(user_dir(), self.id)
        model_file = os.path.join(model_path,
                                  "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_file))
        frame = h2o.H2OFrame(X.to_pandas())
        anomaly_frame = None

        try:
            anomaly_frame = model.anomaly(frame)
            anomaly_frame_df = anomaly_frame.as_data_frame(header=False)
            return anomaly_frame_df
        finally:
            remove(model_file)
            h2o.remove(self.id)
            h2o.remove(anomaly_frame)
Exemple #25
0
    def _fit_async(data_path, grp_hash, tmp_folder, params):
        """
        Fits a FB Prophet model for a particular time group
        :param data_path: Path to the data used to fit the FB Prophet model
        :param grp_hash: Time group identifier
        :return: time group identifier and path to the pickled model
        """
        np.random.seed(1234)
        random.seed(1234)
        X = load_obj(data_path)

        # if X.shape[0] < 20:
        #     return grp_hash, None
        # Import FB Prophet package
        mod = importlib.import_module('fbprophet')
        Prophet = getattr(mod, "Prophet")
        model = fit_prophet_model(Prophet, X, params)
        model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4()))
        save_obj(model, model_path)
        remove(data_path)  # remove to indicate success
        return grp_hash, model_path
    def fit(self, X: dt.Frame, y: np.array = None):
        pm = importlib.import_module('pmdarima')
        self.models = {}
        X = X.to_pandas()
        XX = X[self.tgc].copy()
        XX['y'] = np.array(y)
        self.nan_value = np.mean(y)
        self.ntrain = X.shape[0]
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        num_tasks = len(XX_grp)

        def processor(out, res):
            out[res[0]] = res[1]

        pool_to_use = small_job_pool
        pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks)
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Fitted")
            X_path = os.path.join(temporary_files_path, "autoarima_X" + str(uuid.uuid4()))
            X = X.reset_index(drop=True)
            save_obj(X, X_path)
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            args = (X_path, grp_hash, self.time_column,)
            kwargs = {}
            pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models)
        pool.finish()

        for k, v in self.models.items():
            self.models[k] = load_obj(v) if v is not None else None
            remove(v)
        return self
Exemple #27
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     h2o.init(port=config.h2o_recipes_port)
     model = H2OAutoEncoderEstimator(activation='tanh',
                                     epochs=1,
                                     hidden=[50, 50],
                                     reproducible=True,
                                     seed=1234)
     frame = h2o.H2OFrame(X.to_pandas())
     model_path = None
     try:
         model.train(x=list(range(X.ncols)), training_frame=frame)
         self.id = model.model_id
         model_path = os.path.join(temporary_files_path,
                                   "h2o_model." + str(uuid.uuid4()))
         model_path = h2o.save_model(model=model, path=model_path)
         with open(model_path, "rb") as f:
             self.raw_model_bytes = f.read()
         return model.anomaly(frame).as_data_frame(header=False)
     finally:
         if model_path is not None:
             remove(model_path)
         h2o.remove(model)
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        with open(model_path, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)

            return preds.values.ravel()

        finally:
            h2o.remove(self.id)
            h2o.remove(test_frame)
            if preds_frame is not None:
                h2o.remove(preds_frame)
Exemple #29
0
 def _transform_async(model_path, X_path, nan_value):
     model = load_obj(model_path)
     XX_path = os.path.join(temporary_files_path,
                            "fbprophet_XXt" + str(uuid.uuid4()))
     X = load_obj(X_path)
     # Facebook Prophet returns the predictions ordered by time
     # So we should keep track of the time order for each group so that
     # predictions are ordered the same as the imput frame
     # Keep track of the order
     order = np.argsort(pd.to_datetime(X["ds"]))
     if model is not None:
         # Run prophet
         yhat = model.predict(X)['yhat'].values
         XX = pd.DataFrame(yhat, columns=['yhat'])
     else:
         XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value),
                           columns=['yhat'])  # invalid models
     XX.index = X.index[order]
     assert XX.shape[1] == 1
     save_obj(XX, XX_path)
     remove(model_path)  # indicates success, no longer need
     remove(X_path)  # indicates success, no longer need
     return XX_path
    def transform(self, X: dt.Frame, **kwargs):
        """
        Uses fitted models (1 per time group) to predict the target
        If self.is_train exists, it means we are doing in-sample predictions
        if it does not then we Arima is used to predict the future
        :param X: Datatable Frame containing the features
        :return: ARIMA predictions
        """
        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        tmp_folder = self._create_tmp_folder(logger)

        n_jobs = self._get_n_jobs(logger, **kwargs)

        X = X.to_pandas()
        XX = X[self.tgc].copy()
        tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column))
        if len(tgc_wo_time) > 0:
            XX_grp = XX.groupby(tgc_wo_time)
        else:
            XX_grp = [([None], XX)]

        assert len(XX_grp) > 0
        num_tasks = len(XX_grp)

        def processor(out, res):
            out.append(res)

        pool_to_use = small_job_pool
        loggerinfo(logger,
                   "Arima will use {} workers for transform".format(n_jobs))
        pool = pool_to_use(logger=None,
                           processor=processor,
                           num_tasks=num_tasks,
                           max_workers=n_jobs)

        XX_paths = []
        model_paths = []
        nb_groups = len(XX_grp)
        for _i_g, (key, X) in enumerate(XX_grp):
            # Just print where we are in the process of fitting models
            if (_i_g + 1) % max(1, nb_groups // 20) == 0:
                loggerinfo(
                    logger, "Auto ARIMA : %d%% of groups transformed" %
                    (100 * (_i_g + 1) // nb_groups))

            # Create time group key to store and retrieve fitted models
            key = key if isinstance(key, list) else [key]
            grp_hash = '_'.join(map(str, key))
            # Create file path to store data and pass it to the fitting pool
            X_path = os.path.join(tmp_folder,
                                  "autoarima_Xt" + str(uuid.uuid4()))

            # Commented for performance, uncomment for debug
            # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash))
            if grp_hash in self.models:
                model = self.models[grp_hash]
                model_path = os.path.join(
                    tmp_folder, "autoarima_modelt" + str(uuid.uuid4()))
                save_obj(model, model_path)
                save_obj(X, X_path)
                model_paths.append(model_path)

                args = (model_path, X_path, self.nan_value,
                        hasattr(self, 'is_train'), self.time_column,
                        self.pred_gap, tmp_folder)
                kwargs = {}
                pool.submit_tryget(
                    None,
                    MyParallelAutoArimaTransformer_transform_async,
                    args=args,
                    kwargs=kwargs,
                    out=XX_paths)
            else:
                # Don't go through pools
                XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value),
                                  columns=['yhat'])  # unseen groups
                # Sync indices
                XX.index = X.index
                save_obj(XX, X_path)
                XX_paths.append(X_path)
        pool.finish()
        XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths),
                       axis=0).sort_index()
        for p in XX_paths + model_paths:
            remove(p)

        self._clean_tmp_folder(logger, tmp_folder)

        return XX