def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "autoarima_XXt" + str(uuid.uuid4())) X = load_obj(X_path) # Facebook Prophet returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(X[time_column]) if model is not None: yhat = model.predict_in_sample() \ if has_is_train_attr else model.predict(n_periods=X.shape[0]) yhat = yhat[order] XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid model # Sync index XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _transform_async(model_path, X_path, nan_value, has_is_train_attr, time_column, tmp_folder): """ Predicts target for a particular time group :param model_path: path to the stored model :param X_path: Path to the data used to fit the ARIMA model :param nan_value: Value of target prior, used when no fitted model has been found :param has_is_train_attr: indicates if we predict in-sample or out-of-sample :param time_column: Name of the time column in the input data :return: self """ model = load_obj(model_path) XX_path = os.path.join(tmp_folder, "autoarima_XXt" + str(uuid.uuid4())) X = load_obj(X_path) # Arima returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(X[time_column]) if model is not None: yhat = model.predict_in_sample() \ if has_is_train_attr else model.predict(n_periods=X.shape[0]) yhat = yhat[order] XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid model # Sync index XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) X = self.inf_impute(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(user_dir(), self.id) model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin") os.makedirs(model_path, exist_ok=True) with open(model_file, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_file)) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: if kwargs.get("pred_contribs"): return model.predict_contributions(test_frame).as_data_frame( header=False).values preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) if self.num_classes == 1: return preds.values.ravel() elif self.num_classes == 2: return preds.iloc[:, -1].values.ravel() else: return preds.iloc[:, 1:].values finally: # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model h2o.remove(test_frame) remove(model_file) if preds_frame is not None: h2o.remove(preds_frame)
def transform(self, X: dt.Frame): stop_column_name = self.__class__._stop_column_name if stop_column_name in X.names: del X[:, stop_column_name] else: return X if self.id is None: return X # self._output_feature_names = list(X.names) # self._feature_desc = list(X.names) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(temporary_files_path, self.id) with open(model_path, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) frame = h2o.H2OFrame(X.to_pandas()) try: risk_frame = model.predict(frame) X[:, "risk_score_coxph_{}_{}".format(self.ties, self.max_iterations )] = risk_frame.as_data_frame( header=False) return X finally: h2o.remove(self.id) h2o.remove(frame) if risk_frame is not None: h2o.remove(risk_frame)
def _transform_async(model_path, X_path, nan_value, tmp_folder): """ Predicts target for a particular time group :param model_path: path to the stored model :param X_path: Path to the data used to fit the FB Prophet model :param nan_value: Value of target prior, used when no fitted model has been found :return: self """ model = load_obj(model_path) XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4())) X = load_obj(X_path) X_time = X[['ds']].groupby('ds').first().reset_index() with suppress_stdout_stderr(): y_avg = model.predict(X_time)[['ds', 'yhat']] # Prophet transforms the date column to datetime so we need to transfrom that to merge back X_time.sort_values('ds', inplace=True) X_time['yhat'] = y_avg['yhat'] X_time.sort_index(inplace=True) # Merge back into original frame on 'ds' # pd.merge wipes the index ... so keep it to provide it again indices = X.index X = pd.merge(left=X, right=X_time[['ds', 'yhat']], on='ds', how='left') X.index = indices save_obj(X[['yhat']], XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _fit_async(X_path, grp_hash, tmp_folder): """ Fits a FB Prophet model for a particular time group :param X_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: # print("prophet - small data work-around for group: %s" % grp_hash) return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = Prophet() with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def fit(self, X: dt.Frame, y: np.array = None): XX = X[:, self.tgc].to_pandas() XX = XX.replace([None, np.nan], 0) XX.rename(columns={self.time_column: "ds"}, inplace=True) if self.labels is not None: y = LabelEncoder().fit(self.labels).transform(y) XX['y'] = np.array(y) self.nan_value = np.mean(y) # TODO - store mean per group, not just global tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] self.models = {} num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) for key, X in XX_grp: X_path = os.path.join(temporary_files_path, "fbprophet_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash,) kwargs = {} pool.submit_tryget(None, MyParallelProphetTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) return self
def preprocess_image(self, source_img_path, check_only=False): try: final_img_path = os.path.join(user_dir(), self.uuid, os.path.basename(source_img_path)) except: # we are sometimes getting np.float32, why? return None delete = False if not os.path.exists(final_img_path): if not os.path.exists(source_img_path): try: self.download(source_img_path, final_img_path) except requests.RequestException as e: # print_debug("Error: %s for source_img_path: %s" % (str(e), str(source_img_path))) return None delete = False # True to avoid re-download or a race condition between multiple procs else: final_img_path = source_img_path if not check_only: import h2oaicore.keras as keras importlib.reload(keras) img = keras.preprocessing.image.load_img(final_img_path, target_size=(224, 224)) if delete: remove(final_img_path) x = keras.preprocessing.image.img_to_array(img) x = np.expand_dims(x, axis=0) x = keras.applications.resnet50.preprocess_input(x) return x else: return True
def _fit_async(X_path, grp_hash, tmp_folder, params): """ Fits a FB Prophet model for a particular time group :param X_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: # print("prophet - small data work-around for group: %s" % grp_hash) return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True) if params["country_holidays"] is not None: model.add_country_holidays(country_name=params["country_holidays"]) if params["monthly_seasonality"]: model.add_seasonality(name='monthly', period=30.5, fourier_order=5) with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def _transform_async(model_path, X_path, nan_value): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4())) X = load_obj(X_path) if model is not None: # Facebook Prophet returns the predictions ordered by time # So we should keep track of the times for each group so that # predictions are ordered the same as the imput frame # Make a copy of the input dates X_ds = X.copy() X_ds['ds'] = pd.to_datetime(X_ds['ds']) # Predict with prophet, get the time and prediction and index by time as well # In the case date repeats inside of a group (this happens at least in acceptance test) # We groupby date and keep the max (prophet returns the same value for a given date) # XX will contain the predictions indexed by date XX = model.predict(X)[['ds', 'yhat']].groupby('ds').max() # Now put yhat in the right order, simply by maping the dates to the predictions X_ds['yhat'] = X_ds["ds"].map(XX['yhat']) # Now set XX back to the predictions and drop the index XX = X_ds[['yhat']].reset_index(drop=True) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _transform_async(model_path, X_path, nan_value, tmp_folder): """ Predicts target for a particular time group :param model_path: path to the stored model :param X_path: Path to the data used to fit the FB Prophet model :param nan_value: Value of target prior, used when no fitted model has been found :return: self """ model = load_obj(model_path) XX_path = os.path.join(tmp_folder, "fbprophet_XX" + str(uuid.uuid4())) X = load_obj(X_path) # Facebook Prophet returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(pd.to_datetime(X["ds"])) if model is not None: # Run prophet yhat = model.predict(X)['yhat'].values XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index[order] assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(temporary_files_path, self.id) with open(model_path, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: if kwargs.get("pred_contribs"): return model.predict_contributions(test_frame).as_data_frame( header=False).values preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) if self.num_classes == 1: return preds.values.ravel() elif self.num_classes == 2: return preds.iloc[:, -1].values.ravel() else: return preds.iloc[:, 1:].values finally: h2o.remove(self.id) h2o.remove(test_frame) if preds_frame is not None: h2o.remove(preds_frame)
def _fit_async(X_path, grp_hash, time_column, tmp_folder): """ Fits an ARIMA model for a particular time group :param X_path: Path to the data used to fit the ARIMA model :param grp_hash: Time group identifier :param time_column: Name of the time column in the input data :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) pm = importlib.import_module('pmdarima') with suppress_stdout_stderr(): try: order = order = np.argsort(X[time_column]) model = pm.auto_arima(X['y'].values[order], error_action='ignore') except: model = None model_path = os.path.join(tmp_folder, "autoarima_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) X = self.inf_impute(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(exp_dir(), self.id) model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin") os.makedirs(model_path, exist_ok=True) with open(model_file, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_file)) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: if kwargs.get("pred_contribs"): orig_cols = list(X.names) df_varimp_orig, df_varimp, df_varimp_merged = self.get_df_varimp(model, orig_cols) dfmap = {k: v for k, v in zip(df_varimp_orig.index, df_varimp.index)} preds_df = model.predict_contributions(test_frame).as_data_frame(header=False) # this only has to work for regression and binary since h2o-3 does not support multiclass shapley preds_df.columns = [dfmap.get(x, x) for x in preds_df.columns] preds_df = preds_df.groupby(preds_df.columns, axis=1).sum() return preds_df.values preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) is_final = 'IS_FINAL' in kwargs struuid = str(uuid.uuid4()) json_file = os.path.join(exp_dir(), 'stderr_is_final_%s_%s.json' % (is_final, struuid)) if self.num_classes == 1: if self.doing_p_values(): df = preds.iloc[:, 1] with open(json_file, "wt") as f: pd.set_option('precision', 16) f.write(json.dumps(json.loads(df.to_json()), indent=4)) pd.set_option('precision', 6) return preds.iloc[:, 0].values.ravel() else: return preds.values.ravel() elif self.num_classes == 2: if self.doing_p_values(): df = preds.iloc[:, 2] with open(json_file, "wt") as f: pd.set_option('precision', 16) f.write(json.dumps(json.loads(df.to_json()), indent=4)) pd.set_option('precision', 6) return preds.iloc[:, -1 - 1].values.ravel() else: return preds.iloc[:, -1].values.ravel() else: return preds.iloc[:, 1:].values finally: # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model h2o.remove(test_frame) remove(model_file) if preds_frame is not None: h2o.remove(preds_frame)
def atomic_copy(self, src=None, dst=None): import uuid my_uuid = uuid.uuid4() src_tmp = src + str(my_uuid) shutil.copy(src, src_tmp) os.makedirs(os.path.dirname(dst), exist_ok=True) self.atomic_move(src_tmp, dst) remove(src_tmp)
def transform(self, X: dt.Frame): X = X.to_pandas() X = X.replace([None, np.nan], 0) XX = X[self.tgc].copy() XX.rename(columns={self.time_column: "ds"}, inplace=True) tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, max_workers=self.n_jobs, num_tasks=num_tasks) XX_paths = [] model_paths = [] for key, X in XX_grp: key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(temporary_files_path, "fbprophet_Xt" + str(uuid.uuid4())) print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( temporary_files_path, "fbprophet_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value) kwargs = {} pool.submit_tryget( None, MyParallelProphetTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) return XX
def transform(self, X: dt.Frame): X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): if (_i_g + 1) % max(1, nb_groups // 20) == 0: print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Transformed") key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) X_path = os.path.join(temporary_files_path, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("prophet - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join(temporary_files_path, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column,) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) return XX
def _fit_async(X_path, grp_hash, tmp_folder, params, cap): """ Fits a FB Prophet model for a particular time group :param X_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") # Fit current model and prior nrows = X[['ds', 'y']].shape[0] n_changepoints = max(1, int(nrows * (2 / 3))) if n_changepoints < 25: model = Prophet(growth=params["growth"], n_changepoints=n_changepoints) else: model = Prophet(growth=params["growth"]) # Add params if params["country_holidays"] is not None: model.add_country_holidays(country_name=params["country_holidays"]) if params["monthly_seasonality"]: model.add_seasonality(name='monthly', period=30.5, fourier_order=params["monthly_seasonality"]) if params["quarterly_seasonality"]: model.add_seasonality( name='quarterly', period=92, fourier_order=params["quarterly_seasonality"]) with suppress_stdout_stderr(): if params["growth"] == "logistic": X["cap"] = cap model.fit(X[['ds', 'y', 'cap']]) else: model.fit(X[['ds', 'y']]) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def __init__(self, **kwargs): super().__init__(**kwargs) import spacy env_dir = ContribLoader._env_dir.resolve() lock_file = os.path.join(env_dir, "spacy.lock") try: with filelock.FileLock(lock_file): from spacy.cli import download download('en_core_web_sm', False, "--install-option=--prefix=%s" % ContribLoader._env_dir.resolve()) self.nlp = spacy.load('en_core_web_sm') finally: remove(lock_file) self.ne_types = {"PERSON", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "DATE"}
def _transform_async(model_path, X_path, nan_value): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4())) X = load_obj(X_path) if model is not None: XX = model.predict(X[['ds']])[['yhat']] else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def _fit_async(X_path, grp_hash, time_column): np.random.seed(1234) random.seed(1234) X = load_obj(X_path) pm = importlib.import_module('pmdarima') with suppress_stdout_stderr(): try: order = order = np.argsort(X[time_column]) model = pm.auto_arima(X['y'].values[order], error_action='ignore') except: model = None model_path = os.path.join(temporary_files_path, "autoarima_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def transform(self, X: dt.Frame): h2o.init(port=config.h2o_recipes_port) model_path = os.path.join(temporary_files_path, self.id) with open(model_path, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) frame = h2o.H2OFrame(X.to_pandas()) anomaly_frame = None try: anomaly_frame = model.anomaly(frame) anomaly_frame_df = anomaly_frame.as_data_frame(header=False) return anomaly_frame_df finally: h2o.remove(self.id) h2o.remove(anomaly_frame)
def _fit_async(X_path, grp_hash): np.random.seed(1234) random.seed(1234) X = load_obj(X_path) # Commented for performance, uncomment for debug # print("prophet - fitting on data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if X.shape[0] < 20: # print("prophet - small data work-around for group: %s" % grp_hash) return grp_hash, None mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = Prophet() with suppress_stdout_stderr(): model.fit(X[['ds', 'y']]) model_path = os.path.join(temporary_files_path, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(X_path) # remove to indicate success return grp_hash, model_path
def transform(self, X: dt.Frame): h2o.init(port=config.h2o_recipes_port) model_path = os.path.join(user_dir(), self.id) model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin") os.makedirs(model_path, exist_ok=True) with open(model_file, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(os.path.abspath(model_file)) frame = h2o.H2OFrame(X.to_pandas()) anomaly_frame = None try: anomaly_frame = model.anomaly(frame) anomaly_frame_df = anomaly_frame.as_data_frame(header=False) return anomaly_frame_df finally: remove(model_file) h2o.remove(self.id) h2o.remove(anomaly_frame)
def _fit_async(data_path, grp_hash, tmp_folder, params): """ Fits a FB Prophet model for a particular time group :param data_path: Path to the data used to fit the FB Prophet model :param grp_hash: Time group identifier :return: time group identifier and path to the pickled model """ np.random.seed(1234) random.seed(1234) X = load_obj(data_path) # if X.shape[0] < 20: # return grp_hash, None # Import FB Prophet package mod = importlib.import_module('fbprophet') Prophet = getattr(mod, "Prophet") model = fit_prophet_model(Prophet, X, params) model_path = os.path.join(tmp_folder, "fbprophet_model" + str(uuid.uuid4())) save_obj(model, model_path) remove(data_path) # remove to indicate success return grp_hash, model_path
def fit(self, X: dt.Frame, y: np.array = None): pm = importlib.import_module('pmdarima') self.models = {} X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): if (_i_g + 1) % max(1, nb_groups // 20) == 0: print("Auto ARIMA - ", 100 * (_i_g + 1) // nb_groups, " %% of Groups Fitted") X_path = os.path.join(temporary_files_path, "autoarima_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash, self.time_column,) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) return self
def fit_transform(self, X: dt.Frame, y: np.array = None): h2o.init(port=config.h2o_recipes_port) model = H2OAutoEncoderEstimator(activation='tanh', epochs=1, hidden=[50, 50], reproducible=True, seed=1234) frame = h2o.H2OFrame(X.to_pandas()) model_path = None try: model.train(x=list(range(X.ncols)), training_frame=frame) self.id = model.model_id model_path = os.path.join(temporary_files_path, "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: self.raw_model_bytes = f.read() return model.anomaly(frame).as_data_frame(header=False) finally: if model_path is not None: remove(model_path) h2o.remove(model)
def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(user_dir(), self.id) with open(model_path, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) return preds.values.ravel() finally: h2o.remove(self.id) h2o.remove(test_frame) if preds_frame is not None: h2o.remove(preds_frame)
def _transform_async(model_path, X_path, nan_value): model = load_obj(model_path) XX_path = os.path.join(temporary_files_path, "fbprophet_XXt" + str(uuid.uuid4())) X = load_obj(X_path) # Facebook Prophet returns the predictions ordered by time # So we should keep track of the time order for each group so that # predictions are ordered the same as the imput frame # Keep track of the order order = np.argsort(pd.to_datetime(X["ds"])) if model is not None: # Run prophet yhat = model.predict(X)['yhat'].values XX = pd.DataFrame(yhat, columns=['yhat']) else: XX = pd.DataFrame(np.full((X.shape[0], 1), nan_value), columns=['yhat']) # invalid models XX.index = X.index[order] assert XX.shape[1] == 1 save_obj(XX, XX_path) remove(model_path) # indicates success, no longer need remove(X_path) # indicates success, no longer need return XX_path
def transform(self, X: dt.Frame, **kwargs): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) tmp_folder = self._create_tmp_folder(logger) n_jobs = self._get_n_jobs(logger, **kwargs) X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool loggerinfo(logger, "Arima will use {} workers for transform".format(n_jobs)) pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks, max_workers=n_jobs) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just print where we are in the process of fitting models if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo( logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) # Create time group key to store and retrieve fitted models key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # Create file path to store data and pass it to the fitting pool X_path = os.path.join(tmp_folder, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join( tmp_folder, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column, self.pred_gap, tmp_folder) kwargs = {} pool.submit_tryget( None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) self._clean_tmp_folder(logger, tmp_folder) return XX