def _clean_tmp_folder(self, logger, tmp_folder): try: shutil.rmtree(tmp_folder) loggerinfo(logger, "Prophet cleaned up temporary file folder.") except: loggerwarning( logger, "Prophet could not delete the temporary file folder.")
def _create_tmp_folder(self, logger): # Create a temp folder to store xnn files # Set the default value without context available (required to pass acceptance test) tmp_folder = os.path.join(user_dir(), "%s_xnn_model_folder" % uuid.uuid4()) # Make a real tmp folder when experiment is available if self.context and self.context.experiment_id: tmp_folder = os.path.join(self.context.experiment_tmp_dir, "%s_xnn_model_folder" % uuid.uuid4()) # Now let's try to create that folder try: os.mkdir(tmp_folder) except PermissionError: # This not occur so log a warning loggerwarning(logger, "XNN was denied temp folder creation rights") tmp_folder = os.path.join(user_dir(), "%s_xnn_model_folder" % uuid.uuid4()) os.mkdir(tmp_folder) except FileExistsError: # We should never be here since temp dir name is expected to be unique loggerwarning(logger, "XNN temp folder already exists") tmp_folder = os.path.join(self.context.experiment_tmp_dir, "%s_xnn_model_folder" % uuid.uuid4()) os.mkdir(tmp_folder) except: # Revert to temporary file path tmp_folder = os.path.join(user_dir(), "%s_xnn_model_folder" % uuid.uuid4()) os.mkdir(tmp_folder) loggerinfo(logger, "XNN temp folder {}".format(tmp_folder)) return tmp_folder
def _create_tmp_folder(self, logger): # Create a temp folder to store files used during multi processing experiment # This temp folder will be removed at the end of the process # Set the default value without context available (required to pass acceptance test tmp_folder = str(uuid.uuid4()) + "_prophet_folder/" # Make a real tmp folder when experiment is available if self.context and self.context.experiment_id: tmp_folder = self.context.experiment_tmp_dir + "/" + str( uuid.uuid4()) + "_prophet_folder/" # Now let's try to create that folder try: os.mkdir(tmp_folder) except PermissionError: # This not occur so log a warning loggerwarning(logger, "Prophet was denied temp folder creation rights") tmp_folder = temporary_files_path + "/" + str( uuid.uuid4()) + "_prophet_folder/" os.mkdir(tmp_folder) except FileExistsError: # We should never be here since temp dir name is expected to be unique loggerwarning(logger, "Prophet temp folder already exists") tmp_folder = self.context.experiment_tmp_dir + "/" + str( uuid.uuid4()) + "_prophet_folder/" os.mkdir(tmp_folder) except: # Revert to temporary file path tmp_folder = temporary_files_path + "/" + str( uuid.uuid4()) + "_prophet_folder/" os.mkdir(tmp_folder) loggerinfo(logger, "Prophet temp folder {}".format(tmp_folder)) return tmp_folder
def transform(self, X: dt.Frame): logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) X = dt.Frame(X) original_zip_column_name = X.names[0] X = X[:, dt.str64(dt.f[0])] X.names = ['zip_key'] try: zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0] + ['79936'] zip_features = [self.get_zipcode_features(x) for x in zip_list] X_g = dt.Frame({"zip_key": zip_list}) X_g.cbind(dt.Frame(zip_features)) X_g.key = 'zip_key' X_result = X[:, :, dt.join(X_g)] self._output_feature_names = [ "{}:{}.{}".format(self.transformer_name, original_zip_column_name, self.replaceBannedCharacters(f)) for f in list(X_result[:, 1:].names) ] self._feature_desc = [ "Property '{}' of zipcode column ['{}'] from US zipcode database (recipe '{}')" .format(f, original_zip_column_name, self.transformer_name) for f in list(X_result[:, 1:].names) ] return X_result[:, 1:] except ValueError as ve: loggerinfo( logger, "Column '{}' is not a zipcode: {}".format( original_zip_column_name, str(ve))) return self.get_zipcode_null_result(X, original_zip_column_name) except TypeError as te: loggerwarning( logger, "Column '{}' triggered TypeError: {}".format( original_zip_column_name, str(te))) raise te
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger(experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) loggerinfo(logger, "TestLOGGER: Fit CatBoost") # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "TestMessage: CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if isinstance(X, dt.Frame): orig_cols = list(X.names) # dt -> lightgbm internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy() # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy() # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray(valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] if self.num_classes >= 2: valid_y = lb.transform(valid_y) eval_set[0] = (valid_X, valid_y) else: orig_cols = list(X.columns) if self.num_classes == 1: model = CatBoostRegressor(**self.params) else: model = CatBoostClassifier(**self.params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None model.fit(X, y=y, sample_weight=sample_weight, baseline=baseline, eval_set=eval_set, early_stopping_rounds=kwargs.get('early_stopping_rounds', None), verbose=self.params.get('verbose', False) ) # need to move to wrapper if model.get_best_iteration() is not None: iterations = model.get_best_iteration() + 1 else: iterations = self.params['iterations'] + 1 # must always set best_iterations self.set_model_properties(model=model, features=orig_cols, importances=model.feature_importances_, iterations=iterations)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self._make_logger: # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self._show_logger_test: loggerinfo(logger, "TestLOGGER: Fit CatBoost") if self._show_task_test: # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "Tuning CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType # label encode target and setup type of problem lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set = [(valid_X, valid_y)] self.params.update({'objective': 'Logloss'}) if self.num_classes > 2: self.params.update({'objective': 'MultiClass'}) if isinstance(X, dt.Frame): orig_cols = list(X.names) numeric_cols = list(X[:, [bool, int, float]].names) else: orig_cols = list(X.columns) numeric_cols = list(X.select_dtypes([np.number]).columns) # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc. self.params['cat_features'] = [ i for i, x in enumerate(orig_cols) if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols ] if not self.get_uses_gpus(self.params): # monotonicity constraints not available for GPU for catboost # get names of columns in same order X_names = list(dt.Frame(X).names) X_numeric = self.get_X_ordered_numerics(X) X_numeric_names = list(X_numeric.names) _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y) # if non-numerics, then fix those to have 0 constraint self.params['monotone_constraints'] = [0] * len(X_names) colnumi = 0 for coli in X_names: if X_names[coli] in X_numeric_names: self.params['monotone_constraints'][coli] = constraints[ colnumi] colnumi += 1 if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> catboost internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy( ) # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray( valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] eval_set = [(valid_X, valid_y)] if eval_set is not None: valid_X_shape = eval_set[0][0].shape else: valid_X_shape = None X, eval_set = self.process_cats(X, eval_set, orig_cols) # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape) params = copy.deepcopy( self.params ) # keep separate, since then can be pulled form lightgbm params params = self.transcribe_params(params=params, **kwargs) if logger is not None: loggerdata( logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params))) if self.num_classes == 1: self.model = CatBoostRegressor(**params) else: self.model = CatBoostClassifier(**params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None kwargs_fit = dict(baseline=baseline, eval_set=eval_set) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join(exp_dir(), "catboost%s.tmp.pickle" % self.uuid) save_obj((self.model, X, y, sample_weight, kwargs_fit), pickle_path) # FIT (with migration safety before hyperopt/Optuna function added) try: if hasattr(self, 'dask_or_hyper_or_normal_fit'): self.dask_or_hyper_or_normal_fit(X, y, sample_weight=sample_weight, kwargs=kwargs, **kwargs_fit) else: self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit) except Exception as e: if "All features are either constant or ignored" in str(e): raise IgnoreEntirelyError(str(e)) raise if config.debug_daimodel_level <= 2: remove(pickle_path) # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # need to move to wrapper if self.model.get_best_iteration() is not None: iterations = self.model.get_best_iteration() + 1 else: iterations = self.params['n_estimators'] # must always set best_iterations self.model_path = None importances = copy.deepcopy(self.model.feature_importances_) if not self._save_by_pickle: self.uuid = str(uuid.uuid4())[:6] model_file = "catboost_%s.bin" % str(self.uuid) self.model_path = os.path.join(self.context.experiment_tmp_dir, model_file) self.model.save_model(self.model_path) with open(self.model_path, mode='rb') as f: model = f.read() else: model = self.model self.set_model_properties( model= model, # overwrites self.model object with bytes if not using pickle features=orig_cols, importances=importances, iterations=iterations)
def fit_transform(self, X: dt.Frame, y: np.array = None, **kwargs): X_original = X X = X[:, dt.f[int].extend(dt.f[float]).extend(dt.f[bool]). extend(dt.f[str])] if hasattr(self, 'runcount'): self.run_count += 1 else: self.run_count = 0 # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir, username=self.context.username, ) survival_event = self.__class__._survival_event if survival_event in X.names: raise ValueError( "Consider renaming feature '{}'.".format(survival_event)) # bind y to X to use as event in CoxPH X[:, survival_event] = np.array(LabelEncoder().fit_transform(y)) # sanity check that target is binary if X[survival_event].nunique()[0, 0] != 2: raise ValueError( "Too many values {} in event column - must be exactly 2.". format(X[survival_event].nunique()[0, 0])) # redress target values into 0, 1 event_max = X[survival_event].max()[0, 0] X[dt.f[survival_event] != event_max, survival_event] = 0 X[dt.f[survival_event] == event_max, survival_event] = 1 stop_column_name = self.__class__._stop_column_name ignored_columns = self.__class__._ignored_columns if stop_column_name is None: raise ValueError("Stop column name can't be null.") main_message = "Survival Analysis CoxPH pre-transformer will use event '{}' and time '{}' columns.". \ format(survival_event, stop_column_name) # in accpetance test simply return input X if stop_column_name not in X.names: loggerwarning( logger, "Survival Analysis CoxPH pre-transformer found no time column '{}'." .format(stop_column_name)) return X_original if not X[:, stop_column_name].stype in [ dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64 ]: raise ValueError( "Stop column `{}' type must be numeric, but found '{}'".format( stop_column_name, X[:, stop_column_name].stype)) # remove stop column from X del X_original[:, stop_column_name] self._output_feature_names = list(X_original.names) self._feature_desc = list(X_original.names) if self.run_count == 0 and self.context and self.context.experiment_id: loggerinfo(logger, main_message) task = kwargs.get('task') if task and main_message is not None: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=main_message)) task.flush() # Validate CoxPH requirements on stop column if X[stop_column_name].min()[0, 0] < 0: X[dt.f[stop_column_name] < 0, stop_column_name] = 0 loggerwarning( logger, "Stop column can't be negative: replaced negative values with 0." ) if X[stop_column_name].countna()[0, 0] > 0: X[dt.isna(dt.f[stop_column_name]), stop_column_name] = 0 loggerwarning( logger, "Stop column can't contain NULLs: replaced NULL with 0.") h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model = H2OCoxProportionalHazardsEstimator( stop_column=stop_column_name, ties=self.ties, max_iterations=self.max_iterations) frame = h2o.H2OFrame(X.to_pandas()) model_path = None risk_frame = None try: model.train(y=survival_event, training_frame=frame, ignored_columns=ignored_columns) self.id = model.model_id model_path = os.path.join(temporary_files_path, "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: self.raw_model_bytes = f.read() risk_frame = model.predict(frame) X_original[:, "risk_score_coxph_{}_{}".format( self.ties, self.max_iterations)] = risk_frame.as_data_frame( header=False) self._output_feature_names.append( f"{self.display_name}{orig_feat_prefix}riskscore_coxph{extra_prefix}{self.ties}_{self.max_iterations}" ) self._feature_desc.append( f"CoxPH model risk score [ties={self.ties}, max.iter={self.max_iterations}" ) return X_original finally: if model_path is not None: remove(model_path) h2o.remove(model) h2o.remove(frame) if risk_frame is not None: h2o.remove(risk_frame)
def fit(self, X: dt.Frame, y: np.array = None): """ Fits ARIMA models (1 per time group) using historical target values contained in y Model fitting is distributed over a pool of processes and uses file storage to share the data with workers :param X: Datatable frame containing the features :param y: numpy array containing the historical values of the target :return: self """ # Get the logger if it exists logger = None tmp_folder = str(uuid.uuid4()) + "_arima_folder/" if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" # Create a temp folder to store files used during multi processing experiment # This temp folder will be removed at the end of the process loggerinfo(logger, "Arima temp folder {}".format(tmp_folder)) try: os.mkdir(tmp_folder) except PermissionError: # This not occur so log a warning loggerwarning(logger, "Arima was denied temp folder creation rights") tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except FileExistsError: # We should never be here since temp dir name is expected to be unique loggerwarning(logger, "Arima temp folder already exists") tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except: # Revert to temporary file path tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) # Import the ARIMA python module pm = importlib.import_module('pmdarima') # Init models self.models = {} # Convert to pandas X = X.to_pandas() XX = X[self.tgc].copy() XX['y'] = np.array(y) self.nan_value = np.mean(y) self.ntrain = X.shape[0] # Group the input by TGC (Time group column) excluding the time column itself tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] # Prepare for multi processing num_tasks = len(XX_grp) def processor(out, res): out[res[0]] = res[1] pool_to_use = small_job_pool if hasattr(self, "params_base"): max_workers = self.params_base['n_jobs'] else: loggerinfo(logger, "Custom Recipe does not have a params_base attribute") # Beware not to use the disable_gpus keyword here. looks like cython does not like it # max_workers = get_max_workers(True) # Just set default to 2 max_workers = 2 loggerinfo(logger, "Arima will use {} workers for parallel processing".format(max_workers)) pool = pool_to_use( logger=None, processor=processor, num_tasks=num_tasks, max_workers=max_workers ) # Build 1 ARIMA model per time group columns nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just say where we are in the fitting process if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "Auto ARIMA : %d%% of groups fitted" % (100 * (_i_g + 1) // nb_groups)) X_path = os.path.join(tmp_folder, "autoarima_X" + str(uuid.uuid4())) X = X.reset_index(drop=True) save_obj(X, X_path) key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) args = (X_path, grp_hash, self.time_column, tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_fit_async, args=args, kwargs=kwargs, out=self.models) pool.finish() for k, v in self.models.items(): self.models[k] = load_obj(v) if v is not None else None remove(v) try: shutil.rmtree(tmp_folder) loggerinfo(logger, "Arima cleaned up temporary file folder.") except: loggerwarning(logger, "Arima could not delete the temporary file folder.") return self
def transform(self, X: dt.Frame): """ Uses fitted models (1 per time group) to predict the target If self.is_train exists, it means we are doing in-sample predictions if it does not then we Arima is used to predict the future :param X: Datatable Frame containing the features :return: ARIMA predictions """ # Get the logger if it exists logger = None tmp_folder = str(uuid.uuid4()) + "_arima_folder/" if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir ) tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" # Create a temp folder to store files used during multi processing experiment # This temp folder will be removed at the end of the process loggerinfo(logger, "Arima temp folder {}".format(tmp_folder)) try: os.mkdir(tmp_folder) except PermissionError: # This not occur so log a warning loggerwarning(logger, "Arima was denied temp folder creation rights") tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except FileExistsError: # We should never be here since temp dir name is expected to be unique loggerwarning(logger, "Arima temp folder already exists") tmp_folder = self.context.experiment_tmp_dir + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) except: # Revert to temporary file path loggerwarning(logger, "Arima defaulted to create folder inside tmp directory.") tmp_folder = temporary_files_path + "/" + str(uuid.uuid4()) + "_arima_folder/" os.mkdir(tmp_folder) X = X.to_pandas() XX = X[self.tgc].copy() tgc_wo_time = list(np.setdiff1d(self.tgc, self.time_column)) if len(tgc_wo_time) > 0: XX_grp = XX.groupby(tgc_wo_time) else: XX_grp = [([None], XX)] assert len(XX_grp) > 0 num_tasks = len(XX_grp) def processor(out, res): out.append(res) pool_to_use = small_job_pool pool = pool_to_use(logger=None, processor=processor, num_tasks=num_tasks) XX_paths = [] model_paths = [] nb_groups = len(XX_grp) for _i_g, (key, X) in enumerate(XX_grp): # Just print where we are in the process of fitting models if (_i_g + 1) % max(1, nb_groups // 20) == 0: loggerinfo(logger, "Auto ARIMA : %d%% of groups transformed" % (100 * (_i_g + 1) // nb_groups)) # Create time group key to store and retrieve fitted models key = key if isinstance(key, list) else [key] grp_hash = '_'.join(map(str, key)) # Create file path to store data and pass it to the fitting pool X_path = os.path.join(tmp_folder, "autoarima_Xt" + str(uuid.uuid4())) # Commented for performance, uncomment for debug # print("ARIMA - transforming data of shape: %s for group: %s" % (str(X.shape), grp_hash)) if grp_hash in self.models: model = self.models[grp_hash] model_path = os.path.join(tmp_folder, "autoarima_modelt" + str(uuid.uuid4())) save_obj(model, model_path) save_obj(X, X_path) model_paths.append(model_path) args = (model_path, X_path, self.nan_value, hasattr(self, 'is_train'), self.time_column, tmp_folder) kwargs = {} pool.submit_tryget(None, MyParallelAutoArimaTransformer_transform_async, args=args, kwargs=kwargs, out=XX_paths) else: # Don't go through pools XX = pd.DataFrame(np.full((X.shape[0], 1), self.nan_value), columns=['yhat']) # unseen groups # Sync indices XX.index = X.index save_obj(XX, X_path) XX_paths.append(X_path) pool.finish() XX = pd.concat((load_obj(XX_path) for XX_path in XX_paths), axis=0).sort_index() for p in XX_paths + model_paths: remove(p) try: shutil.rmtree(tmp_folder) loggerinfo(logger, "Arima cleaned up temporary file folder.") except: loggerwarning(logger, "Arima could not delete the temporary file folder.") return XX