def test_for_older_version(): # Fit an ARIMA arima = ARIMA(order=(0, 0, 0), trend='c', suppress_warnings=True) # There are three possibilities here: # 1. The model is serialized/deserialized BEFORE it has been fit. # This means we should not get a warning. # # 2. The model is saved after being fit, but it does not have a # pkg_version_ attribute due to it being an old (very old) version. # We still warn for this # # 3. The model is saved after the fit, and it's version does not match. # We warn for this. for case, do_fit, expect_warning in [(1, False, False), (2, True, True), (3, True, True)]: # Only fit it if we should if do_fit: arima.fit(y) # If it's case 2, we remove the pkg_version_. If 3, we set it low if case == 2: delattr(arima, 'pkg_version_') elif case == 3: arima.pkg_version_ = '0.0.1' # will always be < than current # Pickle it pickle_file = 'model.pkl' try: joblib.dump(arima, pickle_file) # Now unpickle it and show that we get a warning (if expected) with warnings.catch_warnings(record=True) as w: arm = joblib.load(pickle_file) # type: ARIMA if expect_warning: assert len(w) > 0 else: assert not len(w) # we can still produce predictions (only if we fit) if do_fit: arm.predict(n_periods=4) finally: arima._clear_cached_state() os.unlink(pickle_file)
def evaluate_arima(row, y, validation_size): """ Evaluate the model with the params given in the row :param row: The row to get the parameters from :param y: The y target vector :param validation_size: The validation distance to use for scoring :return: """ used_y = y[:-validation_size] params = get_arima_params(row) times = dict() start = time.clock() model = ARIMA(**params) fit = model.fit(y=used_y) times['fit_time'] = time.clock() - start start = time.clock() prediction = fit.predict(validation_size) norm_factor = 1 / (y.max() - y.min()) times['prediction_time'] = time.clock() - start scores = score_prediction(y[-validation_size:], prediction, norm_factor) return { 'params': params, 'prediction': prediction, **times, **scores, 'norm_factor': norm_factor }
def test_double_pickle(): arima = ARIMA(order=(0, 0, 0), trend='c', suppress_warnings=True) arima.fit(y) # Now save it twice file_a = 'first.pkl' file_b = 'second.pkl' try: # No compression joblib.dump(arima, file_a) # Sleep between pickling so that the "pickle hash" for the ARIMA is # different by enough. We could theoretically also just use a UUID # for part of the hash to make sure it's unique? time.sleep(0.5) # Some compression joblib.dump(arima, file_b, compress=2) # Load both and prove they can both predict loaded_a = joblib.load(file_a) # type: ARIMA loaded_b = joblib.load(file_b) # type: ARIMA pred_a = loaded_a.predict(n_periods=5) pred_b = loaded_b.predict(n_periods=5) assert np.allclose(pred_a, pred_b) # Remove the caches from each loaded_a._clear_cached_state() loaded_b._clear_cached_state() # Test the previous condition where we removed the saved state of an # ARIMA from statsmodels and caused an OSError and a corrupted pickle with pytest.raises(OSError) as o: joblib.load(file_a) # fails since no cached state there! msg = str(o) assert 'Could not read saved model state' in msg, msg # Always remove in case we fail in try, leaving residual files finally: os.unlink(file_a) os.unlink(file_b)
class PyramidWrapper(BaseRegressionWrapper): # Attributes. _name = 'PYRAMID' # Label to add to the attributes when saving. def __init__(self, **kwargs): """ """ # Library. from pyramid.arima import ARIMA as PYARIMA # Save config parameters. super(PyramidWrapper, self).__init__(**kwargs) # Create model if len(kwargs): self._model = PYARIMA(**kwargs) def _identifier(self): """This method creates a name that describes de model.""" try: exogenous = self.exogenous is not None except: exogenous = False return "name to do" #return "%s%sx%s [%s,%s]" % (self._name, # self.order, # self.seasonal_order, # self.trend, # exogenous) # -------------------------------------------------------------------------- # SET VARIABLES # -------------------------------------------------------------------------- def _params_from_summary(self): """Gets parameters from the summary result of the raw object. """ # Format summary summary = self._raw.summary().as_csv() summary = summary.split("\n", 1)[1] # Remove first line. summary = summary.replace("\n", ",") # Replace \n by comma. # Split in elements. elements = summary.split(",") elements = [self._cast_float(e.strip()) for e in elements] # Create series. d = {} # Add parameters. d['s_jb_value'] = elements[-13] d['s_jb_prob'] = elements[-9] d['s_skew'] = elements[-5] d['s_Q_value'] = elements[-15] d['s_Q_prob'] = elements[-11] d['s_H_value'] = elements[-7] d['s_H_prob'] = elements[-3] d['s_kurtosis'] = elements[-1] d['s_heteroskedasticity'] = elements[-7] d['s_omnibus_value'] = None d['s_omnibus_prob'] = None # Return return d def _init_result(self, alpha=0.05): """This method set all the variables into this class. @see: statsmodels.Arima @see: statsmodels.ArimaResults Parameters ---------- alpha : Returns ------- series : """ # Create series. d = {} # Add generic metrics. d['aic'] = self._raw.aic() d['aicc'] = self._raw.aicc() d['bic'] = self._raw.bic() d['hqic'] = self._raw.hqic() d['llf'] = self._raw.arima_res_.llf # Check if it is arima or sarimax and get corresponding values if self._raw.seasonal_order is not None: statistic_values = self._raw.arima_res_.zvalues else: statistic_values = self._raw.arima_res_.tvalues, # Create params information. params_data = zip(self._raw.arima_res_.data.param_names, self._raw.arima_res_.params, self._raw.arima_res_.bse, statistic_values, self._raw.arima_res_.pvalues, self._raw.arima_res_.conf_int(alpha)) # Add coefficients statistics to series. for name, coef, std, tvalue, pvalue, (cil, ciu) in params_data: d['%s_%s' % (name, 'coef')] = coef d['%s_%s' % (name, 'std')] = std d['%s_%s' % (name, 'tvalue')] = tvalue d['%s_%s' % (name, 'tprob')] = pvalue d['%s_%s' % (name, 'cil')] = cil d['%s_%s' % (name, 'ciu')] = ciu # Further statistics d.update(self._resid_stats()) # We cannot use the params_from_summary because this wrappers stores # different models with different summaries. The right way to solve this # is by performing the statistics related with the residuals in the # regression_wrapper._resid_stats. #d.update(self._params_from_summary()) # Return return d def _init_config(self): """This method initialises the configuration. For some reason the interestin data is in the method __init__ for the object self._raw (ARIMA) and the method fir for the object self._raw.arima_res_.model. TODO: Handle if the instances passed to getargspecdict do not exist. """ # Create dictionary. d = {} # Fill it. d.update(self._getargspecdict(self._raw.arima_res_.model, 'fit')) d.update(self._getargspecdict(self._raw, '__init__')) # Return return d # -------------------------------------------------------------------------- # HELPER METHODS # -------------------------------------------------------------------------- def as_summary(self, **kwargs): """This method displays the summary. """ # Elements to split by. find = "=" * 78 # Split and fill. smry = find.join( self._raw.summary(**kwargs).as_text().split(find)[:-1]) smry = smry.split("\n") smry[-6] = smry[-6].replace('=', '', 5) smry[-5:] = [v.replace(' ', '', 5) for v in smry[-5:]] smry = "\n".join(smry[:-1]) # Variables. om, omp, dw = 0.0, 0.0, self.m_dw jb, jbp = self.m_jb_value, self.m_jb_prob nm, nmp = self.m_nm_value, self.m_nm_prob skew, kurt = self.m_skew, self.m_kurtosis # Add in new lines. smry += "\n%s\n%s\n%s\n" % ("=" * 78, "Manual".center(78, ' '), "-" * 78) smry += "Omnibus: %#25.3f Durbin-Watson: %#23.3f\n" % (om, dw) smry += "Normal (N): %#25.3f Prob(N): %#23.3f\n" % (nm, nmp) smry += "=" * 78 + "\n" smry += "Note that JB, P(JB), skew and kurtosis have different values.\n" smry += "Note that Prob(Q) tests no correlation of residuals." # Return return smry # -------------------------------------------------------------------------- # FIT # -------------------------------------------------------------------------- def fit(self, **kwargs): """This method fits the specified arima model. Parameters ---------- endog : exog : missing : hasconst : Returns ------- object : A PyramidWrapper object. """ # Fill config. self._config.update(kwargs) # Set model self._raw = self._model.fit(**kwargs) # Set residuals as attribute. self._resid = self._raw.resid() # Set series with interesting params. self._result = self._init_result(alpha=0.05) # return object. return self # --------------------------------------------------------------------------- # PREDICTION # --------------------------------------------------------------------------- def get_prediction(self, **kwargs): """ """ # Compute prediction forecast = self._raw.predict_in_sample(**kwargs) # Get plotting values. mean = forecast.reshape(1, -1) cilo = self.conf_int_insample(mean, alpha=0.05)[:, 0].reshape(1, -1) ciup = self.conf_int_insample(mean, alpha=0.05)[:, 1].reshape(1, -1) # Time. time = self._time(forecast=mean, **kwargs).reshape(1, -1) # Get plotting values. return np.concatenate((time, mean, cilo, ciup), axis=0) def _time(self, forecast, start=None, **kwargs): """This method.... """ # Get default start. if start is None: start = getattr(self._raw.arima_res_, 'k_diff', 0) # Return return np.arange(forecast.shape[1]) + start # --------------------------------------------------------------------------- # FIND AUTO # --------------------------------------------------------------------------- def from_instance(self, arima, **kwargs): """This method constructs a PyramidWrapper object from pyramid.ARIMA """ # Create object. instance = PyramidWrapper() # Set model. instance._raw = arima # Set residuals as attribute. instance._resid = arima.resid() # Set series with interesting params. instance._result = instance._init_result(alpha=0.05) # Set configuration parameters. instance._config = instance._init_config() # Return return instance def auto(self, **kwargs): """This method finds the best arima. @see pyrmid.arima.auto_arima Parameters ---------- Returns ------- """ # Library. from pyramid.arima import auto_arima from pyramid.arima.arima import ARIMA # Compute auto_arima. results = auto_arima(**kwargs) # Return a single PyramidWrapper object. if isinstance(results, ARIMA): return [self.from_instance(results)] # Return an array of PyramidWrapper objects. if isinstance(results, list): return [PyramidWrapper().from_instance(a) for a in results]
"2014-12-31", "%Y-%m-%d")] # Run ARIMA with found parameters stepwise = ARIMA(callback=None, disp=0, maxiter=50, method=None, order=(10, 1, 12), seasonal_order=(4, 1, 2, 52), solver="lbfgs", suppress_warnings=True, transparams=True, trend="c") # Fit and predict print("Fitting and Predicting...") stepwise.fit(train.drop("WeekEnding", axis=1)) future = stepwise.predict(n_periods=len(test.index)) # Merge predictions with raw data future = pd.DataFrame(future, index=test["WeekEnding"], columns=["Forecast"]) df = df.set_index("WeekEnding").join(future, how="outer") forecast = df.dropna() # Plot vs actual data plt.plot(df) plt.xlabel("Date") plt.ylabel("Lower 48 Inventory (Bcf)") plt.show()
def forecasting_sales(): try: period = request.args.get('period') data = pd.read_csv( 'http://robsonfernandes.net/mestrado/data/food-sp.csv') print('Passou 00') variavel = 'VENDA' data.index = data['DATA'] interval = 96 - int(period) df_train = data.iloc[1:interval, ] df_test = data.iloc[interval:96, ] df_train[variavel + '_box'], lmbda = stats.boxcox(df_train[variavel]) print('Passou 01') # model = auto_arima(df_train[variavel+'_box'], # n_fits=10, # start_p=0, # start_q=0, # max_p=5, # max_q=5, # m=20, # start_P=0, # d=1, # D=1, # trace=True, # stationary=False, # error_action='ignore', # suppress_warnings=True, # stepwise=True) model = ARIMA(callback=None, disp=0, maxiter=50, method=None, order=(1, 1, 1), out_of_sample_size=0, scoring='mse', scoring_args={}, seasonal_order=(2, 1, 1, 20), solver='lbfgs', start_params=None, suppress_warnings=True, transparams=True, trend='c') model.fit(df_train[variavel + '_box']) # model.summary() forecast = model.predict(n_periods=int(period)) y_pred = invboxcox(forecast, lmbda) y_true = df_test[variavel].values print('Passou 02') acuracia = round(100 - mean_absolute_percentage_error(y_true, y_pred), 0) retorno = { 'acuracia': acuracia, 'real': y_true.tolist(), 'previsto': y_pred.tolist() } return jsonify(retorno) except Exception: raise
class AutoArima(SupervisedLearnerPrimitiveBase[Inputs, Outputs, ArimaParams, ArimaHyperparams]): __author__ = 'USC ISI' metadata = hyperparams.base.PrimitiveMetadata({ # Required "id": 'b2e4e8ea-76dc-439e-8e46-b377bf616a35', "version": config.VERSION, "name": "DSBox Arima Primitive", "description": "Arima primitive for timeseries data regression/forcasting problems, transferred from pyramid/Arima", "python_path": "d3m.primitives.time_series_forecasting.Arima.DSBOX", "primitive_family": "TIME_SERIES_FORECASTING", "algorithm_types": ["AUTOREGRESSIVE_INTEGRATED_MOVING_AVERAGE"], "source": { "name": config.D3M_PERFORMER_TEAM, "contact": config.D3M_CONTACT, "uris": [config.REPOSITORY] }, "keywords": ["Transform", "Timeseries", "Aggregate"], "installation": [config.INSTALLATION], "precondition": ["NO_MISSING_VALUES", "NO_CATEGORICAL_VALUES"], }) def __init__(self, *, hyperparams: ArimaHyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None, _verbose: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) if self.hyperparams["is_seasonal"]: seasonal_order = self.hyperparams["seasonal_order"] else: seasonal_order = None self._clf = ARIMA( order=(self.hyperparams["P"], self.hyperparams["D"], self.hyperparams["Q"]), seasonal_order=seasonal_order, # seasonal_order=self.hyperparams["seasonal_order"], # seasonal_order=(0,1,1,12), # start_params=self.hyperparams["start_params"], # start_params = None, transparams=self.hyperparams["transparams"], method=self.hyperparams["method"], trend=self.hyperparams["trend"], solver=self.hyperparams["solver"], maxiter=self.hyperparams["maxiter"], disp=self.hyperparams["disp"], # callback=self.hyperparams["callback"], callback=None, suppress_warnings=self.hyperparams["suppress_warnings"], out_of_sample_size=False, scoring="mse", scoring_args=None ) self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._fitted = False self._length_for_produce = 0 def set_training_data(self, *, inputs: Inputs) -> None: inputs_timeseries = d3m_dataframe(inputs.iloc[:, -1]) inputs_d3mIndex = d3m_dataframe(inputs.iloc[:, 0]) if len(inputs_timeseries) == 0: print( "Warning: Inputs timeseries data to timeseries_featurization primitive's length is 0.") return column_name = inputs_timeseries.columns[0] self._training_inputs, self._target_names = inputs_timeseries, column_name self._training_outputs = inputs_timeseries def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None or self._training_outputs is None: raise ValueError("Missing training data.") arima_training_output = d3m_ndarray(self._training_outputs) shape = arima_training_output.shape if len(shape) == 2 and shape[1] == 1: sk_training_output = np.ravel(arima_training_output) self._clf.fit(sk_training_output) self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: arima_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] sk_output = self._clf.predict(n_periods=len(arima_inputs)) output = d3m_dataframe(sk_output, generate_metadata=False, source=self) output.metadata = inputs.metadata.clear( source=self, for_value=output, generate_metadata=True) output.metadata = self._add_target_semantic_types( metadata=output.metadata, target_names=self._target_names, source=self) if not self.hyperparams['use_semantic_types']: return CallResult(output) # outputs = common_utils.combine_columns(return_result=self.hyperparams['return_result'], # add_index_columns=self.hyperparams['add_index_columns'], # inputs=inputs, column_indices=self._training_indices, columns_list=[output], source=self) return CallResult(output) def get_params(self) -> ArimaParams: return Params(arima=self._clf) def set_params(self, *, params: ArimaParams) -> None: self._clf = params["arima"] @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: ArimaHyperparams): if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = common_utils.get_columns_to_use(inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams[ 'exclude_columns'], can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: ArimaHyperparams) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) semantic_types = column_metadata.get('semantic_types', []) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False if "https://metadata.datadrivendiscovery.org/types/Attribute" in semantic_types: return True return False @classmethod def _get_targets(cls, data: d3m_dataframe, hyperparams: ArimaHyperparams): if not hyperparams['use_semantic_types']: return data, [] target_names = [] target_column_indices = [] metadata = data.metadata target_column_indices.extend(metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) for column_index in target_column_indices: if column_index is metadata_base.ALL_ELEMENTS: continue column_index = typing.cast( metadata_base.SimpleSelectorSegment, column_index) column_metadata = metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) target_names.append(column_metadata.get('name', str(column_index))) targets = data.iloc[:, target_column_indices] return targets, target_names @classmethod def _add_target_semantic_types(cls, metadata: metadata_base.DataMetadata, source: typing.Any, target_names: List = None,) -> metadata_base.DataMetadata: for column_index in range(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']): metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/Target', source=source) metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', source=source) if target_names: metadata = metadata.update((metadata_base.ALL_ELEMENTS, column_index), { 'name': target_names[column_index], }, source=source) return metadata # functions to fit in devel branch of d3m (2019-1-17) def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: """ A method calling ``fit`` and after that multiple produce methods at once. This method allows primitive author to implement an optimized version of both fitting and producing a primitive on same data. If any additional method arguments are added to primitive's ``set_training_data`` method or produce method(s), or removed from them, they have to be added to or removed from this method as well. This method should accept an union of all arguments accepted by primitive's ``set_training_data`` method and produce method(s) and then use them accordingly when computing results. The default implementation of this method just calls first ``set_training_data`` method, ``fit`` method, and all produce methods listed in ``produce_methods`` in order and is potentially inefficient. Parameters ---------- produce_methods : Sequence[str] A list of names of produce methods to call. inputs : Inputs The inputs given to ``set_training_data`` and all produce methods. outputs : Outputs The outputs given to ``set_training_data``. timeout : float A maximum time this primitive should take to both fit the primitive and produce outputs for all produce methods listed in ``produce_methods`` argument, in seconds. iterations : int How many of internal iterations should the primitive do for both fitting and producing outputs of all produce methods. Returns ------- MultiCallResult A dict of values for each produce method wrapped inside ``MultiCallResult``. """ return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs)