def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None return result, missing_mask
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None if len(result) > 1: # have RHS design design_info = result[1].design_info # detach it from DataFrame else: design_info = None # NOTE: is there ever a case where we'd need LHS design_info? return result, missing_mask, design_info
def concat(series, axis=0, allow_mix=False): """ Concatenate a set of series. Parameters ---------- series : iterable An iterable of series to be concatenated axis : int, optional The axis along which to concatenate. Default is 1 (columns). allow_mix : bool Whether or not to allow a mix of pandas and non-pandas objects. Default is False. If true, the returned object is an ndarray, and additional pandas metadata (e.g. column names, indices, etc) is lost. Returns ------- concatenated : array or pd.DataFrame The concatenated array. Will be a DataFrame if series are pandas objects. """ is_pandas = np.r_[[_is_using_pandas(s, None) for s in series]] if np.all(is_pandas): concatenated = pd.concat(series, axis=axis) elif np.all(~is_pandas) or allow_mix: concatenated = np.concatenate(series, axis=axis) else: raise ValueError('Attempted to concatenate Pandas objects with' ' non-Pandas objects with `allow_mix=False`.') return concatenated
def sort(self, data, index=None): """Applies a (potentially hierarchical) sort operation on a numpy array or pandas series/dataframe based on the grouping index or a user-supplied index. Returns an object of the same type as the original data as well as the matching (sorted) Pandas index. """ if index is None: index = self.index if data_util._is_using_ndarray_type(data, None): if data.ndim == 1: out = pd.Series(data, index=index, copy=True) out = out.sort_index() else: out = pd.DataFrame(data, index=index) out = out.sort_index(inplace=False) # copies return np.array(out), out.index elif data_util._is_using_pandas(data, None): out = data out = out.reindex(index) # copies? out = out.sort_index() return out, out.index else: msg = 'data must be a Numpy array or a Pandas Series/DataFrame' raise ValueError(msg)
def handle_data(endog, exog, missing='none', hasconst=None, **kwargs): """ Given inputs """ # deal with lists and tuples up-front if isinstance(endog, (list, tuple)): endog = np.asarray(endog) if isinstance(exog, (list, tuple)): exog = np.asarray(exog) if data_util._is_using_ndarray_type(endog, exog): klass = ModelData elif data_util._is_using_pandas(endog, exog): klass = PandasData elif data_util._is_using_patsy(endog, exog): klass = PatsyData # keep this check last elif data_util._is_using_ndarray(endog, exog): klass = ModelData else: raise ValueError('unrecognized data structures: %s / %s' % (type(endog), type(exog))) return klass(endog, exog=exog, missing=missing, hasconst=hasconst, **kwargs)
def __init__(self, fname, data, convert_dates=None, encoding="latin-1", byteorder=None): warnings.warn( "StataWriter is deprecated as of 0.10.0 and will be removed in a " "future version. Use pandas.DataFrame.to_stata or " "pandas.io.stata.StatWriter instead.", FutureWarning) self._convert_dates = convert_dates # attach nobs, nvars, data, varlist, typlist if data_util._is_using_pandas(data, None): self._prepare_pandas(data) elif data_util._is_array_like(data, None): data = np.asarray(data) if data_util._is_structured_ndarray(data): self._prepare_structured_array(data) else: if convert_dates is not None: raise ValueError("Not able to convert dates in a plain" " ndarray.") self._prepare_ndarray(data) else: # pragma : no cover raise ValueError("Type %s for data not understood" % type(data)) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) self._encoding = encoding self._file = get_file_obj(fname, 'wb', encoding)
def handle_data(endog, exog): """ Given inputs """ # deal with lists and tuples up-front if isinstance(endog, (list, tuple)): endog = np.asarray(endog) if isinstance(exog, (list, tuple)): exog = np.asarray(exog) if data_util._is_using_pandas(endog, exog): klass = PandasData elif data_util._is_using_larry(endog, exog): klass = LarryData elif data_util._is_using_timeseries(endog, exog): klass = TimeSeriesData elif data_util._is_using_patsy(endog, exog): klass = PatsyData # keep this check last elif data_util._is_using_ndarray(endog, exog): klass = ModelData else: raise ValueError("unrecognized data structures: %s / %s" % (type(endog), type(exog))) return klass(endog, exog=exog)
def sort(self, data, index=None): '''Applies a (potentially hierarchical) sort operation on a numpy array or pandas series/dataframe based on the grouping index or a user-supplied index. Returns an object of the same type as the original data as well as the matching (sorted) Pandas index. ''' if index is None: index = self.index if data_util._is_using_ndarray_type(data, None): if data.ndim == 1: out = pd.Series(data, index=index, copy=True) out = out.sort_index() else: out = pd.DataFrame(data, index=index) out = out.sort(inplace=False) # copies return np.array(out), out.index elif data_util._is_using_pandas(data, None): out = data out = out.reindex(index) # copies? out = out.sort_index() return out, out.index else: msg = 'data must be a Numpy array or a Pandas Series/DataFrame' raise ValueError(msg)
def auto_arima(endog, freq=None, d=None, D=None, max_p=5, max_q=5, max_P=2, max_Q=2, max_order=5, max_d=2, max_D=1, start_p=2, start_q=2, start_P=1, start_Q=1, stationary=False, ic="aic", stepwise=True, trace=False, approximation=None, test="adf", seasonal_test="ch", allowdrift=True, allowmean=True, lambda_parameter=None, *args, **kwargs): # Parameter Validity Check if np.any(np.isnan(endog)): raise ValueError("Missing Values in Series") origin_endog = endog if _is_using_pandas(endog, None): endog = np.asarray(endog) if len(endog) <= 10: raise ValueError("There are too few observations.") if np.any(np.isnan(endog)): raise ValueError("NaN values in endogenous not allowed") if np.all(endog == endog[0]): raise ValueError("The endogenous variable is a constant") if (not isinstance(freq, int)) or freq <= 1: raise ValueError("The frequency parameter must be a integer greater than 1") if lambda_parameter is not None: if lambda_parameter < 0: raise ValueError("The Lambda parameter must be positive") if not np.all(endog > 0): raise ValueError("Box-Cox Transformation can be only used on positive series.") endog = boxcox(endog, lambda_parameter) max_p = max_p if max_p <= floor(len(endog) / 3) else floor(len(endog) / 3) max_q = max_q if max_q <= floor(len(endog) / 3) else floor(len(endog) / 3) max_P = max_P if max_P <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq) max_Q = max_Q if max_Q <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq) if stationary: D = 0 d = 0 if freq == 1:
def __init__(self, fname, data, convert_dates=None, encoding="latin-1", byteorder=None): self._convert_dates = convert_dates # attach nobs, nvars, data, varlist, typlist if data_util._is_using_pandas(data, None): self._prepare_pandas(data) elif data_util._is_array_like(data, None): data = np.asarray(data) if data_util._is_structured_ndarray(data): self._prepare_structured_array(data) else: if convert_dates is not None: raise ValueError("Not able to convert dates in a plain" " ndarray.") self._prepare_ndarray(data) else: # pragma : no cover raise ValueError("Type %s for data not understood" % type(data)) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) self._encoding = encoding self._file = get_file_obj(fname, 'wb', encoding)
def __init__(self, fname, data, convert_dates=None, encoding="latin-1", byteorder=None): self._convert_dates = convert_dates # attach nobs, nvars, data, varlist, typlist if data_util._is_using_pandas(data, None): self._prepare_pandas(data) elif data_util._is_array_like(data, None): data = np.asarray(data) if data_util._is_structured_ndarray(data): self._prepare_structured_array(data) else: if convert_dates is not None: raise ValueError("Not able to convert dates in a plain" " ndarray.") self._prepare_ndarray(data) else: # pragma : no cover raise ValueError("Type %s for data not understood" % type(data)) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) self._encoding = encoding self._file = _open_file_binary_write(fname, encoding)
def add_constant(data, prepend=True, has_constant='skip'): """ Add a column of ones to an array. Parameters ---------- data : array_like A column-ordered design matrix. prepend : bool If true, the constant is in the first column. Else the constant is appended (last column). has_constant : str {'raise', 'add', 'skip'} Behavior if ``data`` already has a constant. The default will return data without adding another constant. If 'raise', will raise an error if any column has a constant value. Using 'add' will add a column of 1s if a constant column is present. Returns ------- array_like The original values with a constant (column of ones) as the first or last column. Returned value type depends on input type. Notes ----- When the input is a pandas Series or DataFrame, the added column's name is 'const'. """ if _is_using_pandas(data, None): from statsmodels.tsa.tsatools import add_trend return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) # Special case for NumPy x = np.asarray(data) ndim = x.ndim if ndim == 1: x = x[:, None] elif x.ndim > 2: raise ValueError('Only implemented for 2-dimensional arrays') is_nonzero_const = np.ptp(x, axis=0) == 0 is_nonzero_const &= np.all(x != 0.0, axis=0) if is_nonzero_const.any(): if has_constant == 'skip': return x elif has_constant == 'raise': if ndim == 1: raise ValueError("data is constant.") else: columns = np.arange(x.shape[1]) cols = ",".join([str(c) for c in columns[is_nonzero_const]]) raise ValueError(f"Column(s) {cols} are constant.") x = [np.ones(x.shape[0]), x] x = x if prepend else x[::-1] return np.column_stack(x)
def add_constant(data, prepend=True, has_constant='skip'): ''' This appends a column of ones to an array if prepend==False. Parameters ---------- data : array-like `data` is the column-ordered design matrix prepend : bool True and the constant is prepended rather than appended. has_constant : str {'raise', 'add', 'skip'} Behavior if ``data'' already has a constant. The default will return data without adding another constant. If 'raise', will raise an error if a constant is present. Using 'add' will duplicate the constant, if one is present. Has no effect for structured or recarrays. There is no checking for a constant in this case. Returns ------- data : array The original array with a constant (column of ones) as the first or last column. ''' if _is_using_pandas(data, None): # work on a copy return _pandas_add_constant(data.copy(), prepend, has_constant) else: data = np.asarray(data) if not data.dtype.names: var0 = data.var(0) == 0 if np.any(var0): if has_constant == 'raise': raise ValueError("data already contains a constant.") elif has_constant == 'skip': return data elif has_constant == 'add': pass else: raise ValueError("Option {0} not understood for " "has_constant.".format(has_constant)) data = np.column_stack((data, np.ones((data.shape[0], 1)))) if prepend: return np.roll(data, 1, 1) else: return_rec = data.__class__ is np.recarray if prepend: ones = np.ones((data.shape[0], 1), dtype=[('const', float)]) data = nprf.append_fields(ones, data.dtype.names, [data[i] for i in data.dtype.names], usemask=False, asrecarray=return_rec) else: data = nprf.append_fields(data, 'const', np.ones(data.shape[0]), usemask=False, asrecarray=return_rec) return data
def _maybe_get_pandas_wrapper_freq(X, trim=None): if _is_using_pandas(X, None): index = X.index func = _get_pandas_wrapper(X, trim) freq = index.inferred_freq return func, freq else: return lambda x : x, None
def add_constant(data, prepend=True, has_constant='skip'): """ Add a column of ones to an array. Parameters ---------- data : array_like A column-ordered design matrix. prepend : bool If true, the constant is in the first column. Else the constant is appended (last column). has_constant : str {'raise', 'add', 'skip'} Behavior if ``data`` already has a constant. The default will return data without adding another constant. If 'raise', will raise an error if a constant is present. Using 'add' will duplicate the constant, if one is present. Returns ------- array_like The original values with a constant (column of ones) as the first or last column. Returned value type depends on input type. Notes ----- When the input is recarray or a pandas Series or DataFrame, the added column's name is 'const'. """ if _is_using_pandas(data, None) or _is_recarray(data): if _is_recarray(data): # deprecated: remove recarray support after 0.12 import warnings from statsmodels.tools.sm_exceptions import recarray_warning warnings.warn(recarray_warning, FutureWarning) from statsmodels.tsa.tsatools import add_trend return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) # Special case for NumPy x = np.asanyarray(data) if x.ndim == 1: x = x[:, None] elif x.ndim > 2: raise ValueError('Only implementd 2-dimensional arrays') is_nonzero_const = np.ptp(x, axis=0) == 0 is_nonzero_const &= np.all(x != 0.0, axis=0) if is_nonzero_const.any(): if has_constant == 'skip': return x elif has_constant == 'raise': raise ValueError("data already contains a constant") x = [np.ones(x.shape[0]), x] x = x if prepend else x[::-1] return np.column_stack(x)
def new_func(X, *args, **kwargs): # quick pass-through for do nothing case if not _is_using_pandas(X, None): return func(X, *args, **kwargs) wrapper_func = _get_pandas_wrapper(X, trim_head, trim_tail, names) ret = func(X, *args, **kwargs) ret = wrapper_func(ret) return ret
def diff(series, k_diff=1, k_seasonal_diff=None, k_seasons=1): r""" Difference a series simply and/or seasonally along the zero-th axis. Given a series (denoted :math:`y_t`), performs the differencing operation .. math:: \Delta^d \Delta_s^D y_t where :math:`d =` `diff`, :math:`s =` `k_seasons`, :math:`D =` `seasonal\_diff`, and :math:`\Delta` is the difference operator. Parameters ---------- series : array_like The series to be differenced. diff : int, optional The number of simple differences to perform. Default is 1. seasonal_diff : int or None, optional The number of seasonal differences to perform. Default is no seasonal differencing. k_seasons : int, optional The seasonal lag. Default is 1. Unused if there is no seasonal differencing. Returns ------- differenced : array The differenced array. """ pandas = _is_using_pandas(series, None) differenced = np.asanyarray(series) if not pandas else series # Seasonal differencing if k_seasonal_diff is not None: while k_seasonal_diff > 0: if not pandas: differenced = ( differenced[k_seasons:] - differenced[:-k_seasons] ) else: differenced = differenced.diff(k_seasons)[k_seasons:] k_seasonal_diff -= 1 # Simple differencing if not pandas: differenced = np.diff(differenced, k_diff, axis=0) else: while k_diff > 0: differenced = differenced.diff()[1:] k_diff -= 1 return differenced
def _maybe_get_pandas_wrapper(X, trim_head=None, trim_tail=None): """ If using pandas returns a function to wrap the results, e.g., wrapper(X) trim is an integer for the symmetric truncation of the series in some filters. otherwise returns None """ if _is_using_pandas(X, None): return _get_pandas_wrapper(X, trim_head, trim_tail) else: return lambda x : x
def diff(series, k_diff=1, k_seasonal_diff=None, seasonal_periods=1): r""" Difference a series simply and/or seasonally along the zero-th axis. Given a series (denoted :math:`y_t`), performs the differencing operation .. math:: \Delta^d \Delta_s^D y_t where :math:`d =` `diff`, :math:`s =` `seasonal_periods`, :math:`D =` `seasonal\_diff`, and :math:`\Delta` is the difference operator. Parameters ---------- series : array_like The series to be differenced. diff : int, optional The number of simple differences to perform. Default is 1. seasonal_diff : int or None, optional The number of seasonal differences to perform. Default is no seasonal differencing. seasonal_periods : int, optional The seasonal lag. Default is 1. Unused if there is no seasonal differencing. Returns ------- differenced : array The differenced array. """ pandas = _is_using_pandas(series, None) differenced = np.asanyarray(series) if not pandas else series # Seasonal differencing if k_seasonal_diff is not None: while k_seasonal_diff > 0: if not pandas: differenced = ( differenced[seasonal_periods:] - differenced[:-seasonal_periods] ) else: differenced = differenced.diff(seasonal_periods)[seasonal_periods:] k_seasonal_diff -= 1 # Simple differencing if not pandas: differenced = np.diff(differenced, k_diff, axis=0) else: while k_diff > 0: differenced = differenced.diff()[1:] k_diff -= 1 return differenced
def new_func(X, *args, **kwargs): # quick pass-through for do nothing case if not _is_using_pandas(X, None): return func(X, *args, **kwargs) wrapper_func = _get_pandas_wrapper(X, trim_head, trim_tail, columns) index = X.index freq = index.inferred_freq kwargs.update({freq_kw: freq_to_period(freq)}) ret = func(X, *args, **kwargs) ret = wrapper_func(ret) return ret
def add_constant(data, prepend=True): ''' This appends a column of ones to an array if prepend==False. For ndarrays and pandas.DataFrames, checks to make sure a constant is not already included. If there is at least one column of ones then the original object is returned. Does not check for a constant if a structured or recarray is given. Parameters ---------- data : array-like `data` is the column-ordered design matrix prepend : bool True and the constant is prepended rather than appended. Returns ------- data : array The original array with a constant (column of ones) as the first or last column. ''' if _is_using_pandas(data, None): # work on a copy return _pandas_add_constant(data.copy(), prepend) else: data = np.asarray(data) if not data.dtype.names: var0 = data.var(0) == 0 if np.any(var0): return data data = np.column_stack((data, np.ones((data.shape[0], 1)))) if prepend: return np.roll(data, 1, 1) else: return_rec = data.__class__ is np.recarray if prepend: ones = np.ones((data.shape[0], 1), dtype=[('const', float)]) data = nprf.append_fields(ones, data.dtype.names, [data[i] for i in data.dtype.names], usemask=False, asrecarray=return_rec) else: data = nprf.append_fields(data, 'const', np.ones(data.shape[0]), usemask=False, asrecarray=return_rec) return data
def handle_formula_data(Y, X, formula, depth=0): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] if X is not None: if data_util._is_using_pandas(Y, X): return dmatrices(formula, (Y, X), 2, return_type='dataframe') else: return dmatrices(formula, (Y, X), 2, return_type='dataframe') else: if data_util._is_using_pandas(Y, None): return dmatrices(formula, Y, 2, return_type='dataframe') else: return dmatrices(formula, Y, 2, return_type='dataframe')
def __init__(self, endog, exog, **kwargs): # Standardize data if not _is_using_pandas(endog, None): endog = np.asanyarray(endog) exog_is_using_pandas = _is_using_pandas(exog, None) if not exog_is_using_pandas: exog = np.asarray(exog) # Make sure we have 2-dimensional array if exog.ndim == 1: if not exog_is_using_pandas: exog = exog[:, None] else: exog = pd.DataFrame(exog) self.k_exog = exog.shape[1] # Handle coefficient initialization # By default, do not calculate likelihood while it is controlled by # diffuse initial conditions. kwargs.setdefault('loglikelihood_burn', self.k_exog) kwargs.setdefault('initialization', 'approximate_diffuse') kwargs.setdefault('initial_variance', 1e9) # Initialize the state space representation super(RecursiveLS, self).__init__(endog, k_states=self.k_exog, exog=exog, **kwargs) # Setup the state space representation self['design'] = self.exog[:, :, None].T self['transition'] = np.eye(self.k_states) # Notice that the filter output does not depend on the measurement # variance, so we set it here to 1 self['obs_cov', 0, 0] = 1.
def add_constant(data, prepend=True, has_constant='skip'): """ Adds a column of ones to an array Parameters ---------- data : array-like `data` is the column-ordered design matrix prepend : bool If true, the constant is in the first column. Else the constant is appended (last column). has_constant : str {'raise', 'add', 'skip'} Behavior if ``data'' already has a constant. The default will return data without adding another constant. If 'raise', will raise an error if a constant is present. Using 'add' will duplicate the constant, if one is present. Returns ------- data : array, recarray or DataFrame The original values with a constant (column of ones) as the first or last column. Returned value depends on input type. Notes ----- When the input is recarray or a pandas Series or DataFrame, the added column's name is 'const'. """ if _is_using_pandas(data, None) or _is_recarray(data): from statsmodels.tsa.tsatools import add_trend return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) # Special case for NumPy x = np.asanyarray(data) if x.ndim == 1: x = x[:,None] elif x.ndim > 2: raise ValueError('Only implementd 2-dimensional arrays') is_nonzero_const = np.ptp(x, axis=0) == 0 is_nonzero_const &= np.all(x != 0.0, axis=0) if is_nonzero_const.any(): if has_constant == 'skip': return x elif has_constant == 'raise': raise ValueError("data already contains a constant") x = [np.ones(x.shape[0]), x] x = x if prepend else x[::-1] return np.column_stack(x)
def __init__(self, endog, exog, **kwargs): # Standardize data if not _is_using_pandas(endog, None): endog = np.asanyarray(endog) exog_is_using_pandas = _is_using_pandas(exog, None) if not exog_is_using_pandas: exog = np.asarray(exog) # Make sure we have 2-dimensional array if exog.ndim == 1: if not exog_is_using_pandas: exog = exog[:, None] else: exog = pd.DataFrame(exog) self.k_exog = exog.shape[1] # Handle coefficient initialization # By default, do not calculate likelihood while it is controlled by # diffuse initial conditions. kwargs.setdefault('loglikelihood_burn', self.k_exog) kwargs.setdefault('initialization', 'approximate_diffuse') kwargs.setdefault('initial_variance', 1e9) # Initialize the state space representation super(RecursiveLS, self).__init__( endog, k_states=self.k_exog, exog=exog, **kwargs ) # Setup the state space representation self['design'] = self.exog[:, :, None].T self['transition'] = np.eye(self.k_states) # Notice that the filter output does not depend on the measurement # variance, so we set it here to 1 self['obs_cov', 0, 0] = 1.
def handle_data_class_factory(endog, exog): """ Given inputs """ if data_util._is_using_ndarray_type(endog, exog): klass = ModelData elif data_util._is_using_pandas(endog, exog): klass = PandasData elif data_util._is_using_patsy(endog, exog): klass = PatsyData # keep this check last elif data_util._is_using_ndarray(endog, exog): klass = ModelData else: raise ValueError("unrecognized data structures: %s / %s" % (type(endog), type(exog))) return klass
def _check_period_index(x, freq="M"): try: from pandas import PeriodIndex, DatetimeIndex except ImportError: # not sure min. version PeriodIndex = DatetimeIndex # HACK from statsmodels.tools.data import _is_using_pandas if not _is_using_pandas(x, None): raise ValueError("x must be a pandas object") if not isinstance(x.index, (DatetimeIndex, PeriodIndex)): raise ValueError("The index must be a DatetimeIndex or PeriodIndex") from statsmodels.tsa.base.datetools import _infer_freq inferred_freq = _infer_freq(x.index) if not inferred_freq.startswith(freq): raise ValueError("Expected frequency {}. Got {}".format( inferred_freq, freq))
def _check_period_index(x, freq="M"): try: from pandas import PeriodIndex, DatetimeIndex except ImportError: # not sure min. version PeriodIndex = DatetimeIndex # HACK from statsmodels.tools.data import _is_using_pandas if not _is_using_pandas(x, None): raise ValueError("x must be a pandas object") if not isinstance(x.index, (DatetimeIndex, PeriodIndex)): raise ValueError("The index must be a DatetimeIndex or PeriodIndex") from statsmodels.tsa.base.datetools import _infer_freq inferred_freq = _infer_freq(x.index) if not inferred_freq.startswith(freq): raise ValueError("Expected frequency {}. Got {}".format(inferred_freq, freq))
def _maybe_get_pandas_wrapper(X, trim=None): """ If using pandas returns a function to wrap the results, e.g., wrapper(X) trim is an integer for the symmetric truncation of the series in some filters. otherwise returns None """ if _is_using_pandas(X, None): index = X.index if trim is not None: index = X.index[trim:-trim] if hasattr(X, "columns"): return lambda x: X.__class__(x, index=index, columns=X.columns) else: return lambda x: X.__class__(x, index=index, name=X.name) else: return
def _transform_predict_exog(model, exog, design_info=None): """transform exog for predict using design_info Note: this is copied from base.model.Results.predict and converted to standalone function with additional options. """ is_pandas = _is_using_pandas(exog, None) exog_index = exog.index if is_pandas else None if design_info is None: design_info = getattr(model.data, 'design_info', None) if design_info is not None and (exog is not None): from patsy import dmatrix if isinstance(exog, pd.Series): # we are guessing whether it should be column or row if (hasattr(exog, 'name') and isinstance(exog.name, str) and exog.name in design_info.describe()): # assume we need one column exog = pd.DataFrame(exog) else: # assume we need a row exog = pd.DataFrame(exog).T orig_exog_len = len(exog) is_dict = isinstance(exog, dict) exog = dmatrix(design_info, exog, return_type="dataframe") if orig_exog_len > len(exog) and not is_dict: import warnings if exog_index is None: warnings.warn('nan values have been dropped', ValueWarning) else: exog = exog.reindex(exog_index) exog_index = exog.index if exog is not None: exog = np.asarray(exog) if exog.ndim == 1 and (model.exog.ndim == 1 or model.exog.shape[1] == 1): exog = exog[:, None] exog = np.atleast_2d(exog) # needed in count model shape[1] return exog, exog_index
def handle_data_class_factory(endog, exog): """ Given inputs """ if data_util._is_using_ndarray_type(endog, exog): klass = ModelData elif data_util._is_using_pandas(endog, exog): klass = PandasData elif data_util._is_using_patsy(endog, exog): klass = PatsyData # keep this check last elif data_util._is_using_ndarray(endog, exog): klass = ModelData else: raise ValueError('unrecognized data structures: %s / %s' % (type(endog), type(exog))) return klass
def _ensure_2d(x, ndarray=False): """ Parameters ---------- x : array, Series, DataFrame or None Input to verify dimensions, and to transform as necesary ndarray : bool Flag indicating whether to always return a NumPy array. Setting False will return an pandas DataFrame when the input is a Series or a DataFrame. Returns ------- out : array, DataFrame or None array or DataFrame with 2 dimensiona. One dimensional arrays are returned as nobs by 1. None is returned if x is None. names : list of str or None list containing variables names when the input is a pandas datatype. Returns None if the input is an ndarray. Notes ----- Accepts None for simplicity """ if x is None: return x is_pandas = _is_using_pandas(x, None) if x.ndim == 2: if is_pandas: return x, x.columns else: return x, None elif x.ndim > 2: raise ValueError('x mst be 1 or 2-dimensional.') name = x.name if is_pandas else None if ndarray: return np.asarray(x)[:, None], name else: return pd.DataFrame(x), name
def handle_data(endog, exog, missing="none", hasconst=None, **kwargs): """ Given inputs """ # deal with lists and tuples up-front if isinstance(endog, (list, tuple)): endog = np.asarray(endog) if isinstance(exog, (list, tuple)): exog = np.asarray(exog) if data_util._is_using_ndarray_type(endog, exog): klass = ModelData elif data_util._is_using_pandas(endog, exog): klass = PandasData elif data_util._is_using_patsy(endog, exog): klass = PatsyData # keep this check last elif data_util._is_using_ndarray(endog, exog): klass = ModelData else: raise ValueError("unrecognized data structures: %s / %s" % (type(endog), type(exog))) return klass(endog, exog=exog, missing=missing, hasconst=hasconst, **kwargs)
def handle_data(endog, exog): """ Given inputs """ # deal with lists and tuples up-front if isinstance(endog, (list, tuple)): endog = np.asarray(endog) if isinstance(exog, (list, tuple)): exog = np.asarray(exog) if data_util._is_using_pandas(endog, exog): klass = PandasData elif data_util._is_using_larry(endog, exog): klass = LarryData elif data_util._is_using_timeseries(endog, exog): klass = TimeSeriesData # keep this check last elif data_util._is_using_ndarray(endog, exog): klass = ModelData else: raise ValueError('unrecognized data structures: %s / %s' % (type(endog), type(exog))) return klass(endog, exog=exog)
def __init__(self, endog, exog, constraints=None, **kwargs): # Standardize data endog_using_pandas = _is_using_pandas(endog, None) if not endog_using_pandas: endog = np.asanyarray(endog) exog_is_using_pandas = _is_using_pandas(exog, None) if not exog_is_using_pandas: exog = np.asarray(exog) # Make sure we have 2-dimensional array if exog.ndim == 1: if not exog_is_using_pandas: exog = exog[:, None] else: exog = pd.DataFrame(exog) self.k_exog = exog.shape[1] # Handle constraints self.k_constraints = 0 self._r_matrix = self._q_matrix = None if constraints is not None: from patsy import DesignInfo from statsmodels.base.data import handle_data data = handle_data(endog, exog, **kwargs) names = data.param_names LC = DesignInfo(names).linear_constraint(constraints) self._r_matrix, self._q_matrix = LC.coefs, LC.constants self.k_constraints = self._r_matrix.shape[0] constraint_endog = np.zeros((len(endog), len(self._r_matrix))) if endog_using_pandas: constraint_endog = pd.DataFrame(constraint_endog, index=endog.index) endog = concat([endog, constraint_endog], axis=1) endog.values[:, 1:] = self._q_matrix[:, 0] else: endog[:, 1:] = self._q_matrix[:, 0] # Handle coefficient initialization kwargs.setdefault('initialization', 'diffuse') # Initialize the state space representation super(RecursiveLS, self).__init__( endog, k_states=self.k_exog, exog=exog, **kwargs) # Use univariate filtering by default self.ssm.filter_univariate = True # Concentrate the scale out of the likelihood function self.ssm.filter_concentrated = True # Setup the state space representation self['design'] = np.zeros((self.k_endog, self.k_states, self.nobs)) self['design', 0] = self.exog[:, :, None].T if self._r_matrix is not None: self['design', 1:, :] = self._r_matrix[:, :, None] self['transition'] = np.eye(self.k_states) # Notice that the filter output does not depend on the measurement # variance, so we set it here to 1 self['obs_cov', 0, 0] = 1. self['transition'] = np.eye(self.k_states) # Linear constraints are technically imposed by adding "fake" endog # variables that are used during filtering, but for all model- and # results-based purposes we want k_endog = 1. if self._r_matrix is not None: self.k_endog = 1
def __init__(self, endog, exog=None, order=(0, 0, 0), seasonal_order=(0, 0, 0, 0), trend=None, enforce_stationarity=True, enforce_invertibility=True, concentrate_scale=False, trend_offset=1, dates=None, freq=None, missing='none', validate_specification=True): # Default for trend # 'c' if there is no integration and 'n' otherwise # TODO: if trend='c', then we could alternatively use `demean=True` in # the estimation methods rather than setting up `exog` and using GLS. # Not sure if it's worth the trouble though. integrated = order[1] > 0 or seasonal_order[1] > 0 if trend is None and not integrated: trend = 'c' elif trend is None: trend = 'n' # Construct the specification # (don't pass specific values of enforce stationarity/invertibility, # because we don't actually want to restrict the estimators based on # this criteria. Instead, we'll just make sure that the parameter # estimates from those methods satisfy the criteria.) self._spec_arima = SARIMAXSpecification( endog, exog=exog, order=order, seasonal_order=seasonal_order, trend=trend, enforce_stationarity=None, enforce_invertibility=None, concentrate_scale=concentrate_scale, trend_offset=trend_offset, dates=dates, freq=freq, missing=missing, validate_specification=validate_specification) exog = self._spec_arima._model.data.orig_exog # Raise an error if we have a constant in an integrated model has_trend = len(self._spec_arima.trend_terms) > 0 if has_trend: lowest_trend = np.min(self._spec_arima.trend_terms) if lowest_trend < order[1] + seasonal_order[1]: raise ValueError( 'In models with integration (`d > 0`) or seasonal' ' integration (`D > 0`), trend terms of lower order than' ' `d + D` cannot be (as they would be eliminated due to' ' the differencing operation). For example, a constant' ' cannot be included in an ARIMA(1, 1, 1) model, but' ' including a linear trend, which would have the same' ' effect as fitting a constant to the differenced data,' ' is allowed.') # Keep the given `exog` by removing the prepended trend variables input_exog = None if exog is not None: if _is_using_pandas(exog, None): input_exog = exog.iloc[:, self._spec_arima.k_trend:] else: input_exog = exog[:, self._spec_arima.k_trend:] # Initialize the base SARIMAX class # Note: we don't pass in a trend value to the base class, since ARIMA # standardizes the trend to always be part of exog, while the base # SARIMAX class puts it in the transition equation. super(ARIMA, self).__init__(endog, exog, trend=None, order=order, seasonal_order=seasonal_order, enforce_stationarity=enforce_stationarity, enforce_invertibility=enforce_invertibility, concentrate_scale=concentrate_scale, dates=dates, freq=freq, missing=missing, validate_specification=validate_specification) self.trend = trend # Save the input exog and input exog names, so that we can refer to # them later (see especially `ARIMAResults.append`) self._input_exog = input_exog if exog is not None: self._input_exog_names = self.exog_names[self._spec_arima.k_trend:] else: self._input_exog_names = None # Override the public attributes for k_exog and k_trend to reflect the # distinction here (for the purpose of the superclass, these are both # combined as `k_exog`) self.k_exog = self._spec_arima.k_exog self.k_trend = self._spec_arima.k_trend # Remove some init kwargs that aren't used in this model unused = [ 'measurement_error', 'time_varying_regression', 'mle_regression', 'simple_differencing', 'hamilton_representation' ] self._init_keys = [key for key in self._init_keys if key not in unused]
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward', use_pandas=False): """ Generate lagmatrix for 2d array, columns arranged by variables Parameters ---------- x : array_like, 2d 2d data, observation in rows and variables in columns maxlag0 : int for first variable all lags from zero to maxlag are included maxlagex : None or int max lag for all other variables all lags from zero to maxlag are included dropex : int (default is 0) exclude first dropex lags from other variables for all variables, except the first, lags from dropex to maxlagex are included trim : string * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none' : no trimming of observations use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations, columns ordered by variable Notes ----- Inefficient implementation for unequal lags, implemented for convenience """ if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) is_pandas = _is_using_pandas(x, None) if x.ndim == 1: if is_pandas: x = pd.DataFrame(x) else: x = x[:, None] elif x.ndim == 0 or x.ndim > 2: raise ValueError('Only supports 1 and 2-dimensional data.') nobs, nvar = x.shape if is_pandas and use_pandas: lags = lagmat(x.iloc[:, 0], maxlag, trim=trim, original='in', use_pandas=True) lagsli = [lags.iloc[:, :maxlag0 + 1]] for k in range(1, nvar): lags = lagmat(x.iloc[:, k], maxlag, trim=trim, original='in', use_pandas=True) lagsli.append(lags.iloc[:, dropex:maxlagex + 1]) return pd.concat(lagsli, axis=1) elif is_pandas: x = np.asanyarray(x) lagsli = [lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]] for k in range(1, nvar): lagsli.append(lagmat(x[:, k], maxlag, trim=trim, original='in')[:, dropex:maxlagex + 1]) return np.column_stack(lagsli)
def add_trend(x, trend="c", prepend=False, has_constant='skip'): """ Adds a trend and/or constant to an array. Parameters ---------- X : array-like Original array of data. trend : str {"c","t","ct","ctt"} "c" add constant only "t" add trend only "ct" add constant and linear trend "ctt" add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. has_constant : str {'raise', 'add', 'skip'} Controls what happens when trend is 'c' and a constant already exists in X. 'raise' will raise an error. 'add' will duplicate a constant. 'skip' will return the data without change. 'skip' is the default. Returns ------- y : array, recarray or DataFrame The original data with the additional trend columns. If x is a recarray or pandas Series or DataFrame, then the trend column names are 'const', 'trend' and 'trend_squared'. Notes ----- Returns columns as ["ctt","ct","c"] whenever applicable. There is currently no checking for an existing trend. See also -------- statsmodels.tools.tools.add_constant """ # TODO: could be generalized for trend of aribitrary order trend = trend.lower() columns = ['const', 'trend', 'trend_squared'] if trend == "c": # handles structured arrays columns = columns[:1] trendorder = 0 elif trend == "ct" or trend == "t": columns = columns[:2] if trend == "t": columns = columns[1:2] trendorder = 1 elif trend == "ctt": trendorder = 2 else: raise ValueError("trend %s not understood" % trend) is_recarray = _is_recarray(x) is_pandas = _is_using_pandas(x, None) or is_recarray if is_pandas or is_recarray: if is_recarray: descr = x.dtype.descr x = pd.DataFrame.from_records(x) elif isinstance(x, pd.Series): x = pd.DataFrame(x) else: x = x.copy() else: x = np.asanyarray(x) nobs = len(x) trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64), trendorder + 1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:, 1] if "c" in trend: if is_pandas or is_recarray: # Mixed type protection def safe_is_const(s): try: return np.ptp(s) == 0.0 and np.any(s != 0.0) except: return False col_const = x.apply(safe_is_const, 0) else: col_const = np.logical_and(np.any(np.ptp(np.asanyarray(x), axis=0) == 0, axis=0), np.all(x != 0.0, axis=0)) if np.any(col_const): if has_constant == 'raise': raise ValueError("x already contains a constant") elif has_constant == 'skip': columns = columns[1:] trendarr = trendarr[:, 1:] order = 1 if prepend else -1 if is_recarray or is_pandas: trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns) x = [trendarr, x] x = pd.concat(x[::order], 1) else: x = [trendarr, x] x = np.column_stack(x[::order]) if is_recarray: x = x.to_records(index=False, convert_datetime64=False) new_descr = x.dtype.descr extra_col = len(new_descr) - len(descr) descr = new_descr[:extra_col] + descr if prepend else descr + new_descr[-extra_col:] x = x.astype(np.dtype(descr)) return x
def add_constant(data, prepend=False): ''' This appends a column of ones to an array if prepend==False. For ndarrays and pandas.DataFrames, checks to make sure a constant is not already included. If there is at least one column of ones then the original object is returned. Does not check for a constant if a structured or recarray is given. Parameters ---------- data : array-like `data` is the column-ordered design matrix prepend : bool True and the constant is prepended rather than appended. Returns ------- data : array The original array with a constant (column of ones) as the first or last column. Notes ----- .. WARNING:: The default of prepend will be changed to True in the next release of statsmodels. We recommend to use an explicit prepend in any permanent code. ''' if not prepend: import inspect frame = inspect.currentframe().f_back info = inspect.getframeinfo(frame) try: # info.code_context is None on python 2.6? Why? to_warn = (info.code_context is not None and 'prepend' not in '\n'.join(info.code_context)) except: # python 2.5 compatibility to_warn = 'prepend' not in '\n'.join(info[3]) if to_warn: import warnings warnings.warn("The default of `prepend` will be changed to True " "in 0.5.0, use explicit prepend", FutureWarning) if _is_using_pandas(data, None): # work on a copy return _pandas_add_constant(data.copy(), prepend) else: data = np.asarray(data) if not data.dtype.names: var0 = data.var(0) == 0 if np.any(var0): return data data = np.column_stack((data, np.ones((data.shape[0], 1)))) if prepend: return np.roll(data, 1, 1) else: return_rec = data.__class__ is np.recarray if prepend: ones = np.ones((data.shape[0], 1), dtype=[('const', float)]) data = nprf.append_fields(ones, data.dtype.names, [data[i] for i in data.dtype.names], usemask=False, asrecarray=return_rec) else: data = nprf.append_fields(data, 'const', np.ones(data.shape[0]), usemask=False, asrecarray = return_rec) return data
def __init__(self, endog, exog=None, smoother=None, alpha=0, family=None, offset=None, exposure=None, missing='none', **kwargs): # TODO: check usage of hasconst hasconst = kwargs.get('hasconst', None) xnames_linear = None if hasattr(exog, 'design_info'): self.design_info_linear = exog.design_info xnames_linear = self.design_info_linear.column_names is_pandas = _is_using_pandas(exog, None) # TODO: handle data is experimental, see #5469 # This is a bit wasteful because we need to `handle_data twice` self.data_linear = self._handle_data(endog, exog, missing, hasconst) if xnames_linear is None: xnames_linear = self.data_linear.xnames if exog is not None: exog_linear = np.asarray(exog) k_exog_linear = exog_linear.shape[1] else: exog_linear = None k_exog_linear = 0 self.k_exog_linear = k_exog_linear # We need exog_linear for k-fold cross validation # TODO: alternative is to take columns from combined exog self.exog_linear = exog_linear self.smoother = smoother self.k_smooths = smoother.k_variables self.alpha = self._check_alpha(alpha) penal = MultivariateGamPenalty(smoother, alpha=self.alpha, start_idx=k_exog_linear) kwargs.pop('penal', None) if exog_linear is not None: exog = np.column_stack((exog_linear, smoother.basis)) else: exog = smoother.basis # TODO: check: xnames_linear will be None instead of empty list # if no exog_linear # can smoother be empty ? I guess not allowed. if xnames_linear is None: xnames_linear = [] xnames = xnames_linear + self.smoother.col_names if is_pandas and exog_linear is not None: # we a dataframe so we can get a PandasData instance for wrapping exog = pd.DataFrame(exog, index=self.data_linear.row_labels, columns=xnames) super(GLMGam, self).__init__(endog, exog=exog, family=family, offset=offset, exposure=exposure, penal=penal, missing=missing, **kwargs) if not is_pandas: # set exog nanmes if not given by pandas DataFrame self.exog_names[:] = xnames # TODO: the generic data handling might attach the design_info from the # linear part, but this is incorrect for the full model and # causes problems in wald_test_terms if hasattr(self.data, 'design_info'): del self.data.design_info # formula also might be attached which causes problems in predict if hasattr(self, 'formula'): self.formula_linear = self.formula self.formula = None del self.formula
def lagmat( x, maxlag: int, trim: Literal["forward", "backward", "both", "none"] = 'forward', original: Literal["ex", "sep", "in"] = "ex", use_pandas: bool = False ) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame, DataFrame]: """ Create 2d array of lags. Parameters ---------- x : array_like Data; if 2d, observation in rows and variables in columns. maxlag : int All lags from zero to maxlag are included. trim : {'forward', 'backward', 'both', 'none', None} The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none', None : no trimming of observations. original : {'ex','sep','in'} How the original is treated. * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : ndarray The array with lagged observations. y : ndarray, optional Only returned if original == 'sep'. Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) """ maxlag = int_like(maxlag, "maxlag") use_pandas = bool_like(use_pandas, "use_pandas") trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) original = string_like(original, "original", options=("ex", "sep", "in")) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, "x", ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = "none" if trim is None else trim trim = trim.lower() if is_pandas and trim in ("none", "backward"): raise ValueError("trim cannot be 'none' or 'backward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ["ex", "sep"]: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x if trim in ("none", "forward"): startobs = 0 elif trim in ("backward", "both"): startobs = maxlag else: raise ValueError("trim option not valid") if trim in ("none", "backward"): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + ".L." + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ("sep", "ex"): leads = lags[x_columns] lags = lags.drop(x_columns, axis=1) else: lags = lm[startobs:stopobs, dropidx:] if original == "sep": leads = lm[startobs:stopobs, :dropidx] if original == "sep": return lags, leads else: return lags
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim="forward", use_pandas=False): """ Generate lagmatrix for 2d array, columns arranged by variables. Parameters ---------- x : array_like Data, 2d. Observations in rows and variables in columns. maxlag0 : int The first variable all lags from zero to maxlag are included. maxlagex : {None, int} The max lag for all other variables all lags from zero to maxlag are included. dropex : int Exclude first dropex lags from other variables. For all variables, except the first, lags from dropex to maxlagex are included. trim : str The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none' : no trimming of observations. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- ndarray The array with lagged observations, columns ordered by variable. Notes ----- Inefficient implementation for unequal lags, implemented for convenience. """ maxlag0 = int_like(maxlag0, "maxlag0") maxlagex = int_like(maxlagex, "maxlagex", optional=True) trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) is_pandas = _is_using_pandas(x, None) if x.ndim == 1: if is_pandas: x = pd.DataFrame(x) else: x = x[:, None] elif x.ndim == 0 or x.ndim > 2: raise ValueError("Only supports 1 and 2-dimensional data.") nobs, nvar = x.shape if is_pandas and use_pandas: lags = lagmat(x.iloc[:, 0], maxlag, trim=trim, original="in", use_pandas=True) lagsli = [lags.iloc[:, :maxlag0 + 1]] for k in range(1, nvar): lags = lagmat(x.iloc[:, k], maxlag, trim=trim, original="in", use_pandas=True) lagsli.append(lags.iloc[:, dropex:maxlagex + 1]) return pd.concat(lagsli, axis=1) elif is_pandas: x = np.asanyarray(x) lagsli = [ lagmat(x[:, 0], maxlag, trim=trim, original="in")[:, :maxlag0 + 1] ] for k in range(1, nvar): lagsli.append( lagmat(x[:, k], maxlag, trim=trim, original="in")[:, dropex:maxlagex + 1]) return np.column_stack(lagsli)
def add_trend(x, trend="c", prepend=False, has_constant="skip"): """ Add a trend and/or constant to an array. Parameters ---------- x : array_like Original array of data. trend : str {'n', 'c', 't', 'ct', 'ctt'} The trend to add. * 'n' add no trend. * 'c' add constant only. * 't' add trend only. * 'ct' add constant and linear trend. * 'ctt' add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. has_constant : str {'raise', 'add', 'skip'} Controls what happens when trend is 'c' and a constant column already exists in x. 'raise' will raise an error. 'add' will add a column of 1s. 'skip' will return the data without change. 'skip' is the default. Returns ------- array_like The original data with the additional trend columns. If x is a pandas Series or DataFrame, then the trend column names are 'const', 'trend' and 'trend_squared'. See Also -------- statsmodels.tools.tools.add_constant Add a constant column to an array. Notes ----- Returns columns as ['ctt','ct','c'] whenever applicable. There is currently no checking for an existing trend. """ prepend = bool_like(prepend, "prepend") trend = string_like(trend, "trend", options=("n", "c", "t", "ct", "ctt")) has_constant = string_like(has_constant, "has_constant", options=("raise", "add", "skip")) # TODO: could be generalized for trend of aribitrary order columns = ["const", "trend", "trend_squared"] if trend == "n": return x.copy() elif trend == "c": # handles structured arrays columns = columns[:1] trendorder = 0 elif trend == "ct" or trend == "t": columns = columns[:2] if trend == "t": columns = columns[1:2] trendorder = 1 elif trend == "ctt": trendorder = 2 if _is_recarray(x): from statsmodels.tools.sm_exceptions import recarray_exception raise NotImplementedError(recarray_exception) is_pandas = _is_using_pandas(x, None) if is_pandas: if isinstance(x, pd.Series): x = pd.DataFrame(x) else: x = x.copy() else: x = np.asanyarray(x) nobs = len(x) trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64), trendorder + 1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:, 1] if "c" in trend: if is_pandas: # Mixed type protection def safe_is_const(s): try: return np.ptp(s) == 0.0 and np.any(s != 0.0) except: return False col_const = x.apply(safe_is_const, 0) else: ptp0 = np.ptp(np.asanyarray(x), axis=0) col_is_const = ptp0 == 0 nz_const = col_is_const & (x[0] != 0) col_const = nz_const if np.any(col_const): if has_constant == "raise": if x.ndim == 1: base_err = "x is constant." else: columns = np.arange(x.shape[1])[col_const] if isinstance(x, pd.DataFrame): columns = x.columns const_cols = ", ".join([str(c) for c in columns]) base_err = ( "x contains one or more constant columns. Column(s) " f"{const_cols} are constant.") msg = f"{base_err} Adding a constant with trend='{trend}' is not allowed." raise ValueError(msg) elif has_constant == "skip": columns = columns[1:] trendarr = trendarr[:, 1:] order = 1 if prepend else -1 if is_pandas: trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns) x = [trendarr, x] x = pd.concat(x[::order], axis=1) else: x = [trendarr, x] x = np.column_stack(x[::order]) return x
def __init__(self, endog, exog=None, order=(1, 0), trend='c', error_cov_type='unstructured', measurement_error=False, enforce_stationarity=True, enforce_invertibility=True, **kwargs): # Model parameters self.error_cov_type = error_cov_type self.measurement_error = measurement_error self.enforce_stationarity = enforce_stationarity self.enforce_invertibility = enforce_invertibility # Save the given orders self.order = order self.trend = trend # Model orders self.k_ar = int(order[0]) self.k_ma = int(order[1]) self.k_trend = int(self.trend == 'c') # Check for valid model if trend not in ['c', 'nc']: raise ValueError('Invalid trend specification.') if error_cov_type not in ['diagonal', 'unstructured']: raise ValueError('Invalid error covariance matrix type' ' specification.') if self.k_ar == 0 and self.k_ma == 0: raise ValueError('Invalid VARMAX(p,q) specification; at least one' ' p,q must be greater than zero.') # Warn for VARMA model if self.k_ar > 0 and self.k_ma > 0: warn('Estimation of VARMA(p,q) models is not generically robust,' ' due especially to identification issues.') # Exogenous data self.k_exog = 0 if exog is not None: exog_is_using_pandas = _is_using_pandas(exog, None) if not exog_is_using_pandas: exog = np.asarray(exog) # Make sure we have 2-dimensional array if exog.ndim == 1: if not exog_is_using_pandas: exog = exog[:, None] else: exog = pd.DataFrame(exog) self.k_exog = exog.shape[1] # Note: at some point in the future might add state regression, as in # SARIMAX. self.mle_regression = self.k_exog > 0 # We need to have an array or pandas at this point if not _is_using_pandas(endog, None): endog = np.asanyarray(endog) # Model order # Used internally in various places _min_k_ar = max(self.k_ar, 1) self._k_order = _min_k_ar + self.k_ma # Number of states k_endog = endog.shape[1] k_posdef = k_endog k_states = k_endog * self._k_order # By default, initialize as stationary kwargs.setdefault('initialization', 'stationary') # By default, use LU decomposition kwargs.setdefault('inversion_method', INVERT_UNIVARIATE | SOLVE_LU) # Initialize the state space model super(VARMAX, self).__init__( endog, exog=exog, k_states=k_states, k_posdef=k_posdef, **kwargs ) # Initialize the parameters self.parameters = OrderedDict() self.parameters['trend'] = self.k_endog * self.k_trend self.parameters['ar'] = self.k_endog**2 * self.k_ar self.parameters['ma'] = self.k_endog**2 * self.k_ma self.parameters['regression'] = self.k_endog * self.k_exog if self.error_cov_type == 'diagonal': self.parameters['state_cov'] = self.k_endog # These parameters fill in a lower-triangular matrix which is then # dotted with itself to get a positive definite matrix. elif self.error_cov_type == 'unstructured': self.parameters['state_cov'] = ( int(self.k_endog * (self.k_endog + 1) / 2) ) self.parameters['obs_cov'] = self.k_endog * self.measurement_error self.k_params = sum(self.parameters.values()) # Initialize known elements of the state space matrices # If we have exog effects, then the state intercept needs to be # time-varying if self.k_exog > 0: self.ssm['state_intercept'] = np.zeros((self.k_states, self.nobs)) # The design matrix is just an identity for the first k_endog states idx = np.diag_indices(self.k_endog) self.ssm[('design',) + idx] = 1 # The transition matrix is described in four blocks, where the upper # left block is in companion form with the autoregressive coefficient # matrices (so it is shaped k_endog * k_ar x k_endog * k_ar) ... if self.k_ar > 0: idx = np.diag_indices((self.k_ar - 1) * self.k_endog) idx = idx[0] + self.k_endog, idx[1] self.ssm[('transition',) + idx] = 1 # ... and the lower right block is in companion form with zeros as the # coefficient matrices (it is shaped k_endog * k_ma x k_endog * k_ma). idx = np.diag_indices((self.k_ma - 1) * self.k_endog) idx = (idx[0] + (_min_k_ar + 1) * self.k_endog, idx[1] + _min_k_ar * self.k_endog) self.ssm[('transition',) + idx] = 1 # The selection matrix is described in two blocks, where the upper # block selects the all k_posdef errors in the first k_endog rows # (the upper block is shaped k_endog * k_ar x k) and the lower block # also selects all k_posdef errors in the first k_endog rows (the lower # block is shaped k_endog * k_ma x k). idx = np.diag_indices(self.k_endog) self.ssm[('selection',) + idx] = 1 idx = idx[0] + _min_k_ar * self.k_endog, idx[1] if self.k_ma > 0: self.ssm[('selection',) + idx] = 1 # Cache some indices if self.trend == 'c' and self.k_exog == 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog] elif self.k_exog > 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog, :] if self.k_ar > 0: self._idx_transition = np.s_['transition', :k_endog, :] else: self._idx_transition = np.s_['transition', :k_endog, k_endog:] if self.error_cov_type == 'diagonal': self._idx_state_cov = ( ('state_cov',) + np.diag_indices(self.k_endog)) elif self.error_cov_type == 'unstructured': self._idx_lower_state_cov = np.tril_indices(self.k_endog) if self.measurement_error: self._idx_obs_cov = ('obs_cov',) + np.diag_indices(self.k_endog) # Cache some slices def _slice(key, offset): length = self.parameters[key] param_slice = np.s_[offset:offset + length] offset += length return param_slice, offset offset = 0 self._params_trend, offset = _slice('trend', offset) self._params_ar, offset = _slice('ar', offset) self._params_ma, offset = _slice('ma', offset) self._params_regression, offset = _slice('regression', offset) self._params_state_cov, offset = _slice('state_cov', offset) self._params_obs_cov, offset = _slice('obs_cov', offset)
def __init__(self, endog=None, exog=None, order=None, seasonal_order=None, ar_order=None, diff=None, ma_order=None, seasonal_ar_order=None, seasonal_diff=None, seasonal_ma_order=None, seasonal_periods=None, trend=None, enforce_stationarity=None, enforce_invertibility=None, concentrate_scale=None, trend_offset=1, dates=None, freq=None, missing='none', validate_specification=True): # Basic parameters self.enforce_stationarity = enforce_stationarity self.enforce_invertibility = enforce_invertibility self.concentrate_scale = concentrate_scale self.trend_offset = trend_offset # Validate that we were not given conflicting specifications has_order = order is not None has_specific_order = (ar_order is not None or diff is not None or ma_order is not None) has_seasonal_order = seasonal_order is not None has_specific_seasonal_order = (seasonal_ar_order is not None or seasonal_diff is not None or seasonal_ma_order is not None or seasonal_periods is not None) if has_order and has_specific_order: raise ValueError('Cannot specify both `order` and either of' ' `ar_order` or `ma_order`.') if has_seasonal_order and has_specific_seasonal_order: raise ValueError('Cannot specify both `seasonal_order` and any of' ' `seasonal_ar_order`, `seasonal_ma_order`,' ' or `seasonal_periods`.') # Compute `order` if has_specific_order: ar_order = 0 if ar_order is None else ar_order diff = 0 if diff is None else diff ma_order = 0 if ma_order is None else ma_order order = (ar_order, diff, ma_order) elif not has_order: order = (0, 0, 0) # Compute `seasonal_order` if has_specific_seasonal_order: seasonal_ar_order = (0 if seasonal_ar_order is None else seasonal_ar_order) seasonal_diff = 0 if seasonal_diff is None else seasonal_diff seasonal_ma_order = (0 if seasonal_ma_order is None else seasonal_ma_order) seasonal_periods = (0 if seasonal_periods is None else seasonal_periods) seasonal_order = (seasonal_ar_order, seasonal_diff, seasonal_ma_order, seasonal_periods) elif not has_seasonal_order: seasonal_order = (0, 0, 0, 0) # Validate shapes of `order`, `seasonal_order` if len(order) != 3: raise ValueError('`order` argument must be an iterable with three' ' elements.') if len(seasonal_order) != 4: raise ValueError('`seasonal_order` argument must be an iterable' ' with four elements.') # Validate differencing parameters if validate_specification: if order[1] < 0: raise ValueError('Cannot specify negative differencing.') if order[1] != int(order[1]): raise ValueError('Cannot specify fractional differencing.') if seasonal_order[1] < 0: raise ValueError('Cannot specify negative seasonal' ' differencing.') if seasonal_order[1] != int(seasonal_order[1]): raise ValueError('Cannot specify fractional seasonal' ' differencing.') if seasonal_order[3] < 0: raise ValueError('Cannot specify negative seasonal' ' periodicity.') # Standardize to integers or lists of integers order = (standardize_lag_order(order[0], 'AR'), int(order[1]), standardize_lag_order(order[2], 'MA')) seasonal_order = (standardize_lag_order(seasonal_order[0], 'seasonal AR'), int(seasonal_order[1]), standardize_lag_order(seasonal_order[2], 'seasonal MA'), int(seasonal_order[3])) # Validate seasonals if validate_specification: if seasonal_order[3] == 1: raise ValueError('Seasonal periodicity must be greater' ' than 1.') if ((seasonal_order[0] != 0 or seasonal_order[1] != 0 or seasonal_order[2] != 0) and seasonal_order[3] == 0): raise ValueError('Must include nonzero seasonal periodicity if' ' including seasonal AR, MA, or' ' differencing.') # Basic order self.order = order self.ar_order, self.diff, self.ma_order = order self.seasonal_order = seasonal_order (self.seasonal_ar_order, self.seasonal_diff, self.seasonal_ma_order, self.seasonal_periods) = seasonal_order # Lists of included lags if isinstance(self.ar_order, list): self.ar_lags = self.ar_order else: self.ar_lags = np.arange(1, self.ar_order + 1).tolist() if isinstance(self.ma_order, list): self.ma_lags = self.ma_order else: self.ma_lags = np.arange(1, self.ma_order + 1).tolist() if isinstance(self.seasonal_ar_order, list): self.seasonal_ar_lags = self.seasonal_ar_order else: self.seasonal_ar_lags = (np.arange(1, self.seasonal_ar_order + 1).tolist()) if isinstance(self.seasonal_ma_order, list): self.seasonal_ma_lags = self.seasonal_ma_order else: self.seasonal_ma_lags = (np.arange(1, self.seasonal_ma_order + 1).tolist()) # Maximum lag orders self.max_ar_order = self.ar_lags[-1] if self.ar_lags else 0 self.max_ma_order = self.ma_lags[-1] if self.ma_lags else 0 self.max_seasonal_ar_order = (self.seasonal_ar_lags[-1] if self.seasonal_ar_lags else 0) self.max_seasonal_ma_order = (self.seasonal_ma_lags[-1] if self.seasonal_ma_lags else 0) self.max_reduced_ar_order = ( self.max_ar_order + self.max_seasonal_ar_order * self.seasonal_periods) self.max_reduced_ma_order = ( self.max_ma_order + self.max_seasonal_ma_order * self.seasonal_periods) # Check that we don't have duplicate AR or MA lags from the seasonal # component ar_lags = set(self.ar_lags) seasonal_ar_lags = set( np.array(self.seasonal_ar_lags) * self.seasonal_periods) duplicate_ar_lags = ar_lags.intersection(seasonal_ar_lags) if validate_specification and len(duplicate_ar_lags) > 0: raise ValueError('Invalid model: autoregressive lag(s) %s are' ' in both the seasonal and non-seasonal' ' autoregressive components.' % duplicate_ar_lags) ma_lags = set(self.ma_lags) seasonal_ma_lags = set( np.array(self.seasonal_ma_lags) * self.seasonal_periods) duplicate_ma_lags = ma_lags.intersection(seasonal_ma_lags) if validate_specification and len(duplicate_ma_lags) > 0: raise ValueError('Invalid model: moving average lag(s) %s are' ' in both the seasonal and non-seasonal' ' moving average components.' % duplicate_ma_lags) # Handle trend self.trend = trend self.trend_poly, _ = prepare_trend_spec(trend) # Check for a constant column in the provided exog exog_is_pandas = _is_using_pandas(exog, None) if (validate_specification and exog is not None and len(self.trend_poly) > 0 and self.trend_poly[0] == 1): # Figure out if we have any constant columns x = np.asanyarray(exog) ptp0 = np.ptp(x, axis=0) col_is_const = ptp0 == 0 nz_const = col_is_const & (x[0] != 0) col_const = nz_const # If we already have a constant column, raise an error if np.any(col_const): raise ValueError('A constant trend was included in the model' ' specification, but the `exog` data already' ' contains a column of constants.') # This contains the included exponents of the trend polynomial, # where e.g. the constant term has exponent 0, a linear trend has # exponent 1, etc. self.trend_terms = np.where(self.trend_poly == 1)[0] # Trend order is either the degree of the trend polynomial, if all # exponents are included, or a list of included exponents. Here we need # to make a distinction between a degree zero polynomial (i.e. a # constant) and the zero polynomial (i.e. not even a constant). The # former has `trend_order = 0`, while the latter has # `trend_order = None`. self.k_trend = len(self.trend_terms) if len(self.trend_terms) == 0: self.trend_order = None self.trend_degree = None elif np.all(self.trend_terms == np.arange(len(self.trend_terms))): self.trend_order = self.trend_terms[-1] self.trend_degree = self.trend_terms[-1] else: self.trend_order = self.trend_terms self.trend_degree = self.trend_terms[-1] # Handle endog / exog # Standardize exog self.k_exog, exog = prepare_exog(exog) # Standardize endog (including creating a faux endog if necessary) faux_endog = endog is None if endog is None: endog = [] if exog is None else np.zeros(len(exog)) * np.nan # Add trend data into exog nobs = len(endog) if exog is None else len(exog) if self.trend_order is not None: # Add in the data trend_data = self.construct_trend_data(nobs, trend_offset) if exog is None: exog = trend_data elif exog_is_pandas: trend_data = pd.DataFrame(trend_data, index=exog.index, columns=self.construct_trend_names()) exog = pd.concat([trend_data, exog], axis=1) else: exog = np.c_[trend_data, exog] # Create an underlying time series model, to handle endog / exog, # especially validating shapes, retrieving names, and potentially # providing us with a time series index self._model = TimeSeriesModel(endog, exog=exog, dates=dates, freq=freq, missing=missing) self.endog = None if faux_endog else self._model.endog self.exog = self._model.exog # Validate endog shape if (validate_specification and not faux_endog and self.endog.ndim > 1 and self.endog.shape[1] > 1): raise ValueError('SARIMAX models require univariate `endog`. Got' ' shape %s.' % str(self.endog.shape)) self._has_missing = (None if faux_endog else np.any( np.isnan(self.endog)))
def __init__(self, endog, exog=None, order=(1, 0), trend='c', error_cov_type='unstructured', measurement_error=False, enforce_stationarity=True, enforce_invertibility=True, trend_offset=1, **kwargs): # Model parameters self.error_cov_type = error_cov_type self.measurement_error = measurement_error self.enforce_stationarity = enforce_stationarity self.enforce_invertibility = enforce_invertibility # Save the given orders self.order = order # Model orders self.k_ar = int(order[0]) self.k_ma = int(order[1]) # Check for valid model if error_cov_type not in ['diagonal', 'unstructured']: raise ValueError('Invalid error covariance matrix type' ' specification.') if self.k_ar == 0 and self.k_ma == 0: raise ValueError('Invalid VARMAX(p,q) specification; at least one' ' p,q must be greater than zero.') # Warn for VARMA model if self.k_ar > 0 and self.k_ma > 0: warn( 'Estimation of VARMA(p,q) models is not generically robust,' ' due especially to identification issues.', EstimationWarning) # Trend self.trend = trend self.trend_offset = trend_offset self.polynomial_trend, self.k_trend = prepare_trend_spec(self.trend) self._trend_is_const = (self.polynomial_trend.size == 1 and self.polynomial_trend[0] == 1) # Exogenous data (self.k_exog, exog) = prepare_exog(exog) # Note: at some point in the future might add state regression, as in # SARIMAX. self.mle_regression = self.k_exog > 0 # We need to have an array or pandas at this point if not _is_using_pandas(endog, None): endog = np.asanyarray(endog) # Model order # Used internally in various places _min_k_ar = max(self.k_ar, 1) self._k_order = _min_k_ar + self.k_ma # Number of states k_endog = endog.shape[1] k_posdef = k_endog k_states = k_endog * self._k_order # By default, initialize as stationary kwargs.setdefault('initialization', 'stationary') # By default, use LU decomposition kwargs.setdefault('inversion_method', INVERT_UNIVARIATE | SOLVE_LU) # Initialize the state space model super(VARMAX, self).__init__(endog, exog=exog, k_states=k_states, k_posdef=k_posdef, **kwargs) # Set as time-varying model if we have time-trend or exog if self.k_exog > 0 or (self.k_trend > 0 and not self._trend_is_const): self.ssm._time_invariant = False # Initialize the parameters self.parameters = OrderedDict() self.parameters['trend'] = self.k_endog * self.k_trend self.parameters['ar'] = self.k_endog**2 * self.k_ar self.parameters['ma'] = self.k_endog**2 * self.k_ma self.parameters['regression'] = self.k_endog * self.k_exog if self.error_cov_type == 'diagonal': self.parameters['state_cov'] = self.k_endog # These parameters fill in a lower-triangular matrix which is then # dotted with itself to get a positive definite matrix. elif self.error_cov_type == 'unstructured': self.parameters['state_cov'] = (int(self.k_endog * (self.k_endog + 1) / 2)) self.parameters['obs_cov'] = self.k_endog * self.measurement_error self.k_params = sum(self.parameters.values()) # Initialize trend data self._trend_data = prepare_trend_data(self.polynomial_trend, self.k_trend, self.nobs, offset=self.trend_offset) # Initialize known elements of the state space matrices # If we have exog effects, then the state intercept needs to be # time-varying if (self.k_trend > 0 and not self._trend_is_const) or self.k_exog > 0: self.ssm['state_intercept'] = np.zeros((self.k_states, self.nobs)) # self.ssm['obs_intercept'] = np.zeros((self.k_endog, self.nobs)) # The design matrix is just an identity for the first k_endog states idx = np.diag_indices(self.k_endog) self.ssm[('design', ) + idx] = 1 # The transition matrix is described in four blocks, where the upper # left block is in companion form with the autoregressive coefficient # matrices (so it is shaped k_endog * k_ar x k_endog * k_ar) ... if self.k_ar > 0: idx = np.diag_indices((self.k_ar - 1) * self.k_endog) idx = idx[0] + self.k_endog, idx[1] self.ssm[('transition', ) + idx] = 1 # ... and the lower right block is in companion form with zeros as the # coefficient matrices (it is shaped k_endog * k_ma x k_endog * k_ma). idx = np.diag_indices((self.k_ma - 1) * self.k_endog) idx = (idx[0] + (_min_k_ar + 1) * self.k_endog, idx[1] + _min_k_ar * self.k_endog) self.ssm[('transition', ) + idx] = 1 # The selection matrix is described in two blocks, where the upper # block selects the all k_posdef errors in the first k_endog rows # (the upper block is shaped k_endog * k_ar x k) and the lower block # also selects all k_posdef errors in the first k_endog rows (the lower # block is shaped k_endog * k_ma x k). idx = np.diag_indices(self.k_endog) self.ssm[('selection', ) + idx] = 1 idx = idx[0] + _min_k_ar * self.k_endog, idx[1] if self.k_ma > 0: self.ssm[('selection', ) + idx] = 1 # Cache some indices if self._trend_is_const and self.k_exog == 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog, :] elif self.k_trend > 0 or self.k_exog > 0: self._idx_state_intercept = np.s_['state_intercept', :k_endog, :-1] if self.k_ar > 0: self._idx_transition = np.s_['transition', :k_endog, :] else: self._idx_transition = np.s_['transition', :k_endog, k_endog:] if self.error_cov_type == 'diagonal': self._idx_state_cov = (('state_cov', ) + np.diag_indices(self.k_endog)) elif self.error_cov_type == 'unstructured': self._idx_lower_state_cov = np.tril_indices(self.k_endog) if self.measurement_error: self._idx_obs_cov = ('obs_cov', ) + np.diag_indices(self.k_endog) # Cache some slices def _slice(key, offset): length = self.parameters[key] param_slice = np.s_[offset:offset + length] offset += length return param_slice, offset offset = 0 self._params_trend, offset = _slice('trend', offset) self._params_ar, offset = _slice('ar', offset) self._params_ma, offset = _slice('ma', offset) self._params_regression, offset = _slice('regression', offset) self._params_state_cov, offset = _slice('state_cov', offset) self._params_obs_cov, offset = _slice('obs_cov', offset) # Update _init_keys attached by super self._init_keys += [ 'order', 'trend', 'error_cov_type', 'measurement_error', 'enforce_stationarity', 'enforce_invertibility' ] + list(kwargs.keys())
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False): """ Create 2d array of lags Parameters ---------- x : array_like, 1d or 2d data; if 2d, observation in rows and variables in columns maxlag : int all lags from zero to maxlag are included trim : str {'forward', 'backward', 'both', 'none'} or None * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none', None : no trimming of observations original : str {'ex','sep','in'} * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations y : 2d array, optional Only returned if original == 'sep' Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. """ # TODO: allow list of lags additional to maxlag is_pandas = _is_using_pandas(x, None) and use_pandas trim = 'none' if trim is None else trim trim = trim.lower() if is_pandas and trim in ('none', 'backward'): raise ValueError("trim cannot be 'none' or 'forward' when used on " "Series or DataFrames") xa = np.asarray(x) dropidx = 0 if xa.ndim == 1: xa = xa[:, None] nobs, nvar = xa.shape if original in ['ex', 'sep']: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1)] = xa if trim in ('none', 'forward'): startobs = 0 elif trim in ('backward', 'both'): startobs = maxlag else: raise ValueError('trim option not valid') if trim in ('none', 'backward'): stopobs = len(lm) else: stopobs = nobs if is_pandas: x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + '.L.' + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ('sep', 'ex'): leads = lags[x_columns] lags = lags.drop(x_columns, 1) else: lags = lm[startobs:stopobs, dropidx:] if original == 'sep': leads = lm[startobs:stopobs, :dropidx] if original == 'sep': return lags, leads else: return lags