Example #1
0
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X),
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X),
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula,
                               Y,
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula,
                               Y,
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    return result, missing_mask
Example #2
0
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info
Example #3
0
def concat(series, axis=0, allow_mix=False):
    """
    Concatenate a set of series.

    Parameters
    ----------
    series : iterable
        An iterable of series to be concatenated
    axis : int, optional
        The axis along which to concatenate. Default is 1 (columns).
    allow_mix : bool
        Whether or not to allow a mix of pandas and non-pandas objects. Default
        is False. If true, the returned object is an ndarray, and additional
        pandas metadata (e.g. column names, indices, etc) is lost.

    Returns
    -------
    concatenated : array or pd.DataFrame
        The concatenated array. Will be a DataFrame if series are pandas
        objects.
    """
    is_pandas = np.r_[[_is_using_pandas(s, None) for s in series]]

    if np.all(is_pandas):
        concatenated = pd.concat(series, axis=axis)
    elif np.all(~is_pandas) or allow_mix:
        concatenated = np.concatenate(series, axis=axis)
    else:
        raise ValueError('Attempted to concatenate Pandas objects with'
                         ' non-Pandas objects with `allow_mix=False`.')

    return concatenated
    def sort(self, data, index=None):
        """Applies a (potentially hierarchical) sort operation on a numpy array
        or pandas series/dataframe based on the grouping index or a
        user-supplied index.  Returns an object of the same type as the
        original data as well as the matching (sorted) Pandas index.
        """

        if index is None:
            index = self.index
        if data_util._is_using_ndarray_type(data, None):
            if data.ndim == 1:
                out = pd.Series(data, index=index, copy=True)
                out = out.sort_index()
            else:
                out = pd.DataFrame(data, index=index)
                out = out.sort_index(inplace=False)  # copies
            return np.array(out), out.index
        elif data_util._is_using_pandas(data, None):
            out = data
            out = out.reindex(index)  # copies?
            out = out.sort_index()
            return out, out.index
        else:
            msg = 'data must be a Numpy array or a Pandas Series/DataFrame'
            raise ValueError(msg)
Example #5
0
def handle_data(endog, exog, missing='none', hasconst=None, **kwargs):
    """
    Given inputs
    """
    # deal with lists and tuples up-front
    if isinstance(endog, (list, tuple)):
        endog = np.asarray(endog)
    if isinstance(exog, (list, tuple)):
        exog = np.asarray(exog)

    if data_util._is_using_ndarray_type(endog, exog):
        klass = ModelData
    elif data_util._is_using_pandas(endog, exog):
        klass = PandasData
    elif data_util._is_using_patsy(endog, exog):
        klass = PatsyData
    # keep this check last
    elif data_util._is_using_ndarray(endog, exog):
        klass = ModelData
    else:
        raise ValueError('unrecognized data structures: %s / %s' %
                         (type(endog), type(exog)))

    return klass(endog,
                 exog=exog,
                 missing=missing,
                 hasconst=hasconst,
                 **kwargs)
Example #6
0
    def __init__(self, fname, data, convert_dates=None, encoding="latin-1",
                 byteorder=None):
        warnings.warn(
            "StataWriter is deprecated as of 0.10.0 and will be removed in a "
            "future version.  Use pandas.DataFrame.to_stata or "
            "pandas.io.stata.StatWriter instead.",
            FutureWarning)

        self._convert_dates = convert_dates
        # attach nobs, nvars, data, varlist, typlist
        if data_util._is_using_pandas(data, None):
            self._prepare_pandas(data)

        elif data_util._is_array_like(data, None):
            data = np.asarray(data)
            if data_util._is_structured_ndarray(data):
                self._prepare_structured_array(data)
            else:
                if convert_dates is not None:
                    raise ValueError("Not able to convert dates in a plain"
                                     " ndarray.")
                self._prepare_ndarray(data)

        else: # pragma : no cover
            raise ValueError("Type %s for data not understood" % type(data))


        if byteorder is None:
            byteorder = sys.byteorder
        self._byteorder = _set_endianness(byteorder)
        self._encoding = encoding
        self._file = get_file_obj(fname, 'wb', encoding)
Example #7
0
def handle_data(endog, exog):
    """
    Given inputs
    """
    # deal with lists and tuples up-front
    if isinstance(endog, (list, tuple)):
        endog = np.asarray(endog)
    if isinstance(exog, (list, tuple)):
        exog = np.asarray(exog)

    if data_util._is_using_pandas(endog, exog):
        klass = PandasData
    elif data_util._is_using_larry(endog, exog):
        klass = LarryData
    elif data_util._is_using_timeseries(endog, exog):
        klass = TimeSeriesData
    elif data_util._is_using_patsy(endog, exog):
        klass = PatsyData
    # keep this check last
    elif data_util._is_using_ndarray(endog, exog):
        klass = ModelData
    else:
        raise ValueError("unrecognized data structures: %s / %s" % (type(endog), type(exog)))

    return klass(endog, exog=exog)
Example #8
0
    def sort(self, data, index=None):
        '''Applies a (potentially hierarchical) sort operation on a numpy array
        or pandas series/dataframe based on the grouping index or a
        user-supplied index.  Returns an object of the same type as the
        original data as well as the matching (sorted) Pandas index.
        '''

        if index is None:
            index = self.index
        if data_util._is_using_ndarray_type(data, None):
            if data.ndim == 1:
                out = pd.Series(data, index=index, copy=True)
                out = out.sort_index()
            else:
                out = pd.DataFrame(data, index=index)
                out = out.sort(inplace=False)  # copies
            return np.array(out), out.index
        elif data_util._is_using_pandas(data, None):
            out = data
            out = out.reindex(index)  # copies?
            out = out.sort_index()
            return out, out.index
        else:
            msg = 'data must be a Numpy array or a Pandas Series/DataFrame'
            raise ValueError(msg)
    def __init__(self,
                 fname,
                 data,
                 convert_dates=None,
                 encoding="latin-1",
                 byteorder=None):
        warnings.warn(
            "StataWriter is deprecated as of 0.10.0 and will be removed in a "
            "future version.  Use pandas.DataFrame.to_stata or "
            "pandas.io.stata.StatWriter instead.", FutureWarning)

        self._convert_dates = convert_dates
        # attach nobs, nvars, data, varlist, typlist
        if data_util._is_using_pandas(data, None):
            self._prepare_pandas(data)

        elif data_util._is_array_like(data, None):
            data = np.asarray(data)
            if data_util._is_structured_ndarray(data):
                self._prepare_structured_array(data)
            else:
                if convert_dates is not None:
                    raise ValueError("Not able to convert dates in a plain"
                                     " ndarray.")
                self._prepare_ndarray(data)

        else:  # pragma : no cover
            raise ValueError("Type %s for data not understood" % type(data))

        if byteorder is None:
            byteorder = sys.byteorder
        self._byteorder = _set_endianness(byteorder)
        self._encoding = encoding
        self._file = get_file_obj(fname, 'wb', encoding)
Example #10
0
def auto_arima(endog, freq=None, d=None, D=None, max_p=5, max_q=5, max_P=2, max_Q=2, max_order=5, max_d=2, max_D=1, start_p=2, start_q=2, start_P=1, start_Q=1, stationary=False,
               ic="aic", stepwise=True, trace=False, approximation=None,
               test="adf", seasonal_test="ch", allowdrift=True, allowmean=True, lambda_parameter=None, *args, **kwargs):
        # Parameter Validity Check
    if np.any(np.isnan(endog)):
        raise ValueError("Missing Values in Series")
    origin_endog = endog
    if _is_using_pandas(endog, None):
        endog = np.asarray(endog)
    if len(endog) <= 10:
        raise ValueError("There are too few observations.")
    if np.any(np.isnan(endog)):
        raise ValueError("NaN values in endogenous not allowed")
    if np.all(endog == endog[0]):
        raise ValueError("The endogenous variable is a constant")
    if (not isinstance(freq, int)) or freq <= 1:
        raise ValueError("The frequency parameter must be a integer greater than 1")
    if lambda_parameter is not None:
        if lambda_parameter < 0:
            raise ValueError("The Lambda parameter must be positive")
        if not np.all(endog > 0):
            raise ValueError("Box-Cox Transformation can be only used on positive series.")
        endog = boxcox(endog, lambda_parameter)

    max_p = max_p if max_p <= floor(len(endog) / 3) else floor(len(endog) / 3)
    max_q = max_q if max_q <= floor(len(endog) / 3) else floor(len(endog) / 3)
    max_P = max_P if max_P <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq)
    max_Q = max_Q if max_Q <= floor(len(endog) / 3 / freq) else floor(len(endog) / 3 / freq)
    if stationary:
        D = 0
        d = 0
    if freq == 1:
Example #11
0
    def __init__(self, fname, data, convert_dates=None, encoding="latin-1",
                 byteorder=None):

        self._convert_dates = convert_dates
        # attach nobs, nvars, data, varlist, typlist
        if data_util._is_using_pandas(data, None):
            self._prepare_pandas(data)

        elif data_util._is_array_like(data, None):
            data = np.asarray(data)
            if data_util._is_structured_ndarray(data):
                self._prepare_structured_array(data)
            else:
                if convert_dates is not None:
                    raise ValueError("Not able to convert dates in a plain"
                                     " ndarray.")
                self._prepare_ndarray(data)

        else: # pragma : no cover
            raise ValueError("Type %s for data not understood" % type(data))


        if byteorder is None:
            byteorder = sys.byteorder
        self._byteorder = _set_endianness(byteorder)
        self._encoding = encoding
        self._file = get_file_obj(fname, 'wb', encoding)
Example #12
0
    def __init__(self, fname, data, convert_dates=None, encoding="latin-1",
                 byteorder=None):

        self._convert_dates = convert_dates
        # attach nobs, nvars, data, varlist, typlist
        if data_util._is_using_pandas(data, None):
            self._prepare_pandas(data)

        elif data_util._is_array_like(data, None):
            data = np.asarray(data)
            if data_util._is_structured_ndarray(data):
                self._prepare_structured_array(data)
            else:
                if convert_dates is not None:
                    raise ValueError("Not able to convert dates in a plain"
                                     " ndarray.")
                self._prepare_ndarray(data)

        else: # pragma : no cover
            raise ValueError("Type %s for data not understood" % type(data))


        if byteorder is None:
            byteorder = sys.byteorder
        self._byteorder = _set_endianness(byteorder)
        self._encoding = encoding
        self._file = _open_file_binary_write(fname, encoding)
Example #13
0
def add_constant(data, prepend=True, has_constant='skip'):
    """
    Add a column of ones to an array.

    Parameters
    ----------
    data : array_like
        A column-ordered design matrix.
    prepend : bool
        If true, the constant is in the first column.  Else the constant is
        appended (last column).
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data`` already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if any column has a constant value. Using 'add' will add a
        column of 1s if a constant column is present.

    Returns
    -------
    array_like
        The original values with a constant (column of ones) as the first or
        last column. Returned value type depends on input type.

    Notes
    -----
    When the input is a pandas Series or DataFrame, the added column's name
    is 'const'.
    """
    if _is_using_pandas(data, None):
        from statsmodels.tsa.tsatools import add_trend
        return add_trend(data,
                         trend='c',
                         prepend=prepend,
                         has_constant=has_constant)

    # Special case for NumPy
    x = np.asarray(data)
    ndim = x.ndim
    if ndim == 1:
        x = x[:, None]
    elif x.ndim > 2:
        raise ValueError('Only implemented for 2-dimensional arrays')

    is_nonzero_const = np.ptp(x, axis=0) == 0
    is_nonzero_const &= np.all(x != 0.0, axis=0)
    if is_nonzero_const.any():
        if has_constant == 'skip':
            return x
        elif has_constant == 'raise':
            if ndim == 1:
                raise ValueError("data is constant.")
            else:
                columns = np.arange(x.shape[1])
                cols = ",".join([str(c) for c in columns[is_nonzero_const]])
                raise ValueError(f"Column(s) {cols} are constant.")

    x = [np.ones(x.shape[0]), x]
    x = x if prepend else x[::-1]
    return np.column_stack(x)
Example #14
0
def add_constant(data, prepend=True, has_constant='skip'):
    '''
    This appends a column of ones to an array if prepend==False.

    Parameters
    ----------
    data : array-like
        `data` is the column-ordered design matrix
    prepend : bool
        True and the constant is prepended rather than appended.
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data'' already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if a constant is present. Using 'add' will duplicate the
        constant, if one is present. Has no effect for structured or
        recarrays. There is no checking for a constant in this case.

    Returns
    -------
    data : array
        The original array with a constant (column of ones) as the first or
        last column.
    '''
    if _is_using_pandas(data, None):
        # work on a copy
        return _pandas_add_constant(data.copy(), prepend, has_constant)
    else:
        data = np.asarray(data)
    if not data.dtype.names:
        var0 = data.var(0) == 0
        if np.any(var0):
            if has_constant == 'raise':
                raise ValueError("data already contains a constant.")
            elif has_constant == 'skip':
                return data
            elif has_constant == 'add':
                pass
            else:
                raise ValueError("Option {0} not understood for "
                                 "has_constant.".format(has_constant))
        data = np.column_stack((data, np.ones((data.shape[0], 1))))
        if prepend:
            return np.roll(data, 1, 1)
    else:
        return_rec = data.__class__ is np.recarray
        if prepend:
            ones = np.ones((data.shape[0], 1), dtype=[('const', float)])
            data = nprf.append_fields(ones,
                                      data.dtype.names,
                                      [data[i] for i in data.dtype.names],
                                      usemask=False,
                                      asrecarray=return_rec)
        else:
            data = nprf.append_fields(data,
                                      'const',
                                      np.ones(data.shape[0]),
                                      usemask=False,
                                      asrecarray=return_rec)
    return data
Example #15
0
def _maybe_get_pandas_wrapper_freq(X, trim=None):
    if _is_using_pandas(X, None):
        index = X.index
        func = _get_pandas_wrapper(X, trim)
        freq = index.inferred_freq
        return func, freq
    else:
        return lambda x : x, None
Example #16
0
def _maybe_get_pandas_wrapper_freq(X, trim=None):
    if _is_using_pandas(X, None):
        index = X.index
        func = _get_pandas_wrapper(X, trim)
        freq = index.inferred_freq
        return func, freq
    else:
        return lambda x : x, None
Example #17
0
def add_constant(data, prepend=True, has_constant='skip'):
    """
    Add a column of ones to an array.

    Parameters
    ----------
    data : array_like
        A column-ordered design matrix.
    prepend : bool
        If true, the constant is in the first column.  Else the constant is
        appended (last column).
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data`` already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if a constant is present. Using 'add' will duplicate the
        constant, if one is present.

    Returns
    -------
    array_like
        The original values with a constant (column of ones) as the first or
        last column. Returned value type depends on input type.

    Notes
    -----
    When the input is recarray or a pandas Series or DataFrame, the added
    column's name is 'const'.
    """
    if _is_using_pandas(data, None) or _is_recarray(data):
        if _is_recarray(data):
            # deprecated: remove recarray support after 0.12
            import warnings
            from statsmodels.tools.sm_exceptions import recarray_warning
            warnings.warn(recarray_warning, FutureWarning)
        from statsmodels.tsa.tsatools import add_trend
        return add_trend(data,
                         trend='c',
                         prepend=prepend,
                         has_constant=has_constant)

    # Special case for NumPy
    x = np.asanyarray(data)
    if x.ndim == 1:
        x = x[:, None]
    elif x.ndim > 2:
        raise ValueError('Only implementd 2-dimensional arrays')

    is_nonzero_const = np.ptp(x, axis=0) == 0
    is_nonzero_const &= np.all(x != 0.0, axis=0)
    if is_nonzero_const.any():
        if has_constant == 'skip':
            return x
        elif has_constant == 'raise':
            raise ValueError("data already contains a constant")

    x = [np.ones(x.shape[0]), x]
    x = x if prepend else x[::-1]
    return np.column_stack(x)
Example #18
0
    def new_func(X, *args, **kwargs):
        # quick pass-through for do nothing case
        if not _is_using_pandas(X, None):
            return func(X, *args, **kwargs)

        wrapper_func = _get_pandas_wrapper(X, trim_head, trim_tail, names)
        ret = func(X, *args, **kwargs)
        ret = wrapper_func(ret)
        return ret
Example #19
0
    def new_func(X, *args, **kwargs):
        # quick pass-through for do nothing case
        if not _is_using_pandas(X, None):
            return func(X, *args, **kwargs)

        wrapper_func = _get_pandas_wrapper(X, trim_head, trim_tail, names)
        ret = func(X, *args, **kwargs)
        ret = wrapper_func(ret)
        return ret
Example #20
0
def diff(series, k_diff=1, k_seasonal_diff=None, k_seasons=1):
    r"""
    Difference a series simply and/or seasonally along the zero-th axis.

    Given a series (denoted :math:`y_t`), performs the differencing operation

    .. math::

        \Delta^d \Delta_s^D y_t

    where :math:`d =` `diff`, :math:`s =` `k_seasons`,
    :math:`D =` `seasonal\_diff`, and :math:`\Delta` is the difference
    operator.

    Parameters
    ----------
    series : array_like
        The series to be differenced.
    diff : int, optional
        The number of simple differences to perform. Default is 1.
    seasonal_diff : int or None, optional
        The number of seasonal differences to perform. Default is no seasonal
        differencing.
    k_seasons : int, optional
        The seasonal lag. Default is 1. Unused if there is no seasonal
        differencing.

    Returns
    -------
    differenced : array
        The differenced array.
    """
    pandas = _is_using_pandas(series, None)
    differenced = np.asanyarray(series) if not pandas else series

    # Seasonal differencing
    if k_seasonal_diff is not None:
        while k_seasonal_diff > 0:
            if not pandas:
                differenced = (
                    differenced[k_seasons:] - differenced[:-k_seasons]
                )
            else:
                differenced = differenced.diff(k_seasons)[k_seasons:]
            k_seasonal_diff -= 1

    # Simple differencing
    if not pandas:
        differenced = np.diff(differenced, k_diff, axis=0)
    else:
        while k_diff > 0:
            differenced = differenced.diff()[1:]
            k_diff -= 1
    return differenced
Example #21
0
def _maybe_get_pandas_wrapper(X, trim_head=None, trim_tail=None):
    """
    If using pandas returns a function to wrap the results, e.g., wrapper(X)
    trim is an integer for the symmetric truncation of the series in some
    filters.
    otherwise returns None
    """
    if _is_using_pandas(X, None):
        return _get_pandas_wrapper(X, trim_head, trim_tail)
    else:
        return lambda x : x
Example #22
0
def diff(series, k_diff=1, k_seasonal_diff=None, seasonal_periods=1):
    r"""
    Difference a series simply and/or seasonally along the zero-th axis.

    Given a series (denoted :math:`y_t`), performs the differencing operation

    .. math::

        \Delta^d \Delta_s^D y_t

    where :math:`d =` `diff`, :math:`s =` `seasonal_periods`,
    :math:`D =` `seasonal\_diff`, and :math:`\Delta` is the difference
    operator.

    Parameters
    ----------
    series : array_like
        The series to be differenced.
    diff : int, optional
        The number of simple differences to perform. Default is 1.
    seasonal_diff : int or None, optional
        The number of seasonal differences to perform. Default is no seasonal
        differencing.
    seasonal_periods : int, optional
        The seasonal lag. Default is 1. Unused if there is no seasonal
        differencing.

    Returns
    -------
    differenced : array
        The differenced array.
    """
    pandas = _is_using_pandas(series, None)
    differenced = np.asanyarray(series) if not pandas else series

    # Seasonal differencing
    if k_seasonal_diff is not None:
        while k_seasonal_diff > 0:
            if not pandas:
                differenced = (
                    differenced[seasonal_periods:] - differenced[:-seasonal_periods]
                )
            else:
                differenced = differenced.diff(seasonal_periods)[seasonal_periods:]
            k_seasonal_diff -= 1

    # Simple differencing
    if not pandas:
        differenced = np.diff(differenced, k_diff, axis=0)
    else:
        while k_diff > 0:
            differenced = differenced.diff()[1:]
            k_diff -= 1
    return differenced
Example #23
0
def _maybe_get_pandas_wrapper(X, trim_head=None, trim_tail=None):
    """
    If using pandas returns a function to wrap the results, e.g., wrapper(X)
    trim is an integer for the symmetric truncation of the series in some
    filters.
    otherwise returns None
    """
    if _is_using_pandas(X, None):
        return _get_pandas_wrapper(X, trim_head, trim_tail)
    else:
        return lambda x : x
Example #24
0
def add_constant(data, prepend=True, has_constant='skip'):
    '''
    This appends a column of ones to an array if prepend==False.

    Parameters
    ----------
    data : array-like
        `data` is the column-ordered design matrix
    prepend : bool
        True and the constant is prepended rather than appended.
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data'' already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if a constant is present. Using 'add' will duplicate the
        constant, if one is present. Has no effect for structured or
        recarrays. There is no checking for a constant in this case.

    Returns
    -------
    data : array
        The original array with a constant (column of ones) as the first or
        last column.
    '''
    if _is_using_pandas(data, None):
        # work on a copy
        return _pandas_add_constant(data.copy(), prepend, has_constant)
    else:
        data = np.asarray(data)
    if not data.dtype.names:
        var0 = data.var(0) == 0
        if np.any(var0):
            if has_constant == 'raise':
                raise ValueError("data already contains a constant.")
            elif has_constant == 'skip':
                return data
            elif has_constant == 'add':
                pass
            else:
                raise ValueError("Option {0} not understood for "
                                 "has_constant.".format(has_constant))
        data = np.column_stack((data, np.ones((data.shape[0], 1))))
        if prepend:
            return np.roll(data, 1, 1)
    else:
        return_rec = data.__class__ is np.recarray
        if prepend:
            ones = np.ones((data.shape[0], 1), dtype=[('const', float)])
            data = nprf.append_fields(ones, data.dtype.names,
                                      [data[i] for i in data.dtype.names],
                                      usemask=False, asrecarray=return_rec)
        else:
            data = nprf.append_fields(data, 'const', np.ones(data.shape[0]),
                                      usemask=False, asrecarray=return_rec)
    return data
Example #25
0
    def new_func(X, *args, **kwargs):
        # quick pass-through for do nothing case
        if not _is_using_pandas(X, None):
            return func(X, *args, **kwargs)

        wrapper_func = _get_pandas_wrapper(X, trim_head, trim_tail, columns)
        index = X.index
        freq = index.inferred_freq
        kwargs.update({freq_kw: freq_to_period(freq)})
        ret = func(X, *args, **kwargs)
        ret = wrapper_func(ret)
        return ret
Example #26
0
    def new_func(X, *args, **kwargs):
        # quick pass-through for do nothing case
        if not _is_using_pandas(X, None):
            return func(X, *args, **kwargs)

        wrapper_func = _get_pandas_wrapper(X, trim_head, trim_tail, columns)
        index = X.index
        freq = index.inferred_freq
        kwargs.update({freq_kw: freq_to_period(freq)})
        ret = func(X, *args, **kwargs)
        ret = wrapper_func(ret)
        return ret
Example #27
0
def add_constant(data, prepend=True):
    '''
    This appends a column of ones to an array if prepend==False.

    For ndarrays and pandas.DataFrames, checks to make sure a constant is not
    already included. If there is at least one column of ones then the
    original object is returned.  Does not check for a constant if a structured
    or recarray is
    given.

    Parameters
    ----------
    data : array-like
        `data` is the column-ordered design matrix
    prepend : bool
        True and the constant is prepended rather than appended.

    Returns
    -------
    data : array
        The original array with a constant (column of ones) as the first or
        last column.
    '''
    if _is_using_pandas(data, None):
        # work on a copy
        return _pandas_add_constant(data.copy(), prepend)
    else:
        data = np.asarray(data)
    if not data.dtype.names:
        var0 = data.var(0) == 0
        if np.any(var0):
            return data
        data = np.column_stack((data, np.ones((data.shape[0], 1))))
        if prepend:
            return np.roll(data, 1, 1)
    else:
        return_rec = data.__class__ is np.recarray
        if prepend:
            ones = np.ones((data.shape[0], 1), dtype=[('const', float)])
            data = nprf.append_fields(ones,
                                      data.dtype.names,
                                      [data[i] for i in data.dtype.names],
                                      usemask=False,
                                      asrecarray=return_rec)
        else:
            data = nprf.append_fields(data,
                                      'const',
                                      np.ones(data.shape[0]),
                                      usemask=False,
                                      asrecarray=return_rec)
    return data
Example #28
0
def handle_formula_data(Y, X, formula, depth=0):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            return dmatrices(formula, (Y, X), 2, return_type='dataframe')
        else:
            return dmatrices(formula, (Y, X), 2, return_type='dataframe')
    else:
        if data_util._is_using_pandas(Y, None):
            return dmatrices(formula, Y, 2, return_type='dataframe')
        else:
            return dmatrices(formula, Y, 2, return_type='dataframe')
Example #29
0
    def __init__(self, endog, exog, **kwargs):
        # Standardize data
        if not _is_using_pandas(endog, None):
            endog = np.asanyarray(endog)

        exog_is_using_pandas = _is_using_pandas(exog, None)
        if not exog_is_using_pandas:
            exog = np.asarray(exog)

        # Make sure we have 2-dimensional array
        if exog.ndim == 1:
            if not exog_is_using_pandas:
                exog = exog[:, None]
            else:
                exog = pd.DataFrame(exog)

        self.k_exog = exog.shape[1]

        # Handle coefficient initialization
        # By default, do not calculate likelihood while it is controlled by
        # diffuse initial conditions.
        kwargs.setdefault('loglikelihood_burn', self.k_exog)
        kwargs.setdefault('initialization', 'approximate_diffuse')
        kwargs.setdefault('initial_variance', 1e9)

        # Initialize the state space representation
        super(RecursiveLS, self).__init__(endog,
                                          k_states=self.k_exog,
                                          exog=exog,
                                          **kwargs)

        # Setup the state space representation
        self['design'] = self.exog[:, :, None].T
        self['transition'] = np.eye(self.k_states)

        # Notice that the filter output does not depend on the measurement
        # variance, so we set it here to 1
        self['obs_cov', 0, 0] = 1.
Example #30
0
def add_constant(data, prepend=True, has_constant='skip'):
    """
    Adds a column of ones to an array

    Parameters
    ----------
    data : array-like
        `data` is the column-ordered design matrix
    prepend : bool
        If true, the constant is in the first column.  Else the constant is
        appended (last column).
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data'' already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if a constant is present. Using 'add' will duplicate the
        constant, if one is present.

    Returns
    -------
    data : array, recarray or DataFrame
        The original values with a constant (column of ones) as the first or
        last column. Returned value depends on input type.

    Notes
    -----
    When the input is recarray or a pandas Series or DataFrame, the added
    column's name is 'const'.
    """
    if _is_using_pandas(data, None) or _is_recarray(data):
        from statsmodels.tsa.tsatools import add_trend
        return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant)

    # Special case for NumPy
    x = np.asanyarray(data)
    if x.ndim == 1:
        x = x[:,None]
    elif x.ndim > 2:
        raise ValueError('Only implementd 2-dimensional arrays')

    is_nonzero_const = np.ptp(x, axis=0) == 0
    is_nonzero_const &= np.all(x != 0.0, axis=0)
    if is_nonzero_const.any():
        if has_constant == 'skip':
            return x
        elif has_constant == 'raise':
            raise ValueError("data already contains a constant")

    x = [np.ones(x.shape[0]), x]
    x = x if prepend else x[::-1]
    return np.column_stack(x)
Example #31
0
    def __init__(self, endog, exog, **kwargs):
        # Standardize data
        if not _is_using_pandas(endog, None):
            endog = np.asanyarray(endog)

        exog_is_using_pandas = _is_using_pandas(exog, None)
        if not exog_is_using_pandas:
            exog = np.asarray(exog)

        # Make sure we have 2-dimensional array
        if exog.ndim == 1:
            if not exog_is_using_pandas:
                exog = exog[:, None]
            else:
                exog = pd.DataFrame(exog)

        self.k_exog = exog.shape[1]

        # Handle coefficient initialization
        # By default, do not calculate likelihood while it is controlled by
        # diffuse initial conditions.
        kwargs.setdefault('loglikelihood_burn', self.k_exog)
        kwargs.setdefault('initialization', 'approximate_diffuse')
        kwargs.setdefault('initial_variance', 1e9)

        # Initialize the state space representation
        super(RecursiveLS, self).__init__(
            endog, k_states=self.k_exog, exog=exog, **kwargs
        )

        # Setup the state space representation
        self['design'] = self.exog[:, :, None].T
        self['transition'] = np.eye(self.k_states)

        # Notice that the filter output does not depend on the measurement
        # variance, so we set it here to 1
        self['obs_cov', 0, 0] = 1.
Example #32
0
def handle_data_class_factory(endog, exog):
    """
    Given inputs
    """
    if data_util._is_using_ndarray_type(endog, exog):
        klass = ModelData
    elif data_util._is_using_pandas(endog, exog):
        klass = PandasData
    elif data_util._is_using_patsy(endog, exog):
        klass = PatsyData
    # keep this check last
    elif data_util._is_using_ndarray(endog, exog):
        klass = ModelData
    else:
        raise ValueError("unrecognized data structures: %s / %s" % (type(endog), type(exog)))
    return klass
Example #33
0
def _check_period_index(x, freq="M"):
    try:
        from pandas import PeriodIndex, DatetimeIndex
    except ImportError:  # not sure min. version
        PeriodIndex = DatetimeIndex  # HACK
    from statsmodels.tools.data import _is_using_pandas
    if not _is_using_pandas(x, None):
        raise ValueError("x must be a pandas object")
    if not isinstance(x.index, (DatetimeIndex, PeriodIndex)):
        raise ValueError("The index must be a DatetimeIndex or PeriodIndex")

    from statsmodels.tsa.base.datetools import _infer_freq
    inferred_freq = _infer_freq(x.index)
    if not inferred_freq.startswith(freq):
        raise ValueError("Expected frequency {}. Got {}".format(
            inferred_freq, freq))
Example #34
0
def _check_period_index(x, freq="M"):
    try:
        from pandas import PeriodIndex, DatetimeIndex
    except ImportError: # not sure min. version
        PeriodIndex = DatetimeIndex # HACK
    from statsmodels.tools.data import _is_using_pandas
    if not _is_using_pandas(x, None):
        raise ValueError("x must be a pandas object")
    if not isinstance(x.index, (DatetimeIndex, PeriodIndex)):
        raise ValueError("The index must be a DatetimeIndex or PeriodIndex")

    from statsmodels.tsa.base.datetools import _infer_freq
    inferred_freq = _infer_freq(x.index)
    if not inferred_freq.startswith(freq):
        raise ValueError("Expected frequency {}. Got {}".format(inferred_freq,
                                                                freq))
Example #35
0
def _maybe_get_pandas_wrapper(X, trim=None):
    """
    If using pandas returns a function to wrap the results, e.g., wrapper(X)
    trim is an integer for the symmetric truncation of the series in some
    filters.
    otherwise returns None
    """
    if _is_using_pandas(X, None):
        index = X.index
        if trim is not None:
            index = X.index[trim:-trim]
        if hasattr(X, "columns"):
            return lambda x: X.__class__(x, index=index, columns=X.columns)
        else:
            return lambda x: X.__class__(x, index=index, name=X.name)
    else:
        return
Example #36
0
def _transform_predict_exog(model, exog, design_info=None):
    """transform exog for predict using design_info

    Note: this is copied from base.model.Results.predict and converted to
    standalone function with additional options.


    """

    is_pandas = _is_using_pandas(exog, None)

    exog_index = exog.index if is_pandas else None

    if design_info is None:
        design_info = getattr(model.data, 'design_info', None)

    if design_info is not None and (exog is not None):
        from patsy import dmatrix
        if isinstance(exog, pd.Series):
            # we are guessing whether it should be column or row
            if (hasattr(exog, 'name') and isinstance(exog.name, str) and
                    exog.name in design_info.describe()):
                # assume we need one column
                exog = pd.DataFrame(exog)
            else:
                # assume we need a row
                exog = pd.DataFrame(exog).T
        orig_exog_len = len(exog)
        is_dict = isinstance(exog, dict)
        exog = dmatrix(design_info, exog, return_type="dataframe")
        if orig_exog_len > len(exog) and not is_dict:
            import warnings
            if exog_index is None:
                warnings.warn('nan values have been dropped', ValueWarning)
            else:
                exog = exog.reindex(exog_index)
        exog_index = exog.index

    if exog is not None:
        exog = np.asarray(exog)
        if exog.ndim == 1 and (model.exog.ndim == 1 or
                               model.exog.shape[1] == 1):
            exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]

    return exog, exog_index
def _transform_predict_exog(model, exog, design_info=None):
    """transform exog for predict using design_info

    Note: this is copied from base.model.Results.predict and converted to
    standalone function with additional options.


    """

    is_pandas = _is_using_pandas(exog, None)

    exog_index = exog.index if is_pandas else None

    if design_info is None:
        design_info = getattr(model.data, 'design_info', None)

    if design_info is not None and (exog is not None):
        from patsy import dmatrix
        if isinstance(exog, pd.Series):
            # we are guessing whether it should be column or row
            if (hasattr(exog, 'name') and isinstance(exog.name, str) and
                    exog.name in design_info.describe()):
                # assume we need one column
                exog = pd.DataFrame(exog)
            else:
                # assume we need a row
                exog = pd.DataFrame(exog).T
        orig_exog_len = len(exog)
        is_dict = isinstance(exog, dict)
        exog = dmatrix(design_info, exog, return_type="dataframe")
        if orig_exog_len > len(exog) and not is_dict:
            import warnings
            if exog_index is None:
                warnings.warn('nan values have been dropped', ValueWarning)
            else:
                exog = exog.reindex(exog_index)
        exog_index = exog.index

    if exog is not None:
        exog = np.asarray(exog)
        if exog.ndim == 1 and (model.exog.ndim == 1 or
                               model.exog.shape[1] == 1):
            exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]

    return exog, exog_index
Example #38
0
def handle_data_class_factory(endog, exog):
    """
    Given inputs
    """
    if data_util._is_using_ndarray_type(endog, exog):
        klass = ModelData
    elif data_util._is_using_pandas(endog, exog):
        klass = PandasData
    elif data_util._is_using_patsy(endog, exog):
        klass = PatsyData
    # keep this check last
    elif data_util._is_using_ndarray(endog, exog):
        klass = ModelData
    else:
        raise ValueError('unrecognized data structures: %s / %s' %
                         (type(endog), type(exog)))
    return klass
Example #39
0
def add_constant(data, prepend=True):
    '''
    This appends a column of ones to an array if prepend==False.

    For ndarrays and pandas.DataFrames, checks to make sure a constant is not
    already included. If there is at least one column of ones then the
    original object is returned.  Does not check for a constant if a structured
    or recarray is
    given.

    Parameters
    ----------
    data : array-like
        `data` is the column-ordered design matrix
    prepend : bool
        True and the constant is prepended rather than appended.

    Returns
    -------
    data : array
        The original array with a constant (column of ones) as the first or
        last column.
    '''
    if _is_using_pandas(data, None):
        # work on a copy
        return _pandas_add_constant(data.copy(), prepend)
    else:
        data = np.asarray(data)
    if not data.dtype.names:
        var0 = data.var(0) == 0
        if np.any(var0):
            return data
        data = np.column_stack((data, np.ones((data.shape[0], 1))))
        if prepend:
            return np.roll(data, 1, 1)
    else:
        return_rec = data.__class__ is np.recarray
        if prepend:
            ones = np.ones((data.shape[0], 1), dtype=[('const', float)])
            data = nprf.append_fields(ones, data.dtype.names,
                                      [data[i] for i in data.dtype.names],
                                      usemask=False, asrecarray=return_rec)
        else:
            data = nprf.append_fields(data, 'const', np.ones(data.shape[0]),
                                      usemask=False, asrecarray=return_rec)
    return data
Example #40
0
def _ensure_2d(x, ndarray=False):
    """

    Parameters
    ----------
    x : array, Series, DataFrame or None
        Input to verify dimensions, and to transform as necesary
    ndarray : bool
        Flag indicating whether to always return a NumPy array. Setting False
        will return an pandas DataFrame when the input is a Series or a
        DataFrame.

    Returns
    -------
    out : array, DataFrame or None
        array or DataFrame with 2 dimensiona.  One dimensional arrays are
        returned as nobs by 1. None is returned if x is None.
    names : list of str or None
        list containing variables names when the input is a pandas datatype.
        Returns None if the input is an ndarray.

    Notes
    -----
    Accepts None for simplicity
    """
    if x is None:
        return x
    is_pandas = _is_using_pandas(x, None)
    if x.ndim == 2:
        if is_pandas:
            return x, x.columns
        else:
            return x, None
    elif x.ndim > 2:
        raise ValueError('x mst be 1 or 2-dimensional.')

    name = x.name if is_pandas else None
    if ndarray:
        return np.asarray(x)[:, None], name
    else:
        return pd.DataFrame(x), name
Example #41
0
def _ensure_2d(x, ndarray=False):
    """

    Parameters
    ----------
    x : array, Series, DataFrame or None
        Input to verify dimensions, and to transform as necesary
    ndarray : bool
        Flag indicating whether to always return a NumPy array. Setting False
        will return an pandas DataFrame when the input is a Series or a
        DataFrame.

    Returns
    -------
    out : array, DataFrame or None
        array or DataFrame with 2 dimensiona.  One dimensional arrays are
        returned as nobs by 1. None is returned if x is None.
    names : list of str or None
        list containing variables names when the input is a pandas datatype.
        Returns None if the input is an ndarray.

    Notes
    -----
    Accepts None for simplicity
    """
    if x is None:
        return x
    is_pandas = _is_using_pandas(x, None)
    if x.ndim == 2:
        if is_pandas:
            return x, x.columns
        else:
            return x, None
    elif x.ndim > 2:
        raise ValueError('x mst be 1 or 2-dimensional.')

    name = x.name if is_pandas else None
    if ndarray:
        return np.asarray(x)[:, None], name
    else:
        return pd.DataFrame(x), name
Example #42
0
def handle_data(endog, exog, missing="none", hasconst=None, **kwargs):
    """
    Given inputs
    """
    # deal with lists and tuples up-front
    if isinstance(endog, (list, tuple)):
        endog = np.asarray(endog)
    if isinstance(exog, (list, tuple)):
        exog = np.asarray(exog)

    if data_util._is_using_ndarray_type(endog, exog):
        klass = ModelData
    elif data_util._is_using_pandas(endog, exog):
        klass = PandasData
    elif data_util._is_using_patsy(endog, exog):
        klass = PatsyData
    # keep this check last
    elif data_util._is_using_ndarray(endog, exog):
        klass = ModelData
    else:
        raise ValueError("unrecognized data structures: %s / %s" % (type(endog), type(exog)))

    return klass(endog, exog=exog, missing=missing, hasconst=hasconst, **kwargs)
Example #43
0
def handle_data(endog, exog):
    """
    Given inputs
    """
    # deal with lists and tuples up-front
    if isinstance(endog, (list, tuple)):
        endog = np.asarray(endog)
    if isinstance(exog, (list, tuple)):
        exog = np.asarray(exog)

    if data_util._is_using_pandas(endog, exog):
        klass = PandasData
    elif data_util._is_using_larry(endog, exog):
        klass = LarryData
    elif data_util._is_using_timeseries(endog, exog):
        klass = TimeSeriesData
    # keep this check last
    elif data_util._is_using_ndarray(endog, exog):
        klass = ModelData
    else:
        raise ValueError('unrecognized data structures: %s / %s' %
                         (type(endog), type(exog)))

    return klass(endog, exog=exog)
Example #44
0
    def __init__(self, endog, exog, constraints=None, **kwargs):
        # Standardize data
        endog_using_pandas = _is_using_pandas(endog, None)
        if not endog_using_pandas:
            endog = np.asanyarray(endog)

        exog_is_using_pandas = _is_using_pandas(exog, None)
        if not exog_is_using_pandas:
            exog = np.asarray(exog)

        # Make sure we have 2-dimensional array
        if exog.ndim == 1:
            if not exog_is_using_pandas:
                exog = exog[:, None]
            else:
                exog = pd.DataFrame(exog)

        self.k_exog = exog.shape[1]

        # Handle constraints
        self.k_constraints = 0
        self._r_matrix = self._q_matrix = None
        if constraints is not None:
            from patsy import DesignInfo
            from statsmodels.base.data import handle_data
            data = handle_data(endog, exog, **kwargs)
            names = data.param_names
            LC = DesignInfo(names).linear_constraint(constraints)
            self._r_matrix, self._q_matrix = LC.coefs, LC.constants
            self.k_constraints = self._r_matrix.shape[0]

            constraint_endog = np.zeros((len(endog), len(self._r_matrix)))
            if endog_using_pandas:
                constraint_endog = pd.DataFrame(constraint_endog,
                                                index=endog.index)
                endog = concat([endog, constraint_endog], axis=1)
                endog.values[:, 1:] = self._q_matrix[:, 0]
            else:
                endog[:, 1:] = self._q_matrix[:, 0]

        # Handle coefficient initialization
        kwargs.setdefault('initialization', 'diffuse')

        # Initialize the state space representation
        super(RecursiveLS, self).__init__(
            endog, k_states=self.k_exog, exog=exog, **kwargs)

        # Use univariate filtering by default
        self.ssm.filter_univariate = True

        # Concentrate the scale out of the likelihood function
        self.ssm.filter_concentrated = True

        # Setup the state space representation
        self['design'] = np.zeros((self.k_endog, self.k_states, self.nobs))
        self['design', 0] = self.exog[:, :, None].T
        if self._r_matrix is not None:
            self['design', 1:, :] = self._r_matrix[:, :, None]
        self['transition'] = np.eye(self.k_states)

        # Notice that the filter output does not depend on the measurement
        # variance, so we set it here to 1
        self['obs_cov', 0, 0] = 1.
        self['transition'] = np.eye(self.k_states)

        # Linear constraints are technically imposed by adding "fake" endog
        # variables that are used during filtering, but for all model- and
        # results-based purposes we want k_endog = 1.
        if self._r_matrix is not None:
            self.k_endog = 1
Example #45
0
    def __init__(self,
                 endog,
                 exog=None,
                 order=(0, 0, 0),
                 seasonal_order=(0, 0, 0, 0),
                 trend=None,
                 enforce_stationarity=True,
                 enforce_invertibility=True,
                 concentrate_scale=False,
                 trend_offset=1,
                 dates=None,
                 freq=None,
                 missing='none',
                 validate_specification=True):
        # Default for trend
        # 'c' if there is no integration and 'n' otherwise
        # TODO: if trend='c', then we could alternatively use `demean=True` in
        # the estimation methods rather than setting up `exog` and using GLS.
        # Not sure if it's worth the trouble though.
        integrated = order[1] > 0 or seasonal_order[1] > 0
        if trend is None and not integrated:
            trend = 'c'
        elif trend is None:
            trend = 'n'

        # Construct the specification
        # (don't pass specific values of enforce stationarity/invertibility,
        # because we don't actually want to restrict the estimators based on
        # this criteria. Instead, we'll just make sure that the parameter
        # estimates from those methods satisfy the criteria.)
        self._spec_arima = SARIMAXSpecification(
            endog,
            exog=exog,
            order=order,
            seasonal_order=seasonal_order,
            trend=trend,
            enforce_stationarity=None,
            enforce_invertibility=None,
            concentrate_scale=concentrate_scale,
            trend_offset=trend_offset,
            dates=dates,
            freq=freq,
            missing=missing,
            validate_specification=validate_specification)
        exog = self._spec_arima._model.data.orig_exog

        # Raise an error if we have a constant in an integrated model

        has_trend = len(self._spec_arima.trend_terms) > 0
        if has_trend:
            lowest_trend = np.min(self._spec_arima.trend_terms)
            if lowest_trend < order[1] + seasonal_order[1]:
                raise ValueError(
                    'In models with integration (`d > 0`) or seasonal'
                    ' integration (`D > 0`), trend terms of lower order than'
                    ' `d + D` cannot be (as they would be eliminated due to'
                    ' the differencing operation). For example, a constant'
                    ' cannot be included in an ARIMA(1, 1, 1) model, but'
                    ' including a linear trend, which would have the same'
                    ' effect as fitting a constant to the differenced data,'
                    ' is allowed.')

        # Keep the given `exog` by removing the prepended trend variables
        input_exog = None
        if exog is not None:
            if _is_using_pandas(exog, None):
                input_exog = exog.iloc[:, self._spec_arima.k_trend:]
            else:
                input_exog = exog[:, self._spec_arima.k_trend:]

        # Initialize the base SARIMAX class
        # Note: we don't pass in a trend value to the base class, since ARIMA
        # standardizes the trend to always be part of exog, while the base
        # SARIMAX class puts it in the transition equation.
        super(ARIMA,
              self).__init__(endog,
                             exog,
                             trend=None,
                             order=order,
                             seasonal_order=seasonal_order,
                             enforce_stationarity=enforce_stationarity,
                             enforce_invertibility=enforce_invertibility,
                             concentrate_scale=concentrate_scale,
                             dates=dates,
                             freq=freq,
                             missing=missing,
                             validate_specification=validate_specification)
        self.trend = trend

        # Save the input exog and input exog names, so that we can refer to
        # them later (see especially `ARIMAResults.append`)
        self._input_exog = input_exog
        if exog is not None:
            self._input_exog_names = self.exog_names[self._spec_arima.k_trend:]
        else:
            self._input_exog_names = None

        # Override the public attributes for k_exog and k_trend to reflect the
        # distinction here (for the purpose of the superclass, these are both
        # combined as `k_exog`)
        self.k_exog = self._spec_arima.k_exog
        self.k_trend = self._spec_arima.k_trend

        # Remove some init kwargs that aren't used in this model
        unused = [
            'measurement_error', 'time_varying_regression', 'mle_regression',
            'simple_differencing', 'hamilton_representation'
        ]
        self._init_keys = [key for key in self._init_keys if key not in unused]
Example #46
0
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward',
              use_pandas=False):
    """
    Generate lagmatrix for 2d array, columns arranged by variables

    Parameters
    ----------
    x : array_like, 2d
        2d data, observation in rows and variables in columns
    maxlag0 : int
        for first variable all lags from zero to maxlag are included
    maxlagex : None or int
        max lag for all other variables all lags from zero to maxlag are included
    dropex : int (default is 0)
        exclude first dropex lags from other variables
        for all variables, except the first, lags from dropex to maxlagex are
        included
    trim : string
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none' : no trimming of observations
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations, columns ordered by variable

    Notes
    -----
    Inefficient implementation for unequal lags, implemented for convenience
    """

    if maxlagex is None:
        maxlagex = maxlag0
    maxlag = max(maxlag0, maxlagex)
    is_pandas = _is_using_pandas(x, None)

    if x.ndim == 1:
        if is_pandas:
            x = pd.DataFrame(x)
        else:
            x = x[:, None]
    elif x.ndim == 0 or x.ndim > 2:
        raise ValueError('Only supports 1 and 2-dimensional data.')

    nobs, nvar = x.shape

    if is_pandas and use_pandas:
        lags = lagmat(x.iloc[:, 0], maxlag, trim=trim,
                      original='in', use_pandas=True)
        lagsli = [lags.iloc[:, :maxlag0 + 1]]
        for k in range(1, nvar):
            lags = lagmat(x.iloc[:, k], maxlag, trim=trim,
                          original='in', use_pandas=True)
            lagsli.append(lags.iloc[:, dropex:maxlagex + 1])
        return pd.concat(lagsli, axis=1)
    elif is_pandas:
        x = np.asanyarray(x)

    lagsli = [lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]]
    for k in range(1, nvar):
        lagsli.append(lagmat(x[:, k], maxlag, trim=trim, original='in')[:, dropex:maxlagex + 1])
    return np.column_stack(lagsli)
Example #47
0
def add_trend(x, trend="c", prepend=False, has_constant='skip'):
    """
    Adds a trend and/or constant to an array.

    Parameters
    ----------
    X : array-like
        Original array of data.
    trend : str {"c","t","ct","ctt"}
        "c" add constant only
        "t" add trend only
        "ct" add constant and linear trend
        "ctt" add constant and linear and quadratic trend.
    prepend : bool
        If True, prepends the new data to the columns of X.
    has_constant : str {'raise', 'add', 'skip'}
        Controls what happens when trend is 'c' and a constant already
        exists in X. 'raise' will raise an error. 'add' will duplicate a
        constant. 'skip' will return the data without change. 'skip' is the
        default.

    Returns
    -------
    y : array, recarray or DataFrame
        The original data with the additional trend columns.  If x is a
        recarray or pandas Series or DataFrame, then the trend column names
        are 'const', 'trend' and 'trend_squared'.

    Notes
    -----
    Returns columns as ["ctt","ct","c"] whenever applicable. There is currently
    no checking for an existing trend.

    See also
    --------
    statsmodels.tools.tools.add_constant
    """
    # TODO: could be generalized for trend of aribitrary order
    trend = trend.lower()
    columns = ['const', 'trend', 'trend_squared']
    if trend == "c":  # handles structured arrays
        columns = columns[:1]
        trendorder = 0
    elif trend == "ct" or trend == "t":
        columns = columns[:2]
        if trend == "t":
            columns = columns[1:2]
        trendorder = 1
    elif trend == "ctt":
        trendorder = 2
    else:
        raise ValueError("trend %s not understood" % trend)

    is_recarray = _is_recarray(x)
    is_pandas = _is_using_pandas(x, None) or is_recarray
    if is_pandas or is_recarray:
        if is_recarray:
            descr = x.dtype.descr
            x = pd.DataFrame.from_records(x)
        elif isinstance(x, pd.Series):
            x = pd.DataFrame(x)
        else:
            x = x.copy()
    else:
        x = np.asanyarray(x)

    nobs = len(x)
    trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64), trendorder + 1)
    # put in order ctt
    trendarr = np.fliplr(trendarr)
    if trend == "t":
        trendarr = trendarr[:, 1]

    if "c" in trend:
        if is_pandas or is_recarray:
            # Mixed type protection
            def safe_is_const(s):
                try:
                    return np.ptp(s) == 0.0 and np.any(s != 0.0)
                except:
                    return False
            col_const = x.apply(safe_is_const, 0)
        else:
            col_const = np.logical_and(np.any(np.ptp(np.asanyarray(x), axis=0) == 0, axis=0),
                                       np.all(x != 0.0, axis=0))
        if np.any(col_const):
            if has_constant == 'raise':
                raise ValueError("x already contains a constant")
            elif has_constant == 'skip':
                columns = columns[1:]
                trendarr = trendarr[:, 1:]

    order = 1 if prepend else -1
    if is_recarray or is_pandas:
        trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns)
        x = [trendarr, x]
        x = pd.concat(x[::order], 1)
    else:
        x = [trendarr, x]
        x = np.column_stack(x[::order])

    if is_recarray:
        x = x.to_records(index=False, convert_datetime64=False)
        new_descr = x.dtype.descr
        extra_col = len(new_descr) - len(descr)
        descr = new_descr[:extra_col] + descr if prepend else descr + new_descr[-extra_col:]
        x = x.astype(np.dtype(descr))

    return x
Example #48
0
def add_constant(data, prepend=False):
    '''
    This appends a column of ones to an array if prepend==False.

    For ndarrays and pandas.DataFrames, checks to make sure a constant is not
    already included. If there is at least one column of ones then the
    original object is returned.  Does not check for a constant if a structured
    or recarray is
    given.

    Parameters
    ----------
    data : array-like
        `data` is the column-ordered design matrix
    prepend : bool
        True and the constant is prepended rather than appended.

    Returns
    -------
    data : array
        The original array with a constant (column of ones) as the first or
        last column.

    Notes
    -----

    .. WARNING::
       The default of prepend will be changed to True in the next release of
       statsmodels. We recommend to use an explicit prepend in any permanent
       code.
    '''
    if not prepend:
        import inspect
        frame = inspect.currentframe().f_back
        info = inspect.getframeinfo(frame)
        try: # info.code_context is None on python 2.6? Why?
            to_warn = (info.code_context is not None and
                       'prepend' not in '\n'.join(info.code_context))
        except: # python 2.5 compatibility
            to_warn = 'prepend' not in '\n'.join(info[3])
        if to_warn:
            import warnings
            warnings.warn("The default of `prepend` will be changed to True "
                          "in 0.5.0, use explicit prepend",
                          FutureWarning)

    if _is_using_pandas(data, None):
        # work on a copy
        return _pandas_add_constant(data.copy(), prepend)
    else:
        data = np.asarray(data)
    if not data.dtype.names:
        var0 = data.var(0) == 0
        if np.any(var0):
            return data
        data = np.column_stack((data, np.ones((data.shape[0], 1))))
        if prepend:
            return np.roll(data, 1, 1)
    else:
        return_rec = data.__class__ is np.recarray
        if prepend:
            ones = np.ones((data.shape[0], 1), dtype=[('const', float)])
            data = nprf.append_fields(ones, data.dtype.names, [data[i] for
                i in data.dtype.names], usemask=False, asrecarray=return_rec)
        else:
            data = nprf.append_fields(data, 'const', np.ones(data.shape[0]),
                    usemask=False, asrecarray = return_rec)
    return data
    def __init__(self, endog, exog=None, smoother=None, alpha=0, family=None,
                 offset=None, exposure=None, missing='none', **kwargs):

        # TODO: check usage of hasconst
        hasconst = kwargs.get('hasconst', None)
        xnames_linear = None
        if hasattr(exog, 'design_info'):
            self.design_info_linear = exog.design_info
            xnames_linear = self.design_info_linear.column_names

        is_pandas = _is_using_pandas(exog, None)

        # TODO: handle data is experimental, see #5469
        # This is a bit wasteful because we need to `handle_data twice`
        self.data_linear = self._handle_data(endog, exog, missing, hasconst)
        if xnames_linear is None:
            xnames_linear = self.data_linear.xnames
        if exog is not None:
            exog_linear = np.asarray(exog)
            k_exog_linear = exog_linear.shape[1]
        else:
            exog_linear = None
            k_exog_linear = 0
        self.k_exog_linear = k_exog_linear
        # We need exog_linear for k-fold cross validation
        # TODO: alternative is to take columns from combined exog
        self.exog_linear = exog_linear

        self.smoother = smoother
        self.k_smooths = smoother.k_variables
        self.alpha = self._check_alpha(alpha)
        penal = MultivariateGamPenalty(smoother, alpha=self.alpha,
                                       start_idx=k_exog_linear)
        kwargs.pop('penal', None)
        if exog_linear is not None:
            exog = np.column_stack((exog_linear, smoother.basis))
        else:
            exog = smoother.basis

        # TODO: check: xnames_linear will be None instead of empty list
        #       if no exog_linear
        # can smoother be empty ? I guess not allowed.
        if xnames_linear is None:
            xnames_linear = []
        xnames = xnames_linear + self.smoother.col_names

        if is_pandas and exog_linear is not None:
            # we a dataframe so we can get a PandasData instance for wrapping
            exog = pd.DataFrame(exog, index=self.data_linear.row_labels,
                                columns=xnames)

        super(GLMGam, self).__init__(endog, exog=exog, family=family,
                                     offset=offset, exposure=exposure,
                                     penal=penal, missing=missing, **kwargs)

        if not is_pandas:
            # set exog nanmes if not given by pandas DataFrame
            self.exog_names[:] = xnames

        # TODO: the generic data handling might attach the design_info from the
        #       linear part, but this is incorrect for the full model and
        #       causes problems in wald_test_terms

        if hasattr(self.data, 'design_info'):
            del self.data.design_info
        # formula also might be attached which causes problems in predict
        if hasattr(self, 'formula'):
            self.formula_linear = self.formula
            self.formula = None
            del self.formula
Example #50
0
def lagmat(
    x,
    maxlag: int,
    trim: Literal["forward", "backward", "both", "none"] = 'forward',
    original: Literal["ex", "sep", "in"] = "ex",
    use_pandas: bool = False
) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame,
                                                           DataFrame]:
    """
    Create 2d array of lags.

    Parameters
    ----------
    x : array_like
        Data; if 2d, observation in rows and variables in columns.
    maxlag : int
        All lags from zero to maxlag are included.
    trim : {'forward', 'backward', 'both', 'none', None}
        The trimming method to use.

        * 'forward' : trim invalid observations in front.
        * 'backward' : trim invalid initial observations.
        * 'both' : trim invalid observations on both sides.
        * 'none', None : no trimming of observations.
    original : {'ex','sep','in'}
        How the original is treated.

        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : ndarray
        The array with lagged observations.
    y : ndarray, optional
        Only returned if original == 'sep'.

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend
    index values.

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])
    """
    maxlag = int_like(maxlag, "maxlag")
    use_pandas = bool_like(use_pandas, "use_pandas")
    trim = string_like(
        trim,
        "trim",
        optional=True,
        options=("forward", "backward", "both", "none"),
    )
    original = string_like(original, "original", options=("ex", "sep", "in"))

    # TODO:  allow list of lags additional to maxlag
    orig = x
    x = array_like(x, "x", ndim=2, dtype=None)
    is_pandas = _is_using_pandas(orig, None) and use_pandas
    trim = "none" if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ("none", "backward"):
        raise ValueError("trim cannot be 'none' or 'backward' when used on "
                         "Series or DataFrames")

    dropidx = 0
    nobs, nvar = x.shape
    if original in ["ex", "sep"]:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
           nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x

    if trim in ("none", "forward"):
        startobs = 0
    elif trim in ("backward", "both"):
        startobs = maxlag
    else:
        raise ValueError("trim option not valid")

    if trim in ("none", "backward"):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x = orig
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + ".L." + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ("sep", "ex"):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, axis=1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == "sep":
            leads = lm[startobs:stopobs, :dropidx]

    if original == "sep":
        return lags, leads
    else:
        return lags
Example #51
0
def lagmat2ds(x,
              maxlag0,
              maxlagex=None,
              dropex=0,
              trim="forward",
              use_pandas=False):
    """
    Generate lagmatrix for 2d array, columns arranged by variables.

    Parameters
    ----------
    x : array_like
        Data, 2d. Observations in rows and variables in columns.
    maxlag0 : int
        The first variable all lags from zero to maxlag are included.
    maxlagex : {None, int}
        The max lag for all other variables all lags from zero to maxlag are
        included.
    dropex : int
        Exclude first dropex lags from other variables. For all variables,
        except the first, lags from dropex to maxlagex are included.
    trim : str
        The trimming method to use.

        * 'forward' : trim invalid observations in front.
        * 'backward' : trim invalid initial observations.
        * 'both' : trim invalid observations on both sides.
        * 'none' : no trimming of observations.
    use_pandas : bool
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    ndarray
        The array with lagged observations, columns ordered by variable.

    Notes
    -----
    Inefficient implementation for unequal lags, implemented for convenience.
    """
    maxlag0 = int_like(maxlag0, "maxlag0")
    maxlagex = int_like(maxlagex, "maxlagex", optional=True)
    trim = string_like(
        trim,
        "trim",
        optional=True,
        options=("forward", "backward", "both", "none"),
    )
    if maxlagex is None:
        maxlagex = maxlag0
    maxlag = max(maxlag0, maxlagex)
    is_pandas = _is_using_pandas(x, None)

    if x.ndim == 1:
        if is_pandas:
            x = pd.DataFrame(x)
        else:
            x = x[:, None]
    elif x.ndim == 0 or x.ndim > 2:
        raise ValueError("Only supports 1 and 2-dimensional data.")

    nobs, nvar = x.shape

    if is_pandas and use_pandas:
        lags = lagmat(x.iloc[:, 0],
                      maxlag,
                      trim=trim,
                      original="in",
                      use_pandas=True)
        lagsli = [lags.iloc[:, :maxlag0 + 1]]
        for k in range(1, nvar):
            lags = lagmat(x.iloc[:, k],
                          maxlag,
                          trim=trim,
                          original="in",
                          use_pandas=True)
            lagsli.append(lags.iloc[:, dropex:maxlagex + 1])
        return pd.concat(lagsli, axis=1)
    elif is_pandas:
        x = np.asanyarray(x)

    lagsli = [
        lagmat(x[:, 0], maxlag, trim=trim, original="in")[:, :maxlag0 + 1]
    ]
    for k in range(1, nvar):
        lagsli.append(
            lagmat(x[:, k], maxlag, trim=trim,
                   original="in")[:, dropex:maxlagex + 1])
    return np.column_stack(lagsli)
Example #52
0
def add_trend(x, trend="c", prepend=False, has_constant="skip"):
    """
    Add a trend and/or constant to an array.

    Parameters
    ----------
    x : array_like
        Original array of data.
    trend : str {'n', 'c', 't', 'ct', 'ctt'}
        The trend to add.

        * 'n' add no trend.
        * 'c' add constant only.
        * 't' add trend only.
        * 'ct' add constant and linear trend.
        * 'ctt' add constant and linear and quadratic trend.
    prepend : bool
        If True, prepends the new data to the columns of X.
    has_constant : str {'raise', 'add', 'skip'}
        Controls what happens when trend is 'c' and a constant column already
        exists in x. 'raise' will raise an error. 'add' will add a column of
        1s. 'skip' will return the data without change. 'skip' is the default.

    Returns
    -------
    array_like
        The original data with the additional trend columns.  If x is a
        pandas Series or DataFrame, then the trend column names are 'const',
        'trend' and 'trend_squared'.

    See Also
    --------
    statsmodels.tools.tools.add_constant
        Add a constant column to an array.

    Notes
    -----
    Returns columns as ['ctt','ct','c'] whenever applicable. There is currently
    no checking for an existing trend.
    """
    prepend = bool_like(prepend, "prepend")
    trend = string_like(trend, "trend", options=("n", "c", "t", "ct", "ctt"))
    has_constant = string_like(has_constant,
                               "has_constant",
                               options=("raise", "add", "skip"))

    # TODO: could be generalized for trend of aribitrary order
    columns = ["const", "trend", "trend_squared"]
    if trend == "n":
        return x.copy()
    elif trend == "c":  # handles structured arrays
        columns = columns[:1]
        trendorder = 0
    elif trend == "ct" or trend == "t":
        columns = columns[:2]
        if trend == "t":
            columns = columns[1:2]
        trendorder = 1
    elif trend == "ctt":
        trendorder = 2

    if _is_recarray(x):
        from statsmodels.tools.sm_exceptions import recarray_exception

        raise NotImplementedError(recarray_exception)

    is_pandas = _is_using_pandas(x, None)
    if is_pandas:
        if isinstance(x, pd.Series):
            x = pd.DataFrame(x)
        else:
            x = x.copy()
    else:
        x = np.asanyarray(x)

    nobs = len(x)
    trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64),
                         trendorder + 1)
    # put in order ctt
    trendarr = np.fliplr(trendarr)
    if trend == "t":
        trendarr = trendarr[:, 1]

    if "c" in trend:
        if is_pandas:
            # Mixed type protection
            def safe_is_const(s):
                try:
                    return np.ptp(s) == 0.0 and np.any(s != 0.0)
                except:
                    return False

            col_const = x.apply(safe_is_const, 0)
        else:
            ptp0 = np.ptp(np.asanyarray(x), axis=0)
            col_is_const = ptp0 == 0
            nz_const = col_is_const & (x[0] != 0)
            col_const = nz_const

        if np.any(col_const):
            if has_constant == "raise":
                if x.ndim == 1:
                    base_err = "x is constant."
                else:
                    columns = np.arange(x.shape[1])[col_const]
                    if isinstance(x, pd.DataFrame):
                        columns = x.columns
                    const_cols = ", ".join([str(c) for c in columns])
                    base_err = (
                        "x contains one or more constant columns. Column(s) "
                        f"{const_cols} are constant.")
                msg = f"{base_err} Adding a constant with trend='{trend}' is not allowed."
                raise ValueError(msg)
            elif has_constant == "skip":
                columns = columns[1:]
                trendarr = trendarr[:, 1:]

    order = 1 if prepend else -1
    if is_pandas:
        trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns)
        x = [trendarr, x]
        x = pd.concat(x[::order], axis=1)
    else:
        x = [trendarr, x]
        x = np.column_stack(x[::order])

    return x
Example #53
0
    def __init__(self, endog, exog=None, order=(1, 0), trend='c',
                 error_cov_type='unstructured', measurement_error=False,
                 enforce_stationarity=True, enforce_invertibility=True,
                 **kwargs):

        # Model parameters
        self.error_cov_type = error_cov_type
        self.measurement_error = measurement_error
        self.enforce_stationarity = enforce_stationarity
        self.enforce_invertibility = enforce_invertibility

        # Save the given orders
        self.order = order
        self.trend = trend

        # Model orders
        self.k_ar = int(order[0])
        self.k_ma = int(order[1])
        self.k_trend = int(self.trend == 'c')

        # Check for valid model
        if trend not in ['c', 'nc']:
            raise ValueError('Invalid trend specification.')
        if error_cov_type not in ['diagonal', 'unstructured']:
            raise ValueError('Invalid error covariance matrix type'
                             ' specification.')
        if self.k_ar == 0 and self.k_ma == 0:
            raise ValueError('Invalid VARMAX(p,q) specification; at least one'
                             ' p,q must be greater than zero.')

        # Warn for VARMA model
        if self.k_ar > 0 and self.k_ma > 0:
            warn('Estimation of VARMA(p,q) models is not generically robust,'
                 ' due especially to identification issues.')

        # Exogenous data
        self.k_exog = 0
        if exog is not None:
            exog_is_using_pandas = _is_using_pandas(exog, None)
            if not exog_is_using_pandas:
                exog = np.asarray(exog)

            # Make sure we have 2-dimensional array
            if exog.ndim == 1:
                if not exog_is_using_pandas:
                    exog = exog[:, None]
                else:
                    exog = pd.DataFrame(exog)

            self.k_exog = exog.shape[1]

        # Note: at some point in the future might add state regression, as in
        # SARIMAX.
        self.mle_regression = self.k_exog > 0

        # We need to have an array or pandas at this point
        if not _is_using_pandas(endog, None):
            endog = np.asanyarray(endog)

        # Model order
        # Used internally in various places
        _min_k_ar = max(self.k_ar, 1)
        self._k_order = _min_k_ar + self.k_ma

        # Number of states
        k_endog = endog.shape[1]
        k_posdef = k_endog
        k_states = k_endog * self._k_order

        # By default, initialize as stationary
        kwargs.setdefault('initialization', 'stationary')

        # By default, use LU decomposition
        kwargs.setdefault('inversion_method', INVERT_UNIVARIATE | SOLVE_LU)

        # Initialize the state space model
        super(VARMAX, self).__init__(
            endog, exog=exog, k_states=k_states, k_posdef=k_posdef, **kwargs
        )

        # Initialize the parameters
        self.parameters = OrderedDict()
        self.parameters['trend'] = self.k_endog * self.k_trend
        self.parameters['ar'] = self.k_endog**2 * self.k_ar
        self.parameters['ma'] = self.k_endog**2 * self.k_ma
        self.parameters['regression'] = self.k_endog * self.k_exog
        if self.error_cov_type == 'diagonal':
            self.parameters['state_cov'] = self.k_endog
        # These parameters fill in a lower-triangular matrix which is then
        # dotted with itself to get a positive definite matrix.
        elif self.error_cov_type == 'unstructured':
            self.parameters['state_cov'] = (
                int(self.k_endog * (self.k_endog + 1) / 2)
            )
        self.parameters['obs_cov'] = self.k_endog * self.measurement_error
        self.k_params = sum(self.parameters.values())

        # Initialize known elements of the state space matrices

        # If we have exog effects, then the state intercept needs to be
        # time-varying
        if self.k_exog > 0:
            self.ssm['state_intercept'] = np.zeros((self.k_states, self.nobs))

        # The design matrix is just an identity for the first k_endog states
        idx = np.diag_indices(self.k_endog)
        self.ssm[('design',) + idx] = 1

        # The transition matrix is described in four blocks, where the upper
        # left block is in companion form with the autoregressive coefficient
        # matrices (so it is shaped k_endog * k_ar x k_endog * k_ar) ...
        if self.k_ar > 0:
            idx = np.diag_indices((self.k_ar - 1) * self.k_endog)
            idx = idx[0] + self.k_endog, idx[1]
            self.ssm[('transition',) + idx] = 1
        # ... and the  lower right block is in companion form with zeros as the
        # coefficient matrices (it is shaped k_endog * k_ma x k_endog * k_ma).
        idx = np.diag_indices((self.k_ma - 1) * self.k_endog)
        idx = (idx[0] + (_min_k_ar + 1) * self.k_endog,
               idx[1] + _min_k_ar * self.k_endog)
        self.ssm[('transition',) + idx] = 1

        # The selection matrix is described in two blocks, where the upper
        # block selects the all k_posdef errors in the first k_endog rows
        # (the upper block is shaped k_endog * k_ar x k) and the lower block
        # also selects all k_posdef errors in the first k_endog rows (the lower
        # block is shaped k_endog * k_ma x k).
        idx = np.diag_indices(self.k_endog)
        self.ssm[('selection',) + idx] = 1
        idx = idx[0] + _min_k_ar * self.k_endog, idx[1]
        if self.k_ma > 0:
            self.ssm[('selection',) + idx] = 1

        # Cache some indices
        if self.trend == 'c' and self.k_exog == 0:
            self._idx_state_intercept = np.s_['state_intercept', :k_endog]
        elif self.k_exog > 0:
            self._idx_state_intercept = np.s_['state_intercept', :k_endog, :]
        if self.k_ar > 0:
            self._idx_transition = np.s_['transition', :k_endog, :]
        else:
            self._idx_transition = np.s_['transition', :k_endog, k_endog:]
        if self.error_cov_type == 'diagonal':
            self._idx_state_cov = (
                ('state_cov',) + np.diag_indices(self.k_endog))
        elif self.error_cov_type == 'unstructured':
            self._idx_lower_state_cov = np.tril_indices(self.k_endog)
        if self.measurement_error:
            self._idx_obs_cov = ('obs_cov',) + np.diag_indices(self.k_endog)

        # Cache some slices
        def _slice(key, offset):
            length = self.parameters[key]
            param_slice = np.s_[offset:offset + length]
            offset += length
            return param_slice, offset

        offset = 0
        self._params_trend, offset = _slice('trend', offset)
        self._params_ar, offset = _slice('ar', offset)
        self._params_ma, offset = _slice('ma', offset)
        self._params_regression, offset = _slice('regression', offset)
        self._params_state_cov, offset = _slice('state_cov', offset)
        self._params_obs_cov, offset = _slice('obs_cov', offset)
Example #54
0
    def __init__(self,
                 endog=None,
                 exog=None,
                 order=None,
                 seasonal_order=None,
                 ar_order=None,
                 diff=None,
                 ma_order=None,
                 seasonal_ar_order=None,
                 seasonal_diff=None,
                 seasonal_ma_order=None,
                 seasonal_periods=None,
                 trend=None,
                 enforce_stationarity=None,
                 enforce_invertibility=None,
                 concentrate_scale=None,
                 trend_offset=1,
                 dates=None,
                 freq=None,
                 missing='none',
                 validate_specification=True):

        # Basic parameters
        self.enforce_stationarity = enforce_stationarity
        self.enforce_invertibility = enforce_invertibility
        self.concentrate_scale = concentrate_scale
        self.trend_offset = trend_offset

        # Validate that we were not given conflicting specifications
        has_order = order is not None
        has_specific_order = (ar_order is not None or diff is not None
                              or ma_order is not None)
        has_seasonal_order = seasonal_order is not None
        has_specific_seasonal_order = (seasonal_ar_order is not None
                                       or seasonal_diff is not None
                                       or seasonal_ma_order is not None
                                       or seasonal_periods is not None)
        if has_order and has_specific_order:
            raise ValueError('Cannot specify both `order` and either of'
                             ' `ar_order` or `ma_order`.')
        if has_seasonal_order and has_specific_seasonal_order:
            raise ValueError('Cannot specify both `seasonal_order` and any of'
                             ' `seasonal_ar_order`, `seasonal_ma_order`,'
                             ' or `seasonal_periods`.')

        # Compute `order`
        if has_specific_order:
            ar_order = 0 if ar_order is None else ar_order
            diff = 0 if diff is None else diff
            ma_order = 0 if ma_order is None else ma_order
            order = (ar_order, diff, ma_order)
        elif not has_order:
            order = (0, 0, 0)

        # Compute `seasonal_order`
        if has_specific_seasonal_order:
            seasonal_ar_order = (0 if seasonal_ar_order is None else
                                 seasonal_ar_order)
            seasonal_diff = 0 if seasonal_diff is None else seasonal_diff
            seasonal_ma_order = (0 if seasonal_ma_order is None else
                                 seasonal_ma_order)
            seasonal_periods = (0 if seasonal_periods is None else
                                seasonal_periods)
            seasonal_order = (seasonal_ar_order, seasonal_diff,
                              seasonal_ma_order, seasonal_periods)
        elif not has_seasonal_order:
            seasonal_order = (0, 0, 0, 0)

        # Validate shapes of `order`, `seasonal_order`
        if len(order) != 3:
            raise ValueError('`order` argument must be an iterable with three'
                             ' elements.')
        if len(seasonal_order) != 4:
            raise ValueError('`seasonal_order` argument must be an iterable'
                             ' with four elements.')

        # Validate differencing parameters
        if validate_specification:
            if order[1] < 0:
                raise ValueError('Cannot specify negative differencing.')
            if order[1] != int(order[1]):
                raise ValueError('Cannot specify fractional differencing.')
            if seasonal_order[1] < 0:
                raise ValueError('Cannot specify negative seasonal'
                                 ' differencing.')
            if seasonal_order[1] != int(seasonal_order[1]):
                raise ValueError('Cannot specify fractional seasonal'
                                 ' differencing.')
            if seasonal_order[3] < 0:
                raise ValueError('Cannot specify negative seasonal'
                                 ' periodicity.')

        # Standardize to integers or lists of integers
        order = (standardize_lag_order(order[0], 'AR'), int(order[1]),
                 standardize_lag_order(order[2], 'MA'))
        seasonal_order = (standardize_lag_order(seasonal_order[0],
                                                'seasonal AR'),
                          int(seasonal_order[1]),
                          standardize_lag_order(seasonal_order[2],
                                                'seasonal MA'),
                          int(seasonal_order[3]))

        # Validate seasonals
        if validate_specification:
            if seasonal_order[3] == 1:
                raise ValueError('Seasonal periodicity must be greater'
                                 ' than 1.')
            if ((seasonal_order[0] != 0 or seasonal_order[1] != 0
                 or seasonal_order[2] != 0) and seasonal_order[3] == 0):
                raise ValueError('Must include nonzero seasonal periodicity if'
                                 ' including seasonal AR, MA, or'
                                 ' differencing.')

        # Basic order
        self.order = order
        self.ar_order, self.diff, self.ma_order = order

        self.seasonal_order = seasonal_order
        (self.seasonal_ar_order, self.seasonal_diff, self.seasonal_ma_order,
         self.seasonal_periods) = seasonal_order

        # Lists of included lags
        if isinstance(self.ar_order, list):
            self.ar_lags = self.ar_order
        else:
            self.ar_lags = np.arange(1, self.ar_order + 1).tolist()
        if isinstance(self.ma_order, list):
            self.ma_lags = self.ma_order
        else:
            self.ma_lags = np.arange(1, self.ma_order + 1).tolist()

        if isinstance(self.seasonal_ar_order, list):
            self.seasonal_ar_lags = self.seasonal_ar_order
        else:
            self.seasonal_ar_lags = (np.arange(1, self.seasonal_ar_order +
                                               1).tolist())
        if isinstance(self.seasonal_ma_order, list):
            self.seasonal_ma_lags = self.seasonal_ma_order
        else:
            self.seasonal_ma_lags = (np.arange(1, self.seasonal_ma_order +
                                               1).tolist())

        # Maximum lag orders
        self.max_ar_order = self.ar_lags[-1] if self.ar_lags else 0
        self.max_ma_order = self.ma_lags[-1] if self.ma_lags else 0

        self.max_seasonal_ar_order = (self.seasonal_ar_lags[-1]
                                      if self.seasonal_ar_lags else 0)
        self.max_seasonal_ma_order = (self.seasonal_ma_lags[-1]
                                      if self.seasonal_ma_lags else 0)

        self.max_reduced_ar_order = (
            self.max_ar_order +
            self.max_seasonal_ar_order * self.seasonal_periods)
        self.max_reduced_ma_order = (
            self.max_ma_order +
            self.max_seasonal_ma_order * self.seasonal_periods)

        # Check that we don't have duplicate AR or MA lags from the seasonal
        # component
        ar_lags = set(self.ar_lags)
        seasonal_ar_lags = set(
            np.array(self.seasonal_ar_lags) * self.seasonal_periods)
        duplicate_ar_lags = ar_lags.intersection(seasonal_ar_lags)
        if validate_specification and len(duplicate_ar_lags) > 0:
            raise ValueError('Invalid model: autoregressive lag(s) %s are'
                             ' in both the seasonal and non-seasonal'
                             ' autoregressive components.' % duplicate_ar_lags)

        ma_lags = set(self.ma_lags)
        seasonal_ma_lags = set(
            np.array(self.seasonal_ma_lags) * self.seasonal_periods)
        duplicate_ma_lags = ma_lags.intersection(seasonal_ma_lags)
        if validate_specification and len(duplicate_ma_lags) > 0:
            raise ValueError('Invalid model: moving average lag(s) %s are'
                             ' in both the seasonal and non-seasonal'
                             ' moving average components.' % duplicate_ma_lags)

        # Handle trend
        self.trend = trend
        self.trend_poly, _ = prepare_trend_spec(trend)

        # Check for a constant column in the provided exog
        exog_is_pandas = _is_using_pandas(exog, None)
        if (validate_specification and exog is not None
                and len(self.trend_poly) > 0 and self.trend_poly[0] == 1):
            # Figure out if we have any constant columns
            x = np.asanyarray(exog)
            ptp0 = np.ptp(x, axis=0)
            col_is_const = ptp0 == 0
            nz_const = col_is_const & (x[0] != 0)
            col_const = nz_const

            # If we already have a constant column, raise an error
            if np.any(col_const):
                raise ValueError('A constant trend was included in the model'
                                 ' specification, but the `exog` data already'
                                 ' contains a column of constants.')

        # This contains the included exponents of the trend polynomial,
        # where e.g. the constant term has exponent 0, a linear trend has
        # exponent 1, etc.
        self.trend_terms = np.where(self.trend_poly == 1)[0]
        # Trend order is either the degree of the trend polynomial, if all
        # exponents are included, or a list of included exponents. Here we need
        # to make a distinction between a degree zero polynomial (i.e. a
        # constant) and the zero polynomial (i.e. not even a constant). The
        # former has `trend_order = 0`, while the latter has
        # `trend_order = None`.
        self.k_trend = len(self.trend_terms)
        if len(self.trend_terms) == 0:
            self.trend_order = None
            self.trend_degree = None
        elif np.all(self.trend_terms == np.arange(len(self.trend_terms))):
            self.trend_order = self.trend_terms[-1]
            self.trend_degree = self.trend_terms[-1]
        else:
            self.trend_order = self.trend_terms
            self.trend_degree = self.trend_terms[-1]

        # Handle endog / exog
        # Standardize exog
        self.k_exog, exog = prepare_exog(exog)

        # Standardize endog (including creating a faux endog if necessary)
        faux_endog = endog is None
        if endog is None:
            endog = [] if exog is None else np.zeros(len(exog)) * np.nan

        # Add trend data into exog
        nobs = len(endog) if exog is None else len(exog)
        if self.trend_order is not None:
            # Add in the data
            trend_data = self.construct_trend_data(nobs, trend_offset)
            if exog is None:
                exog = trend_data
            elif exog_is_pandas:
                trend_data = pd.DataFrame(trend_data,
                                          index=exog.index,
                                          columns=self.construct_trend_names())
                exog = pd.concat([trend_data, exog], axis=1)
            else:
                exog = np.c_[trend_data, exog]

        # Create an underlying time series model, to handle endog / exog,
        # especially validating shapes, retrieving names, and potentially
        # providing us with a time series index
        self._model = TimeSeriesModel(endog,
                                      exog=exog,
                                      dates=dates,
                                      freq=freq,
                                      missing=missing)
        self.endog = None if faux_endog else self._model.endog
        self.exog = self._model.exog

        # Validate endog shape
        if (validate_specification and not faux_endog and self.endog.ndim > 1
                and self.endog.shape[1] > 1):
            raise ValueError('SARIMAX models require univariate `endog`. Got'
                             ' shape %s.' % str(self.endog.shape))

        self._has_missing = (None if faux_endog else np.any(
            np.isnan(self.endog)))
Example #55
0
    def __init__(self, endog, exog, constraints=None, **kwargs):
        # Standardize data
        endog_using_pandas = _is_using_pandas(endog, None)
        if not endog_using_pandas:
            endog = np.asanyarray(endog)

        exog_is_using_pandas = _is_using_pandas(exog, None)
        if not exog_is_using_pandas:
            exog = np.asarray(exog)

        # Make sure we have 2-dimensional array
        if exog.ndim == 1:
            if not exog_is_using_pandas:
                exog = exog[:, None]
            else:
                exog = pd.DataFrame(exog)

        self.k_exog = exog.shape[1]

        # Handle constraints
        self.k_constraints = 0
        self._r_matrix = self._q_matrix = None
        if constraints is not None:
            from patsy import DesignInfo
            from statsmodels.base.data import handle_data
            data = handle_data(endog, exog, **kwargs)
            names = data.param_names
            LC = DesignInfo(names).linear_constraint(constraints)
            self._r_matrix, self._q_matrix = LC.coefs, LC.constants
            self.k_constraints = self._r_matrix.shape[0]

            constraint_endog = np.zeros((len(endog), len(self._r_matrix)))
            if endog_using_pandas:
                constraint_endog = pd.DataFrame(constraint_endog,
                                                index=endog.index)
                endog = concat([endog, constraint_endog], axis=1)
                endog.values[:, 1:] = self._q_matrix[:, 0]
            else:
                endog[:, 1:] = self._q_matrix[:, 0]

        # Handle coefficient initialization
        kwargs.setdefault('initialization', 'diffuse')

        # Initialize the state space representation
        super(RecursiveLS, self).__init__(
            endog, k_states=self.k_exog, exog=exog, **kwargs)

        # Use univariate filtering by default
        self.ssm.filter_univariate = True

        # Concentrate the scale out of the likelihood function
        self.ssm.filter_concentrated = True

        # Setup the state space representation
        self['design'] = np.zeros((self.k_endog, self.k_states, self.nobs))
        self['design', 0] = self.exog[:, :, None].T
        if self._r_matrix is not None:
            self['design', 1:, :] = self._r_matrix[:, :, None]
        self['transition'] = np.eye(self.k_states)

        # Notice that the filter output does not depend on the measurement
        # variance, so we set it here to 1
        self['obs_cov', 0, 0] = 1.
        self['transition'] = np.eye(self.k_states)

        # Linear constraints are technically imposed by adding "fake" endog
        # variables that are used during filtering, but for all model- and
        # results-based purposes we want k_endog = 1.
        if self._r_matrix is not None:
            self.k_endog = 1
Example #56
0
    def __init__(self,
                 endog,
                 exog=None,
                 order=(1, 0),
                 trend='c',
                 error_cov_type='unstructured',
                 measurement_error=False,
                 enforce_stationarity=True,
                 enforce_invertibility=True,
                 trend_offset=1,
                 **kwargs):

        # Model parameters
        self.error_cov_type = error_cov_type
        self.measurement_error = measurement_error
        self.enforce_stationarity = enforce_stationarity
        self.enforce_invertibility = enforce_invertibility

        # Save the given orders
        self.order = order

        # Model orders
        self.k_ar = int(order[0])
        self.k_ma = int(order[1])

        # Check for valid model
        if error_cov_type not in ['diagonal', 'unstructured']:
            raise ValueError('Invalid error covariance matrix type'
                             ' specification.')
        if self.k_ar == 0 and self.k_ma == 0:
            raise ValueError('Invalid VARMAX(p,q) specification; at least one'
                             ' p,q must be greater than zero.')

        # Warn for VARMA model
        if self.k_ar > 0 and self.k_ma > 0:
            warn(
                'Estimation of VARMA(p,q) models is not generically robust,'
                ' due especially to identification issues.', EstimationWarning)

        # Trend
        self.trend = trend
        self.trend_offset = trend_offset
        self.polynomial_trend, self.k_trend = prepare_trend_spec(self.trend)
        self._trend_is_const = (self.polynomial_trend.size == 1
                                and self.polynomial_trend[0] == 1)

        # Exogenous data
        (self.k_exog, exog) = prepare_exog(exog)

        # Note: at some point in the future might add state regression, as in
        # SARIMAX.
        self.mle_regression = self.k_exog > 0

        # We need to have an array or pandas at this point
        if not _is_using_pandas(endog, None):
            endog = np.asanyarray(endog)

        # Model order
        # Used internally in various places
        _min_k_ar = max(self.k_ar, 1)
        self._k_order = _min_k_ar + self.k_ma

        # Number of states
        k_endog = endog.shape[1]
        k_posdef = k_endog
        k_states = k_endog * self._k_order

        # By default, initialize as stationary
        kwargs.setdefault('initialization', 'stationary')

        # By default, use LU decomposition
        kwargs.setdefault('inversion_method', INVERT_UNIVARIATE | SOLVE_LU)

        # Initialize the state space model
        super(VARMAX, self).__init__(endog,
                                     exog=exog,
                                     k_states=k_states,
                                     k_posdef=k_posdef,
                                     **kwargs)

        # Set as time-varying model if we have time-trend or exog
        if self.k_exog > 0 or (self.k_trend > 0 and not self._trend_is_const):
            self.ssm._time_invariant = False

        # Initialize the parameters
        self.parameters = OrderedDict()
        self.parameters['trend'] = self.k_endog * self.k_trend
        self.parameters['ar'] = self.k_endog**2 * self.k_ar
        self.parameters['ma'] = self.k_endog**2 * self.k_ma
        self.parameters['regression'] = self.k_endog * self.k_exog
        if self.error_cov_type == 'diagonal':
            self.parameters['state_cov'] = self.k_endog
        # These parameters fill in a lower-triangular matrix which is then
        # dotted with itself to get a positive definite matrix.
        elif self.error_cov_type == 'unstructured':
            self.parameters['state_cov'] = (int(self.k_endog *
                                                (self.k_endog + 1) / 2))
        self.parameters['obs_cov'] = self.k_endog * self.measurement_error
        self.k_params = sum(self.parameters.values())

        # Initialize trend data
        self._trend_data = prepare_trend_data(self.polynomial_trend,
                                              self.k_trend,
                                              self.nobs,
                                              offset=self.trend_offset)

        # Initialize known elements of the state space matrices

        # If we have exog effects, then the state intercept needs to be
        # time-varying
        if (self.k_trend > 0 and not self._trend_is_const) or self.k_exog > 0:
            self.ssm['state_intercept'] = np.zeros((self.k_states, self.nobs))
            # self.ssm['obs_intercept'] = np.zeros((self.k_endog, self.nobs))

        # The design matrix is just an identity for the first k_endog states
        idx = np.diag_indices(self.k_endog)
        self.ssm[('design', ) + idx] = 1

        # The transition matrix is described in four blocks, where the upper
        # left block is in companion form with the autoregressive coefficient
        # matrices (so it is shaped k_endog * k_ar x k_endog * k_ar) ...
        if self.k_ar > 0:
            idx = np.diag_indices((self.k_ar - 1) * self.k_endog)
            idx = idx[0] + self.k_endog, idx[1]
            self.ssm[('transition', ) + idx] = 1
        # ... and the  lower right block is in companion form with zeros as the
        # coefficient matrices (it is shaped k_endog * k_ma x k_endog * k_ma).
        idx = np.diag_indices((self.k_ma - 1) * self.k_endog)
        idx = (idx[0] + (_min_k_ar + 1) * self.k_endog,
               idx[1] + _min_k_ar * self.k_endog)
        self.ssm[('transition', ) + idx] = 1

        # The selection matrix is described in two blocks, where the upper
        # block selects the all k_posdef errors in the first k_endog rows
        # (the upper block is shaped k_endog * k_ar x k) and the lower block
        # also selects all k_posdef errors in the first k_endog rows (the lower
        # block is shaped k_endog * k_ma x k).
        idx = np.diag_indices(self.k_endog)
        self.ssm[('selection', ) + idx] = 1
        idx = idx[0] + _min_k_ar * self.k_endog, idx[1]
        if self.k_ma > 0:
            self.ssm[('selection', ) + idx] = 1

        # Cache some indices
        if self._trend_is_const and self.k_exog == 0:
            self._idx_state_intercept = np.s_['state_intercept', :k_endog, :]
        elif self.k_trend > 0 or self.k_exog > 0:
            self._idx_state_intercept = np.s_['state_intercept', :k_endog, :-1]
        if self.k_ar > 0:
            self._idx_transition = np.s_['transition', :k_endog, :]
        else:
            self._idx_transition = np.s_['transition', :k_endog, k_endog:]
        if self.error_cov_type == 'diagonal':
            self._idx_state_cov = (('state_cov', ) +
                                   np.diag_indices(self.k_endog))
        elif self.error_cov_type == 'unstructured':
            self._idx_lower_state_cov = np.tril_indices(self.k_endog)
        if self.measurement_error:
            self._idx_obs_cov = ('obs_cov', ) + np.diag_indices(self.k_endog)

        # Cache some slices
        def _slice(key, offset):
            length = self.parameters[key]
            param_slice = np.s_[offset:offset + length]
            offset += length
            return param_slice, offset

        offset = 0
        self._params_trend, offset = _slice('trend', offset)
        self._params_ar, offset = _slice('ar', offset)
        self._params_ma, offset = _slice('ma', offset)
        self._params_regression, offset = _slice('regression', offset)
        self._params_state_cov, offset = _slice('state_cov', offset)
        self._params_obs_cov, offset = _slice('obs_cov', offset)

        # Update _init_keys attached by super
        self._init_keys += [
            'order', 'trend', 'error_cov_type', 'measurement_error',
            'enforce_stationarity', 'enforce_invertibility'
        ] + list(kwargs.keys())
Example #57
0
    def __init__(self, endog, exog=None, smoother=None, alpha=0, family=None,
                 offset=None, exposure=None, missing='none', **kwargs):

        # TODO: check usage of hasconst
        hasconst = kwargs.get('hasconst', None)
        xnames_linear = None
        if hasattr(exog, 'design_info'):
            self.design_info_linear = exog.design_info
            xnames_linear = self.design_info_linear.column_names

        is_pandas = _is_using_pandas(exog, None)

        # TODO: handle data is experimental, see #5469
        # This is a bit wasteful because we need to `handle_data twice`
        self.data_linear = self._handle_data(endog, exog, missing, hasconst)
        if xnames_linear is None:
            xnames_linear = self.data_linear.xnames
        if exog is not None:
            exog_linear = np.asarray(exog)
            k_exog_linear = exog_linear.shape[1]
        else:
            exog_linear = None
            k_exog_linear = 0
        self.k_exog_linear = k_exog_linear
        # We need exog_linear for k-fold cross validation
        # TODO: alternative is to take columns from combined exog
        self.exog_linear = exog_linear

        self.smoother = smoother
        self.k_smooths = smoother.k_variables
        self.alpha = self._check_alpha(alpha)
        penal = MultivariateGamPenalty(smoother, alpha=self.alpha,
                                       start_idx=k_exog_linear)
        kwargs.pop('penal', None)
        if exog_linear is not None:
            exog = np.column_stack((exog_linear, smoother.basis))
        else:
            exog = smoother.basis

        # TODO: check: xnames_linear will be None instead of empty list
        #       if no exog_linear
        # can smoother be empty ? I guess not allowed.
        if xnames_linear is None:
            xnames_linear = []
        xnames = xnames_linear + self.smoother.col_names

        if is_pandas and exog_linear is not None:
            # we a dataframe so we can get a PandasData instance for wrapping
            exog = pd.DataFrame(exog, index=self.data_linear.row_labels,
                                columns=xnames)

        super(GLMGam, self).__init__(endog, exog=exog, family=family,
                                     offset=offset, exposure=exposure,
                                     penal=penal, missing=missing, **kwargs)

        if not is_pandas:
            # set exog nanmes if not given by pandas DataFrame
            self.exog_names[:] = xnames

        # TODO: the generic data handling might attach the design_info from the
        #       linear part, but this is incorrect for the full model and
        #       causes problems in wald_test_terms

        if hasattr(self.data, 'design_info'):
            del self.data.design_info
        # formula also might be attached which causes problems in predict
        if hasattr(self, 'formula'):
            self.formula_linear = self.formula
            self.formula = None
            del self.formula
Example #58
0
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False):
    """
    Create 2d array of lags

    Parameters
    ----------
    x : array_like, 1d or 2d
        data; if 2d, observation in rows and variables in columns
    maxlag : int
        all lags from zero to maxlag are included
    trim : str {'forward', 'backward', 'both', 'none'} or None
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none', None : no trimming of observations
    original : str {'ex','sep','in'}
        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations
    y : 2d array, optional
        Only returned if original == 'sep'

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend index
    values.
    """
    # TODO:  allow list of lags additional to maxlag
    is_pandas = _is_using_pandas(x, None) and use_pandas
    trim = 'none' if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ('none', 'backward'):
        raise ValueError("trim cannot be 'none' or 'forward' when used on "
                         "Series or DataFrames")

    xa = np.asarray(x)
    dropidx = 0
    if xa.ndim == 1:
        xa = xa[:, None]
    nobs, nvar = xa.shape
    if original in ['ex', 'sep']:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
        nvar * (maxlag - k):nvar * (maxlag - k + 1)] = xa

    if trim in ('none', 'forward'):
        startobs = 0
    elif trim in ('backward', 'both'):
        startobs = maxlag
    else:
        raise ValueError('trim option not valid')

    if trim in ('none', 'backward'):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + '.L.' + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ('sep', 'ex'):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, 1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == 'sep':
            leads = lm[startobs:stopobs, :dropidx]

    if original == 'sep':
        return lags, leads
    else:
        return lags