Example #1
0
    def select_order(self, maxlags=None, verbose=True):
        """
        Compute lag order selections based on each of the available information
        criteria

        Parameters
        ----------
        maxlags : int
            if None, defaults to 12 * (nobs/100.)**(1./4)
        verbose : bool, default True
            If True, print table of info criteria and selected orders

        Returns
        -------
        selections : dict {info_crit -> selected_order}
        """
        if maxlags is None:
            maxlags = int(round(12*(len(self.endog)/100.)**(1/4.)))

        ics = defaultdict(list)
        for p in range(maxlags + 1):
            # exclude some periods to same amount of data used for each lag
            # order
            result = self._estimate_var(p, offset=maxlags-p)

            for k, v in iteritems(result.info_criteria):
                ics[k].append(v)

        selected_orders = dict((k, mat(v).argmin())
                               for k, v in iteritems(ics))

        if verbose:
            output.print_ic_table(ics, selected_orders)

        return selected_orders
Example #2
0
 def get_colwidths(self, output_format, **fmt_dict):
     """Return list, the widths of each column."""
     call_args = [output_format]
     for k, v in sorted(iteritems(fmt_dict)):
         if isinstance(v, list):
             call_args.append((k, tuple(v)))
         elif isinstance(v, dict):
             call_args.append((k, tuple(sorted(iteritems(v)))))
         else:
             call_args.append((k, v))
     key = tuple(call_args)
     try:
         return self._colwidths[key]
     except KeyError:
         self._colwidths[key] = self._get_colwidths(output_format, **fmt_dict)
         return self._colwidths[key]
Example #3
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        if exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog,)
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (value_array,)
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (value_array,)
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (value_array,)
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                            "aren't yet handled")

        nan_mask = _nan_rows(*combined)
        if combined_2d:
            nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)

        if missing == 'raise' and np.any(nan_mask):
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names,
                                          lmap(drop_nans_2d, combined_2d))))
            if none_array_names:
                combined.update(dict(zip(none_array_names,
                                          [None] * len(none_array_names))))
            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Example #4
0
 def get_colwidths(self, output_format, **fmt_dict):
     """Return list, the widths of each column."""
     call_args = [output_format]
     for k, v in sorted(iteritems(fmt_dict)):
         if isinstance(v, list):
             call_args.append((k, tuple(v)))
         elif isinstance(v, dict):
             call_args.append((k, tuple(sorted(iteritems(v)))))
         else:
             call_args.append((k, v))
     key = tuple(call_args)
     try:
         return self._colwidths[key]
     except KeyError:
         self._colwidths[key] = self._get_colwidths(output_format,
                                                    **fmt_dict)
         return self._colwidths[key]
Example #5
0
def populate_wrapper(klass, wrapping):
    for meth, how in iteritems(klass._wrap_methods):
        if not hasattr(wrapping, meth):
            continue

        func = getattr(wrapping, meth)
        wrapper = make_wrapper(func, how)
        setattr(klass, meth, wrapper)
Example #6
0
def _reduce_dict(count_dict, partial_key):
    """
    Make partial sum on a counter dict.
    Given a match for the beginning of the category, it will sum each value.
    """
    L = len(partial_key)
    count = sum(v for k, v in iteritems(count_dict) if k[:L] == partial_key)
    return count
Example #7
0
def _normalize_data(data, index):
    """normalize the data to a dict with tuples of strings as keys
    right now it works with:

        0 - dictionary (or equivalent mappable)
        1 - pandas.Series with simple or hierarchical indexes
        2 - numpy.ndarrays
        3 - everything that can be converted to a numpy array
        4 - pandas.DataFrame (via the _normalize_dataframe function)
    """
    # if data is a dataframe we need to take a completely new road
    # before coming back here. Use the hasattr to avoid importing
    # pandas explicitly
    if hasattr(data, 'pivot') and hasattr(data, 'groupby'):
        data = _normalize_dataframe(data, index)
        index = None
    # can it be used as a dictionary?
    try:
        items = list(iteritems(data))
    except AttributeError:
        # ok, I cannot use the data as a dictionary
        # Try to convert it to a numpy array, or die trying
        data = np.asarray(data)
        temp = OrderedDict()
        for idx in np.ndindex(data.shape):
            name = tuple(i for i in idx)
            temp[name] = data[idx]
        data = temp
        items = list(iteritems(data))
    # make all the keys a tuple, even if simple numbers
    data = OrderedDict([_tuplify(k), v] for k, v in items)
    categories_levels = _categories_level(list(iterkeys(data)))
    # fill the void in the counting dictionary
    indexes = product(*categories_levels)
    contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes])
    data = contingency
    # reorder the keys order according to the one specified by the user
    # or if the index is None convert it into a simple list
    # right now it doesn't do any check, but can be modified in the future
    index = lrange(len(categories_levels)) if index is None else index
    contingency = OrderedDict()
    for key, value in iteritems(data):
        new_key = tuple(key[i] for i in index)
        contingency[new_key] = value
    data = contingency
    return data
Example #8
0
def _normalize_data(data, index):
    """normalize the data to a dict with tuples of strings as keys
    right now it works with:

        0 - dictionary (or equivalent mappable)
        1 - pandas.Series with simple or hierarchical indexes
        2 - numpy.ndarrays
        3 - everything that can be converted to a numpy array
        4 - pandas.DataFrame (via the _normalize_dataframe function)
    """
    # if data is a dataframe we need to take a completely new road
    # before coming back here. Use the hasattr to avoid importing
    # pandas explicitly
    if hasattr(data, 'pivot') and hasattr(data, 'groupby'):
        data = _normalize_dataframe(data, index)
        index = None
    # can it be used as a dictionary?
    try:
        items = list(iteritems(data))
    except AttributeError:
        # ok, I cannot use the data as a dictionary
        # Try to convert it to a numpy array, or die trying
        data = np.asarray(data)
        temp = OrderedDict()
        for idx in np.ndindex(data.shape):
            name = tuple(i for i in idx)
            temp[name] = data[idx]
        data = temp
        items = list(iteritems(data))
    # make all the keys a tuple, even if simple numbers
    data = OrderedDict([_tuplify(k), v] for k, v in items)
    categories_levels = _categories_level(list(iterkeys(data)))
    # fill the void in the counting dictionary
    indexes = product(*categories_levels)
    contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes])
    data = contingency
    # reorder the keys order according to the one specified by the user
    # or if the index is None convert it into a simple list
    # right now it doesn't do any check, but can be modified in the future
    index = lrange(len(categories_levels)) if index is None else index
    contingency = OrderedDict()
    for key, value in iteritems(data):
        new_key = tuple(key[i] for i in index)
        contingency[new_key] = value
    data = contingency
    return data
Example #9
0
def _reduce_dict(count_dict, partial_key):
    """
    Make partial sum on a counter dict.
    Given a match for the beginning of the category, it will sum each value.
    """
    L = len(partial_key)
    count = sum(v for k, v in iteritems(count_dict) if k[:L] == partial_key)
    return count
Example #10
0
def populate_wrapper(klass, wrapping):
    for meth, how in iteritems(klass._wrap_methods):
        if not hasattr(wrapping, meth):
            continue

        func = getattr(wrapping, meth)
        wrapper = make_wrapper(func, how)
        setattr(klass, meth, wrapper)
Example #11
0
    def equations(self):
        eqs = {}
        for col, ts in iteritems(self.y):
            model = _window_ols(y=ts, x=self.x, window=self._window,
                                window_type=self._window_type,
                                min_periods=self._min_periods)

            eqs[col] = model

        return eqs
Example #12
0
    def equations(self):
        eqs = {}
        for col, ts in iteritems(self.y):
            model = _window_ols(y=ts, x=self.x, window=self._window,
                                window_type=self._window_type,
                                min_periods=self._min_periods)

            eqs[col] = model

        return eqs
Example #13
0
def get_lag(mod, endog, exog, start_lag, max_lag, method, model_args=()):
    results = {}  # dict
    method = method.lower()
    for lag in range(start_lag, start_lag + max_lag + 1):
        results[lag] = mod(endog, exog[:, :lag], *model_args).fit()
    if method == "aic":
        best_inf_crit, best_lag = min(
            (v.aic, k)
            for k, v in iteritems(results))  # перебор по значениям из results
    return best_inf_crit, best_lag
Example #14
0
def test_get_trendorder():
    results = {
        'c' : 1,
        'nc' : 0,
        'ct' : 2,
        'ctt' : 3
    }

    for t, trendorder in iteritems(results):
        assert(util.get_trendorder(t) == trendorder)
Example #15
0
    def select_order(self, maxlag, ic, trend='c', method='mle'):
        """
        Select the lag order according to the information criterion.

        Parameters
        ----------
        maxlag : int
            The highest lag length tried. See `AR.fit`.
        ic : str {'aic','bic','hqic','t-stat'}
            Criterion used for selecting the optimal lag length.
            See `AR.fit`.
        trend : str {'c','nc'}
            Whether to include a constant or not. 'c' - include constant.
            'nc' - no constant.

        Returns
        -------
        bestlag : int
            Best lag according to IC.
        """
        endog = self.endog

        # make Y and X with same nobs to compare ICs
        Y = endog[maxlag:]
        self.Y = Y  # attach to get correct fit stats
        X = self._stackX(maxlag, trend)  # sets k_trend
        self.X = X
        k = self.k_trend  # k_trend set in _stackX
        k = max(1, k)  # handle if startlag is 0
        results = {}

        if ic != 't-stat':
            for lag in range(k, maxlag+1):
                # have to reinstantiate the model to keep comparable models
                endog_tmp = endog[maxlag-lag:]
                fit = AR(endog_tmp).fit(maxlag=lag, method=method,
                                        full_output=0, trend=trend,
                                        maxiter=100, disp=0)
                results[lag] = eval('fit.'+ic)
            bestic, bestlag = min((res, k) for k, res in iteritems(results))

        else:  # choose by last t-stat.
            stop = 1.6448536269514722  # for t-stat, norm.ppf(.95)
            for lag in range(maxlag, k - 1, -1):
                # have to reinstantiate the model to keep comparable models
                endog_tmp = endog[maxlag - lag:]
                fit = AR(endog_tmp).fit(maxlag=lag, method=method,
                                        full_output=0, trend=trend,
                                        maxiter=35, disp=-1)

                bestlag = 0
                if np.abs(fit.tvalues[-1]) >= stop:
                    bestlag = lag
                    break
        return bestlag
Example #16
0
    def equations(self):
        eqs = {}
        for col, ts in iteritems(self.y):
            # TODO: Remove in favor of statsmodels implemetation
            model = pd.ols(y=ts, x=self.x, window=self._window,
                           window_type=self._window_type,
                           min_periods=self._min_periods)

            eqs[col] = model

        return eqs
Example #17
0
    def equations(self):
        eqs = {}
        for col, ts in iteritems(self.y):
            # TODO: Remove in favor of statsmodels implemetation
            model = pd.ols(y=ts, x=self.x, window=self._window,
                           window_type=self._window_type,
                           min_periods=self._min_periods)

            eqs[col] = model

        return eqs
Example #18
0
def _statistical_coloring(data):
    """evaluate colors from the indipendence properties of the matrix
    It will encounter problem if one category has all zeros
    """
    data = _normalize_data(data, None)
    categories_levels = _categories_level(list(iterkeys(data)))
    Nlevels = len(categories_levels)
    total = 1.0 * sum(v for v in itervalues(data))
    # count the proportion of observation
    # for each level that has the given name
    # at each level
    levels_count = []
    for level_idx in range(Nlevels):
        proportion = {}
        for level in categories_levels[level_idx]:
            proportion[level] = 0.0
            for key, value in iteritems(data):
                if level == key[level_idx]:
                    proportion[level] += value
            proportion[level] /= total
        levels_count.append(proportion)
    # for each key I obtain the expected value
    # and it's standard deviation from a binomial distribution
    # under the hipothesys of independence
    expected = {}
    for key, value in iteritems(data):
        base = 1.0
        for i, k in enumerate(key):
            base *= levels_count[i][k]
        expected[key] = base * total, np.sqrt(total * base * (1.0 - base))
    # now we have the standard deviation of distance from the
    # expected value for each tile. We create the colors from this
    sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected))
    props = {}
    for key, dev in iteritems(sigmas):
        red = 0.0 if dev < 0 else (dev / (1 + dev))
        blue = 0.0 if dev > 0 else (dev / (-1 + dev))
        green = (1.0 - red - blue) / 2.0
        hatch = 'x' if dev > 2 else 'o' if dev < -2 else ''
        props[key] = {'color': [red, green, blue], 'hatch': hatch}
    return props
Example #19
0
def _statistical_coloring(data):
    """evaluate colors from the indipendence properties of the matrix
    It will encounter problem if one category has all zeros
    """
    data = _normalize_data(data, None)
    categories_levels = _categories_level(list(iterkeys(data)))
    Nlevels = len(categories_levels)
    total = 1.0 * sum(v for v in itervalues(data))
    # count the proportion of observation
    # for each level that has the given name
    # at each level
    levels_count = []
    for level_idx in range(Nlevels):
        proportion = {}
        for level in categories_levels[level_idx]:
            proportion[level] = 0.0
            for key, value in iteritems(data):
                if level == key[level_idx]:
                    proportion[level] += value
            proportion[level] /= total
        levels_count.append(proportion)
    # for each key I obtain the expected value
    # and it's standard deviation from a binomial distribution
    # under the hipothesys of independence
    expected = {}
    for key, value in iteritems(data):
        base = 1.0
        for i, k in enumerate(key):
            base *= levels_count[i][k]
        expected[key] = base * total, np.sqrt(total * base * (1.0 - base))
    # now we have the standard deviation of distance from the
    # expected value for each tile. We create the colors from this
    sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected))
    props = {}
    for key, dev in iteritems(sigmas):
        red = 0.0 if dev < 0 else (dev / (1 + dev))
        blue = 0.0 if dev > 0 else (dev / (-1 + dev))
        green = (1.0 - red - blue) / 2.0
        hatch = 'x' if dev > 2 else 'o' if dev < -2 else ''
        props[key] = {'color': [red, green, blue], 'hatch': hatch}
    return props
Example #20
0
    def coefs(self):
        """
        Return dynamic regression coefficients as Panel
        """
        data = {}
        for eq, result in iteritems(self.equations):
            data[eq] = result.beta

        panel = pd.Panel.fromDict(data)

        # Coefficient names become items
        return panel.swapaxes('items', 'minor')
Example #21
0
    def maxlag_selection(self):
        '''
        最大滞后阶数选择
        :return: 不同准则下最优滞后阶数
        '''
        self.data_prepro()
        Y, Y_future, Y_exog, Y_exog_future = self.prepare_Y(0, -10)

        model = VAR(endog=Y, exog=Y_exog)
        ics = defaultdict(list)
        for p in range(self.maxlags + 1):
            result = model._estimate_var(p, offset=self.maxlags - p)
            for k, v in iteritems(result.info_criteria):
                ics[k].append(v)
        selected_orders = dict(
            (k, np.array(v).argmin()) for k, v in iteritems(ics))
        t_str = get_ic_table(ics, selected_orders)
        t_str += '\n%s' % str(selected_orders)
        self.model = model
        self.InfoText.delete(0.0, tkinter.END)
        self.InfoText.insert(0.0, t_str)
Example #22
0
    def coefs(self):
        """
        Return dynamic regression coefficients as Panel
        """
        data = {}
        for eq, result in iteritems(self.equations):
            data[eq] = result.beta

        panel = pd.Panel.fromDict(data)

        # Coefficient names become items
        return panel.swapaxes('items', 'minor')
def test_mosaic_very_complex():
    # make a scattermatrix of mosaic plots to show the correlations between
    # each pair of variable in a dataset. Could be easily converted into a
    # new function that does this automatically based on the type of data
    key_name = ['gender', 'age', 'health', 'work']
    key_base = (['male', 'female'], ['old',
                                     'young'], ['healty',
                                                'ill'], ['work', 'unemployed'])
    keys = list(product(*key_base))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    props = {}
    props[('male', 'old')] = {'color': 'r'}
    props[('female', )] = {'color': 'pink'}
    L = len(key_base)
    fig, axes = pylab.subplots(L, L)
    for i in range(L):
        for j in range(L):
            m = set(range(L)).difference(set((i, j)))
            if i == j:
                axes[i, i].text(0.5,
                                0.5,
                                key_name[i],
                                ha='center',
                                va='center')
                axes[i, i].set_xticks([])
                axes[i, i].set_xticklabels([])
                axes[i, i].set_yticks([])
                axes[i, i].set_yticklabels([])
            else:
                ji = max(i, j)
                ij = min(i, j)
                temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r]
                                                                 for r in m),
                                          v) for k, v in iteritems(data)])

                keys = list(iterkeys(temp_data))
                for k in keys:
                    value = _reduce_dict(temp_data, k[:2])
                    temp_data[k[:2]] = value
                    del temp_data[k]
                mosaic(temp_data,
                       ax=axes[i, j],
                       axes_label=False,
                       properties=props,
                       gap=0.05,
                       horizontal=i > j)
    pylab.suptitle('old males should look bright red,  (plot 4 of 4)')
    #pylab.show()
    pylab.close('all')
Example #24
0
def summary_model(results):
    """
    Create a dict with information about the model
    """

    def time_now(*args, **kwds):
        now = datetime.datetime.now()
        return now.strftime('%Y-%m-%d %H:%M')

    info = {}
    info['Model:'] = lambda x: x.model.__class__.__name__
    info['Model Family:'] = lambda x: x.family.__class.__name__
    info['Link Function:'] = lambda x: x.family.link.__class__.__name__
    info['Dependent Variable:'] = lambda x: x.model.endog_names
    info['Date:'] = time_now
    info['No. Observations:'] = lambda x: "%#6d" % x.nobs
    info['Df Model:'] = lambda x: "%#6d" % x.df_model
    info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid
    info['Converged:'] = lambda x: x.mle_retvals['converged']
    info['No. Iterations:'] = lambda x: x.mle_retvals['iterations']
    info['Method:'] = lambda x: x.method
    info['Norm:'] = lambda x: x.fit_options['norm']
    info['Scale Est.:'] = lambda x: x.fit_options['scale_est']
    info['Cov. Type:'] = lambda x: x.fit_options['cov']

    rsquared_type = '' if results.k_constant else ' (uncentered)'
    info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared
    info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj  # noqa:E501
    info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared
    info['AIC:'] = lambda x: "%8.4f" % x.aic
    info['BIC:'] = lambda x: "%8.4f" % x.bic
    info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf
    info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull
    info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue
    info['Deviance:'] = lambda x: "%#8.5g" % x.deviance
    info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2
    info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue
    info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue
    info['Scale:'] = lambda x: "%#8.5g" % x.scale
    out = {}
    for key, func in iteritems(info):
        try:
            out[key] = func(results)
        except (AttributeError, KeyError, NotImplementedError):
            # NOTE: some models do not have loglike defined (RLM),
            #   so raise NotImplementedError
            pass
    return out
Example #25
0
def test_adf_autolag():
    #see issue #246
    #this is mostly a unit test
    d2 = macrodata.load().data

    for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']):
        #[None:'nc', 0:'c', 1:'ct', 2:'ctt']
        x = np.log(d2['realgdp'])
        xd = np.diff(x)

        #check exog
        adf3 = tsast.adfuller(x,
                              maxlag=None,
                              autolag='aic',
                              regression=tr,
                              store=True,
                              regresults=True)
        st2 = adf3[-1]

        assert_equal(len(st2.autolag_results), 15 + 1)  #+1 for lagged level
        for l, res in sorted(iteritems(st2.autolag_results))[:5]:
            lag = l - k_trend
            #assert correct design matrices in _autolag
            assert_equal(res.model.exog[-10:, k_trend], x[-11:-1])
            assert_equal(res.model.exog[-1, k_trend + 1:], xd[-lag:-1][::-1])
            #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend
            assert_equal(st2.usedlag, 2)

        #same result with lag fixed at usedlag of autolag
        adf2 = tsast.adfuller(x, maxlag=2, autolag=None, regression=tr)
        assert_almost_equal(adf3[:2], adf2[:2], decimal=12)

    tr = 'c'
    #check maxlag with autolag
    adf3 = tsast.adfuller(x,
                          maxlag=5,
                          autolag='aic',
                          regression=tr,
                          store=True,
                          regresults=True)
    assert_equal(len(adf3[-1].autolag_results), 5 + 1)
    adf3 = tsast.adfuller(x,
                          maxlag=0,
                          autolag='aic',
                          regression=tr,
                          store=True,
                          regresults=True)
    assert_equal(len(adf3[-1].autolag_results), 0 + 1)
Example #26
0
def summary_model(results):
    '''Create a dict with information about the model
    '''

    def time_now(*args, **kwds):
        now = datetime.datetime.now()
        return now.strftime('%Y-%m-%d %H:%M')

    info = OrderedDict()
    info['Model:'] = lambda x: x.model.__class__.__name__
    info['Model Family:'] = lambda x: x.family.__class.__name__
    info['Link Function:'] = lambda x: x.family.link.__class__.__name__
    info['Dependent Variable:'] = lambda x: x.model.endog_names
    info['Date:'] = time_now
    info['No. Observations:'] = lambda x: "%#6d" % x.nobs
    info['Df Model:'] = lambda x: "%#6d" % x.df_model
    info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid
    info['Converged:'] = lambda x: x.mle_retvals['converged']
    info['No. Iterations:'] = lambda x: x.mle_retvals['iterations']
    info['Method:'] = lambda x: x.method
    info['Norm:'] = lambda x: x.fit_options['norm']
    info['Scale Est.:'] = lambda x: x.fit_options['scale_est']
    info['Cov. Type:'] = lambda x: x.fit_options['cov']

    rsquared_type = '' if results.k_constant else ' (uncentered)'
    info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared
    info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj
    info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared
    info['AIC:'] = lambda x: "%8.4f" % x.aic
    info['BIC:'] = lambda x: "%8.4f" % x.bic
    info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf
    info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull
    info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue
    info['Deviance:'] = lambda x: "%#8.5g" % x.deviance
    info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2
    info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue
    info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue
    info['Scale:'] = lambda x: "%#8.5g" % x.scale
    out = OrderedDict()
    for key, func in iteritems(info):
        try:
            out[key] = func(results)
        except (AttributeError, KeyError, NotImplementedError):
            # NOTE: some models don't have loglike defined (RLM),
            #   so raise NotImplementedError
            pass
    return out
def _recode(x, levels):
    """ Recode categorial data to int factor.

    Parameters
    ----------
    x : array-like
        array like object supporting with numpy array methods of categorially
        coded data.
    levels : dict
        mapping of labels to integer-codings

    Returns
    -------
    out : instance numpy.ndarray

    """
    from pandas import Series
    name = None
    index = None

    if isinstance(x, Series):
        name = x.name
        index = x.index
        x = x.values

    if x.dtype.type not in [np.str_, np.object_]:
        raise ValueError('This is not a categorial factor.'
                         ' Array of str type required.')

    elif not isinstance(levels, dict):
        raise ValueError('This is not a valid value for levels.'
                         ' Dict required.')

    elif not (np.unique(x) == np.unique(list(iterkeys(levels)))).all():
        raise ValueError('The levels do not match the array values.')

    else:
        out = np.empty(x.shape[0], dtype=np.int)
        for level, coding in iteritems(levels):
            out[x == level] = coding

        if name:
            out = Series(out, name=name, index=index)

        return out
Example #28
0
def _recode(x, levels):
    """ Recode categorial data to int factor.

    Parameters
    ----------
    x : array-like
        array like object supporting with numpy array methods of categorially
        coded data.
    levels : dict
        mapping of labels to integer-codings

    Returns
    -------
    out : instance numpy.ndarray

    """
    from pandas import Series
    name = None
    index = None

    if isinstance(x, Series):
        name = x.name
        index = x.index
        x = x.values

    if x.dtype.type not in [np.str_, np.object_]:
        raise ValueError('This is not a categorial factor.'
                         ' Array of str type required.')

    elif not isinstance(levels, dict):
        raise ValueError('This is not a valid value for levels.'
                         ' Dict required.')

    elif not (np.unique(x) == np.unique(list(iterkeys(levels)))).all():
        raise ValueError('The levels do not match the array values.')

    else:
        out = np.empty(x.shape[0], dtype=np.int)
        for level, coding in iteritems(levels):
            out[x == level] = coding

        if name:
            out = Series(out, name=name, index=index)

        return out
Example #29
0
def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap):
    """
    Given a dictionary where each entry  is a rectangle, a list of key and
    value (count of elements in each category) it split each rect accordingly,
    as long as the key start with the tuple key_subset.  The other keys are
    returned without modification.
    """
    result = OrderedDict()
    L = len(key_subset)
    for name, (x, y, w, h) in iteritems(rect_dict):
        if key_subset == name[:L]:
            # split base on the values given
            divisions = _split_rect(x, y, w, h, values, horizontal, gap)
            for key, rect in zip(keys, divisions):
                result[name + (key, )] = rect
        else:
            result[name] = (x, y, w, h)
    return result
Example #30
0
    def test_multi_pvalcorrection(self):
        #test against R package multtest mt.rawp2adjp

        res_multtest = self.res2
        pval0 = res_multtest[:,0]

        for k,v in iteritems(rmethods):
            if v[1] in self.methods:
                reject, pvalscorr = multipletests(pval0,
                                                  alpha=self.alpha,
                                                  method=v[1])[:2]
                assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15)
                assert_equal(reject, pvalscorr <= self.alpha)

        pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,7], 15)
        pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
        assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
Example #31
0
    def test_multi_pvalcorrection(self):
        #test against R package multtest mt.rawp2adjp

        res_multtest = self.res2
        pval0 = res_multtest[:, 0]

        for k, v in iteritems(rmethods):
            if v[1] in self.methods:
                reject, pvalscorr = multipletests(pval0,
                                                  alpha=self.alpha,
                                                  method=v[1])[:2]
                assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15)
                assert_equal(reject, pvalscorr <= self.alpha)

        pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1])
        assert_almost_equal(pvalscorr, res_multtest[:, 7], 15)
        pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1])
        assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
Example #32
0
def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap):
    """
    Given a dictionary where each entry  is a rectangle, a list of key and
    value (count of elements in each category) it split each rect accordingly,
    as long as the key start with the tuple key_subset.  The other keys are
    returned without modification.
    """
    result = OrderedDict()
    L = len(key_subset)
    for name, (x, y, w, h) in iteritems(rect_dict):
        if key_subset == name[:L]:
            # split base on the values given
            divisions = _split_rect(x, y, w, h, values, horizontal, gap)
            for key, rect in zip(keys, divisions):
                result[name + (key,)] = rect
        else:
            result[name] = (x, y, w, h)
    return result
Example #33
0
def summary_model(results):
    '''Create a dict with information about the model
    '''
    def time_now(**kwrds):
        now = datetime.datetime.now()
        return now.strftime('%Y-%m-%d %H:%M')
    info = OrderedDict()
    info['Model:'] = lambda x: x.model.__class__.__name__
    info['Model Family:'] = lambda x: x.family.__class.__name__
    info['Link Function:'] = lambda x: x.family.link.__class__.__name__
    info['Dependent Variable:'] = lambda x: x.model.endog_names
    info['Date:'] = time_now()
    info['No. Observations:'] = lambda x: "%#6d" % x.nobs
    info['Df Model:'] = lambda x: "%#6d" % x.df_model
    info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid
    info['Converged:'] = lambda x: x.mle_retvals['converged']
    info['No. Iterations:'] = lambda x: x.mle_retvals['iterations']
    info['Method:'] = lambda x: x.method
    info['Norm:'] = lambda x: x.fit_options['norm']
    info['Scale Est.:'] = lambda x: x.fit_options['scale_est']
    info['Cov. Type:'] = lambda x: x.fit_options['cov']
    info['R-squared:'] = lambda x: "%#8.3f" % x.rsquared
    info['Adj. R-squared:'] = lambda x: "%#8.3f" % x.rsquared_adj
    info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared
    info['AIC:'] = lambda x: "%8.4f" % x.aic
    info['BIC:'] = lambda x: "%8.4f" % x.bic
    info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf
    info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull
    info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue
    info['Deviance:'] = lambda x: "%#8.5g" % x.deviance
    info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2
    info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue
    info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue
    info['Scale:'] = lambda x: "%#8.5g" % x.scale
    out = OrderedDict()
    for key in iteritems(info):
        try:
            out[key] = info[key](results)
        except:
            pass
    return out
Example #34
0
def test_mosaic_very_complex():
    # make a scattermatrix of mosaic plots to show the correlations between
    # each pair of variable in a dataset. Could be easily converted into a
    # new function that does this automatically based on the type of data
    key_name = ['gender', 'age', 'health', 'work']
    key_base = (['male', 'female'], ['old', 'young'],
                    ['healty', 'ill'], ['work', 'unemployed'])
    keys = list(product(*key_base))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    props = {}
    props[('male', 'old')] = {'color': 'r'}
    props[('female',)] = {'color': 'pink'}
    L = len(key_base)
    fig, axes = pylab.subplots(L, L)
    for i in range(L):
        for j in range(L):
            m = set(range(L)).difference(set((i, j)))
            if i == j:
                axes[i, i].text(0.5, 0.5, key_name[i],
                                ha='center', va='center')
                axes[i, i].set_xticks([])
                axes[i, i].set_xticklabels([])
                axes[i, i].set_yticks([])
                axes[i, i].set_yticklabels([])
            else:
                ji = max(i, j)
                ij = min(i, j)
                temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v)
                                            for k, v in iteritems(data)])

                keys = list(iterkeys(temp_data))
                for k in keys:
                    value = _reduce_dict(temp_data, k[:2])
                    temp_data[k[:2]] = value
                    del temp_data[k]
                mosaic(temp_data, ax=axes[i, j], axes_label=False,
                       properties=props, gap=0.05, horizontal=i > j)
    pylab.suptitle('old males should look bright red,  (plot 4 of 4)')
    #pylab.show()
    pylab.close('all')
Example #35
0
    def __init__(self, endog, exog, tree, paramsind):
        self.endog = endog
        self.datadict = exog
        self.tree = tree
        self.paramsind = paramsind

        self.branchsum = ''
        self.probs = {}
        self.probstxt = {}
        self.branchleaves = {}
        self.branchvalues = {}  #just to keep track of returns by branches
        self.branchsums = {}
        self.bprobs = {}
        self.branches, self.leaves, self.branches_degenerate  = getnodes(tree)
        self.nbranches = len(self.branches)

        #copied over but not quite sure yet
        #unique, parameter array names,
        #sorted alphabetically, order is/should be only internal

        self.paramsnames = (sorted(set([i for j in itervalues(paramsind)
                                       for i in j])) +
                            ['tau_%s' % bname for bname in self.branches])

        self.nparams = len(self.paramsnames)

        #mapping coefficient names to indices to unique/parameter array
        self.paramsidx = dict((name, idx) for (idx,name) in
                              enumerate(self.paramsnames))

        #mapping branch and leaf names to index in parameter array
        self.parinddict = dict((k, [self.paramsidx[j] for j in v])
                               for k,v in iteritems(self.paramsind))

        self.recursionparams = 1. + np.arange(len(self.paramsnames))
        #for testing that individual parameters are used in the right place
        self.recursionparams = np.zeros(len(self.paramsnames))
        #self.recursionparams[2] = 1
        self.recursionparams[-self.nbranches:] = 1  #values for tau's
Example #36
0
    def __init__(self, endog, exog, tree, paramsind):
        self.endog = endog
        self.datadict = exog
        self.tree = tree
        self.paramsind = paramsind

        self.branchsum = ''
        self.probs = {}
        self.probstxt = {}
        self.branchleaves = {}
        self.branchvalues = {}  #just to keep track of returns by branches
        self.branchsums = {}
        self.bprobs = {}
        self.branches, self.leaves, self.branches_degenerate = getnodes(tree)
        self.nbranches = len(self.branches)

        #copied over but not quite sure yet
        #unique, parameter array names,
        #sorted alphabetically, order is/should be only internal

        self.paramsnames = (
            sorted(set([i for j in itervalues(paramsind) for i in j])) +
            ['tau_%s' % bname for bname in self.branches])

        self.nparams = len(self.paramsnames)

        #mapping coefficient names to indices to unique/parameter array
        self.paramsidx = dict(
            (name, idx) for (idx, name) in enumerate(self.paramsnames))

        #mapping branch and leaf names to index in parameter array
        self.parinddict = dict((k, [self.paramsidx[j] for j in v])
                               for k, v in iteritems(self.paramsind))

        self.recursionparams = 1. + np.arange(len(self.paramsnames))
        #for testing that individual parameters are used in the right place
        self.recursionparams = np.zeros(len(self.paramsnames))
        #self.recursionparams[2] = 1
        self.recursionparams[-self.nbranches:] = 1  #values for tau's
def test_adf_autolag():
    #see issue #246
    #this is mostly a unit test
    d2 = macrodata.load().data

    for k_trend, tr in enumerate(['nc', 'c', 'ct', 'ctt']):
        #[None:'nc', 0:'c', 1:'ct', 2:'ctt']
        x = np.log(d2['realgdp'])
        xd = np.diff(x)

        #check exog
        adf3 = tsast.adfuller(x, maxlag=None, autolag='aic',
                              regression=tr, store=True, regresults=True)
        st2 = adf3[-1]

        assert_equal(len(st2.autolag_results), 15 + 1)  #+1 for lagged level
        for l, res in sorted(iteritems(st2.autolag_results))[:5]:
            lag = l-k_trend
            #assert correct design matrices in _autolag
            assert_equal(res.model.exog[-10:,k_trend], x[-11:-1])
            assert_equal(res.model.exog[-1,k_trend+1:], xd[-lag:-1][::-1])
            #min-ic lag of dfgls in Stata is also 2, or 9 for maic with notrend
            assert_equal(st2.usedlag, 2)

        #same result with lag fixed at usedlag of autolag
        adf2 = tsast.adfuller(x, maxlag=2, autolag=None, regression=tr)
        assert_almost_equal(adf3[:2], adf2[:2], decimal=12)


    tr = 'c'
    #check maxlag with autolag
    adf3 = tsast.adfuller(x, maxlag=5, autolag='aic',
                          regression=tr, store=True, regresults=True)
    assert_equal(len(adf3[-1].autolag_results), 5 + 1)
    adf3 = tsast.adfuller(x, maxlag=0, autolag='aic',
                          regression=tr, store=True, regresults=True)
    assert_equal(len(adf3[-1].autolag_results), 0 + 1)
Example #38
0
 def __init__(self, **kwargs):
     for key, value in iteritems(kwargs):
         setattr(self, key, value)
Example #39
0
    def resid(self):
        data = {}
        for eq, result in iteritems(self.equations):
            data[eq] = result.resid

        return pd.DataFrame(data)
Example #40
0
 def r2(self):
     """Returns the r-squared values."""
     data = dict((eq, r.r2) for eq, r in iteritems(self.equations))
     return pd.DataFrame(data)
Example #41
0
 def nobs(self):
     # Stub, do I need this?
     data = dict((eq, r.nobs) for eq, r in iteritems(self.equations))
     return pd.DataFrame(data)
Example #42
0
    def resid(self):
        data = {}
        for eq, result in iteritems(self.equations):
            data[eq] = result.resid

        return pd.DataFrame(data)
Example #43
0
def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(),
             fitargs=(), regresults=False):
    """
    Returns the results for the lag length that maximizes the info criterion.

    Parameters
    ----------
    mod : Model class
        Model estimator class
    endog : array-like
        nobs array containing endogenous variable
    exog : array-like
        nobs by (startlag + maxlag) array containing lags and possibly other
        variables
    startlag : int
        The first zero-indexed column to hold a lag.  See Notes.
    maxlag : int
        The highest lag order for lag length selection.
    method : {'aic', 'bic', 't-stat'}
        aic - Akaike Information Criterion
        bic - Bayes Information Criterion
        t-stat - Based on last lag
    modargs : tuple, optional
        args to pass to model.  See notes.
    fitargs : tuple, optional
        args to pass to fit.  See notes.
    regresults : bool, optional
        Flag indicating to return optional return results

    Returns
    -------
    icbest : float
        Best information criteria.
    bestlag : int
        The lag length that maximizes the information criterion.
    results : dict, optional
        Dictionary containing all estimation results

    Notes
    -----
    Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs)
    where i goes from lagstart to lagstart+maxlag+1.  Therefore, lags are
    assumed to be in contiguous columns from low to high lag length with
    the highest lag in the last column.
    """
    #TODO: can tcol be replaced by maxlag + 2?
    #TODO: This could be changed to laggedRHS and exog keyword arguments if
    #    this will be more general.

    results = {}
    method = method.lower()
    for lag in range(startlag, startlag + maxlag + 1):
        mod_instance = mod(endog, exog[:, :lag], *modargs)
        results[lag] = mod_instance.fit()

    if method == "aic":
        icbest, bestlag = min((v.aic, k) for k, v in iteritems(results))
    elif method == "bic":
        icbest, bestlag = min((v.bic, k) for k, v in iteritems(results))
    elif method == "t-stat":
        #stop = stats.norm.ppf(.95)
        stop = 1.6448536269514722
        for lag in range(startlag + maxlag, startlag - 1, -1):
            icbest = np.abs(results[lag].tvalues[-1])
            if np.abs(icbest) >= stop:
                bestlag = lag
                icbest = icbest
                break
    else:
        raise ValueError("Information Criterion %s not understood.") % method

    if not regresults:
        return icbest, bestlag
    else:
        return icbest, bestlag, results
Example #44
0
    def solve_power(self, **kwds):
        '''solve for any one of the parameters of a t-test

        for t-test the keywords are:
            effect_size, nobs, alpha, power

        exactly one needs to be ``None``, all others need numeric values

        *attaches*

        cache_fit_res : list
            Cache of the result of the root finding procedure for the latest
            call to ``solve_power``, mainly for debugging purposes.
            The first element is the success indicator, one if successful.
            The remaining elements contain the return information of the up to
            three solvers that have been tried.


        '''
        #TODO: maybe use explicit kwds,
        #    nicer but requires inspect? and not generic across tests
        #    I'm duplicating this in the subclass to get informative docstring
        key = [k for k,v in iteritems(kwds) if v is None]
        #print kwds, key;
        if len(key) != 1:
            raise ValueError('need exactly one keyword that is None')
        key = key[0]

        if key == 'power':
            del kwds['power']
            return self.power(**kwds)

        self._counter = 0
        def func(x):
            kwds[key] = x
            fval = self._power_identity(**kwds)
            self._counter += 1
            #print self._counter,
            if self._counter > 500:
                raise RuntimeError('possible endless loop (500 NaNs)')
            if np.isnan(fval):
                return np.inf
            else:
                return fval

        #TODO: I'm using the following so I get a warning when start_ttp is not defined
        try:
            start_value = self.start_ttp[key]
        except KeyError:
            start_value = 0.9
            print('Warning: using default start_value for {0}'.format(key))

        fit_kwds = self.start_bqexp[key]
        fit_res = []
        #print vars()
        try:
            val, res = brentq_expanding(func, full_output=True, **fit_kwds)
            failed = False
            fit_res.append(res)
        except ValueError:
            failed = True
            fit_res.append(None)

        success = None
        if (not failed) and res.converged:
            success = 1
        else:
            # try backup
            #TODO: check more cases to make this robust
            val, infodict, ier, msg = optimize.fsolve(func, start_value,
                                                      full_output=True) #scalar
            #val = optimize.newton(func, start_value) #scalar
            fval = infodict['fvec']
            fit_res.append(infodict)
            if ier == 1 and np.abs(fval) < 1e-4 :
                success = 1
            else:
                #print infodict
                if key in ['alpha', 'power', 'effect_size']:
                    val, r = optimize.brentq(func, 1e-8, 1-1e-8,
                                             full_output=True) #scalar
                    success = 1 if r.converged else 0
                    fit_res.append(r)
                else:
                    success = 0

        if not success == 1:
            import warnings
            from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
                convergence_doc)
            warnings.warn(convergence_doc, ConvergenceWarning)

        #attach fit_res, for reading only, should be needed only for debugging
        fit_res.insert(0, success)
        self.cache_fit_res = fit_res
        return val
from statsmodels.datasets import macrodata

import statsmodels.tsa.stattools as tsa_stats

# some example data
mdata = macrodata.load_pandas().data
mdata = mdata[['realgdp','realcons']].values
data = mdata
data = np.diff(np.log(data), axis=0)

#R: lmtest:grangertest
r_result = [0.243097, 0.7844328, 195, 2]  #f_test
gr = tsa_stats.grangercausalitytests(data[:,1::-1], 2, verbose=False)
assert_almost_equal(r_result, gr[2][0]['ssr_ftest'], decimal=7)
assert_almost_equal(gr[2][0]['params_ftest'], gr[2][0]['ssr_ftest'],
                    decimal=7)

lag = 2
print('\nTest Results for %d lags' % lag)
print()
print('\n'.join(['%-20s statistic: %f6.4   p-value: %f6.4' % (k, res[0], res[1])
                 for k, res in iteritems(gr[lag][0]) ]))

print('\n Results for auxiliary restricted regression with two lags')
print()
print(gr[lag][1][0].summary())

print('\n Results for auxiliary unrestricted regression with two lags')
print()
print(gr[lag][1][1].summary())
Example #46
0
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c',
                         model_kw={}, fit_kw={}):
    """
    Returns information criteria for many ARMA models

    Parameters
    ----------
    y : array-like
        Time-series data
    max_ar : int
        Maximum number of AR lags to use. Default 4.
    max_ma : int
        Maximum number of MA lags to use. Default 2.
    ic : str, list
        Information criteria to report. Either a single string or a list
        of different criteria is possible.
    trend : str
        The trend to use when fitting the ARMA models.
    model_kw : dict
        Keyword arguments to be passed to the ``ARMA`` model
    fit_kw : dict
        Keyword arguments to be passed to ``ARMA.fit``.

    Returns
    -------
    obj : Results object
        Each ic is an attribute with a DataFrame for the results. The AR order
        used is the row index. The ma order used is the column index. The
        minimum orders are available as ``ic_min_order``.

    Examples
    --------

    >>> from statsmodels.tsa.arima_process import arma_generate_sample
    >>> import statsmodels.api as sm
    >>> import numpy as np

    >>> arparams = np.array([.75, -.25])
    >>> maparams = np.array([.65, .35])
    >>> arparams = np.r_[1, -arparams]
    >>> maparam = np.r_[1, maparams]
    >>> nobs = 250
    >>> np.random.seed(2014)
    >>> y = arma_generate_sample(arparams, maparams, nobs)
    >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc')
    >>> res.aic_min_order
    >>> res.bic_min_order

    Notes
    -----
    This method can be used to tentatively identify the order of an ARMA
from    process, provided that the time series is stationary and invertible. This
    function computes the full exact MLE estimate of each model and can be,
    therefore a little slow. An implementation using approximate estimates
    will be provided in the future. In the meantime, consider passing
    {method : 'css'} to fit_kw.
    """
    from pandas import DataFrame

    ar_range = lrange(0, max_ar + 1)
    ma_range = lrange(0, max_ma + 1)
    if isinstance(ic, string_types):
        ic = [ic]
    elif not isinstance(ic, (list, tuple)):
        raise ValueError("Need a list or a tuple for ic if not a string.")

    results = np.zeros((len(ic), max_ar + 1, max_ma + 1))

    for ar in ar_range:
        for ma in ma_range:
            if ar == 0 and ma == 0 and trend == 'nc':
                results[:, ar, ma] = np.nan
                continue

            mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw)
            if mod is None:
                results[:, ar, ma] = np.nan
                continue

            for i, criteria in enumerate(ic):
                results[i, ar, ma] = getattr(mod, criteria)

    dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results]

    res = dict(zip(ic, dfs))

    # add the minimums to the results dict
    min_res = {}
    for i, result in iteritems(res):
        mins = np.where(result.min().min() == result)
        min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])})
    res.update(min_res)

    return Bunch(**res)
Example #47
0
 def __init__(self, **kwargs):
     for key, value in iteritems(kwargs):
         setattr(self, key, value)
     self.nobs = len(self.observed)
Example #48
0
 def __init__(self, **kwargs):
     for key, value in iteritems(kwargs):
         setattr(self, key, value)
Example #49
0
 def __init__(self, **kwargs):
     for key, value in iteritems(kwargs):
         setattr(self, key, value)
     self.nobs = len(self.observed)
Example #50
0
def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(),
             fitargs=(), regresults=False):
    """
    Returns the results for the lag length that maximimizes the info criterion.

    Parameters
    ----------
    mod : Model class
        Model estimator class.
    modargs : tuple
        args to pass to model.  See notes.
    fitargs : tuple
        args to pass to fit.  See notes.
    lagstart : int
        The first zero-indexed column to hold a lag.  See Notes.
    maxlag : int
        The highest lag order for lag length selection.
    method : str {"aic","bic","t-stat"}
        aic - Akaike Information Criterion
        bic - Bayes Information Criterion
        t-stat - Based on last lag

    Returns
    -------
    icbest : float
        Best information criteria.
    bestlag : int
        The lag length that maximizes the information criterion.


    Notes
    -----
    Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs)
    where i goes from lagstart to lagstart+maxlag+1.  Therefore, lags are
    assumed to be in contiguous columns from low to high lag length with
    the highest lag in the last column.
    """
    #TODO: can tcol be replaced by maxlag + 2?
    #TODO: This could be changed to laggedRHS and exog keyword arguments if
    #    this will be more general.

    results = {}
    method = method.lower()
    for lag in range(startlag, startlag + maxlag + 1):
        mod_instance = mod(endog, exog[:, :lag], *modargs)
        results[lag] = mod_instance.fit()

    if method == "aic":
        icbest, bestlag = min((v.aic, k) for k, v in iteritems(results))
    elif method == "bic":
        icbest, bestlag = min((v.bic, k) for k, v in iteritems(results))
    elif method == "t-stat":
        #stop = stats.norm.ppf(.95)
        stop = 1.6448536269514722
        for lag in range(startlag + maxlag, startlag - 1, -1):
            icbest = np.abs(results[lag].tvalues[-1])
            if np.abs(icbest) >= stop:
                bestlag = lag
                icbest = icbest
                break
    else:
        raise ValueError("Information Criterion %s not understood.") % method

    if not regresults:
        return icbest, bestlag
    else:
        return icbest, bestlag, results
Example #51
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        # patsy's already dropped NaNs in y/X
        missing_idx = kwargs.pop('missing_idx', None)

        if missing_idx is not None:
            # y, X already handled by patsy. add back in later.
            combined = ()
            combined_names = []
            if exog is None:
                none_array_names += ['exog']
        elif exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog, )
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (np.asarray(value_array), )
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (np.asarray(value_array), )
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (np.asarray(value_array), )
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                                     "aren't yet handled")

        if missing_idx is not None:
            nan_mask = missing_idx
            updated_row_mask = None
            if combined:  # there were extra arrays not handled by patsy
                combined_nans = _nan_rows(*combined)
                if combined_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra arrays given to model.")
                # for going back and updated endog/exog
                updated_row_mask = combined_nans[~nan_mask]
                nan_mask |= combined_nans  # for updating extra arrays only
            if combined_2d:
                combined_2d_nans = _nan_rows(combined_2d)
                if combined_2d_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra 2d arrays given to model.")
                if updated_row_mask is not None:
                    updated_row_mask |= combined_2d_nans[~nan_mask]
                else:
                    updated_row_mask = combined_2d_nans[~nan_mask]
                nan_mask |= combined_2d_nans

        else:
            nan_mask = _nan_rows(*combined)
            if combined_2d:
                nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d)

        if not np.any(nan_mask):  # no missing don't do anything
            combined = dict(zip(combined_names, combined))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names, combined_2d)))
            if none_array_names:
                combined.update(
                    dict(zip(none_array_names,
                             [None] * len(none_array_names))))

            if missing_idx is not None:
                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            return combined, []

        elif missing == 'raise':
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))

            if missing_idx is not None:
                if updated_row_mask is not None:
                    updated_row_mask = ~updated_row_mask
                    # update endog/exog with this new information
                    endog = cls._drop_nans(endog, updated_row_mask)
                    if exog is not None:
                        exog = cls._drop_nans(exog, updated_row_mask)

                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            if combined_2d:
                combined.update(
                    dict(
                        zip(combined_2d_names, lmap(drop_nans_2d,
                                                    combined_2d))))
            if none_array_names:
                combined.update(
                    dict(zip(none_array_names,
                             [None] * len(none_array_names))))

            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Example #52
0
    def solve_power(self, **kwds):
        '''solve for any one of the parameters of a t-test

        for t-test the keywords are:
            effect_size, nobs, alpha, power

        exactly one needs to be ``None``, all others need numeric values

        *attaches*

        cache_fit_res : list
            Cache of the result of the root finding procedure for the latest
            call to ``solve_power``, mainly for debugging purposes.
            The first element is the success indicator, one if successful.
            The remaining elements contain the return information of the up to
            three solvers that have been tried.


        '''
        #TODO: maybe use explicit kwds,
        #    nicer but requires inspect? and not generic across tests
        #    I'm duplicating this in the subclass to get informative docstring
        key = [k for k, v in iteritems(kwds) if v is None]
        #print kwds, key
        if len(key) != 1:
            raise ValueError('need exactly one keyword that is None')
        key = key[0]

        if key == 'power':
            del kwds['power']
            return self.power(**kwds)

        if kwds['effect_size'] == 0:
            import warnings
            from statsmodels.tools.sm_exceptions import HypothesisTestWarning
            warnings.warn('Warning: Effect size of 0 detected',
                          HypothesisTestWarning)
            if key == 'power':
                return kwds['alpha']
            if key == 'alpha':
                return kwds['power']
            else:
                raise ValueError(
                    'Cannot detect an effect-size of 0. Try changing your effect-size.'
                )

        self._counter = 0

        def func(x):
            kwds[key] = x
            fval = self._power_identity(**kwds)
            self._counter += 1
            #print self._counter,
            if self._counter > 500:
                raise RuntimeError('possible endless loop (500 NaNs)')
            if np.isnan(fval):
                return np.inf
            else:
                return fval

        #TODO: I'm using the following so I get a warning when start_ttp is not defined
        try:
            start_value = self.start_ttp[key]
        except KeyError:
            start_value = 0.9
            import warnings
            from statsmodels.tools.sm_exceptions import ValueWarning
            warnings.warn(
                'Warning: using default start_value for {0}'.format(key),
                ValueWarning)

        fit_kwds = self.start_bqexp[key]
        fit_res = []
        #print vars()
        try:
            val, res = brentq_expanding(func, full_output=True, **fit_kwds)
            failed = False
            fit_res.append(res)
        except ValueError:
            failed = True
            fit_res.append(None)

        success = None
        if (not failed) and res.converged:
            success = 1
        else:
            # try backup
            # TODO: check more cases to make this robust
            if not np.isnan(start_value):
                val, infodict, ier, msg = optimize.fsolve(
                    func, start_value, full_output=True)  #scalar
                #val = optimize.newton(func, start_value) #scalar
                fval = infodict['fvec']
                fit_res.append(infodict)
            else:
                ier = -1
                fval = 1
                fit_res.append([None])

            if ier == 1 and np.abs(fval) < 1e-4:
                success = 1
            else:
                #print infodict
                if key in ['alpha', 'power', 'effect_size']:
                    val, r = optimize.brentq(func,
                                             1e-8,
                                             1 - 1e-8,
                                             full_output=True)  #scalar
                    success = 1 if r.converged else 0
                    fit_res.append(r)
                else:
                    success = 0

        if not success == 1:
            import warnings
            from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
                                                         convergence_doc)
            warnings.warn(convergence_doc, ConvergenceWarning)

        #attach fit_res, for reading only, should be needed only for debugging
        fit_res.insert(0, success)
        self.cache_fit_res = fit_res
        return val
Example #53
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        # patsy's already dropped NaNs in y/X
        missing_idx = kwargs.pop('missing_idx', None)

        if missing_idx is not None:
            # y, X already handled by patsy. add back in later.
            combined = ()
            combined_names = []
            if exog is None:
                none_array_names += ['exog']
        elif exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog,)
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (np.asarray(value_array),)
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (np.asarray(value_array),)
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (np.asarray(value_array),)
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                                     "aren't yet handled")

        if missing_idx is not None:
            nan_mask = missing_idx
            updated_row_mask = None
            if combined:  # there were extra arrays not handled by patsy
                combined_nans = _nan_rows(*combined)
                if combined_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra arrays given to model.")
                # for going back and updated endog/exog
                updated_row_mask = combined_nans[~nan_mask]
                nan_mask |= combined_nans  # for updating extra arrays only
            if combined_2d:
                combined_2d_nans = _nan_rows(combined_2d)
                if combined_2d_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra 2d arrays given to model.")
                if updated_row_mask is not None:
                    updated_row_mask |= combined_2d_nans[~nan_mask]
                else:
                    updated_row_mask = combined_2d_nans[~nan_mask]
                nan_mask |= combined_2d_nans

        else:
            nan_mask = _nan_rows(*combined)
            if combined_2d:
                nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)

        if not np.any(nan_mask):  # no missing don't do anything
            combined = dict(zip(combined_names, combined))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names, combined_2d)))
            if none_array_names:
                combined.update(dict(zip(none_array_names,
                                         [None] * len(none_array_names))))

            if missing_idx is not None:
                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            return combined, []

        elif missing == 'raise':
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))

            if missing_idx is not None:
                if updated_row_mask is not None:
                    updated_row_mask = ~updated_row_mask
                    # update endog/exog with this new information
                    endog = cls._drop_nans(endog, updated_row_mask)
                    if exog is not None:
                        exog = cls._drop_nans(exog, updated_row_mask)

                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            if combined_2d:
                combined.update(dict(zip(combined_2d_names,
                                         lmap(drop_nans_2d, combined_2d))))
            if none_array_names:
                combined.update(dict(zip(none_array_names,
                                         [None] * len(none_array_names))))

            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Example #54
0
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c',
                         model_kw={}, fit_kw={}):
    """
    Returns information criteria for many ARMA models

    Parameters
    ----------
    y : array-like
        Time-series data
    max_ar : int
        Maximum number of AR lags to use. Default 4.
    max_ma : int
        Maximum number of MA lags to use. Default 2.
    ic : str, list
        Information criteria to report. Either a single string or a list
        of different criteria is possible.
    trend : str
        The trend to use when fitting the ARMA models.
    model_kw : dict
        Keyword arguments to be passed to the ``ARMA`` model
    fit_kw : dict
        Keyword arguments to be passed to ``ARMA.fit``.

    Returns
    -------
    obj : Results object
        Each ic is an attribute with a DataFrame for the results. The AR order
        used is the row index. The ma order used is the column index. The
        minimum orders are available as ``ic_min_order``.

    Examples
    --------

    >>> from statsmodels.tsa.arima_process import arma_generate_sample
    >>> import statsmodels.api as sm
    >>> import numpy as np

    >>> arparams = np.array([.75, -.25])
    >>> maparams = np.array([.65, .35])
    >>> arparams = np.r_[1, -arparams]
    >>> maparam = np.r_[1, maparams]
    >>> nobs = 250
    >>> np.random.seed(2014)
    >>> y = arma_generate_sample(arparams, maparams, nobs)
    >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc')
    >>> res.aic_min_order
    >>> res.bic_min_order

    Notes
    -----
    This method can be used to tentatively identify the order of an ARMA
    process, provided that the time series is stationary and invertible. This
    function computes the full exact MLE estimate of each model and can be,
    therefore a little slow. An implementation using approximate estimates
    will be provided in the future. In the meantime, consider passing
    {method : 'css'} to fit_kw.
    """
    from pandas import DataFrame

    ar_range = lrange(0, max_ar + 1)
    ma_range = lrange(0, max_ma + 1)
    if isinstance(ic, string_types):
        ic = [ic]
    elif not isinstance(ic, (list, tuple)):
        raise ValueError("Need a list or a tuple for ic if not a string.")

    results = np.zeros((len(ic), max_ar + 1, max_ma + 1))

    for ar in ar_range:
        for ma in ma_range:
            if ar == 0 and ma == 0 and trend == 'nc':
                results[:, ar, ma] = np.nan
                continue

            mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw)
            if mod is None:
                results[:, ar, ma] = np.nan
                continue

            for i, criteria in enumerate(ic):
                results[i, ar, ma] = getattr(mod, criteria)

    dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results]

    res = dict(zip(ic, dfs))

    # add the minimums to the results dict
    min_res = {}
    for i, result in iteritems(res):
        mins = np.where(result.min().min() == result)
        min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])})
    res.update(min_res)

    return Bunch(**res)
Example #55
0
 def set_options(self, **kwargs):
     options = ""
     for key, value in iteritems(kwargs):
         options += "{0}={1}\n".format(key, value)
         self.__dict__.update({key: value})
     self.options = options
Example #56
0
    "g": ["p", "hincg"],
    "B3": [],
    "h": ["consth", "p", "h"],
    "top": [],
}

# unique, parameter array names,
# sorted alphabetically, order is/should be only internal

paramsnames = sorted(set([i for j in itervalues(paramsind) for i in j]))

# mapping coefficient names to indices to unique/parameter array
paramsidx = dict((name, idx) for (idx, name) in enumerate(paramsnames))

# mapping branch and leaf names to index in parameter array
inddict = dict((k, [paramsidx[j] for j in v]) for k, v in iteritems(paramsind))

"""
>>> paramsnames
['const2', 'consta', 'constb', 'conste', 'consth', 'constt', 'h', 'hince',
 'hincf', 'hincg', 'p', 'time', 'x2', 'x22']
>>> parmasidx
{'conste': 3, 'consta': 1, 'constb': 2, 'h': 6, 'time': 11, 'consth': 4,
 'p': 10, 'constt': 5, 'const2': 0, 'x2': 12, 'x22': 13, 'hince': 7,
 'hincg': 9, 'hincf': 8}
>>> inddict
{'a': [1, 10], 'c': [1, 10, 11], 'b': [2, 10], 'e': [3, 10, 7],
 'd': [1, 10, 11], 'g': [10, 9], 'f': [5, 10, 8], 'h': [4, 10, 6],
 'top': [], 'B22': [13], 'B21': [], 'B1': [], 'B2': [0, 12], 'B3': []}
>>> paramsind
{'a': ['consta', 'p'], 'c': ['consta', 'p', 'time'], 'b': ['constb', 'p'],
Example #57
0
 def r2(self):
     """Returns the r-squared values."""
     data = dict((eq, r.r2) for eq, r in iteritems(self.equations))
     return pd.DataFrame(data)