def test_wls_example():
    #example from the docstring, there was a note about a bug, should
    #be fixed now
    Y = [1,3,4,5,2,3,4]
    X = lrange(1,8)
    X = add_constant(X, prepend=False)
    wls_model = WLS(Y,X, weights=lrange(1,8)).fit()
    #taken from R lm.summary
    assert_almost_equal(wls_model.fvalue, 0.127337843215, 6)
    assert_almost_equal(wls_model.scale, 2.44608530786**2, 6)
def test_arma_order_select_ic():
    # smoke test, assumes info-criteria are right
    from statsmodels.tsa.arima_process import arma_generate_sample

    arparams = np.array([.75, -.25])
    maparams = np.array([.65, .35])
    arparams = np.r_[1, -arparams]
    maparam = np.r_[1, maparams]
    nobs = 250
    np.random.seed(2014)
    y = arma_generate_sample(arparams, maparams, nobs)
    res = arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc')
    # regression tests in case we change algorithm to minic in sas
    aic_x = np.array([[       np.nan,  552.7342255 ,  484.29687843],
                      [ 562.10924262,  485.5197969 ,  480.32858497],
                      [ 507.04581344,  482.91065829,  481.91926034],
                      [ 484.03995962,  482.14868032,  483.86378955],
                      [ 481.8849479 ,  483.8377379 ,  485.83756612]])
    bic_x = np.array([[       np.nan,  559.77714733,  494.86126118],
                      [ 569.15216446,  496.08417966,  494.41442864],
                      [ 517.61019619,  496.99650196,  499.52656493],
                      [ 498.12580329,  499.75598491,  504.99255506],
                      [ 499.49225249,  504.96650341,  510.48779255]])
    aic = DataFrame(aic_x, index=lrange(5), columns=lrange(3))
    bic = DataFrame(bic_x, index=lrange(5), columns=lrange(3))
    assert_almost_equal(res.aic.values, aic.values, 5)
    assert_almost_equal(res.bic.values, bic.values, 5)
    assert_equal(res.aic_min_order, (1, 2))
    assert_equal(res.bic_min_order, (1, 2))
    assert_(res.aic.index.equals(aic.index))
    assert_(res.aic.columns.equals(aic.columns))
    assert_(res.bic.index.equals(bic.index))
    assert_(res.bic.columns.equals(bic.columns))

    index = pd.date_range('2000-1-1', freq='M', periods=len(y))
    y_series = pd.Series(y, index=index)
    res_pd = arma_order_select_ic(y_series, max_ar=2, max_ma=1,
                                  ic=['aic', 'bic'], trend='nc')
    assert_almost_equal(res_pd.aic.values, aic.values[:3, :2], 5)
    assert_almost_equal(res_pd.bic.values, bic.values[:3, :2], 5)
    assert_equal(res_pd.aic_min_order, (2, 1))
    assert_equal(res_pd.bic_min_order, (1, 1))

    res = arma_order_select_ic(y, ic='aic', trend='nc')
    assert_almost_equal(res.aic.values, aic.values, 5)
    assert_(res.aic.index.equals(aic.index))
    assert_(res.aic.columns.equals(aic.columns))
    assert_equal(res.aic_min_order, (1, 2))
Example #3
0
    def _make_predict_dates(self):
        data = self.data
        dtstart = data.predict_start
        dtend = data.predict_end
        freq = data.freq

        if freq is not None:
            pandas_freq = _freq_to_pandas[freq]
            try:
                from pandas import DatetimeIndex
                dates = DatetimeIndex(start=dtstart, end=dtend,
                                        freq=pandas_freq)
            except ImportError as err:
                from pandas import DateRange
                dates = DateRange(dtstart, dtend, offset = pandas_freq).values
        # handle
        elif freq is None and (isinstance(dtstart, int) and
                               isinstance(dtend, int)):
            from pandas import Index
            dates = Index(lrange(dtstart, dtend+1))
        # if freq is None and dtstart and dtend aren't integers, we're
        # in sample
        else:
            dates = self.data.dates
            start = self._get_dates_loc(dates, dtstart)
            end = self._get_dates_loc(dates, dtend)
            dates = dates[start:end+1] # is this index inclusive?
        self.data.predict_dates = dates
Example #4
0
 def maineffect_func(value, reference=reference):
     rvalue = []
     keep = lrange(value.shape[0])
     keep.pop(reference)
     for i in range(len(keep)):
         rvalue.append(value[keep[i]] - value[reference])
     return np.array(rvalue)
Example #5
0
def print_ic_table(ics, selected_orders):
    """
    For VAR order selection

    """
    # Can factor this out into a utility method if so desired

    cols = sorted(ics)

    data = mat([["%#10.4g" % v for v in ics[c]] for c in cols],
               dtype=object).T

    # start minimums
    for i, col in enumerate(cols):
        idx = int(selected_orders[col]), i
        data[idx] = data[idx] + '*'
        # data[idx] = data[idx][:-1] + '*' # super hack, ugh

    fmt = dict(_default_table_fmt,
               data_fmts=("%s",) * len(cols))

    buf = StringIO()
    table = SimpleTable(data, cols, lrange(len(data)),
                        title='VAR Order Selection', txt_fmt=fmt)
    buf.write(str(table) + '\n')
    buf.write('* Minimum' + '\n')

    print(buf.getvalue())
Example #6
0
def plot_with_error(y, error, x=None, axes=None, value_fmt='k',
                    error_fmt='k--', alpha=0.05, stderr_type = 'asym'):
    """
    Make plot with optional error bars

    Parameters
    ----------
    y :
    error : array or None

    """
    import matplotlib.pyplot as plt

    if axes is None:
        axes = plt.gca()

    x = x if x is not None else lrange(len(y))
    plot_action = lambda y, fmt: axes.plot(x, y, fmt)
    plot_action(y, value_fmt)

    #changed this
    if error is not None:
        if stderr_type == 'asym':
            q = util.norm_signif_level(alpha)
            plot_action(y - q * error, error_fmt)
            plot_action(y + q * error, error_fmt)
        if stderr_type in ('mc','sz1','sz2','sz3'):
            plot_action(error[0], error_fmt)
            plot_action(error[1], error_fmt)
Example #7
0
def test_pickle():
    import tempfile
    from numpy.testing import assert_equal
    tmpdir = tempfile.mkdtemp(prefix='pickle')
    a = lrange(10)
    save_pickle(a, tmpdir+'/res.pkl')
    b = load_pickle(tmpdir+'/res.pkl')
    assert_equal(a, b)

    #cleanup, tested on Windows
    try:
        import os
        os.remove(tmpdir+'/res.pkl')
        os.rmdir(tmpdir)
    except (OSError, IOError):
        pass
    assert not os.path.exists(tmpdir)

    #test with file handle
    fh = BytesIO()
    save_pickle(a, fh)
    fh.seek(0,0)
    c = load_pickle(fh)
    fh.close()
    assert_equal(a,b)
Example #8
0
    def _make_predict_dates(self):
        data = self.data
        dtstart = data.predict_start
        dtend = data.predict_end
        freq = data.freq

        if freq is not None:
            pandas_freq = _freq_to_pandas[freq]
            # preserve PeriodIndex or DatetimeIndex
            dates = self.data.dates.__class__(start=dtstart,
                                              end=dtend,
                                              freq=pandas_freq)
        # handle
        elif freq is None and (isinstance(dtstart, (int, long)) and
                               isinstance(dtend, (int, long))):
            from pandas import Index
            dates = Index(lrange(dtstart, dtend+1))
        # if freq is None and dtstart and dtend aren't integers, we're
        # in sample
        else:
            dates = self.data.dates
            start = self._get_dates_loc(dates, dtstart)
            end = self._get_dates_loc(dates, dtend)
            dates = dates[start:end+1] # is this index inclusive?
        self.data.predict_dates = dates
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None,
                         **kwargs):

    from scipy.stats import zscore, norm
    fig, ax = utils.create_mpl_ax(ax)

    infl = influence
    leverage = infl.hat_matrix_diag
    resid = zscore(infl.resid)
    ax.plot(resid**2, leverage, 'o', **kwargs)
    ax.set_xlabel("Normalized residuals**2")
    ax.set_ylabel("Leverage")
    ax.set_title("Leverage vs. Normalized residuals squared")

    large_leverage = leverage > _high_leverage(results)
    #norm or t here if standardized?
    cutoff = norm.ppf(1.-alpha/2)
    large_resid = np.abs(resid) > cutoff
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(int(results.nobs))
    index = np.where(np.logical_or(large_leverage, large_resid))[0]
    ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
                             [(0, 5)]*int(results.nobs), "large",
                             ax=ax, ha="center", va="bottom")
    ax.margins(.075, .075)
    return fig
Example #10
0
def irf_grid_plot(values, stderr, impcol, rescol, names, title,
                  signif=0.05, hlines=None, subplot_params=None,
                  plot_params=None, figsize=(10,10), stderr_type='asym'):
    """
    Reusable function to make flexible grid plots of impulse responses and
    comulative effects

    values : (T + 1) x k x k
    stderr : T x k x k
    hlines : k x k
    """
    import matplotlib.pyplot as plt

    if subplot_params is None:
        subplot_params = {}
    if plot_params is None:
        plot_params = {}

    nrows, ncols, to_plot = _get_irf_plot_config(names, impcol, rescol)

    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharex=True,
                             squeeze=False, figsize=figsize)

    # fill out space
    adjust_subplots()

    fig.suptitle(title, fontsize=14)

    subtitle_temp = r'%s$\rightarrow$%s'

    k = len(names)

    rng = lrange(len(values))
    for (j, i, ai, aj) in to_plot:
        ax = axes[ai][aj]

        # HACK?
        if stderr is not None:
            if stderr_type == 'asym':
                sig = np.sqrt(stderr[:, j * k + i, j * k + i])
                plot_with_error(values[:, i, j], sig, x=rng, axes=ax,
                            alpha=signif, value_fmt='b', stderr_type=stderr_type)
            if stderr_type in ('mc','sz1','sz2','sz3'):
                errs = stderr[0][:, i, j], stderr[1][:, i, j]
                plot_with_error(values[:, i, j], errs, x=rng, axes=ax,
                            alpha=signif, value_fmt='b', stderr_type=stderr_type)
        else:
            plot_with_error(values[:, i, j], None, x=rng, axes=ax,
                            value_fmt='b')

        ax.axhline(0, color='k')

        if hlines is not None:
            ax.axhline(hlines[i,j], color='k')

        sz = subplot_params.get('fontsize', 12)
        ax.set_title(subtitle_temp % (names[j], names[i]), fontsize=sz)

    return fig
Example #11
0
def _maybe_reset_index(data):
    """
    All the Rdatasets have the integer row.labels from R if there is no
    real index. Strip this for a zero-based index
    """
    if data.index.equals(Index(lrange(1, len(data) + 1))):
        data = data.reset_index(drop=True)
    return data
Example #12
0
def test__reduce_dict():
    data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), [1] * 8))
    eq(_reduce_dict(data, ('m',)), 4)
    eq(_reduce_dict(data, ('m', 'o')), 2)
    eq(_reduce_dict(data, ('m', 'o', 'w')), 1)
    data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), lrange(8)))
    eq(_reduce_dict(data, ('m',)), 6)
    eq(_reduce_dict(data, ('m', 'o')), 1)
    eq(_reduce_dict(data, ('m', 'o', 'w')), 0)
Example #13
0
 def variables(self):
     """
     Returns a list of the dataset's StataVariables objects.
     """
     return lmap(_StataVariable, zip(lrange(self._header['nvar']),
         self._header['typlist'], self._header['varlist'],
         self._header['srtlist'],
         self._header['fmtlist'], self._header['lbllist'],
         self._header['vlblist']))
Example #14
0
 def __iter__(self):
     n = self.n
     p = self.p
     comb = combinations(lrange(n), p)
     for idx in comb:
         test_index = np.zeros(n, dtype=np.bool)
         test_index[np.array(idx)] = True
         train_index = np.logical_not(test_index)
         yield train_index, test_index
Example #15
0
def date_range_str(start, end=None, length=None):
    """
    Returns a list of abbreviated date strings.

    Parameters
    ----------
    start : str
        The first abbreviated date, for instance, '1965q1' or '1965m1'
    end : str, optional
        The last abbreviated date if length is None.
    length : int, optional
        The length of the returned array of end is None.

    Returns
    -------
    date_range : list
        List of strings
    """
    flags = re.IGNORECASE | re.VERBOSE
    #_check_range_inputs(end, length, freq)
    start = start.lower()
    if re.search(_m_pattern, start, flags):
        annual_freq = 12
        split = 'm'
    elif re.search(_q_pattern, start, flags):
        annual_freq = 4
        split = 'q'
    elif re.search(_y_pattern, start, flags):
        annual_freq = 1
        start += 'a1' # hack
        if end:
            end += 'a1'
        split = 'a'
    else:
        raise ValueError("Date %s not understood" % start)
    yr1, offset1 = lmap(int, start.replace(":","").split(split))
    if end is not None:
        end = end.lower()
        yr2, offset2 = lmap(int, end.replace(":","").split(split))
        length = (yr2 - yr1) * annual_freq + offset2
    elif length:
        yr2 = yr1 + length // annual_freq
        offset2 = length % annual_freq + (offset1 - 1)
    years = np.repeat(lrange(yr1+1, yr2), annual_freq).tolist()
    years = np.r_[[str(yr1)]*(annual_freq+1-offset1), years] # tack on first year
    years = np.r_[years, [str(yr2)]*offset2] # tack on last year
    if split != 'a':
        offset = np.tile(np.arange(1, annual_freq+1), yr2-yr1-1)
        offset = np.r_[np.arange(offset1, annual_freq+1).astype('a2'), offset]
        offset = np.r_[offset, np.arange(1,offset2+1).astype('a2')]
        date_arr_range = [''.join([i, split, asstr(j)]) for i,j in
                                                        zip(years, offset)]
    else:
        date_arr_range = years.tolist()
    return date_arr_range
Example #16
0
def interactions(terms, order=[1,2]):
    """
    Output all pairwise interactions of given order of a
    sequence of terms.

    The argument order is a sequence specifying which order
    of interactions should be generated -- the default
    creates main effects and two-way interactions. If order
    is an integer, it is changed to range(1,order+1), so
    order=3 is equivalent to order=[1,2,3], generating
    all one, two and three-way interactions.

    If any entry of order is greater than len(terms), it is
    effectively treated as len(terms).

    >>> print interactions([Term(l) for l in ['a', 'b', 'c']])
    <formula: a*b + a*c + b*c + a + b + c>
    >>>
    >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5)))
    <formula: a*b + a*b*c + a*c + b*c + a + b + c>
    >>>

    """
    l = len(terms)

    values = {}

    if np.asarray(order).shape == ():
        order = lrange(1, int(order)+1)

    # First order

    for o in order:
        I = np.indices((l,)*(o))
        I.shape = (I.shape[0], np.product(I.shape[1:]))
        for m in range(I.shape[1]):

            # only keep combinations that have unique entries

            if (np.unique(I[:,m]).shape == I[:,m].shape and
                    np.alltrue(np.equal(np.sort(I[:,m]), I[:,m]))):
                ll = [terms[j] for j in I[:,m]]
                v = ll[0]
                for ii in range(len(ll)-1):
                    v *= ll[ii+1]
                values[tuple(I[:,m])] = v

    key = list(iterkeys(values))[0]
    value = values[key]
    del(values[key])

    for v in itervalues(values):
        value += v
    return value
Example #17
0
    def summary(self):
        buf = StringIO()

        rng = lrange(self.periods)
        for i in range(self.neqs):
            ppm = output.pprint_matrix(self.decomp[i], rng, self.names)

            buf.write('FEVD for %s\n' % self.names[i])
            buf.write(ppm + '\n')

        print(buf.getvalue())
Example #18
0
 def check_index(self, is_sorted=True, unique=True, index=None):
     """Sanity checks"""
     if not index:
         index = self.index
     if is_sorted:
         test = pd.DataFrame(lrange(len(index)), index=index)
         test_sorted = test.sort()
         if not test.index.equals(test_sorted.index):
             raise Exception('Data is not be sorted')
     if unique:
         if len(index) != len(index.unique()):
             raise Exception('Duplicate index entries')
Example #19
0
def cat2dummy(y, nonseq=0):
    if nonseq or (y.ndim == 2 and y.shape[1] > 1):
        ycat, uniques, unitransl =  convertlabels(y, lrange(y.shape[1]))
    else:
        ycat = y.copy()
        ymin = y.min()
        uniques = np.arange(ymin,y.max()+1)
    if ycat.ndim == 1:
        ycat = ycat[:,np.newaxis]
    # this builds matrix nobs*ncat
    dummy = (ycat == uniques).astype(int)
    return dummy
Example #20
0
def _influence_plot(results, influence, external=True, alpha=.05,
                    criterion="cooks", size=48, plot_alpha=.75, ax=None,
                    **kwargs):
    infl = influence
    fig, ax = utils.create_mpl_ax(ax)

    if criterion.lower().startswith('coo'):
        psize = infl.cooks_distance[0]
    elif criterion.lower().startswith('dff'):
        psize = np.abs(infl.dffits[0])
    else:
        raise ValueError("Criterion %s not understood" % criterion)

    # scale the variables
    #TODO: what is the correct scaling and the assumption here?
    #we want plots to be comparable across different plots
    #so we would need to use the expected distribution of criterion probably
    old_range = np.ptp(psize)
    new_range = size**2 - 8**2

    psize = (psize - psize.min()) * new_range/old_range + 8**2

    leverage = infl.hat_matrix_diag
    if external:
        resids = infl.resid_studentized_external
    else:
        resids = infl.resid_studentized

    from scipy import stats

    cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
    large_resid = np.abs(resids) > cutoff
    large_leverage = leverage > _high_leverage(results)
    large_points = np.logical_or(large_resid, large_leverage)

    ax.scatter(leverage, resids, s=psize, alpha=plot_alpha)

    # add point labels
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(len(resids))
    ax = utils.annotate_axes(np.where(large_points)[0], labels,
                             lzip(leverage, resids),
                             lzip(-(psize/2)**.5, (psize/2)**.5), "x-large",
                             ax)

    #TODO: make configurable or let people do it ex-post?
    font = {"fontsize" : 16, "color" : "black"}
    ax.set_ylabel("Studentized Residuals", **font)
    ax.set_xlabel("H Leverage", **font)
    ax.set_title("Influence Plot", **font)
    return fig
Example #21
0
def detrend(x, order=1, axis=0):
    '''detrend an array with a trend of given order along axis 0 or 1

    Parameters
    ----------
    x : array_like, 1d or 2d
        data, if 2d, then each row or column is independently detrended with the
        same trendorder, but independent trend estimates
    order : int
        specifies the polynomial order of the trend, zero is constant, one is
        linear trend, two is quadratic trend
    axis : int
        for detrending with order > 0, axis can be either 0 observations by rows,
        or 1, observations by columns

    Returns
    -------
    detrended data series : ndarray
        The detrended series is the residual of the linear regression of the
        data on the trend of given order.


    '''
    x = np.asarray(x)
    nobs = x.shape[0]
    if order == 0:
        return x - np.expand_dims(x.mean(ax), x)
    else:
        if x.ndim == 2 and lrange(2)[axis]==1:
            x = x.T
        elif x.ndim > 2:
            raise NotImplementedError('x.ndim>2 is not implemented until it is needed')
        #could use a polynomial, but this should work also with 2d x, but maybe not yet
        trends = np.vander(np.arange(nobs).astype(float), N=order+1)
        beta = np.linalg.lstsq(trends, x)[0]
        resid = x - np.dot(trends, beta)
        if x.ndim == 2 and lrange(2)[axis]==1:
            resid = resid.T
        return resid
    def simulate(self):

        group_effect_var = self.dep_params[0]

        vcomp = self.dep_params[1:]
        vcomp.append(0)

        endog, exog, group, id_matrix = [], [], [], []

        for i in range(self.ngroups):

            iterators = [lrange(n) for n in self.nest_sizes]

            # The random effects
            variances = [np.sqrt(v)*np.random.normal(size=n)
                         for v,n in zip(vcomp, self.nest_sizes)]

            gpe = np.random.normal() * np.sqrt(group_effect_var)

            nest_all = []
            for j in self.nest_sizes:
                nest_all.append(set())

            for nest in product(*iterators):

                group.append(i)

                # The sum of all random effects that apply to this
                # unit
                ref = gpe + sum([v[j] for v,j in zip(variances, nest)])

                exog1 = np.random.normal(size=5)
                exog1[0] = 1
                exog.append(exog1)

                error = ref + self.error_sd * np.random.normal()

                endog1 = np.dot(exog1, self.params) + error
                endog.append(endog1)

                for j in range(len(nest)):
                    nest_all[j].add(tuple(nest[0:j+1]))

                nest1 = [len(x)-1 for x in nest_all]
                id_matrix.append(nest1[0:-1])

        self.exog = np.array(exog)
        self.endog = np.array(endog)
        self.group = np.array(group)
        self.id_matrix = np.array(id_matrix)
        self.time = np.zeros_like(self.endog)
Example #23
0
def coef_restriction_diffbase(n_coeffs, n_vars=None, position=0, base_idx=0):

    reduced = -np.eye(n_coeffs)  #make all rows, drop one row later
    reduced[:, base_idx] = 1

    keep = lrange(n_coeffs)
    del keep[base_idx]
    reduced = np.take(reduced, keep, axis=0)

    if n_vars is None:
        return reduced
    else:
        full = np.zeros((n_coeffs-1, n_vars))
        full[:, position:position+n_coeffs] = reduced
        return full
Example #24
0
 def variables(self):
     """
     Returns a list of the dataset's StataVariables objects.
     """
     return lmap(
         _StataVariable,
         zip(
             lrange(self._header["nvar"]),
             self._header["typlist"],
             self._header["varlist"],
             self._header["srtlist"],
             self._header["fmtlist"],
             self._header["lbllist"],
             self._header["vlblist"],
         ),
     )
Example #25
0
 def _next(self):
     typlist = self._header['typlist']
     if self._has_string_data:
         data = [None]*self._header['nvar']
         for i in range(len(data)):
             if isinstance(typlist[i], int):
                 data[i] = self._null_terminate(self._file.read(typlist[i]),
                             self._encoding)
             else:
                 data[i] = self._unpack(typlist[i],
                         self._file.read(self._col_size(i)))
         return data
     else:
         return lmap(lambda i: self._unpack(typlist[i],
             self._file.read(self._col_size(i))),
             lrange(self._header['nvar']))
Example #26
0
    def __init__(self, endog, exog, r_matrix=None, q_matrix=None,
                 sigma_prior=None, sigma=None):
        super(TheilGLS, self).__init__(endog, exog, sigma=sigma)

        if r_matrix is not None:
            r_matrix = np.asarray(r_matrix)
        else:
            try:
                const_idx = self.data.const_idx
            except AttributeError:
                const_idx = None

            k_exog = exog.shape[1]
            r_matrix = np.eye(k_exog)
            if const_idx is not None:
                keep_idx = lrange(k_exog)
                del keep_idx[const_idx]
                r_matrix = r_matrix[keep_idx]  # delete row for constant

        k_constraints, k_exog = r_matrix.shape
        self.r_matrix = r_matrix
        if k_exog != self.exog.shape[1]:
            raise ValueError('r_matrix needs to have the same number of columns'
                             'as exog')

        if q_matrix is not None:
            self.q_matrix = atleast_2dcols(q_matrix)
        else:
            self.q_matrix = np.zeros(k_constraints)[:, None]
        if self.q_matrix.shape != (k_constraints, 1):
                raise ValueError('q_matrix has wrong shape')

        if sigma_prior is not None:
            sigma_prior = np.asarray(sigma_prior)
            if np.size(sigma_prior) == 1:
                sigma_prior = np.diag(sigma_prior * np.ones(k_constraints))
                #no numerical shortcuts are used for this case
            elif sigma_prior.ndim == 1:
                sigma_prior = np.diag(sigma_prior)
        else:
            sigma_prior = np.eye(k_constraints)

        if sigma_prior.shape != (k_constraints, k_constraints):
            raise ValueError('sigma_prior has wrong shape')

        self.sigma_prior = sigma_prior
        self.sigma_prior_inv = np.linalg.pinv(sigma_prior) #or inv
Example #27
0
 def _next(self):
     typlist = self._header['typlist']
     if self._has_string_data:
         data = [None] * self._header['nvar']
         for i in range(len(data)):
             if isinstance(typlist[i], int):
                 data[i] = self._null_terminate(self._file.read(typlist[i]),
                                                self._encoding)
             else:
                 data[i] = self._unpack(typlist[i],
                                        self._file.read(self._col_size(i)))
         return data
     else:
         return lmap(
             lambda i: self._unpack(typlist[i],
                                    self._file.read(self._col_size(i))),
             lrange(self._header['nvar']))
Example #28
0
def _normalize_data(data, index):
    """normalize the data to a dict with tuples of strings as keys
    right now it works with:

        0 - dictionary (or equivalent mappable)
        1 - pandas.Series with simple or hierarchical indexes
        2 - numpy.ndarrays
        3 - everything that can be converted to a numpy array
        4 - pandas.DataFrame (via the _normalize_dataframe function)
    """
    # if data is a dataframe we need to take a completely new road
    # before coming back here. Use the hasattr to avoid importing
    # pandas explicitly
    if hasattr(data, 'pivot') and hasattr(data, 'groupby'):
        data = _normalize_dataframe(data, index)
        index = None
    # can it be used as a dictionary?
    try:
        items = list(iteritems(data))
    except AttributeError:
        # ok, I cannot use the data as a dictionary
        # Try to convert it to a numpy array, or die trying
        data = np.asarray(data)
        temp = OrderedDict()
        for idx in np.ndindex(data.shape):
            name = tuple(i for i in idx)
            temp[name] = data[idx]
        data = temp
        items = list(iteritems(data))
    # make all the keys a tuple, even if simple numbers
    data = OrderedDict([_tuplify(k), v] for k, v in items)
    categories_levels = _categories_level(list(iterkeys(data)))
    # fill the void in the counting dictionary
    indexes = product(*categories_levels)
    contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes])
    data = contingency
    # reorder the keys order according to the one specified by the user
    # or if the index is None convert it into a simple list
    # right now it doesn't do any check, but can be modified in the future
    index = lrange(len(categories_levels)) if index is None else index
    contingency = OrderedDict()
    for key, value in iteritems(data):
        new_key = tuple(key[i] for i in index)
        contingency[new_key] = value
    data = contingency
    return data
Example #29
0
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None,
                         **kwargs):
    """
    Plots leverage statistics vs. normalized residuals squared

    Parameters
    ----------
    results : results instance
        A regression results instance
    alpha : float
        Specifies the cut-off for large-standardized residuals. Residuals
        are assumed to be distributed N(0, 1) with alpha=alpha.
    label_kwargs : dict
        The keywords to pass to annotate for the labels.
    ax : Axes instance
        Matplotlib Axes instance

    Returns
    -------
    fig : matplotlib Figure
        A matplotlib figure instance.
    """
    from scipy.stats import zscore, norm
    fig, ax = utils.create_mpl_ax(ax)

    infl = results.get_influence()
    leverage = infl.hat_matrix_diag
    resid = zscore(results.resid)
    ax.plot(resid**2, leverage, 'o', **kwargs)
    ax.set_xlabel("Normalized residuals**2")
    ax.set_ylabel("Leverage")
    ax.set_title("Leverage vs. Normalized residuals squared")

    large_leverage = leverage > _high_leverage(results)
    #norm or t here if standardized?
    cutoff = norm.ppf(1.-alpha/2)
    large_resid = np.abs(resid) > cutoff
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(results.nobs)
    index = np.where(np.logical_or(large_leverage, large_resid))[0]
    ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
                             [(0, 5)]*int(results.nobs), "large",
                             ax=ax, ha="center", va="bottom")
    ax.margins(.075, .075)
    return fig
Example #30
0
def _normalize_data(data, index):
    """normalize the data to a dict with tuples of strings as keys
    right now it works with:

        0 - dictionary (or equivalent mappable)
        1 - pandas.Series with simple or hierarchical indexes
        2 - numpy.ndarrays
        3 - everything that can be converted to a numpy array
        4 - pandas.DataFrame (via the _normalize_dataframe function)
    """
    # if data is a dataframe we need to take a completely new road
    # before coming back here. Use the hasattr to avoid importing
    # pandas explicitly
    if hasattr(data, 'pivot') and hasattr(data, 'groupby'):
        data = _normalize_dataframe(data, index)
        index = None
    # can it be used as a dictionary?
    try:
        items = list(iteritems(data))
    except AttributeError:
        # ok, I cannot use the data as a dictionary
        # Try to convert it to a numpy array, or die trying
        data = np.asarray(data)
        temp = OrderedDict()
        for idx in np.ndindex(data.shape):
            name = tuple(i for i in idx)
            temp[name] = data[idx]
        data = temp
        items = list(iteritems(data))
    # make all the keys a tuple, even if simple numbers
    data = OrderedDict([_tuplify(k), v] for k, v in items)
    categories_levels = _categories_level(list(iterkeys(data)))
    # fill the void in the counting dictionary
    indexes = product(*categories_levels)
    contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes])
    data = contingency
    # reorder the keys order according to the one specified by the user
    # or if the index is None convert it into a simple list
    # right now it doesn't do any check, but can be modified in the future
    index = lrange(len(categories_levels)) if index is None else index
    contingency = OrderedDict()
    for key, value in iteritems(data):
        new_key = tuple(key[i] for i in index)
        contingency[new_key] = value
    data = contingency
    return data
Example #31
0
    def setup_class(cls):
        #DGP: simple polynomial
        order = 3
        nobs = 200
        lb, ub = -3.5, 3
        x1 = np.linspace(lb, ub, nobs)
        x2 = np.sin(2*x1)
        x = np.column_stack((x1/x1.max()*1, 1.*x2))
        exog = (x[:,:,None]**np.arange(order+1)[None, None, :]).reshape(nobs, -1)
        idx = lrange((order+1)*2)
        del idx[order+1]
        exog_reduced = exog[:,idx]  #remove duplicate constant
        y_true = exog.sum(1) #/ 4.
        #z = y_true #alias check
        #d = x

        cls.nobs = nobs
        cls.y_true, cls.x, cls.exog = y_true, x, exog_reduced
Example #32
0
    def dummy(self, drop_idx=None, sparse=False, dtype=int):
        """
        drop_idx is only available if sparse=False

        drop_idx is supposed to index into uni
        """
        uni = self.uni
        if drop_idx is not None:
            idx = lrange(len(uni))
            del idx[drop_idx]
            uni = uni[idx]

        group = self.group

        if not sparse:
            return (group[:, None] == uni[None, :]).astype(dtype)
        else:
            return dummy_sparse(self.group_int)
    def dummy(self, drop_idx=None, sparse=False, dtype=int):
        """
        drop_idx is only available if sparse=False

        drop_idx is supposed to index into uni
        """
        uni = self.uni
        if drop_idx is not None:
            idx = lrange(len(uni))
            del idx[drop_idx]
            uni = uni[idx]

        group = self.group

        if not sparse:
            return (group[:, None] == uni[None, :]).astype(dtype)
        else:
            return dummy_sparse(self.group_int)
Example #34
0
def plot_leverage_resid2(results, alpha=.05, ax=None,
                         **kwargs):
    """
    Plots leverage statistics vs. normalized residuals squared

    Parameters
    ----------
    results : results instance
        A regression results instance
    alpha : float
        Specifies the cut-off for large-standardized residuals. Residuals
        are assumed to be distributed N(0, 1) with alpha=alpha.
    ax : Axes instance
        Matplotlib Axes instance

    Returns
    -------
    fig : matplotlib Figure
        A matplotlib figure instance.
    """
    from scipy.stats import zscore, norm
    fig, ax = utils.create_mpl_ax(ax)

    infl = results.get_influence()
    leverage = infl.hat_matrix_diag
    resid = zscore(results.resid)
    ax.plot(resid**2, leverage, 'o', **kwargs)
    ax.set_xlabel("Normalized residuals**2")
    ax.set_ylabel("Leverage")
    ax.set_title("Leverage vs. Normalized residuals squared")

    large_leverage = leverage > _high_leverage(results)
    #norm or t here if standardized?
    cutoff = norm.ppf(1.-alpha/2)
    large_resid = np.abs(resid) > cutoff
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(int(results.nobs))
    index = np.where(np.logical_or(large_leverage, large_resid))[0]
    ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
                             [(0, 5)]*int(results.nobs), "large",
                             ax=ax, ha="center", va="bottom")
    ax.margins(.075, .075)
    return fig
Example #35
0
    def main_effect(self, reference=None):
        """
        Return the 'main effect' columns of a factor, choosing
        an optional reference key.

        The reference key can be one of the keys of the Factor,
        or an integer, representing which column to remove.
        It defaults to 0.

        """

        names = self.names()

        if reference is None:
            reference = 0
        else:
            try:
                reference = self.keys.index(reference)
            except ValueError:
                reference = int(reference)

        def maineffect_func(value, reference=reference):
            rvalue = []
            keep = lrange(value.shape[0])
            keep.pop(reference)
            for i in range(len(keep)):
                rvalue.append(value[keep[i]] - value[reference])
            return np.array(rvalue)

        keep = lrange(len(self.names()))
        keep.pop(reference)
        __names = self.names()
        _names = [
            '%s-%s' % (__names[keep[i]], __names[reference])
            for i in range(len(keep))
        ]
        value = Quantitative(_names,
                             func=self,
                             termname='%s:maineffect' % self.termname,
                             transform=maineffect_func)
        value.namespace = self.namespace
        return value
    def setup_class(cls):
        #DGP: simple polynomial
        order = 3
        nobs = 200
        lb, ub = -3.5, 3
        x1 = np.linspace(lb, ub, nobs)
        x2 = np.sin(2 * x1)
        x = np.column_stack((x1 / x1.max() * 1, 1. * x2))
        exog = (x[:, :,
                  None]**np.arange(order + 1)[None,
                                              None, :]).reshape(nobs, -1)
        idx = lrange((order + 1) * 2)
        del idx[order + 1]
        exog_reduced = exog[:, idx]  #remove duplicate constant
        y_true = exog.sum(1)  #/ 4.
        #z = y_true #alias check
        #d = x

        cls.nobs = nobs
        cls.y_true, cls.x, cls.exog = y_true, x, exog_reduced
Example #37
0
    def test_causality(self):
        causedby = self.ref.causality['causedby']

        for i, name in enumerate(self.names):
            variables = self.names[:i] + self.names[i + 1:]
            result = self.res.test_causality(name, variables, kind='f')
            assert_almost_equal(result['pvalue'], causedby[i], DECIMAL_4)

            rng = lrange(self.k)
            rng.remove(i)
            result2 = self.res.test_causality(i, rng, kind='f')
            assert_almost_equal(result['pvalue'], result2['pvalue'], DECIMAL_12)

            # make sure works
            result = self.res.test_causality(name, variables, kind='wald')

        # corner cases
        _ = self.res.test_causality(self.names[0], self.names[1])
        _ = self.res.test_causality(0, 1)

        assert_raises(Exception,self.res.test_causality, 0, 1, kind='foo')
Example #38
0
    def test_causality(self):
        causedby = self.ref.causality["causedby"]

        for i, name in enumerate(self.names):
            variables = self.names[:i] + self.names[i + 1:]
            result = self.res.test_causality(name, variables, kind="f")
            assert_almost_equal(result.pvalue, causedby[i], DECIMAL_4)

            rng = lrange(self.k)
            rng.remove(i)
            result2 = self.res.test_causality(i, rng, kind="f")
            assert_almost_equal(result.pvalue, result2.pvalue, DECIMAL_12)

            # make sure works
            result = self.res.test_causality(name, variables, kind="wald")

        # corner cases
        _ = self.res.test_causality(self.names[0], self.names[1])
        _ = self.res.test_causality(0, 1)

        with pytest.raises(Exception):
            self.res.test_causality(0, 1, kind="foo")
Example #39
0
def maybe_name_or_idx(idx, model):
    """
    Give a name or an integer and return the name and integer location of the
    column in a design matrix.
    """
    if idx is None:
        idx = lrange(model.exog.shape[1])
    if isinstance(idx, int):
        exog_name = model.exog_names[idx]
        exog_idx = idx
    # anticipate index as list and recurse
    elif isinstance(idx, (tuple, list)):
        exog_name = []
        exog_idx = []
        for item in idx:
            exog_name_item, exog_idx_item = maybe_name_or_idx(item, model)
            exog_name.append(exog_name_item)
            exog_idx.append(exog_idx_item)
    else:  # assume we've got a string variable
        exog_name = idx
        exog_idx = model.exog_names.index(idx)

    return exog_name, exog_idx
Example #40
0
def test_pickle():
    tmpdir = tempfile.mkdtemp(prefix='pickle')
    a = lrange(10)
    save_pickle(a, tmpdir + '/res.pkl')
    b = load_pickle(tmpdir + '/res.pkl')
    assert_equal(a, b)

    # cleanup, tested on Windows
    try:
        import os
        os.remove(tmpdir + '/res.pkl')
        os.rmdir(tmpdir)
    except (OSError, IOError):
        pass
    assert not os.path.exists(tmpdir)

    # test with file handle
    fh = BytesIO()
    save_pickle(a, fh)
    fh.seek(0, 0)
    c = load_pickle(fh)
    fh.close()
    assert_equal(a, c)
Example #41
0
def get_ic_table(ics, selected_orders):
    '''
    该方法将滞后阶数结果转换为表格化的分析结果
    :param ics: 滞后阶数结果
    :param selected_orders: 最大滞后阶数
    :return: 返回表格化的滞后阶数分析结果
    '''
    _default_table_fmt = dict(empty_cell='',
                              colsep='  ',
                              row_pre='',
                              row_post='',
                              table_dec_above='=',
                              table_dec_below='=',
                              header_dec_below='-',
                              header_fmt='%s',
                              stub_fmt='%s',
                              title_align='c',
                              header_align='r',
                              data_aligns='r',
                              stubs_align='l',
                              fmt='txt')
    cols = sorted(ics)
    data = np.array([["%#10.4g" % v for v in ics[c]] for c in cols],
                    dtype=object).T
    for i, col in enumerate(cols):
        idx = int(selected_orders[col]), i
        data[idx] = data[idx] + '*'
    fmt = dict(_default_table_fmt, data_fmts=("%s", ) * len(cols))
    buf = StringIO()
    table = SimpleTable(data,
                        cols,
                        lrange(len(data)),
                        title='VAR Order Selection',
                        txt_fmt=fmt)
    buf.write(str(table) + '\n')
    buf.write('* Minimum' + '\n')
    return buf.getvalue()
Example #42
0
    def simulate(self):

        endog, exog, group, time = [], [], [], []

        for i in range(self.ngroups):

            gsize = np.random.randint(self.group_size_range[0],
                                      self.group_size_range[1])

            group.append([
                i,
            ] * gsize)

            time1 = np.random.normal(size=(gsize, 2))
            time.append(time1)

            exog1 = np.random.normal(size=(gsize, len(self.params[0])))
            exog.append(exog1)

            # Probabilities for each outcome
            prob = [np.exp(np.dot(exog1, p)) for p in self.params]
            prob = np.vstack(prob).T
            prob /= prob.sum(1)[:, None]

            m = len(self.params)
            endog1 = []
            for k in range(gsize):
                pdist = stats.rv_discrete(values=(lrange(m), prob[k, :]))
                endog1.append(pdist.rvs())

            endog.append(np.asarray(endog1))

        self.exog = np.concatenate(exog, axis=0)
        self.endog = np.concatenate(endog).astype(np.int32)
        self.time = np.concatenate(time, axis=0)
        self.group = np.concatenate(group)
        self.offset = np.zeros(len(self.endog), dtype=np.float64)
Example #43
0
    def _make_predict_dates(self):
        data = self.data
        dtstart = data.predict_start
        dtend = data.predict_end
        freq = data.freq

        if freq is not None:
            pandas_freq = _freq_to_pandas[freq]
            # preserve PeriodIndex or DatetimeIndex
            dates = self.data.dates.__class__(start=dtstart,
                                              end=dtend,
                                              freq=pandas_freq)

            if pandas_freq.freqstr == 'N':
                _dtend = dtend
                if isinstance(dates[-1], Period):
                    _dtend = pd.to_datetime(_dtend).to_period(dates.freq)
                if not dates[-1] == _dtend:
                    # TODO: this is a hack because a DatetimeIndex with
                    # nanosecond frequency does not include "end"
                    dtend = Timestamp(dtend.value + 1)
                    dates = self.data.dates.__class__(start=dtstart,
                                                      end=dtend,
                                                      freq=pandas_freq)
        # handle
        elif freq is None and (isinstance(dtstart, (int, long))
                               and isinstance(dtend, (int, long))):
            from pandas import Index
            dates = Index(lrange(dtstart, dtend + 1))
        # if freq is None and dtstart and dtend aren't integers, we're
        # in sample
        else:
            dates = self.data.dates
            start = self._get_dates_loc(dates, dtstart)
            end = self._get_dates_loc(dates, dtend)
            dates = dates[start:end + 1]  # is this index inclusive?
        self.data.predict_dates = dates
Example #44
0
    def __init__(self, model, P=None, periods=None):
        self.periods = periods

        self.model = model
        self.neqs = model.neqs
        self.names = model.model.endog_names

        self.irfobj = model.irf(var_decomp=P, periods=periods)
        self.orth_irfs = self.irfobj.orth_irfs

        # cumulative impulse responses
        irfs = (self.orth_irfs[:periods] ** 2).cumsum(axis=0)

        rng = lrange(self.neqs)
        mse = self.model.mse(periods)[:, rng, rng]

        # lag x equation x component
        fevd = np.empty_like(irfs)

        for i in range(periods):
            fevd[i] = (irfs[i].T / mse[i]).T

        # switch to equation x lag x component
        self.decomp = fevd.swapaxes(0, 1)
Example #45
0
def summary_col(results, float_format='%.4f', model_names=(), stars=False,
                info_dict=None, regressor_order=(), drop_omitted=False):
    """
    Summarize multiple results instances side-by-side (coefs and SEs)

    Parameters
    ----------
    results : statsmodels results instance or list of result instances
    float_format : str, optional
        float format for coefficients and standard errors
        Default : '%.4f'
    model_names : list[str], optional
        Must have same length as the number of results. If the names are not
        unique, a roman number will be appended to all model names
    stars : bool
        print significance stars
    info_dict : dict
        dict of functions to be applied to results instances to retrieve
        model info. To use specific information for different models, add a
        (nested) info_dict with model name as the key.
        Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would
        only show `R2` for OLS regression models, but additionally `N` for
        all other results.
        Default : None (use the info_dict specified in
        result.default_model_infos, if this property exists)
    regressor_order : list[str], optional
        list of names of the regressors in the desired order. All regressors
        not specified will be appended to the end of the list.
    drop_omitted : bool, optional
        Includes regressors that are not specified in regressor_order. If
        False, regressors not specified will be appended to end of the list.
        If True, only regressors in regressor_order will be included.
    """

    if not isinstance(results, list):
        results = [results]

    cols = [_col_params(x, stars=stars, float_format=float_format) for x in
            results]

    # Unique column names (pandas has problems merging otherwise)
    if model_names:
        colnames = _make_unique(model_names)
    else:
        colnames = _make_unique([x.columns[0] for x in cols])
    for i in range(len(cols)):
        cols[i].columns = [colnames[i]]

    def merg(x, y):
        return x.merge(y, how='outer', right_index=True,
                       left_index=True)

    summ = reduce(merg, cols)

    if regressor_order:
        varnames = summ.index.get_level_values(0).tolist()
        ordered = [x for x in regressor_order if x in varnames]
        unordered = [x for x in varnames if x not in regressor_order + ['']]
        order = ordered + list(np.unique(unordered))

        def f(idx):
            return sum([[x + 'coef', x + 'stde'] for x in idx], [])

        summ.index = f(pd.unique(varnames))
        summ = summ.reindex(f(order))
        summ.index = [x[:-4] for x in summ.index]
        if drop_omitted:
            summ = summ.loc[regressor_order]

    idx = pd.Series(lrange(summ.shape[0])) % 2 == 1
    summ.index = np.where(idx, '', summ.index.get_level_values(0))

    # add infos about the models.
    if info_dict:
        cols = [_col_info(x, info_dict.get(x.model.__class__.__name__,
                                           info_dict)) for x in results]
    else:
        cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in
                results]
    # use unique column names, otherwise the merge will not succeed
    for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])):
        df.columns = [name]

    def merg(x, y):
        return x.merge(y, how='outer', right_index=True,
                       left_index=True)

    info = reduce(merg, cols)
    dat = pd.DataFrame(np.vstack([summ, info]))  # pd.concat better, but error
    dat.columns = summ.columns
    dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
    summ = dat

    summ = summ.fillna('')

    smry = Summary()
    smry._merge_latex = True
    smry.add_df(summ, header=True, align='l')
    smry.add_text('Standard errors in parentheses.')
    if stars:
        smry.add_text('* p<.1, ** p<.05, ***p<.01')

    return smry
Example #46
0
from scipy import stats
from statsmodels.sandbox.tools.mctools import StatTestMC
from statsmodels.sandbox.stats.diagnostic import (
                    acorr_ljungbox, unitroot_adf)

def normalnoisesim(nobs=500, loc=0.0):
    return (loc+np.random.randn(nobs))


def lb(x):
    s,p = acorr_ljungbox(x, lags=4)
    return np.r_[s, p]


mc1 = StatTestMC(normalnoisesim, lb)
mc1.run(5000, statindices=lrange(4))

print(mc1.summary_quantiles([1,2,3], stats.chi2([2,3,4]).ppf,
                            varnames=['lag 1', 'lag 2', 'lag 3'],
                            title='acorr_ljungbox'))
print('\n\n')

frac = [0.01, 0.025, 0.05, 0.1, 0.975]
crit = stats.chi2([2,3,4]).ppf(np.atleast_2d(frac).T)
print(mc1.summary_cdf([1,2,3], frac, crit,
                      varnames=['lag 1', 'lag 2', 'lag 3'],
                      title='acorr_ljungbox'))
print(mc1.cdf(crit, [1,2,3])[1])

#----------------------
Example #47
0
    def lb4(x):
        s, p = acorr_ljungbox(x, lags=4)
        return s[-1], p[-1]

    def lb1(x):
        s, p = acorr_ljungbox(x, lags=1)
        return s[0], p[0]

    def lb(x):
        s, p = acorr_ljungbox(x, lags=4)
        return np.r_[s, p]

    print('Results with MC class')
    mc1 = StatTestMC(normalnoisesim, lb)
    mc1.run(10000, statindices=lrange(8))
    print(mc1.histogram(1, critval=[0.01, 0.025, 0.05, 0.1, 0.975]))
    print(mc1.quantiles(1))
    print(mc1.quantiles(0))
    print(mc1.histogram(0))

    #print(mc1.summary_quantiles([1], stats.chi2([2]).ppf, title='acorr_ljungbox')
    print(
        mc1.summary_quantiles([1, 2, 3],
                              stats.chi2([2, 3, 4]).ppf,
                              varnames=['lag 1', 'lag 2', 'lag 3'],
                              title='acorr_ljungbox'))
    print(mc1.cdf(0.1026, 1))
    print(mc1.cdf(0.7278, 3))

    print(mc1.cdf(0.7278, [1, 2, 3]))
class StataReader(object):
    """
    Stata .dta file reader.

    Provides methods to return the metadata of a Stata .dta file and
    a generator for the data itself.

    Parameters
    ----------
    file : file-like
        A file-like object representing a Stata .dta file.
    missing_values : bool
        If missing_values is True, parse missing_values and return a
        Missing Values object instead of None.
    encoding : string, optional
        Used for Python 3 only. Encoding to use when reading the .dta file.
        Defaults to `locale.getpreferredencoding`

    See Also
    --------
    statsmodels.iolib.foreign.genfromdta
    pandas.read_stata
    pandas.io.stata.StataReader

    Notes
    -----
    This is known only to work on file formats 113 (Stata 8/9), 114
    (Stata 10/11), and 115 (Stata 12).  Needs to be tested on older versions.
    Known not to work on format 104, 108. If you have the documentation for
    older formats, please contact the developers.

    For more information about the .dta format see
    http://www.stata.com/help.cgi?dta
    http://www.stata.com/help.cgi?dta_113
    """

    _header = {}
    _data_location = 0
    _col_sizes = ()
    _has_string_data = False
    _missing_values = False
    #type          code
    #--------------------
    #str1        1 = 0x01
    #str2        2 = 0x02
    #...
    #str244    244 = 0xf4
    #byte      251 = 0xfb  (sic)
    #int       252 = 0xfc
    #long      253 = 0xfd
    #float     254 = 0xfe
    #double    255 = 0xff
    #--------------------
    #NOTE: the byte type seems to be reserved for categorical variables
    # with a label, but the underlying variable is -127 to 100
    # we're going to drop the label and cast to int
    DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \
                    [(251, np.int16),(252, np.int32),(253, int),
                        (254, np.float32), (255, np.float64)])
    TYPE_MAP = lrange(251) + list('bhlfd')
    #NOTE: technically, some of these are wrong. there are more numbers
    # that can be represented. it's the 27 ABOVE and BELOW the max listed
    # numeric data type in [U] 12.2.2 of the 11.2 manual
    MISSING_VALUES = {
        'b': (-127, 100),
        'h': (-32767, 32740),
        'l': (-2147483647, 2147483620),
        'f': (-1.701e+38, +1.701e+38),
        'd': (-1.798e+308, +8.988e+307)
    }

    def __init__(self, fname, missing_values=False, encoding=None):
        warnings.warn(
            "StataReader is deprecated as of 0.10.0 and will be removed in a "
            "future version.  Use pandas.read_stata or "
            "pandas.io.stata.StataReader instead.", FutureWarning)

        if encoding is None:
            import locale
            self._encoding = locale.getpreferredencoding()
        else:
            self._encoding = encoding
        self._missing_values = missing_values
        self._parse_header(fname)

    def file_headers(self):
        """
        Returns all .dta file headers.

        out: dict
            Has keys typlist, data_label, lbllist, varlist, nvar, filetype,
            ds_format, nobs, fmtlist, vlblist, time_stamp, srtlist, byteorder
        """
        return self._header

    def file_format(self):
        """
        Returns the file format.

        Returns
        -------
        out : int

        Notes
        -----
        Format 113: Stata 8/9
        Format 114: Stata 10/11
        Format 115: Stata 12
        """
        return self._header['ds_format']

    def file_label(self):
        """
        Returns the dataset's label.

        Returns
        -------
        out: string
        """
        return self._header['data_label']

    def file_timestamp(self):
        """
        Returns the date and time Stata recorded on last file save.

        Returns
        -------
        out : str
        """
        return self._header['time_stamp']

    def variables(self):
        """
        Returns a list of the dataset's StataVariables objects.
        """
        return lmap(
            _StataVariable,
            zip(lrange(self._header['nvar']), self._header['typlist'],
                self._header['varlist'], self._header['srtlist'],
                self._header['fmtlist'], self._header['lbllist'],
                self._header['vlblist']))

    def dataset(self, as_dict=False):
        """
        Returns a Python generator object for iterating over the dataset.


        Parameters
        ----------
        as_dict : bool, optional
            If as_dict is True, yield each row of observations as a dict.
            If False, yields each row of observations as a list.

        Returns
        -------
        Generator object for iterating over the dataset.  Yields each row of
        observations as a list by default.

        Notes
        -----
        If missing_values is True during instantiation of StataReader then
        observations with StataMissingValue(s) are not filtered and should
        be handled by your applcation.
        """

        try:
            self._file.seek(self._data_location)
        except Exception:
            pass

        if as_dict:
            vars = lmap(str, self.variables())
            for i in range(len(self)):
                yield dict(zip(vars, self._next()))
        else:
            for i in range(self._header['nobs']):
                yield self._next()

    ### Python special methods

    def __len__(self):
        """
        Return the number of observations in the dataset.

        This value is taken directly from the header and includes observations
        with missing values.
        """
        return self._header['nobs']

    def __getitem__(self, k):
        """
        Seek to an observation indexed k in the file and return it, ordered
        by Stata's output to the .dta file.

        k is zero-indexed.  Prefer using R.data() for performance.
        """
        if not (isinstance(k, (int, long))) or k < 0 or k > len(self) - 1:
            raise IndexError(k)
        loc = self._data_location + sum(self._col_size()) * k
        if self._file.tell() != loc:
            self._file.seek(loc)
        return self._next()

    ### Private methods

    def _null_terminate(self, s, encoding):
        if PY3:  # have bytes not strings, so must decode
            null_byte = asbytes('\x00')
            try:
                s = s.lstrip(null_byte)[:s.index(null_byte)]
            except:
                pass
            return s.decode(encoding)
        else:
            null_byte = asbytes('\x00')
            try:
                return s.lstrip(null_byte)[:s.index(null_byte)]
            except:
                return s

    def _parse_header(self, file_object):
        self._file = file_object
        encoding = self._encoding

        # parse headers
        self._header['ds_format'] = unpack('b', self._file.read(1))[0]

        if self._header['ds_format'] not in [113, 114, 115]:
            raise ValueError("Only file formats >= 113 (Stata >= 9)"
                             " are supported.  Got format %s.  Please report "
                             "if you think this error is incorrect." %
                             self._header['ds_format'])
        byteorder = self._header['byteorder'] = unpack(
            'b', self._file.read(1))[0] == 0x1 and '>' or '<'
        self._header['filetype'] = unpack('b', self._file.read(1))[0]
        self._file.read(1)
        nvar = self._header['nvar'] = unpack(byteorder + 'h',
                                             self._file.read(2))[0]
        self._header['nobs'] = unpack(byteorder + 'i', self._file.read(4))[0]
        self._header['data_label'] = self._null_terminate(
            self._file.read(81), encoding)
        self._header['time_stamp'] = self._null_terminate(
            self._file.read(18), encoding)

        # parse descriptors
        typlist = [ord(self._file.read(1)) for i in range(nvar)]
        self._header['typlist'] = [self.TYPE_MAP[typ] for typ in typlist]
        self._header['dtyplist'] = [self.DTYPE_MAP[typ] for typ in typlist]
        self._header['varlist'] = [
            self._null_terminate(self._file.read(33), encoding)
            for i in range(nvar)
        ]
        self._header['srtlist'] = unpack(byteorder + ('h' * (nvar + 1)),
                                         self._file.read(2 * (nvar + 1)))[:-1]
        if self._header['ds_format'] <= 113:
            self._header['fmtlist'] = \
                    [self._null_terminate(self._file.read(12), encoding) \
                    for i in range(nvar)]
        else:
            self._header['fmtlist'] = \
                    [self._null_terminate(self._file.read(49), encoding) \
                    for i in range(nvar)]
        self._header['lbllist'] = [
            self._null_terminate(self._file.read(33), encoding)
            for i in range(nvar)
        ]
        self._header['vlblist'] = [
            self._null_terminate(self._file.read(81), encoding)
            for i in range(nvar)
        ]

        # ignore expansion fields
        # When reading, read five bytes; the last four bytes now tell you the
        # size of the next read, which you discard.  You then continue like
        # this until you read 5 bytes of zeros.

        while True:
            data_type = unpack(byteorder + 'b', self._file.read(1))[0]
            data_len = unpack(byteorder + 'i', self._file.read(4))[0]
            if data_type == 0:
                break
            self._file.read(data_len)

        # other state vars
        self._data_location = self._file.tell()
        self._has_string_data = len(
            lfilter(lambda x: isinstance(x, int), self._header['typlist'])) > 0
        self._col_size()

    def _calcsize(self, fmt):
        return isinstance(fmt, int) and fmt or \
                calcsize(self._header['byteorder']+fmt)

    def _col_size(self, k=None):
        """Calculate size of a data record."""
        if len(self._col_sizes) == 0:
            self._col_sizes = lmap(lambda x: self._calcsize(x),
                                   self._header['typlist'])
        if k is None:
            return self._col_sizes
        else:
            return self._col_sizes[k]

    def _unpack(self, fmt, byt):
        d = unpack(self._header['byteorder'] + fmt, byt)[0]
        if fmt[-1] in self.MISSING_VALUES:
            nmin, nmax = self.MISSING_VALUES[fmt[-1]]
            if d < nmin or d > nmax:
                if self._missing_values:
                    return StataMissingValue(nmax, d)
                else:
                    return None
        return d

    def _next(self):
        typlist = self._header['typlist']
        if self._has_string_data:
            data = [None] * self._header['nvar']
            for i in range(len(data)):
                if isinstance(typlist[i], int):
                    data[i] = self._null_terminate(self._file.read(typlist[i]),
                                                   self._encoding)
                else:
                    data[i] = self._unpack(typlist[i],
                                           self._file.read(self._col_size(i)))
            return data
        else:
            return lmap(
                lambda i: self._unpack(typlist[i],
                                       self._file.read(self._col_size(i))),
                lrange(self._header['nvar']))
Example #49
0
def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None):
    """Plot partial regression for a set of regressors.

    Parameters
    ----------
    results : results instance
        A regression model results instance
    exog_idx : None, list of ints, list of strings
        (column) indices of the exog used in the plot, default is all.
    grid : None or tuple of int (nrows, ncols)
        If grid is given, then it is used for the arrangement of the subplots.
        If grid is None, then ncol is one, if there are only 2 subplots, and
        the number of columns is two otherwise.
    fig : Matplotlib figure instance, optional
        If given, this figure is simply returned.  Otherwise a new figure is
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `fig` is None, the created figure.  Otherwise `fig` itself.

    Notes
    -----
    A subplot is created for each explanatory variable given by exog_idx.
    The partial regression plot shows the relationship between the response
    and the given explanatory variable after removing the effect of all other
    explanatory variables in exog.

    See Also
    --------
    plot_partregress : Plot partial regression for a single regressor.
    plot_ccpr : Plot CCPR against one regressor

    Examples
    --------
    Using the state crime dataset seperately plot the effect of the each
    variable on the on the outcome, murder rate while accounting for the effect
    of all other variables in the model visualized with a grid of partial
    regression plots.

    >>> from statsmodels.graphics.regressionplots import plot_partregress_grid
    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.formula.api as smf

    >>> fig = plt.figure(figsize=(8, 6))
    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
    ...                   data=crime_data.data).fit()
    >>> plot_partregress_grid(results, fig=fig)
    >>> plt.show()

    .. plot:: plots/graphics_regression_partregress_grid.py

    References
    ----------
    See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm

    """
    import pandas
    fig = utils.create_mpl_fig(fig)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)

    #maybe add option for using wendog, wexog instead
    y = pandas.Series(results.model.endog, name=results.model.endog_names)
    exog = results.model.exog

    k_vars = exog.shape[1]
    #this function doesn't make sense if k_vars=1

    if grid is not None:
        nrows, ncols = grid
    else:
        if len(exog_idx) > 2:
            nrows = int(np.ceil(len(exog_idx) / 2.))
            ncols = 2
            title_kwargs = {"fontdict": {"fontsize": 'small'}}
        else:
            nrows = len(exog_idx)
            ncols = 1
            title_kwargs = {}

    # for indexing purposes
    other_names = np.array(results.model.exog_names)
    for i, idx in enumerate(exog_idx):
        others = lrange(k_vars)
        others.pop(idx)
        exog_others = pandas.DataFrame(exog[:, others],
                                       columns=other_names[others])
        ax = fig.add_subplot(nrows, ncols, i + 1)
        plot_partregress(y,
                         pandas.Series(exog[:, idx], name=other_names[idx]),
                         exog_others,
                         ax=ax,
                         title_kwargs=title_kwargs,
                         obs_labels=False)
        ax.set_title("")

    fig.suptitle("Partial Regression Plot", fontsize="large")

    fig.tight_layout()

    fig.subplots_adjust(top=.95)
    return fig
Example #50
0
    def summary(self,
                yname=None,
                xname=None,
                title=0,
                alpha=.05,
                returns='text',
                model_info=None):
        # General part of the summary table
        if title == 0:
            title = 'High Dimensional Fixed Effect Regression Results'

        if type(xname) == str: xname = [xname]
        if type(yname) == str: yname = [yname]
        if xname is not None and len(xname) != len(self.xname):
            # GH 2298
            raise ValueError(
                'User supplied xnames must have the same number of '
                'entries as the number of model parameters '
                '({0})'.format(len(self.xname)))

        if yname is not None and len(yname) != len(self.yname):
            raise ValueError(
                'User supplied ynames must have the same number of '
                'entries as the number of model dependent variables '
                '({0})'.format(len(self.yname)))
        if xname is None:
            xname = self.xname
        if yname is None:
            yname = self.yname

        time_now = time.localtime()
        time_of_day = [time.strftime("%H:%M:%S", time_now)]
        date = time.strftime("%a, %d %b %Y", time_now)
        nobs = int(self.nobs)
        df_model = self.df
        resid_std_err = forg(self.resid_std_err, 4)
        Covariance_Type = self.Covariance_Type
        cluster_method = self.cluster_method
        gen_left = [
            ('Dep. Variable:', yname),
            ('No. Observations:',
             [nobs]),  # TODO: What happens with multiple names?
            ('DoF of residual:', [df_model]),
            ('Residual std err:', [resid_std_err]),
            ('Covariance Type:', [Covariance_Type]),
            ('Cluster Method:', [cluster_method])
        ]
        r_squared = forg(self.rsquared, 4)
        rsquared_adj = forg(self.rsquared_adj, 4)
        full_rsquared = forg(self.full_rsquared, 4)
        full_rsquared_adj = forg(self.full_rsquared_adj, 4)
        fvalue = forg(self.fvalue, 4)
        f_pvalue = forg(self.f_pvalue, 4)
        full_fvalue = forg(self.full_fvalue, 4)
        full_f_pvalue = forg(self.full_f_pvalue, 4)
        gen_right = [('R-squared(proj model):', [r_squared]),
                     ('Adj. R-squared(proj model):', [rsquared_adj]),
                     ('R-squared(full model):', [full_rsquared]),
                     ('Adj. R-squared(full model):', [full_rsquared_adj]),
                     ('F-statistic(proj model):', [fvalue]),
                     ('Prob (F-statistic (proj model)):', [f_pvalue]),
                     ('DoF of F-test (proj model):', [self.f_df_proj]),
                     ('F-statistic(full model):', [full_fvalue]),
                     ('Prob (F-statistic (full model)):', [full_f_pvalue]),
                     ('DoF of F-test (full model):', [self.f_df_full])]
        # pad both tables to equal number of rows
        if len(gen_right) < len(gen_left):
            # fill up with blank lines to same length
            gen_right += [(' ', ' ')] * (len(gen_left) - len(gen_right))
        elif len(gen_right) > len(gen_left):
            # fill up with blank lines to same length, just to keep it symmetric
            gen_left += [(' ', ' ')] * (len(gen_right) - len(gen_left))

        gen_stubs_left, gen_data_left = zip_longest(*gen_left)
        gen_title = title
        gen_header = None
        gen_table_left = SimpleTable(gen_data_left,
                                     gen_header,
                                     gen_stubs_left,
                                     title=gen_title,
                                     txt_fmt=gen_fmt)
        gen_stubs_right, gen_data_right = zip_longest(*gen_right)
        gen_table_right = SimpleTable(gen_data_right,
                                      gen_header,
                                      gen_stubs_right,
                                      title=gen_title,
                                      txt_fmt=gen_fmt)
        gen_table_left.extend_right(gen_table_right)
        self.general_table = gen_table_left

        # Parameters part of the summary table
        s_alp = alpha / 2
        c_alp = 1 - alpha / 2
        if Covariance_Type == 'nonrobust':
            self.std_err_name = 'nonrobust std err'
        elif Covariance_Type == 'robust':
            self.std_err_name = 'robust std err'
        elif Covariance_Type == 'clustered':
            self.std_err_name = 'cluster std err'
        else:
            self.std_err_name = 'std err'
        param_header = [
            'coef', self.std_err_name, 't', 'P>|t|', '[' + str(s_alp),
            str(c_alp) + ']'
        ]  # alp + ' Conf. Interval'
        params_stubs = xname
        params = self.params.copy()
        conf_int = self.conf_int(alpha)
        std_err = self.bse.copy()
        exog_len = lrange(len(xname))
        tstat = self.tvalues.copy()
        prob_stat = self.pvalues.copy()
        for i in range(len(self.params)):
            params[i] = forg(self.params[i], 5)
            std_err[i] = forg(self.bse[i], 5)
            tstat[i] = forg(self.tvalues[i], 4)
            prob_stat[i] = forg(self.pvalues[i], 4)

        # Simpletable should be able to handle the formating
        params_data = lzip(["%#6.5f" % (params[i]) for i in exog_len],
                           ["%#6.5f" % (std_err[i]) for i in exog_len],
                           ["%#6.4f" % (tstat[i]) for i in exog_len],
                           ["%#6.4f" % (prob_stat[i]) for i in exog_len],
                           ["%#6.4f" % conf_int[0][i] for i in exog_len],
                           ["%#6.4f" % conf_int[1][i] for i in exog_len])
        self.parameter_table = SimpleTable(params_data,
                                           param_header,
                                           params_stubs,
                                           title=None,
                                           txt_fmt=fmt_2)
        print(self.general_table)
        print(self.parameter_table)
        return
Example #51
0
def summary(self,
            yname=None,
            xname=None,
            title=0,
            alpha=.05,
            returns='text',
            model_info=None):
    """
    Parameters
    ----------
    yname : str
            optional, Default is `Y`
    xname : list[str]
            optional, Default is `X.#` for # in p the number of regressors
    Confidance interval : (0,1) not implimented
    title : str
            optional, Defualt is 'Generalized linear model'
    returns : str
              'text', 'table', 'csv', 'latex', 'html'

    Returns
    -------
    Default :
    returns='print'
            Prints the summarirized results

    Option :
    returns='text'
            Prints the summarirized results

    Option :
    returns='table'
             SimpleTable instance : summarizing the fit of a linear model.

    Option :
    returns='csv'
            returns a string of csv of the results, to import into a spreadsheet

    Option :
    returns='latex'
    Not implimented yet

    Option :
    returns='HTML'
    Not implimented yet


    Examples (needs updating)
    --------
    >>> import statsmodels as sm
    >>> data = sm.datasets.longley.load()
    >>> data.exog = sm.add_constant(data.exog)
    >>> ols_results = sm.OLS(data.endog, data.exog).results
    >>> print ols_results.summary()
    ...

    Notes
    -----
    conf_int calculated from normal dist.
    """
    if title == 0:
        title = _model_types[self.model.__class__.__name__]

    if xname is not None and len(xname) != len(self.params):
        # GH 2298
        raise ValueError('User supplied xnames must have the same number of '
                         'entries as the number of model parameters '
                         '({0})'.format(len(self.params)))

    yname, xname = _getnames(self, yname, xname)

    time_now = time.localtime()
    time_of_day = [time.strftime("%H:%M:%S", time_now)]
    date = time.strftime("%a, %d %b %Y", time_now)
    modeltype = self.model.__class__.__name__
    nobs = self.nobs
    df_model = self.df_model
    df_resid = self.df_resid

    #General part of the summary table, Applicable to all? models
    #------------------------------------------------------------
    # TODO: define this generically, overwrite in model classes
    #replace definition of stubs data by single list
    #e.g.
    gen_left = [
        ('Model type:', [modeltype]),
        ('Date:', [date]),
        ('Dependent Variable:',
         yname),  # TODO: What happens with multiple names?
        ('df model', [df_model])
    ]
    gen_stubs_left, gen_data_left = zip_longest(*gen_left)  #transpose row col

    gen_title = title
    gen_header = None
    gen_table_left = SimpleTable(gen_data_left,
                                 gen_header,
                                 gen_stubs_left,
                                 title=gen_title,
                                 txt_fmt=gen_fmt)

    gen_stubs_right = ('Method:', 'Time:', 'Number of Obs:', 'df resid')
    gen_data_right = (
        [modeltype],  #was dist family need to look at more
        time_of_day,
        [nobs],
        [df_resid])
    gen_table_right = SimpleTable(gen_data_right,
                                  gen_header,
                                  gen_stubs_right,
                                  title=gen_title,
                                  txt_fmt=gen_fmt)
    gen_table_left.extend_right(gen_table_right)
    general_table = gen_table_left

    # Parameters part of the summary table
    # ------------------------------------
    # Note: this is not necessary since we standardized names,
    #  only t versus normal
    tstats = {
        'OLS': self.t(),
        'GLS': self.t(),
        'GLSAR': self.t(),
        'WLS': self.t(),
        'RLM': self.t(),
        'GLM': self.t()
    }
    prob_stats = {
        'OLS': self.pvalues,
        'GLS': self.pvalues,
        'GLSAR': self.pvalues,
        'WLS': self.pvalues,
        'RLM': self.pvalues,
        'GLM': self.pvalues
    }
    # Dictionary to store the header names for the parameter part of the
    # summary table. look up by modeltype
    alp = str((1 - alpha) * 100) + '%'
    param_header = {
        'OLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'GLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'GLSAR': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'WLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'GLM': ['coef', 'std err', 't', 'P>|t|',
                alp + ' Conf. Interval'],  #glm uses t-distribution
        'RLM': ['coef', 'std err', 'z', 'P>|z|',
                alp + ' Conf. Interval']  #checke z
    }
    params_stubs = xname
    params = self.params
    conf_int = self.conf_int(alpha)
    std_err = self.bse
    exog_len = lrange(len(xname))
    tstat = tstats[modeltype]
    prob_stat = prob_stats[modeltype]

    # Simpletable should be able to handle the formating
    params_data = lzip(["%#6.4g" % (params[i]) for i in exog_len],
                       ["%#6.4f" % (std_err[i]) for i in exog_len],
                       ["%#6.4f" % (tstat[i]) for i in exog_len],
                       ["%#6.4f" % (prob_stat[i]) for i in exog_len],
                       ["(%#5g, %#5g)" % tuple(conf_int[i]) for i in exog_len])
    parameter_table = SimpleTable(params_data,
                                  param_header[modeltype],
                                  params_stubs,
                                  title=None,
                                  txt_fmt=fmt_2)

    #special table
    #-------------
    #TODO: exists in linear_model, what about other models
    #residual diagnostics

    #output options
    #--------------
    #TODO: JP the rest needs to be fixed, similar to summary in linear_model

    def ols_printer():
        """
        print summary table for ols models
        """
        table = str(general_table) + '\n' + str(parameter_table)
        return table

    def glm_printer():
        table = str(general_table) + '\n' + str(parameter_table)
        return table

    printers = {'OLS': ols_printer, 'GLM': glm_printer}

    if returns == 'print':
        try:
            return printers[modeltype]()
        except KeyError:
            return printers['OLS']()
Example #52
0
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c',
                         model_kw={}, fit_kw={}):
    """
    Returns information criteria for many ARMA models

    Parameters
    ----------
    y : array-like
        Time-series data
    max_ar : int
        Maximum number of AR lags to use. Default 4.
    max_ma : int
        Maximum number of MA lags to use. Default 2.
    ic : str, list
        Information criteria to report. Either a single string or a list
        of different criteria is possible.
    trend : str
        The trend to use when fitting the ARMA models.
    model_kw : dict
        Keyword arguments to be passed to the ``ARMA`` model
    fit_kw : dict
        Keyword arguments to be passed to ``ARMA.fit``.

    Returns
    -------
    obj : Results object
        Each ic is an attribute with a DataFrame for the results. The AR order
        used is the row index. The ma order used is the column index. The
        minimum orders are available as ``ic_min_order``.

    Examples
    --------

    >>> from statsmodels.tsa.arima_process import arma_generate_sample
    >>> import statsmodels.api as sm
    >>> import numpy as np

    >>> arparams = np.array([.75, -.25])
    >>> maparams = np.array([.65, .35])
    >>> arparams = np.r_[1, -arparams]
    >>> maparam = np.r_[1, maparams]
    >>> nobs = 250
    >>> np.random.seed(2014)
    >>> y = arma_generate_sample(arparams, maparams, nobs)
    >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc')
    >>> res.aic_min_order
    >>> res.bic_min_order

    Notes
    -----
    This method can be used to tentatively identify the order of an ARMA
from    process, provided that the time series is stationary and invertible. This
    function computes the full exact MLE estimate of each model and can be,
    therefore a little slow. An implementation using approximate estimates
    will be provided in the future. In the meantime, consider passing
    {method : 'css'} to fit_kw.
    """
    from pandas import DataFrame

    ar_range = lrange(0, max_ar + 1)
    ma_range = lrange(0, max_ma + 1)
    if isinstance(ic, string_types):
        ic = [ic]
    elif not isinstance(ic, (list, tuple)):
        raise ValueError("Need a list or a tuple for ic if not a string.")

    results = np.zeros((len(ic), max_ar + 1, max_ma + 1))

    for ar in ar_range:
        for ma in ma_range:
            if ar == 0 and ma == 0 and trend == 'nc':
                results[:, ar, ma] = np.nan
                continue

            mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw)
            if mod is None:
                results[:, ar, ma] = np.nan
                continue

            for i, criteria in enumerate(ic):
                results[i, ar, ma] = getattr(mod, criteria)

    dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results]

    res = dict(zip(ic, dfs))

    # add the minimums to the results dict
    min_res = {}
    for i, result in iteritems(res):
        mins = np.where(result.min().min() == result)
        min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])})
    res.update(min_res)

    return Bunch(**res)
Example #53
0
from pandas import to_datetime
import numpy as np

_quarter_to_day = {
    "1": (3, 31),
    "2": (6, 30),
    "3": (9, 30),
    "4": (12, 31),
    "I": (3, 31),
    "II": (6, 30),
    "III": (9, 30),
    "IV": (12, 31)
}

_mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
_months_with_days = lzip(lrange(1, 13), _mdays)
_month_to_day = dict(zip(map(str, lrange(1, 13)), _months_with_days))
_month_to_day.update(
    dict(
        zip([
            "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI",
            "XII"
        ], _months_with_days)))

# regex patterns
_y_pattern = r'^\d?\d?\d?\d$'

_q_pattern = r'''
^               # beginning of string
\d?\d?\d?\d     # match any number 1-9999, includes leading zeros
def influence_plot(results,
                   external=True,
                   alpha=.05,
                   criterion="cooks",
                   size=48,
                   plot_alpha=.75,
                   ax=None,
                   **kwargs):
    """
    Plot of influence in regression. Plots studentized resids vs. leverage.

    Parameters
    ----------
    results : #1lab_results instance
        A fitted model.
    external : bool
        Whether to use externally or internally studentized residuals. It is
        recommended to leave external as True.
    alpha : float
        The alpha value to identify large studentized residuals. Large means
        abs(resid_studentized) > t.ppf(1-alpha/2, dof=#1lab_results.df_resid)
    criterion : str {'DFFITS', 'Cooks'}
        Which criterion to base the size of the points on. Options are
        DFFITS or Cook's D.
    size : float
        The range of `criterion` is mapped to 10**2 - size**2 in points.
    plot_alpha : float
        The `alpha` of the plotted points.
    ax : matplotlib Axes instance
        An instance of a matplotlib Axes.

    Returns
    -------
    fig : matplotlib figure
        The matplotlib figure that contains the Axes.

    Notes
    -----
    Row labels for the observations in which the leverage, measured by the
    diagonal of the hat matrix, is high or the residuals are large, as the
    combination of large residuals and a high influence value indicates an
    influence point. The value of large residuals can be controlled using the
    `alpha` parameter. Large leverage points are identified as
    hat_i > 2 * (df_model + 1)/nobs.
    """
    fig, ax = utils.create_mpl_ax(ax)

    infl = results.get_influence()

    if criterion.lower().startswith('coo'):
        psize = infl.cooks_distance[0]
    elif criterion.lower().startswith('dff'):
        psize = np.abs(infl.dffits[0])
    else:
        raise ValueError("Criterion %s not understood" % criterion)

    # scale the variables
    #TODO: what is the correct scaling and the assumption here?
    #we want plots to be comparable across different plots
    #so we would need to use the expected distribution of criterion probably
    old_range = np.ptp(psize)
    new_range = size**2 - 8**2

    psize = (psize - psize.min()) * new_range / old_range + 8**2

    leverage = infl.hat_matrix_diag
    if external:
        resids = infl.resid_studentized_external
    else:
        resids = infl.resid_studentized_internal

    from scipy import stats

    cutoff = stats.t.ppf(1. - alpha / 2, results.df_resid)
    large_resid = np.abs(resids) > cutoff
    large_leverage = leverage > _high_leverage(results)
    large_points = np.logical_or(large_resid, large_leverage)

    ax.scatter(leverage, resids, s=psize, alpha=plot_alpha)

    # add point labels
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(len(resids))
    ax = utils.annotate_axes(
        np.where(large_points)[0], labels, lzip(leverage, resids),
        lzip(-(psize / 2)**.5, (psize / 2)**.5), "x-large", ax)

    #TODO: make configurable or let people do it ex-post?
    font = {"fontsize": 16, "color": "black"}
    ax.set_ylabel("Studentized Residuals", **font)
    ax.set_xlabel("H Leverage", **font)
    ax.set_title("Influence Plot", **font)
    return fig
def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None):
    """Plot partial regression for a set of regressors.

    Parameters
    ----------
    results : #1lab_results instance
        A regression model #1lab_results instance
    exog_idx : None, list of ints, list of strings
        (column) indices of the exog used in the plot, default is all.
    grid : None or tuple of int (nrows, ncols)
        If grid is given, then it is used for the arrangement of the subplots.
        If grid is None, then ncol is one, if there are only 2 subplots, and
        the number of columns is two otherwise.
    fig : Matplotlib figure instance, optional
        If given, this figure is simply returned.  Otherwise a new figure is
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `fig` is None, the created figure.  Otherwise `fig` itself.

    Notes
    -----
    A subplot is created for each explanatory variable given by exog_idx.
    The partial regression plot shows the relationship between the response
    and the given explanatory variable after removing the effect of all other
    explanatory variables in exog.

    See Also
    --------
    plot_partregress : Plot partial regression for a single regressor.
    plot_ccpr

    References
    ----------
    See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm

    """
    import pandas
    fig = utils.create_mpl_fig(fig)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)

    #maybe add option for using wendog, wexog instead
    y = pandas.Series(results.model.endog, name=results.model.endog_names)
    exog = results.model.exog

    k_vars = exog.shape[1]
    #this function doesn't make sense if k_vars=1

    if not grid is None:
        nrows, ncols = grid
    else:
        if len(exog_idx) > 2:
            nrows = int(np.ceil(len(exog_idx) / 2.))
            ncols = 2
            title_kwargs = {"fontdict": {"fontsize": 'small'}}
        else:
            nrows = len(exog_idx)
            ncols = 1
            title_kwargs = {}

    # for indexing purposes
    other_names = np.array(results.model.exog_names)
    for i, idx in enumerate(exog_idx):
        others = lrange(k_vars)
        others.pop(idx)
        exog_others = pandas.DataFrame(exog[:, others],
                                       columns=other_names[others])
        ax = fig.add_subplot(nrows, ncols, i + 1)
        plot_partregress(y,
                         pandas.Series(exog[:, idx], name=other_names[idx]),
                         exog_others,
                         ax=ax,
                         title_kwargs=title_kwargs,
                         obs_labels=False)
        ax.set_title("")

    fig.suptitle("Partial Regression Plot", fontsize="large")

    fig.tight_layout()

    fig.subplots_adjust(top=.95)
    return fig
class StataWriter(object):
    """
    A class for writing Stata binary dta files from array-like objects

    Parameters
    ----------
    fname : file path or buffer
        Where to save the dta file.
    data : array-like
        Array-like input to save. Pandas objects are also accepted.
    convert_dates : dict
        Dictionary mapping column of datetime types to the stata internal
        format that you want to use for the dates. Options are
        'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
        number or a name.
    encoding : str
        Default is latin-1. Note that Stata does not support unicode.
    byteorder : str
        Can be ">", "<", "little", or "big". The default is None which uses
        `sys.byteorder`

    Returns
    -------
    writer : StataWriter instance
        The StataWriter instance has a write_file method, which will
        write the file to the given `fname`.

    Examples
    --------
    >>> writer = StataWriter('./data_file.dta', data)
    >>> writer.write_file()

    Or with dates

    >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'})
    >>> writer.write_file()
    """
    #type          code
    #--------------------
    #str1        1 = 0x01
    #str2        2 = 0x02
    #...
    #str244    244 = 0xf4
    #byte      251 = 0xfb  (sic)
    #int       252 = 0xfc
    #long      253 = 0xfd
    #float     254 = 0xfe
    #double    255 = 0xff
    #--------------------
    #NOTE: the byte type seems to be reserved for categorical variables
    # with a label, but the underlying variable is -127 to 100
    # we're going to drop the label and cast to int
    DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \
                    [(251, np.int16),(252, np.int32),(253, int),
                        (254, np.float32), (255, np.float64)])
    TYPE_MAP = lrange(251) + list('bhlfd')
    MISSING_VALUES = {
        'b': 101,
        'h': 32741,
        'l': 2147483621,
        'f': 1.7014118346046923e+38,
        'd': 8.98846567431158e+307
    }

    def __init__(self,
                 fname,
                 data,
                 convert_dates=None,
                 encoding="latin-1",
                 byteorder=None):
        warnings.warn(
            "StataWriter is deprecated as of 0.10.0 and will be removed in a "
            "future version.  Use pandas.DataFrame.to_stata or "
            "pandas.io.stata.StatWriter instead.", FutureWarning)

        self._convert_dates = convert_dates
        # attach nobs, nvars, data, varlist, typlist
        if data_util._is_using_pandas(data, None):
            self._prepare_pandas(data)

        elif data_util._is_array_like(data, None):
            data = np.asarray(data)
            if data_util._is_structured_ndarray(data):
                self._prepare_structured_array(data)
            else:
                if convert_dates is not None:
                    raise ValueError("Not able to convert dates in a plain"
                                     " ndarray.")
                self._prepare_ndarray(data)

        else:  # pragma : no cover
            raise ValueError("Type %s for data not understood" % type(data))

        if byteorder is None:
            byteorder = sys.byteorder
        self._byteorder = _set_endianness(byteorder)
        self._encoding = encoding
        self._file = get_file_obj(fname, 'wb', encoding)

    def _write(self, to_write):
        """
        Helper to call asbytes before writing to file for Python 3 compat.
        """
        self._file.write(asbytes(to_write))

    def _prepare_structured_array(self, data):
        self.nobs = len(data)
        self.nvar = len(data.dtype)
        self.data = data
        self.datarows = iter(data)
        dtype = data.dtype
        descr = dtype.descr
        if dtype.names is None:
            varlist = _default_names(self.nvar)
        else:
            varlist = dtype.names

        # check for datetime and change the type
        convert_dates = self._convert_dates
        if convert_dates is not None:
            convert_dates = _maybe_convert_to_int_keys(convert_dates, varlist)
            self._convert_dates = convert_dates
            for key in convert_dates:
                descr[key] = (descr[key][0],
                              _convert_datetime_to_stata_type(
                                  convert_dates[key]))
            dtype = np.dtype(descr)

        self.varlist = varlist
        self.typlist = [
            _dtype_to_stata_type(dtype[i]) for i in range(self.nvar)
        ]
        self.fmtlist = [
            _dtype_to_default_stata_fmt(dtype[i]) for i in range(self.nvar)
        ]
        # set the given format for the datetime cols
        if convert_dates is not None:
            for key in convert_dates:
                self.fmtlist[key] = convert_dates[key]

    def _prepare_ndarray(self, data):
        if data.ndim == 1:
            data = data[:, None]
        self.nobs, self.nvar = data.shape
        self.data = data
        self.datarows = iter(data)
        #TODO: this should be user settable
        dtype = data.dtype
        self.varlist = _default_names(self.nvar)
        self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)]
        self.fmtlist = [
            _dtype_to_default_stata_fmt(dtype) for i in range(self.nvar)
        ]

    def _prepare_pandas(self, data):
        #NOTE: we might need a different API / class for pandas objects so
        # we can set different semantics - handle this with a PR to pandas.io
        class DataFrameRowIter(object):
            def __init__(self, data):
                self.data = data

            def __iter__(self):
                for i, row in data.iterrows():
                    yield row

        data = data.reset_index()
        self.datarows = DataFrameRowIter(data)
        self.nobs, self.nvar = data.shape
        self.data = data
        self.varlist = data.columns.tolist()
        dtypes = data.dtypes
        convert_dates = self._convert_dates
        if convert_dates is not None:
            convert_dates = _maybe_convert_to_int_keys(convert_dates,
                                                       self.varlist)
            self._convert_dates = convert_dates
            for key in convert_dates:
                new_type = _convert_datetime_to_stata_type(convert_dates[key])
                dtypes[key] = np.dtype(new_type)
        self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
        self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes]
        # set the given format for the datetime cols
        if convert_dates is not None:
            for key in convert_dates:
                self.fmtlist[key] = convert_dates[key]

    def write_file(self):
        self._write_header()
        self._write_descriptors()
        self._write_variable_labels()
        # write 5 zeros for expansion fields
        self._write(_pad_bytes("", 5))
        if self._convert_dates is None:
            self._write_data_nodates()
        else:
            self._write_data_dates()
        #self._write_value_labels()

    def _write_header(self, data_label=None, time_stamp=None):
        byteorder = self._byteorder
        # ds_format - just use 114
        self._write(pack("b", 114))
        # byteorder
        self._write(byteorder == ">" and "\x01" or "\x02")
        # filetype
        self._write("\x01")
        # unused
        self._write("\x00")
        # number of vars, 2 bytes
        self._write(pack(byteorder + "h", self.nvar)[:2])
        # number of obs, 4 bytes
        self._write(pack(byteorder + "i", self.nobs)[:4])
        # data label 81 bytes, char, null terminated
        if data_label is None:
            self._write(
                self._null_terminate(_pad_bytes("", 80), self._encoding))
        else:
            self._write(
                self._null_terminate(_pad_bytes(data_label[:80], 80),
                                     self._encoding))
        # time stamp, 18 bytes, char, null terminated
        # format dd Mon yyyy hh:mm
        if time_stamp is None:
            time_stamp = datetime.datetime.now()
        elif not isinstance(time_stamp, datetime):
            raise ValueError("time_stamp should be datetime type")
        self._write(
            self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"),
                                 self._encoding))

    def _write_descriptors(self,
                           typlist=None,
                           varlist=None,
                           srtlist=None,
                           fmtlist=None,
                           lbllist=None):
        nvar = self.nvar
        # typlist, length nvar, format byte array
        for typ in self.typlist:
            self._write(typ)

        # varlist, length 33*nvar, char array, null terminated
        for name in self.varlist:
            name = self._null_terminate(name, self._encoding)
            name = _pad_bytes(asstr(name[:32]), 33)
            self._write(name)

        # srtlist, 2*(nvar+1), int array, encoded by byteorder
        srtlist = _pad_bytes("", (2 * (nvar + 1)))
        self._write(srtlist)

        # fmtlist, 49*nvar, char array
        for fmt in self.fmtlist:
            self._write(_pad_bytes(fmt, 49))

        # lbllist, 33*nvar, char array
        #NOTE: this is where you could get fancy with pandas categorical type
        for i in range(nvar):
            self._write(_pad_bytes("", 33))

    def _write_variable_labels(self, labels=None):
        nvar = self.nvar
        if labels is None:
            for i in range(nvar):
                self._write(_pad_bytes("", 81))

    def _write_data_nodates(self):
        data = self.datarows
        byteorder = self._byteorder
        TYPE_MAP = self.TYPE_MAP
        typlist = self.typlist
        for row in data:
            #row = row.squeeze().tolist() # needed for structured arrays
            for i, var in enumerate(row):
                typ = ord(typlist[i])
                if typ <= 244:  # we've got a string
                    if len(var) < typ:
                        var = _pad_bytes(asstr(var), len(var) + 1)
                    self._write(var)
                else:
                    try:
                        if typ in _type_converters:
                            var = _type_converters[typ](var)
                        self._write(pack(byteorder + TYPE_MAP[typ], var))
                    except struct_error:
                        # have to be strict about type pack won't do any
                        # kind of casting
                        self._write(
                            pack(byteorder + TYPE_MAP[typ],
                                 _type_converters[typ](var)))

    def _write_data_dates(self):
        convert_dates = self._convert_dates
        data = self.datarows
        byteorder = self._byteorder
        TYPE_MAP = self.TYPE_MAP
        MISSING_VALUES = self.MISSING_VALUES
        typlist = self.typlist
        for row in data:
            #row = row.squeeze().tolist() # needed for structured arrays
            for i, var in enumerate(row):
                typ = ord(typlist[i])
                #NOTE: If anyone finds this terribly slow, there is
                # a vectorized way to convert dates, see genfromdta for going
                # from int to datetime and reverse it. will copy data though
                if i in convert_dates:
                    var = _datetime_to_stata_elapsed(var, self.fmtlist[i])
                if typ <= 244:  # we've got a string
                    if isnull(var):
                        var = ""  # missing string
                    if len(var) < typ:
                        var = _pad_bytes(var, len(var) + 1)
                    self._write(var)
                else:
                    if isnull(var):  # this only matters for floats
                        var = MISSING_VALUES[typ]
                    self._write(pack(byteorder + TYPE_MAP[typ], var))

    def _null_terminate(self, s, encoding):
        null_byte = '\x00'
        if PY3:
            s += null_byte
            return s.encode(encoding)
        else:
            s += null_byte
            return s
Example #57
0
def summary_params(results,
                   yname=None,
                   xname=None,
                   alpha=.05,
                   use_t=True,
                   skip_header=False,
                   title=None):
    '''create a summary table for the parameters

    Parameters
    ----------
    res : results instance
        some required information is directly taken from the result
        instance
    yname : {str, None}
        optional name for the endogenous variable, default is "y"
    xname : {list[str], None}
        optional names for the exogenous variables, default is "var_xx"
    alpha : float
        significance level for the confidence intervals
    use_t : bool
        indicator whether the p-values are based on the Student-t
        distribution (if True) or on the normal distribution (if False)
    skip_headers : bool
        If false (default), then the header row is added. If true, then no
        header row is added.

    Returns
    -------
    params_table : SimpleTable instance
    '''

    # Parameters part of the summary table
    # ------------------------------------
    # Note: this is not necessary since we standardized names,
    #   only t versus normal

    if isinstance(results, tuple):
        # for multivariate endog
        # TODO: check whether I do not want to refactor this
        #we need to give parameter alpha to conf_int
        results, params, std_err, tvalues, pvalues, conf_int = results
    else:
        params = results.params
        std_err = results.bse
        tvalues = results.tvalues  # is this sometimes called zvalues
        pvalues = results.pvalues
        conf_int = results.conf_int(alpha)
    if params.size == 0:
        return SimpleTable([['No Model Parameters']])
    # Dictionary to store the header names for the parameter part of the
    # summary table. look up by modeltype
    if use_t:
        param_header = [
            'coef', 'std err', 't', 'P>|t|', '[' + str(alpha / 2),
            str(1 - alpha / 2) + ']'
        ]
    else:
        param_header = [
            'coef', 'std err', 'z', 'P>|z|', '[' + str(alpha / 2),
            str(1 - alpha / 2) + ']'
        ]

    if skip_header:
        param_header = None

    _, xname = _getnames(results, yname=yname, xname=xname)

    if len(xname) != len(params):
        raise ValueError('xnames and params do not have the same length')

    params_stubs = xname

    exog_idx = lrange(len(xname))

    params_data = lzip([forg(params[i], prec=4) for i in exog_idx],
                       [forg(std_err[i]) for i in exog_idx],
                       [forg(tvalues[i]) for i in exog_idx],
                       ["%#6.3f" % (pvalues[i]) for i in exog_idx],
                       [forg(conf_int[i, 0]) for i in exog_idx],
                       [forg(conf_int[i, 1]) for i in exog_idx])
    parameter_table = SimpleTable(params_data,
                                  param_header,
                                  params_stubs,
                                  title=title,
                                  txt_fmt=fmt_params)

    return parameter_table
Example #58
0
def anova2_lm_single(model, design_info, n_rows, test, pr_test, robust):
    """
    Anova type II table for one fitted linear model.

    Parameters
    ----------
    model : fitted linear model results instance
        A fitted linear model

    **kwargs**

    scale : float
        Estimate of variance, If None, will be estimated from the largest
    model. Default is None.
        test : str {"F", "Chisq", "Cp"} or None
        Test statistics to provide. Default is "F".

    Notes
    -----
    Use of this function is discouraged. Use anova_lm instead.

    Type II
    Sum of Squares compares marginal contribution of terms. Thus, it is
    not particularly useful for models with significant interaction terms.
    """
    terms_info = design_info.terms[:] # copy
    terms_info = _remove_intercept_patsy(terms_info)

    names = ['sum_sq', 'df', test, pr_test]

    table = DataFrame(np.zeros((n_rows, 4)), columns = names)
    cov = _get_covariance(model, None)
    robust_cov = _get_covariance(model, robust)
    col_order = []
    index = []
    for i, term in enumerate(terms_info):
        # grab all varaibles except interaction effects that contain term
        # need two hypotheses matrices L1 is most restrictive, ie., term==0
        # L2 is everything except term==0
        cols = design_info.slice(term)
        L1 = lrange(cols.start, cols.stop)
        L2 = []
        term_set = set(term.factors)
        for t in terms_info: # for the term you have
            other_set = set(t.factors)
            if term_set.issubset(other_set) and not term_set == other_set:
                col = design_info.slice(t)
                # on a higher order term containing current `term`
                L1.extend(lrange(col.start, col.stop))
                L2.extend(lrange(col.start, col.stop))

        L1 = np.eye(model.model.exog.shape[1])[L1]
        L2 = np.eye(model.model.exog.shape[1])[L2]

        if L2.size:
            LVL = np.dot(np.dot(L1,robust_cov),L2.T)
            from scipy import linalg
            orth_compl,_ = linalg.qr(LVL)
            r = L1.shape[0] - L2.shape[0]
            # L1|2
            # use the non-unique orthogonal completion since L12 is rank r
            L12 = np.dot(orth_compl[:,-r:].T, L1)
        else:
            L12 = L1
            r = L1.shape[0]
        #from IPython.core.debugger import Pdb; Pdb().set_trace()
        if test == 'F':
            f = model.f_test(L12, cov_p=robust_cov)
            table.loc[table.index[i], test] = test_value = f.fvalue
            table.loc[table.index[i], pr_test] = f.pvalue

        # need to back out SSR from f_test
        table.loc[table.index[i], 'df'] = r
        col_order.append(cols.start)
        index.append(term.name())

    table.index = Index(index + ['Residual'])
    table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1]+1])]
    # back out sum of squares from f_test
    ssr = table[test] * table['df'] * model.ssr/model.df_resid
    table['sum_sq'] = ssr
    # fill in residual
    table.loc['Residual', ['sum_sq','df', test, pr_test]] = (model.ssr,
                                                            model.df_resid,
                                                            np.nan, np.nan)

    return table
Example #59
0
def test_pandasacovf():
    s = Series(lrange(1, 11))
    assert_almost_equal(acovf(s), acovf(s.values))
Example #60
0
def plot_partregress(endog,
                     exog_i,
                     exog_others,
                     data=None,
                     title_kwargs={},
                     obs_labels=True,
                     label_kwargs={},
                     ax=None,
                     ret_coords=False,
                     **kwargs):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : ndarray or string
       endogenous or response variable. If string is given, you can use a
       arbitrary translations as with a formula.
    exog_i : ndarray or string
        exogenous, explanatory variable. If string is given, you can use a
        arbitrary translations as with a formula.
    exog_others : ndarray or list of strings
        other exogenous, explanatory variables. If a list of strings is given,
        each item is a term in formula. You can use a arbitrary translations
        as with a formula. The effect of these variables will be removed by
        OLS regression.
    data : DataFrame, dict, or recarray
        Some kind of data structure with names if the other variables are
        given as strings.
    title_kwargs : dict
        Keyword arguments to pass on for the title. The key to control the
        fonts is fontdict.
    obs_labels : bool or array-like
        Whether or not to annotate the plot points with their observation
        labels. If obs_labels is a boolean, the point labels will try to do
        the right thing. First it will try to use the index of data, then
        fall back to the index of exog_i. Alternatively, you may give an
        array-like object corresponding to the obseveration numbers.
    labels_kwargs : dict
        Keyword arguments that control annotate for the observation labels.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    ret_coords : bool
        If True will return the coordinates of the points in the plot. You
        can use this to add your own annotations.
    kwargs
        The keyword arguments passed to plot for the points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    coords : list, optional
        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).

    Notes
    -----
    The slope of the fitted line is the that of `exog_i` in the full
    multiple regression. The individual points can be used to assess the
    influence of points on the estimated coefficient.

    See Also
    --------
    plot_partregress_grid : Plot partial regression for a set of regressors.

    Examples
    --------
    Load the Statewide Crime data set and plot partial regression of the rate
    of high school graduation (hs_grad) on the murder rate(murder).

    The effects of the percent of the population living in urban areas (urban),
    below the poverty line (poverty) , and in a single person household (single)
    are removed by OLS regression.

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
    ...                              exog_others=['urban', 'poverty', 'single'],
    ...                              data=crime_data.data, obs_labels=False)
    >>> plt.show()

    .. plot:: plots/graphics_regression_partregress.py

    More detailed examples can be found in the Regression Plots notebook
    on the examples page.

    """
    #NOTE: there is no interaction between possible missing data and
    #obs_labels yet, so this will need to be tweaked a bit for this case
    fig, ax = utils.create_mpl_ax(ax)

    # strings, use patsy to transform to data
    if isinstance(endog, string_types):
        endog = dmatrix(endog + "-1", data)

    if isinstance(exog_others, string_types):
        RHS = dmatrix(exog_others, data)
    elif isinstance(exog_others, list):
        RHS = "+".join(exog_others)
        RHS = dmatrix(RHS, data)
    else:
        RHS = exog_others
    RHS_isemtpy = False
    if isinstance(RHS, np.ndarray) and RHS.size == 0:
        RHS_isemtpy = True
    elif isinstance(RHS, pd.DataFrame) and RHS.empty:
        RHS_isemtpy = True
    if isinstance(exog_i, string_types):
        exog_i = dmatrix(exog_i + "-1", data)

    # all arrays or pandas-like

    if RHS_isemtpy:
        ax.plot(endog, exog_i, 'o', **kwargs)
        fitted_line = OLS(endog, exog_i).fit()
        x_axis_endog_name = 'x' if isinstance(exog_i,
                                              np.ndarray) else exog_i.name
        y_axis_endog_name = 'y' if isinstance(
            endog, np.ndarray) else endog.design_info.column_names[0]
    else:
        res_yaxis = OLS(endog, RHS).fit()
        res_xaxis = OLS(exog_i, RHS).fit()
        xaxis_resid = res_xaxis.resid
        yaxis_resid = res_yaxis.resid
        x_axis_endog_name = res_xaxis.model.endog_names
        y_axis_endog_name = res_yaxis.model.endog_names
        ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
        fitted_line = OLS(yaxis_resid, xaxis_resid).fit()

    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)

    if x_axis_endog_name == 'y':  # for no names regression will just get a y
        x_axis_endog_name = 'x'  # this is misleading, so use x
    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
    ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
    ax.set_title('Partial Regression Plot', **title_kwargs)

    #NOTE: if we want to get super fancy, we could annotate if a point is
    #clicked using this widget
    #http://stackoverflow.com/questions/4652439/
    #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
    #4674445#4674445
    if obs_labels is True:
        if data is not None:
            obs_labels = data.index
        elif hasattr(exog_i, "index"):
            obs_labels = exog_i.index
        else:
            obs_labels = res_xaxis.model.data.row_labels
        #NOTE: row_labels can be None.
        #Maybe we should fix this to never be the case.
        if obs_labels is None:
            obs_labels = lrange(len(exog_i))

    if obs_labels is not False:  # could be array-like
        if len(obs_labels) != len(exog_i):
            raise ValueError("obs_labels does not match length of exog_i")
        label_kwargs.update(dict(ha="center", va="bottom"))
        ax = utils.annotate_axes(lrange(len(obs_labels)),
                                 obs_labels,
                                 lzip(res_xaxis.resid, res_yaxis.resid),
                                 [(0, 5)] * len(obs_labels),
                                 "x-large",
                                 ax=ax,
                                 **label_kwargs)

    if ret_coords:
        return fig, (res_xaxis.resid, res_yaxis.resid)
    else:
        return fig