def test_wls_example(): #example from the docstring, there was a note about a bug, should #be fixed now Y = [1,3,4,5,2,3,4] X = lrange(1,8) X = add_constant(X, prepend=False) wls_model = WLS(Y,X, weights=lrange(1,8)).fit() #taken from R lm.summary assert_almost_equal(wls_model.fvalue, 0.127337843215, 6) assert_almost_equal(wls_model.scale, 2.44608530786**2, 6)
def test_arma_order_select_ic(): # smoke test, assumes info-criteria are right from statsmodels.tsa.arima_process import arma_generate_sample arparams = np.array([.75, -.25]) maparams = np.array([.65, .35]) arparams = np.r_[1, -arparams] maparam = np.r_[1, maparams] nobs = 250 np.random.seed(2014) y = arma_generate_sample(arparams, maparams, nobs) res = arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc') # regression tests in case we change algorithm to minic in sas aic_x = np.array([[ np.nan, 552.7342255 , 484.29687843], [ 562.10924262, 485.5197969 , 480.32858497], [ 507.04581344, 482.91065829, 481.91926034], [ 484.03995962, 482.14868032, 483.86378955], [ 481.8849479 , 483.8377379 , 485.83756612]]) bic_x = np.array([[ np.nan, 559.77714733, 494.86126118], [ 569.15216446, 496.08417966, 494.41442864], [ 517.61019619, 496.99650196, 499.52656493], [ 498.12580329, 499.75598491, 504.99255506], [ 499.49225249, 504.96650341, 510.48779255]]) aic = DataFrame(aic_x, index=lrange(5), columns=lrange(3)) bic = DataFrame(bic_x, index=lrange(5), columns=lrange(3)) assert_almost_equal(res.aic.values, aic.values, 5) assert_almost_equal(res.bic.values, bic.values, 5) assert_equal(res.aic_min_order, (1, 2)) assert_equal(res.bic_min_order, (1, 2)) assert_(res.aic.index.equals(aic.index)) assert_(res.aic.columns.equals(aic.columns)) assert_(res.bic.index.equals(bic.index)) assert_(res.bic.columns.equals(bic.columns)) index = pd.date_range('2000-1-1', freq='M', periods=len(y)) y_series = pd.Series(y, index=index) res_pd = arma_order_select_ic(y_series, max_ar=2, max_ma=1, ic=['aic', 'bic'], trend='nc') assert_almost_equal(res_pd.aic.values, aic.values[:3, :2], 5) assert_almost_equal(res_pd.bic.values, bic.values[:3, :2], 5) assert_equal(res_pd.aic_min_order, (2, 1)) assert_equal(res_pd.bic_min_order, (1, 1)) res = arma_order_select_ic(y, ic='aic', trend='nc') assert_almost_equal(res.aic.values, aic.values, 5) assert_(res.aic.index.equals(aic.index)) assert_(res.aic.columns.equals(aic.columns)) assert_equal(res.aic_min_order, (1, 2))
def _make_predict_dates(self): data = self.data dtstart = data.predict_start dtend = data.predict_end freq = data.freq if freq is not None: pandas_freq = _freq_to_pandas[freq] try: from pandas import DatetimeIndex dates = DatetimeIndex(start=dtstart, end=dtend, freq=pandas_freq) except ImportError as err: from pandas import DateRange dates = DateRange(dtstart, dtend, offset = pandas_freq).values # handle elif freq is None and (isinstance(dtstart, int) and isinstance(dtend, int)): from pandas import Index dates = Index(lrange(dtstart, dtend+1)) # if freq is None and dtstart and dtend aren't integers, we're # in sample else: dates = self.data.dates start = self._get_dates_loc(dates, dtstart) end = self._get_dates_loc(dates, dtend) dates = dates[start:end+1] # is this index inclusive? self.data.predict_dates = dates
def maineffect_func(value, reference=reference): rvalue = [] keep = lrange(value.shape[0]) keep.pop(reference) for i in range(len(keep)): rvalue.append(value[keep[i]] - value[reference]) return np.array(rvalue)
def print_ic_table(ics, selected_orders): """ For VAR order selection """ # Can factor this out into a utility method if so desired cols = sorted(ics) data = mat([["%#10.4g" % v for v in ics[c]] for c in cols], dtype=object).T # start minimums for i, col in enumerate(cols): idx = int(selected_orders[col]), i data[idx] = data[idx] + '*' # data[idx] = data[idx][:-1] + '*' # super hack, ugh fmt = dict(_default_table_fmt, data_fmts=("%s",) * len(cols)) buf = StringIO() table = SimpleTable(data, cols, lrange(len(data)), title='VAR Order Selection', txt_fmt=fmt) buf.write(str(table) + '\n') buf.write('* Minimum' + '\n') print(buf.getvalue())
def plot_with_error(y, error, x=None, axes=None, value_fmt='k', error_fmt='k--', alpha=0.05, stderr_type = 'asym'): """ Make plot with optional error bars Parameters ---------- y : error : array or None """ import matplotlib.pyplot as plt if axes is None: axes = plt.gca() x = x if x is not None else lrange(len(y)) plot_action = lambda y, fmt: axes.plot(x, y, fmt) plot_action(y, value_fmt) #changed this if error is not None: if stderr_type == 'asym': q = util.norm_signif_level(alpha) plot_action(y - q * error, error_fmt) plot_action(y + q * error, error_fmt) if stderr_type in ('mc','sz1','sz2','sz3'): plot_action(error[0], error_fmt) plot_action(error[1], error_fmt)
def test_pickle(): import tempfile from numpy.testing import assert_equal tmpdir = tempfile.mkdtemp(prefix='pickle') a = lrange(10) save_pickle(a, tmpdir+'/res.pkl') b = load_pickle(tmpdir+'/res.pkl') assert_equal(a, b) #cleanup, tested on Windows try: import os os.remove(tmpdir+'/res.pkl') os.rmdir(tmpdir) except (OSError, IOError): pass assert not os.path.exists(tmpdir) #test with file handle fh = BytesIO() save_pickle(a, fh) fh.seek(0,0) c = load_pickle(fh) fh.close() assert_equal(a,b)
def _make_predict_dates(self): data = self.data dtstart = data.predict_start dtend = data.predict_end freq = data.freq if freq is not None: pandas_freq = _freq_to_pandas[freq] # preserve PeriodIndex or DatetimeIndex dates = self.data.dates.__class__(start=dtstart, end=dtend, freq=pandas_freq) # handle elif freq is None and (isinstance(dtstart, (int, long)) and isinstance(dtend, (int, long))): from pandas import Index dates = Index(lrange(dtstart, dtend+1)) # if freq is None and dtstart and dtend aren't integers, we're # in sample else: dates = self.data.dates start = self._get_dates_loc(dates, dtstart) end = self._get_dates_loc(dates, dtend) dates = dates[start:end+1] # is this index inclusive? self.data.predict_dates = dates
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None, **kwargs): from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = influence leverage = infl.hat_matrix_diag resid = zscore(infl.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(int(results.nobs)) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def irf_grid_plot(values, stderr, impcol, rescol, names, title, signif=0.05, hlines=None, subplot_params=None, plot_params=None, figsize=(10,10), stderr_type='asym'): """ Reusable function to make flexible grid plots of impulse responses and comulative effects values : (T + 1) x k x k stderr : T x k x k hlines : k x k """ import matplotlib.pyplot as plt if subplot_params is None: subplot_params = {} if plot_params is None: plot_params = {} nrows, ncols, to_plot = _get_irf_plot_config(names, impcol, rescol) fig, axes = plt.subplots(nrows=nrows, ncols=ncols, sharex=True, squeeze=False, figsize=figsize) # fill out space adjust_subplots() fig.suptitle(title, fontsize=14) subtitle_temp = r'%s$\rightarrow$%s' k = len(names) rng = lrange(len(values)) for (j, i, ai, aj) in to_plot: ax = axes[ai][aj] # HACK? if stderr is not None: if stderr_type == 'asym': sig = np.sqrt(stderr[:, j * k + i, j * k + i]) plot_with_error(values[:, i, j], sig, x=rng, axes=ax, alpha=signif, value_fmt='b', stderr_type=stderr_type) if stderr_type in ('mc','sz1','sz2','sz3'): errs = stderr[0][:, i, j], stderr[1][:, i, j] plot_with_error(values[:, i, j], errs, x=rng, axes=ax, alpha=signif, value_fmt='b', stderr_type=stderr_type) else: plot_with_error(values[:, i, j], None, x=rng, axes=ax, value_fmt='b') ax.axhline(0, color='k') if hlines is not None: ax.axhline(hlines[i,j], color='k') sz = subplot_params.get('fontsize', 12) ax.set_title(subtitle_temp % (names[j], names[i]), fontsize=sz) return fig
def _maybe_reset_index(data): """ All the Rdatasets have the integer row.labels from R if there is no real index. Strip this for a zero-based index """ if data.index.equals(Index(lrange(1, len(data) + 1))): data = data.reset_index(drop=True) return data
def test__reduce_dict(): data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), [1] * 8)) eq(_reduce_dict(data, ('m',)), 4) eq(_reduce_dict(data, ('m', 'o')), 2) eq(_reduce_dict(data, ('m', 'o', 'w')), 1) data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), lrange(8))) eq(_reduce_dict(data, ('m',)), 6) eq(_reduce_dict(data, ('m', 'o')), 1) eq(_reduce_dict(data, ('m', 'o', 'w')), 0)
def variables(self): """ Returns a list of the dataset's StataVariables objects. """ return lmap(_StataVariable, zip(lrange(self._header['nvar']), self._header['typlist'], self._header['varlist'], self._header['srtlist'], self._header['fmtlist'], self._header['lbllist'], self._header['vlblist']))
def __iter__(self): n = self.n p = self.p comb = combinations(lrange(n), p) for idx in comb: test_index = np.zeros(n, dtype=np.bool) test_index[np.array(idx)] = True train_index = np.logical_not(test_index) yield train_index, test_index
def date_range_str(start, end=None, length=None): """ Returns a list of abbreviated date strings. Parameters ---------- start : str The first abbreviated date, for instance, '1965q1' or '1965m1' end : str, optional The last abbreviated date if length is None. length : int, optional The length of the returned array of end is None. Returns ------- date_range : list List of strings """ flags = re.IGNORECASE | re.VERBOSE #_check_range_inputs(end, length, freq) start = start.lower() if re.search(_m_pattern, start, flags): annual_freq = 12 split = 'm' elif re.search(_q_pattern, start, flags): annual_freq = 4 split = 'q' elif re.search(_y_pattern, start, flags): annual_freq = 1 start += 'a1' # hack if end: end += 'a1' split = 'a' else: raise ValueError("Date %s not understood" % start) yr1, offset1 = lmap(int, start.replace(":","").split(split)) if end is not None: end = end.lower() yr2, offset2 = lmap(int, end.replace(":","").split(split)) length = (yr2 - yr1) * annual_freq + offset2 elif length: yr2 = yr1 + length // annual_freq offset2 = length % annual_freq + (offset1 - 1) years = np.repeat(lrange(yr1+1, yr2), annual_freq).tolist() years = np.r_[[str(yr1)]*(annual_freq+1-offset1), years] # tack on first year years = np.r_[years, [str(yr2)]*offset2] # tack on last year if split != 'a': offset = np.tile(np.arange(1, annual_freq+1), yr2-yr1-1) offset = np.r_[np.arange(offset1, annual_freq+1).astype('a2'), offset] offset = np.r_[offset, np.arange(1,offset2+1).astype('a2')] date_arr_range = [''.join([i, split, asstr(j)]) for i,j in zip(years, offset)] else: date_arr_range = years.tolist() return date_arr_range
def interactions(terms, order=[1,2]): """ Output all pairwise interactions of given order of a sequence of terms. The argument order is a sequence specifying which order of interactions should be generated -- the default creates main effects and two-way interactions. If order is an integer, it is changed to range(1,order+1), so order=3 is equivalent to order=[1,2,3], generating all one, two and three-way interactions. If any entry of order is greater than len(terms), it is effectively treated as len(terms). >>> print interactions([Term(l) for l in ['a', 'b', 'c']]) <formula: a*b + a*c + b*c + a + b + c> >>> >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5))) <formula: a*b + a*b*c + a*c + b*c + a + b + c> >>> """ l = len(terms) values = {} if np.asarray(order).shape == (): order = lrange(1, int(order)+1) # First order for o in order: I = np.indices((l,)*(o)) I.shape = (I.shape[0], np.product(I.shape[1:])) for m in range(I.shape[1]): # only keep combinations that have unique entries if (np.unique(I[:,m]).shape == I[:,m].shape and np.alltrue(np.equal(np.sort(I[:,m]), I[:,m]))): ll = [terms[j] for j in I[:,m]] v = ll[0] for ii in range(len(ll)-1): v *= ll[ii+1] values[tuple(I[:,m])] = v key = list(iterkeys(values))[0] value = values[key] del(values[key]) for v in itervalues(values): value += v return value
def summary(self): buf = StringIO() rng = lrange(self.periods) for i in range(self.neqs): ppm = output.pprint_matrix(self.decomp[i], rng, self.names) buf.write('FEVD for %s\n' % self.names[i]) buf.write(ppm + '\n') print(buf.getvalue())
def check_index(self, is_sorted=True, unique=True, index=None): """Sanity checks""" if not index: index = self.index if is_sorted: test = pd.DataFrame(lrange(len(index)), index=index) test_sorted = test.sort() if not test.index.equals(test_sorted.index): raise Exception('Data is not be sorted') if unique: if len(index) != len(index.unique()): raise Exception('Duplicate index entries')
def cat2dummy(y, nonseq=0): if nonseq or (y.ndim == 2 and y.shape[1] > 1): ycat, uniques, unitransl = convertlabels(y, lrange(y.shape[1])) else: ycat = y.copy() ymin = y.min() uniques = np.arange(ymin,y.max()+1) if ycat.ndim == 1: ycat = ycat[:,np.newaxis] # this builds matrix nobs*ncat dummy = (ycat == uniques).astype(int) return dummy
def _influence_plot(results, influence, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): infl = influence fig, ax = utils.create_mpl_ax(ax) if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range/old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized from scipy import stats cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize/2)**.5, (psize/2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def detrend(x, order=1, axis=0): '''detrend an array with a trend of given order along axis 0 or 1 Parameters ---------- x : array_like, 1d or 2d data, if 2d, then each row or column is independently detrended with the same trendorder, but independent trend estimates order : int specifies the polynomial order of the trend, zero is constant, one is linear trend, two is quadratic trend axis : int for detrending with order > 0, axis can be either 0 observations by rows, or 1, observations by columns Returns ------- detrended data series : ndarray The detrended series is the residual of the linear regression of the data on the trend of given order. ''' x = np.asarray(x) nobs = x.shape[0] if order == 0: return x - np.expand_dims(x.mean(ax), x) else: if x.ndim == 2 and lrange(2)[axis]==1: x = x.T elif x.ndim > 2: raise NotImplementedError('x.ndim>2 is not implemented until it is needed') #could use a polynomial, but this should work also with 2d x, but maybe not yet trends = np.vander(np.arange(nobs).astype(float), N=order+1) beta = np.linalg.lstsq(trends, x)[0] resid = x - np.dot(trends, beta) if x.ndim == 2 and lrange(2)[axis]==1: resid = resid.T return resid
def simulate(self): group_effect_var = self.dep_params[0] vcomp = self.dep_params[1:] vcomp.append(0) endog, exog, group, id_matrix = [], [], [], [] for i in range(self.ngroups): iterators = [lrange(n) for n in self.nest_sizes] # The random effects variances = [np.sqrt(v)*np.random.normal(size=n) for v,n in zip(vcomp, self.nest_sizes)] gpe = np.random.normal() * np.sqrt(group_effect_var) nest_all = [] for j in self.nest_sizes: nest_all.append(set()) for nest in product(*iterators): group.append(i) # The sum of all random effects that apply to this # unit ref = gpe + sum([v[j] for v,j in zip(variances, nest)]) exog1 = np.random.normal(size=5) exog1[0] = 1 exog.append(exog1) error = ref + self.error_sd * np.random.normal() endog1 = np.dot(exog1, self.params) + error endog.append(endog1) for j in range(len(nest)): nest_all[j].add(tuple(nest[0:j+1])) nest1 = [len(x)-1 for x in nest_all] id_matrix.append(nest1[0:-1]) self.exog = np.array(exog) self.endog = np.array(endog) self.group = np.array(group) self.id_matrix = np.array(id_matrix) self.time = np.zeros_like(self.endog)
def coef_restriction_diffbase(n_coeffs, n_vars=None, position=0, base_idx=0): reduced = -np.eye(n_coeffs) #make all rows, drop one row later reduced[:, base_idx] = 1 keep = lrange(n_coeffs) del keep[base_idx] reduced = np.take(reduced, keep, axis=0) if n_vars is None: return reduced else: full = np.zeros((n_coeffs-1, n_vars)) full[:, position:position+n_coeffs] = reduced return full
def variables(self): """ Returns a list of the dataset's StataVariables objects. """ return lmap( _StataVariable, zip( lrange(self._header["nvar"]), self._header["typlist"], self._header["varlist"], self._header["srtlist"], self._header["fmtlist"], self._header["lbllist"], self._header["vlblist"], ), )
def _next(self): typlist = self._header['typlist'] if self._has_string_data: data = [None]*self._header['nvar'] for i in range(len(data)): if isinstance(typlist[i], int): data[i] = self._null_terminate(self._file.read(typlist[i]), self._encoding) else: data[i] = self._unpack(typlist[i], self._file.read(self._col_size(i))) return data else: return lmap(lambda i: self._unpack(typlist[i], self._file.read(self._col_size(i))), lrange(self._header['nvar']))
def __init__(self, endog, exog, r_matrix=None, q_matrix=None, sigma_prior=None, sigma=None): super(TheilGLS, self).__init__(endog, exog, sigma=sigma) if r_matrix is not None: r_matrix = np.asarray(r_matrix) else: try: const_idx = self.data.const_idx except AttributeError: const_idx = None k_exog = exog.shape[1] r_matrix = np.eye(k_exog) if const_idx is not None: keep_idx = lrange(k_exog) del keep_idx[const_idx] r_matrix = r_matrix[keep_idx] # delete row for constant k_constraints, k_exog = r_matrix.shape self.r_matrix = r_matrix if k_exog != self.exog.shape[1]: raise ValueError('r_matrix needs to have the same number of columns' 'as exog') if q_matrix is not None: self.q_matrix = atleast_2dcols(q_matrix) else: self.q_matrix = np.zeros(k_constraints)[:, None] if self.q_matrix.shape != (k_constraints, 1): raise ValueError('q_matrix has wrong shape') if sigma_prior is not None: sigma_prior = np.asarray(sigma_prior) if np.size(sigma_prior) == 1: sigma_prior = np.diag(sigma_prior * np.ones(k_constraints)) #no numerical shortcuts are used for this case elif sigma_prior.ndim == 1: sigma_prior = np.diag(sigma_prior) else: sigma_prior = np.eye(k_constraints) if sigma_prior.shape != (k_constraints, k_constraints): raise ValueError('sigma_prior has wrong shape') self.sigma_prior = sigma_prior self.sigma_prior_inv = np.linalg.pinv(sigma_prior) #or inv
def _next(self): typlist = self._header['typlist'] if self._has_string_data: data = [None] * self._header['nvar'] for i in range(len(data)): if isinstance(typlist[i], int): data[i] = self._null_terminate(self._file.read(typlist[i]), self._encoding) else: data[i] = self._unpack(typlist[i], self._file.read(self._col_size(i))) return data else: return lmap( lambda i: self._unpack(typlist[i], self._file.read(self._col_size(i))), lrange(self._header['nvar']))
def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(iteritems(data)) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = list(iteritems(data)) # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(iterkeys(data))) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = lrange(len(categories_levels)) if index is None else index contingency = OrderedDict() for key, value in iteritems(data): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None, **kwargs): """ Plots leverage statistics vs. normalized residuals squared Parameters ---------- results : results instance A regression results instance alpha : float Specifies the cut-off for large-standardized residuals. Residuals are assumed to be distributed N(0, 1) with alpha=alpha. label_kwargs : dict The keywords to pass to annotate for the labels. ax : Axes instance Matplotlib Axes instance Returns ------- fig : matplotlib Figure A matplotlib figure instance. """ from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() leverage = infl.hat_matrix_diag resid = zscore(results.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(results.nobs) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def setup_class(cls): #DGP: simple polynomial order = 3 nobs = 200 lb, ub = -3.5, 3 x1 = np.linspace(lb, ub, nobs) x2 = np.sin(2*x1) x = np.column_stack((x1/x1.max()*1, 1.*x2)) exog = (x[:,:,None]**np.arange(order+1)[None, None, :]).reshape(nobs, -1) idx = lrange((order+1)*2) del idx[order+1] exog_reduced = exog[:,idx] #remove duplicate constant y_true = exog.sum(1) #/ 4. #z = y_true #alias check #d = x cls.nobs = nobs cls.y_true, cls.x, cls.exog = y_true, x, exog_reduced
def dummy(self, drop_idx=None, sparse=False, dtype=int): """ drop_idx is only available if sparse=False drop_idx is supposed to index into uni """ uni = self.uni if drop_idx is not None: idx = lrange(len(uni)) del idx[drop_idx] uni = uni[idx] group = self.group if not sparse: return (group[:, None] == uni[None, :]).astype(dtype) else: return dummy_sparse(self.group_int)
def plot_leverage_resid2(results, alpha=.05, ax=None, **kwargs): """ Plots leverage statistics vs. normalized residuals squared Parameters ---------- results : results instance A regression results instance alpha : float Specifies the cut-off for large-standardized residuals. Residuals are assumed to be distributed N(0, 1) with alpha=alpha. ax : Axes instance Matplotlib Axes instance Returns ------- fig : matplotlib Figure A matplotlib figure instance. """ from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() leverage = infl.hat_matrix_diag resid = zscore(results.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(int(results.nobs)) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def main_effect(self, reference=None): """ Return the 'main effect' columns of a factor, choosing an optional reference key. The reference key can be one of the keys of the Factor, or an integer, representing which column to remove. It defaults to 0. """ names = self.names() if reference is None: reference = 0 else: try: reference = self.keys.index(reference) except ValueError: reference = int(reference) def maineffect_func(value, reference=reference): rvalue = [] keep = lrange(value.shape[0]) keep.pop(reference) for i in range(len(keep)): rvalue.append(value[keep[i]] - value[reference]) return np.array(rvalue) keep = lrange(len(self.names())) keep.pop(reference) __names = self.names() _names = [ '%s-%s' % (__names[keep[i]], __names[reference]) for i in range(len(keep)) ] value = Quantitative(_names, func=self, termname='%s:maineffect' % self.termname, transform=maineffect_func) value.namespace = self.namespace return value
def setup_class(cls): #DGP: simple polynomial order = 3 nobs = 200 lb, ub = -3.5, 3 x1 = np.linspace(lb, ub, nobs) x2 = np.sin(2 * x1) x = np.column_stack((x1 / x1.max() * 1, 1. * x2)) exog = (x[:, :, None]**np.arange(order + 1)[None, None, :]).reshape(nobs, -1) idx = lrange((order + 1) * 2) del idx[order + 1] exog_reduced = exog[:, idx] #remove duplicate constant y_true = exog.sum(1) #/ 4. #z = y_true #alias check #d = x cls.nobs = nobs cls.y_true, cls.x, cls.exog = y_true, x, exog_reduced
def test_causality(self): causedby = self.ref.causality['causedby'] for i, name in enumerate(self.names): variables = self.names[:i] + self.names[i + 1:] result = self.res.test_causality(name, variables, kind='f') assert_almost_equal(result['pvalue'], causedby[i], DECIMAL_4) rng = lrange(self.k) rng.remove(i) result2 = self.res.test_causality(i, rng, kind='f') assert_almost_equal(result['pvalue'], result2['pvalue'], DECIMAL_12) # make sure works result = self.res.test_causality(name, variables, kind='wald') # corner cases _ = self.res.test_causality(self.names[0], self.names[1]) _ = self.res.test_causality(0, 1) assert_raises(Exception,self.res.test_causality, 0, 1, kind='foo')
def test_causality(self): causedby = self.ref.causality["causedby"] for i, name in enumerate(self.names): variables = self.names[:i] + self.names[i + 1:] result = self.res.test_causality(name, variables, kind="f") assert_almost_equal(result.pvalue, causedby[i], DECIMAL_4) rng = lrange(self.k) rng.remove(i) result2 = self.res.test_causality(i, rng, kind="f") assert_almost_equal(result.pvalue, result2.pvalue, DECIMAL_12) # make sure works result = self.res.test_causality(name, variables, kind="wald") # corner cases _ = self.res.test_causality(self.names[0], self.names[1]) _ = self.res.test_causality(0, 1) with pytest.raises(Exception): self.res.test_causality(0, 1, kind="foo")
def maybe_name_or_idx(idx, model): """ Give a name or an integer and return the name and integer location of the column in a design matrix. """ if idx is None: idx = lrange(model.exog.shape[1]) if isinstance(idx, int): exog_name = model.exog_names[idx] exog_idx = idx # anticipate index as list and recurse elif isinstance(idx, (tuple, list)): exog_name = [] exog_idx = [] for item in idx: exog_name_item, exog_idx_item = maybe_name_or_idx(item, model) exog_name.append(exog_name_item) exog_idx.append(exog_idx_item) else: # assume we've got a string variable exog_name = idx exog_idx = model.exog_names.index(idx) return exog_name, exog_idx
def test_pickle(): tmpdir = tempfile.mkdtemp(prefix='pickle') a = lrange(10) save_pickle(a, tmpdir + '/res.pkl') b = load_pickle(tmpdir + '/res.pkl') assert_equal(a, b) # cleanup, tested on Windows try: import os os.remove(tmpdir + '/res.pkl') os.rmdir(tmpdir) except (OSError, IOError): pass assert not os.path.exists(tmpdir) # test with file handle fh = BytesIO() save_pickle(a, fh) fh.seek(0, 0) c = load_pickle(fh) fh.close() assert_equal(a, c)
def get_ic_table(ics, selected_orders): ''' 该方法将滞后阶数结果转换为表格化的分析结果 :param ics: 滞后阶数结果 :param selected_orders: 最大滞后阶数 :return: 返回表格化的滞后阶数分析结果 ''' _default_table_fmt = dict(empty_cell='', colsep=' ', row_pre='', row_post='', table_dec_above='=', table_dec_below='=', header_dec_below='-', header_fmt='%s', stub_fmt='%s', title_align='c', header_align='r', data_aligns='r', stubs_align='l', fmt='txt') cols = sorted(ics) data = np.array([["%#10.4g" % v for v in ics[c]] for c in cols], dtype=object).T for i, col in enumerate(cols): idx = int(selected_orders[col]), i data[idx] = data[idx] + '*' fmt = dict(_default_table_fmt, data_fmts=("%s", ) * len(cols)) buf = StringIO() table = SimpleTable(data, cols, lrange(len(data)), title='VAR Order Selection', txt_fmt=fmt) buf.write(str(table) + '\n') buf.write('* Minimum' + '\n') return buf.getvalue()
def simulate(self): endog, exog, group, time = [], [], [], [] for i in range(self.ngroups): gsize = np.random.randint(self.group_size_range[0], self.group_size_range[1]) group.append([ i, ] * gsize) time1 = np.random.normal(size=(gsize, 2)) time.append(time1) exog1 = np.random.normal(size=(gsize, len(self.params[0]))) exog.append(exog1) # Probabilities for each outcome prob = [np.exp(np.dot(exog1, p)) for p in self.params] prob = np.vstack(prob).T prob /= prob.sum(1)[:, None] m = len(self.params) endog1 = [] for k in range(gsize): pdist = stats.rv_discrete(values=(lrange(m), prob[k, :])) endog1.append(pdist.rvs()) endog.append(np.asarray(endog1)) self.exog = np.concatenate(exog, axis=0) self.endog = np.concatenate(endog).astype(np.int32) self.time = np.concatenate(time, axis=0) self.group = np.concatenate(group) self.offset = np.zeros(len(self.endog), dtype=np.float64)
def _make_predict_dates(self): data = self.data dtstart = data.predict_start dtend = data.predict_end freq = data.freq if freq is not None: pandas_freq = _freq_to_pandas[freq] # preserve PeriodIndex or DatetimeIndex dates = self.data.dates.__class__(start=dtstart, end=dtend, freq=pandas_freq) if pandas_freq.freqstr == 'N': _dtend = dtend if isinstance(dates[-1], Period): _dtend = pd.to_datetime(_dtend).to_period(dates.freq) if not dates[-1] == _dtend: # TODO: this is a hack because a DatetimeIndex with # nanosecond frequency does not include "end" dtend = Timestamp(dtend.value + 1) dates = self.data.dates.__class__(start=dtstart, end=dtend, freq=pandas_freq) # handle elif freq is None and (isinstance(dtstart, (int, long)) and isinstance(dtend, (int, long))): from pandas import Index dates = Index(lrange(dtstart, dtend + 1)) # if freq is None and dtstart and dtend aren't integers, we're # in sample else: dates = self.data.dates start = self._get_dates_loc(dates, dtstart) end = self._get_dates_loc(dates, dtend) dates = dates[start:end + 1] # is this index inclusive? self.data.predict_dates = dates
def __init__(self, model, P=None, periods=None): self.periods = periods self.model = model self.neqs = model.neqs self.names = model.model.endog_names self.irfobj = model.irf(var_decomp=P, periods=periods) self.orth_irfs = self.irfobj.orth_irfs # cumulative impulse responses irfs = (self.orth_irfs[:periods] ** 2).cumsum(axis=0) rng = lrange(self.neqs) mse = self.model.mse(periods)[:, rng, rng] # lag x equation x component fevd = np.empty_like(irfs) for i in range(periods): fevd[i] = (irfs[i].T / mse[i]).T # switch to equation x lag x component self.decomp = fevd.swapaxes(0, 1)
def summary_col(results, float_format='%.4f', model_names=(), stars=False, info_dict=None, regressor_order=(), drop_omitted=False): """ Summarize multiple results instances side-by-side (coefs and SEs) Parameters ---------- results : statsmodels results instance or list of result instances float_format : str, optional float format for coefficients and standard errors Default : '%.4f' model_names : list[str], optional Must have same length as the number of results. If the names are not unique, a roman number will be appended to all model names stars : bool print significance stars info_dict : dict dict of functions to be applied to results instances to retrieve model info. To use specific information for different models, add a (nested) info_dict with model name as the key. Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would only show `R2` for OLS regression models, but additionally `N` for all other results. Default : None (use the info_dict specified in result.default_model_infos, if this property exists) regressor_order : list[str], optional list of names of the regressors in the desired order. All regressors not specified will be appended to the end of the list. drop_omitted : bool, optional Includes regressors that are not specified in regressor_order. If False, regressors not specified will be appended to end of the list. If True, only regressors in regressor_order will be included. """ if not isinstance(results, list): results = [results] cols = [_col_params(x, stars=stars, float_format=float_format) for x in results] # Unique column names (pandas has problems merging otherwise) if model_names: colnames = _make_unique(model_names) else: colnames = _make_unique([x.columns[0] for x in cols]) for i in range(len(cols)): cols[i].columns = [colnames[i]] def merg(x, y): return x.merge(y, how='outer', right_index=True, left_index=True) summ = reduce(merg, cols) if regressor_order: varnames = summ.index.get_level_values(0).tolist() ordered = [x for x in regressor_order if x in varnames] unordered = [x for x in varnames if x not in regressor_order + ['']] order = ordered + list(np.unique(unordered)) def f(idx): return sum([[x + 'coef', x + 'stde'] for x in idx], []) summ.index = f(pd.unique(varnames)) summ = summ.reindex(f(order)) summ.index = [x[:-4] for x in summ.index] if drop_omitted: summ = summ.loc[regressor_order] idx = pd.Series(lrange(summ.shape[0])) % 2 == 1 summ.index = np.where(idx, '', summ.index.get_level_values(0)) # add infos about the models. if info_dict: cols = [_col_info(x, info_dict.get(x.model.__class__.__name__, info_dict)) for x in results] else: cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in results] # use unique column names, otherwise the merge will not succeed for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])): df.columns = [name] def merg(x, y): return x.merge(y, how='outer', right_index=True, left_index=True) info = reduce(merg, cols) dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error dat.columns = summ.columns dat.index = pd.Index(summ.index.tolist() + info.index.tolist()) summ = dat summ = summ.fillna('') smry = Summary() smry._merge_latex = True smry.add_df(summ, header=True, align='l') smry.add_text('Standard errors in parentheses.') if stars: smry.add_text('* p<.1, ** p<.05, ***p<.01') return smry
from scipy import stats from statsmodels.sandbox.tools.mctools import StatTestMC from statsmodels.sandbox.stats.diagnostic import ( acorr_ljungbox, unitroot_adf) def normalnoisesim(nobs=500, loc=0.0): return (loc+np.random.randn(nobs)) def lb(x): s,p = acorr_ljungbox(x, lags=4) return np.r_[s, p] mc1 = StatTestMC(normalnoisesim, lb) mc1.run(5000, statindices=lrange(4)) print(mc1.summary_quantiles([1,2,3], stats.chi2([2,3,4]).ppf, varnames=['lag 1', 'lag 2', 'lag 3'], title='acorr_ljungbox')) print('\n\n') frac = [0.01, 0.025, 0.05, 0.1, 0.975] crit = stats.chi2([2,3,4]).ppf(np.atleast_2d(frac).T) print(mc1.summary_cdf([1,2,3], frac, crit, varnames=['lag 1', 'lag 2', 'lag 3'], title='acorr_ljungbox')) print(mc1.cdf(crit, [1,2,3])[1]) #----------------------
def lb4(x): s, p = acorr_ljungbox(x, lags=4) return s[-1], p[-1] def lb1(x): s, p = acorr_ljungbox(x, lags=1) return s[0], p[0] def lb(x): s, p = acorr_ljungbox(x, lags=4) return np.r_[s, p] print('Results with MC class') mc1 = StatTestMC(normalnoisesim, lb) mc1.run(10000, statindices=lrange(8)) print(mc1.histogram(1, critval=[0.01, 0.025, 0.05, 0.1, 0.975])) print(mc1.quantiles(1)) print(mc1.quantiles(0)) print(mc1.histogram(0)) #print(mc1.summary_quantiles([1], stats.chi2([2]).ppf, title='acorr_ljungbox') print( mc1.summary_quantiles([1, 2, 3], stats.chi2([2, 3, 4]).ppf, varnames=['lag 1', 'lag 2', 'lag 3'], title='acorr_ljungbox')) print(mc1.cdf(0.1026, 1)) print(mc1.cdf(0.7278, 3)) print(mc1.cdf(0.7278, [1, 2, 3]))
class StataReader(object): """ Stata .dta file reader. Provides methods to return the metadata of a Stata .dta file and a generator for the data itself. Parameters ---------- file : file-like A file-like object representing a Stata .dta file. missing_values : bool If missing_values is True, parse missing_values and return a Missing Values object instead of None. encoding : string, optional Used for Python 3 only. Encoding to use when reading the .dta file. Defaults to `locale.getpreferredencoding` See Also -------- statsmodels.iolib.foreign.genfromdta pandas.read_stata pandas.io.stata.StataReader Notes ----- This is known only to work on file formats 113 (Stata 8/9), 114 (Stata 10/11), and 115 (Stata 12). Needs to be tested on older versions. Known not to work on format 104, 108. If you have the documentation for older formats, please contact the developers. For more information about the .dta format see http://www.stata.com/help.cgi?dta http://www.stata.com/help.cgi?dta_113 """ _header = {} _data_location = 0 _col_sizes = () _has_string_data = False _missing_values = False #type code #-------------------- #str1 1 = 0x01 #str2 2 = 0x02 #... #str244 244 = 0xf4 #byte 251 = 0xfb (sic) #int 252 = 0xfc #long 253 = 0xfd #float 254 = 0xfe #double 255 = 0xff #-------------------- #NOTE: the byte type seems to be reserved for categorical variables # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \ [(251, np.int16),(252, np.int32),(253, int), (254, np.float32), (255, np.float64)]) TYPE_MAP = lrange(251) + list('bhlfd') #NOTE: technically, some of these are wrong. there are more numbers # that can be represented. it's the 27 ABOVE and BELOW the max listed # numeric data type in [U] 12.2.2 of the 11.2 manual MISSING_VALUES = { 'b': (-127, 100), 'h': (-32767, 32740), 'l': (-2147483647, 2147483620), 'f': (-1.701e+38, +1.701e+38), 'd': (-1.798e+308, +8.988e+307) } def __init__(self, fname, missing_values=False, encoding=None): warnings.warn( "StataReader is deprecated as of 0.10.0 and will be removed in a " "future version. Use pandas.read_stata or " "pandas.io.stata.StataReader instead.", FutureWarning) if encoding is None: import locale self._encoding = locale.getpreferredencoding() else: self._encoding = encoding self._missing_values = missing_values self._parse_header(fname) def file_headers(self): """ Returns all .dta file headers. out: dict Has keys typlist, data_label, lbllist, varlist, nvar, filetype, ds_format, nobs, fmtlist, vlblist, time_stamp, srtlist, byteorder """ return self._header def file_format(self): """ Returns the file format. Returns ------- out : int Notes ----- Format 113: Stata 8/9 Format 114: Stata 10/11 Format 115: Stata 12 """ return self._header['ds_format'] def file_label(self): """ Returns the dataset's label. Returns ------- out: string """ return self._header['data_label'] def file_timestamp(self): """ Returns the date and time Stata recorded on last file save. Returns ------- out : str """ return self._header['time_stamp'] def variables(self): """ Returns a list of the dataset's StataVariables objects. """ return lmap( _StataVariable, zip(lrange(self._header['nvar']), self._header['typlist'], self._header['varlist'], self._header['srtlist'], self._header['fmtlist'], self._header['lbllist'], self._header['vlblist'])) def dataset(self, as_dict=False): """ Returns a Python generator object for iterating over the dataset. Parameters ---------- as_dict : bool, optional If as_dict is True, yield each row of observations as a dict. If False, yields each row of observations as a list. Returns ------- Generator object for iterating over the dataset. Yields each row of observations as a list by default. Notes ----- If missing_values is True during instantiation of StataReader then observations with StataMissingValue(s) are not filtered and should be handled by your applcation. """ try: self._file.seek(self._data_location) except Exception: pass if as_dict: vars = lmap(str, self.variables()) for i in range(len(self)): yield dict(zip(vars, self._next())) else: for i in range(self._header['nobs']): yield self._next() ### Python special methods def __len__(self): """ Return the number of observations in the dataset. This value is taken directly from the header and includes observations with missing values. """ return self._header['nobs'] def __getitem__(self, k): """ Seek to an observation indexed k in the file and return it, ordered by Stata's output to the .dta file. k is zero-indexed. Prefer using R.data() for performance. """ if not (isinstance(k, (int, long))) or k < 0 or k > len(self) - 1: raise IndexError(k) loc = self._data_location + sum(self._col_size()) * k if self._file.tell() != loc: self._file.seek(loc) return self._next() ### Private methods def _null_terminate(self, s, encoding): if PY3: # have bytes not strings, so must decode null_byte = asbytes('\x00') try: s = s.lstrip(null_byte)[:s.index(null_byte)] except: pass return s.decode(encoding) else: null_byte = asbytes('\x00') try: return s.lstrip(null_byte)[:s.index(null_byte)] except: return s def _parse_header(self, file_object): self._file = file_object encoding = self._encoding # parse headers self._header['ds_format'] = unpack('b', self._file.read(1))[0] if self._header['ds_format'] not in [113, 114, 115]: raise ValueError("Only file formats >= 113 (Stata >= 9)" " are supported. Got format %s. Please report " "if you think this error is incorrect." % self._header['ds_format']) byteorder = self._header['byteorder'] = unpack( 'b', self._file.read(1))[0] == 0x1 and '>' or '<' self._header['filetype'] = unpack('b', self._file.read(1))[0] self._file.read(1) nvar = self._header['nvar'] = unpack(byteorder + 'h', self._file.read(2))[0] self._header['nobs'] = unpack(byteorder + 'i', self._file.read(4))[0] self._header['data_label'] = self._null_terminate( self._file.read(81), encoding) self._header['time_stamp'] = self._null_terminate( self._file.read(18), encoding) # parse descriptors typlist = [ord(self._file.read(1)) for i in range(nvar)] self._header['typlist'] = [self.TYPE_MAP[typ] for typ in typlist] self._header['dtyplist'] = [self.DTYPE_MAP[typ] for typ in typlist] self._header['varlist'] = [ self._null_terminate(self._file.read(33), encoding) for i in range(nvar) ] self._header['srtlist'] = unpack(byteorder + ('h' * (nvar + 1)), self._file.read(2 * (nvar + 1)))[:-1] if self._header['ds_format'] <= 113: self._header['fmtlist'] = \ [self._null_terminate(self._file.read(12), encoding) \ for i in range(nvar)] else: self._header['fmtlist'] = \ [self._null_terminate(self._file.read(49), encoding) \ for i in range(nvar)] self._header['lbllist'] = [ self._null_terminate(self._file.read(33), encoding) for i in range(nvar) ] self._header['vlblist'] = [ self._null_terminate(self._file.read(81), encoding) for i in range(nvar) ] # ignore expansion fields # When reading, read five bytes; the last four bytes now tell you the # size of the next read, which you discard. You then continue like # this until you read 5 bytes of zeros. while True: data_type = unpack(byteorder + 'b', self._file.read(1))[0] data_len = unpack(byteorder + 'i', self._file.read(4))[0] if data_type == 0: break self._file.read(data_len) # other state vars self._data_location = self._file.tell() self._has_string_data = len( lfilter(lambda x: isinstance(x, int), self._header['typlist'])) > 0 self._col_size() def _calcsize(self, fmt): return isinstance(fmt, int) and fmt or \ calcsize(self._header['byteorder']+fmt) def _col_size(self, k=None): """Calculate size of a data record.""" if len(self._col_sizes) == 0: self._col_sizes = lmap(lambda x: self._calcsize(x), self._header['typlist']) if k is None: return self._col_sizes else: return self._col_sizes[k] def _unpack(self, fmt, byt): d = unpack(self._header['byteorder'] + fmt, byt)[0] if fmt[-1] in self.MISSING_VALUES: nmin, nmax = self.MISSING_VALUES[fmt[-1]] if d < nmin or d > nmax: if self._missing_values: return StataMissingValue(nmax, d) else: return None return d def _next(self): typlist = self._header['typlist'] if self._has_string_data: data = [None] * self._header['nvar'] for i in range(len(data)): if isinstance(typlist[i], int): data[i] = self._null_terminate(self._file.read(typlist[i]), self._encoding) else: data[i] = self._unpack(typlist[i], self._file.read(self._col_size(i))) return data else: return lmap( lambda i: self._unpack(typlist[i], self._file.read(self._col_size(i))), lrange(self._header['nvar']))
def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None): """Plot partial regression for a set of regressors. Parameters ---------- results : results instance A regression model results instance exog_idx : None, list of ints, list of strings (column) indices of the exog used in the plot, default is all. grid : None or tuple of int (nrows, ncols) If grid is given, then it is used for the arrangement of the subplots. If grid is None, then ncol is one, if there are only 2 subplots, and the number of columns is two otherwise. fig : Matplotlib figure instance, optional If given, this figure is simply returned. Otherwise a new figure is created. Returns ------- fig : Matplotlib figure instance If `fig` is None, the created figure. Otherwise `fig` itself. Notes ----- A subplot is created for each explanatory variable given by exog_idx. The partial regression plot shows the relationship between the response and the given explanatory variable after removing the effect of all other explanatory variables in exog. See Also -------- plot_partregress : Plot partial regression for a single regressor. plot_ccpr : Plot CCPR against one regressor Examples -------- Using the state crime dataset seperately plot the effect of the each variable on the on the outcome, murder rate while accounting for the effect of all other variables in the model visualized with a grid of partial regression plots. >>> from statsmodels.graphics.regressionplots import plot_partregress_grid >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> import statsmodels.formula.api as smf >>> fig = plt.figure(figsize=(8, 6)) >>> crime_data = sm.datasets.statecrime.load_pandas() >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', ... data=crime_data.data).fit() >>> plot_partregress_grid(results, fig=fig) >>> plt.show() .. plot:: plots/graphics_regression_partregress_grid.py References ---------- See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm """ import pandas fig = utils.create_mpl_fig(fig) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) #maybe add option for using wendog, wexog instead y = pandas.Series(results.model.endog, name=results.model.endog_names) exog = results.model.exog k_vars = exog.shape[1] #this function doesn't make sense if k_vars=1 if grid is not None: nrows, ncols = grid else: if len(exog_idx) > 2: nrows = int(np.ceil(len(exog_idx) / 2.)) ncols = 2 title_kwargs = {"fontdict": {"fontsize": 'small'}} else: nrows = len(exog_idx) ncols = 1 title_kwargs = {} # for indexing purposes other_names = np.array(results.model.exog_names) for i, idx in enumerate(exog_idx): others = lrange(k_vars) others.pop(idx) exog_others = pandas.DataFrame(exog[:, others], columns=other_names[others]) ax = fig.add_subplot(nrows, ncols, i + 1) plot_partregress(y, pandas.Series(exog[:, idx], name=other_names[idx]), exog_others, ax=ax, title_kwargs=title_kwargs, obs_labels=False) ax.set_title("") fig.suptitle("Partial Regression Plot", fontsize="large") fig.tight_layout() fig.subplots_adjust(top=.95) return fig
def summary(self, yname=None, xname=None, title=0, alpha=.05, returns='text', model_info=None): # General part of the summary table if title == 0: title = 'High Dimensional Fixed Effect Regression Results' if type(xname) == str: xname = [xname] if type(yname) == str: yname = [yname] if xname is not None and len(xname) != len(self.xname): # GH 2298 raise ValueError( 'User supplied xnames must have the same number of ' 'entries as the number of model parameters ' '({0})'.format(len(self.xname))) if yname is not None and len(yname) != len(self.yname): raise ValueError( 'User supplied ynames must have the same number of ' 'entries as the number of model dependent variables ' '({0})'.format(len(self.yname))) if xname is None: xname = self.xname if yname is None: yname = self.yname time_now = time.localtime() time_of_day = [time.strftime("%H:%M:%S", time_now)] date = time.strftime("%a, %d %b %Y", time_now) nobs = int(self.nobs) df_model = self.df resid_std_err = forg(self.resid_std_err, 4) Covariance_Type = self.Covariance_Type cluster_method = self.cluster_method gen_left = [ ('Dep. Variable:', yname), ('No. Observations:', [nobs]), # TODO: What happens with multiple names? ('DoF of residual:', [df_model]), ('Residual std err:', [resid_std_err]), ('Covariance Type:', [Covariance_Type]), ('Cluster Method:', [cluster_method]) ] r_squared = forg(self.rsquared, 4) rsquared_adj = forg(self.rsquared_adj, 4) full_rsquared = forg(self.full_rsquared, 4) full_rsquared_adj = forg(self.full_rsquared_adj, 4) fvalue = forg(self.fvalue, 4) f_pvalue = forg(self.f_pvalue, 4) full_fvalue = forg(self.full_fvalue, 4) full_f_pvalue = forg(self.full_f_pvalue, 4) gen_right = [('R-squared(proj model):', [r_squared]), ('Adj. R-squared(proj model):', [rsquared_adj]), ('R-squared(full model):', [full_rsquared]), ('Adj. R-squared(full model):', [full_rsquared_adj]), ('F-statistic(proj model):', [fvalue]), ('Prob (F-statistic (proj model)):', [f_pvalue]), ('DoF of F-test (proj model):', [self.f_df_proj]), ('F-statistic(full model):', [full_fvalue]), ('Prob (F-statistic (full model)):', [full_f_pvalue]), ('DoF of F-test (full model):', [self.f_df_full])] # pad both tables to equal number of rows if len(gen_right) < len(gen_left): # fill up with blank lines to same length gen_right += [(' ', ' ')] * (len(gen_left) - len(gen_right)) elif len(gen_right) > len(gen_left): # fill up with blank lines to same length, just to keep it symmetric gen_left += [(' ', ' ')] * (len(gen_right) - len(gen_left)) gen_stubs_left, gen_data_left = zip_longest(*gen_left) gen_title = title gen_header = None gen_table_left = SimpleTable(gen_data_left, gen_header, gen_stubs_left, title=gen_title, txt_fmt=gen_fmt) gen_stubs_right, gen_data_right = zip_longest(*gen_right) gen_table_right = SimpleTable(gen_data_right, gen_header, gen_stubs_right, title=gen_title, txt_fmt=gen_fmt) gen_table_left.extend_right(gen_table_right) self.general_table = gen_table_left # Parameters part of the summary table s_alp = alpha / 2 c_alp = 1 - alpha / 2 if Covariance_Type == 'nonrobust': self.std_err_name = 'nonrobust std err' elif Covariance_Type == 'robust': self.std_err_name = 'robust std err' elif Covariance_Type == 'clustered': self.std_err_name = 'cluster std err' else: self.std_err_name = 'std err' param_header = [ 'coef', self.std_err_name, 't', 'P>|t|', '[' + str(s_alp), str(c_alp) + ']' ] # alp + ' Conf. Interval' params_stubs = xname params = self.params.copy() conf_int = self.conf_int(alpha) std_err = self.bse.copy() exog_len = lrange(len(xname)) tstat = self.tvalues.copy() prob_stat = self.pvalues.copy() for i in range(len(self.params)): params[i] = forg(self.params[i], 5) std_err[i] = forg(self.bse[i], 5) tstat[i] = forg(self.tvalues[i], 4) prob_stat[i] = forg(self.pvalues[i], 4) # Simpletable should be able to handle the formating params_data = lzip(["%#6.5f" % (params[i]) for i in exog_len], ["%#6.5f" % (std_err[i]) for i in exog_len], ["%#6.4f" % (tstat[i]) for i in exog_len], ["%#6.4f" % (prob_stat[i]) for i in exog_len], ["%#6.4f" % conf_int[0][i] for i in exog_len], ["%#6.4f" % conf_int[1][i] for i in exog_len]) self.parameter_table = SimpleTable(params_data, param_header, params_stubs, title=None, txt_fmt=fmt_2) print(self.general_table) print(self.parameter_table) return
def summary(self, yname=None, xname=None, title=0, alpha=.05, returns='text', model_info=None): """ Parameters ---------- yname : str optional, Default is `Y` xname : list[str] optional, Default is `X.#` for # in p the number of regressors Confidance interval : (0,1) not implimented title : str optional, Defualt is 'Generalized linear model' returns : str 'text', 'table', 'csv', 'latex', 'html' Returns ------- Default : returns='print' Prints the summarirized results Option : returns='text' Prints the summarirized results Option : returns='table' SimpleTable instance : summarizing the fit of a linear model. Option : returns='csv' returns a string of csv of the results, to import into a spreadsheet Option : returns='latex' Not implimented yet Option : returns='HTML' Not implimented yet Examples (needs updating) -------- >>> import statsmodels as sm >>> data = sm.datasets.longley.load() >>> data.exog = sm.add_constant(data.exog) >>> ols_results = sm.OLS(data.endog, data.exog).results >>> print ols_results.summary() ... Notes ----- conf_int calculated from normal dist. """ if title == 0: title = _model_types[self.model.__class__.__name__] if xname is not None and len(xname) != len(self.params): # GH 2298 raise ValueError('User supplied xnames must have the same number of ' 'entries as the number of model parameters ' '({0})'.format(len(self.params))) yname, xname = _getnames(self, yname, xname) time_now = time.localtime() time_of_day = [time.strftime("%H:%M:%S", time_now)] date = time.strftime("%a, %d %b %Y", time_now) modeltype = self.model.__class__.__name__ nobs = self.nobs df_model = self.df_model df_resid = self.df_resid #General part of the summary table, Applicable to all? models #------------------------------------------------------------ # TODO: define this generically, overwrite in model classes #replace definition of stubs data by single list #e.g. gen_left = [ ('Model type:', [modeltype]), ('Date:', [date]), ('Dependent Variable:', yname), # TODO: What happens with multiple names? ('df model', [df_model]) ] gen_stubs_left, gen_data_left = zip_longest(*gen_left) #transpose row col gen_title = title gen_header = None gen_table_left = SimpleTable(gen_data_left, gen_header, gen_stubs_left, title=gen_title, txt_fmt=gen_fmt) gen_stubs_right = ('Method:', 'Time:', 'Number of Obs:', 'df resid') gen_data_right = ( [modeltype], #was dist family need to look at more time_of_day, [nobs], [df_resid]) gen_table_right = SimpleTable(gen_data_right, gen_header, gen_stubs_right, title=gen_title, txt_fmt=gen_fmt) gen_table_left.extend_right(gen_table_right) general_table = gen_table_left # Parameters part of the summary table # ------------------------------------ # Note: this is not necessary since we standardized names, # only t versus normal tstats = { 'OLS': self.t(), 'GLS': self.t(), 'GLSAR': self.t(), 'WLS': self.t(), 'RLM': self.t(), 'GLM': self.t() } prob_stats = { 'OLS': self.pvalues, 'GLS': self.pvalues, 'GLSAR': self.pvalues, 'WLS': self.pvalues, 'RLM': self.pvalues, 'GLM': self.pvalues } # Dictionary to store the header names for the parameter part of the # summary table. look up by modeltype alp = str((1 - alpha) * 100) + '%' param_header = { 'OLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLSAR': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'WLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLM': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], #glm uses t-distribution 'RLM': ['coef', 'std err', 'z', 'P>|z|', alp + ' Conf. Interval'] #checke z } params_stubs = xname params = self.params conf_int = self.conf_int(alpha) std_err = self.bse exog_len = lrange(len(xname)) tstat = tstats[modeltype] prob_stat = prob_stats[modeltype] # Simpletable should be able to handle the formating params_data = lzip(["%#6.4g" % (params[i]) for i in exog_len], ["%#6.4f" % (std_err[i]) for i in exog_len], ["%#6.4f" % (tstat[i]) for i in exog_len], ["%#6.4f" % (prob_stat[i]) for i in exog_len], ["(%#5g, %#5g)" % tuple(conf_int[i]) for i in exog_len]) parameter_table = SimpleTable(params_data, param_header[modeltype], params_stubs, title=None, txt_fmt=fmt_2) #special table #------------- #TODO: exists in linear_model, what about other models #residual diagnostics #output options #-------------- #TODO: JP the rest needs to be fixed, similar to summary in linear_model def ols_printer(): """ print summary table for ols models """ table = str(general_table) + '\n' + str(parameter_table) return table def glm_printer(): table = str(general_table) + '\n' + str(parameter_table) return table printers = {'OLS': ols_printer, 'GLM': glm_printer} if returns == 'print': try: return printers[modeltype]() except KeyError: return printers['OLS']()
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c', model_kw={}, fit_kw={}): """ Returns information criteria for many ARMA models Parameters ---------- y : array-like Time-series data max_ar : int Maximum number of AR lags to use. Default 4. max_ma : int Maximum number of MA lags to use. Default 2. ic : str, list Information criteria to report. Either a single string or a list of different criteria is possible. trend : str The trend to use when fitting the ARMA models. model_kw : dict Keyword arguments to be passed to the ``ARMA`` model fit_kw : dict Keyword arguments to be passed to ``ARMA.fit``. Returns ------- obj : Results object Each ic is an attribute with a DataFrame for the results. The AR order used is the row index. The ma order used is the column index. The minimum orders are available as ``ic_min_order``. Examples -------- >>> from statsmodels.tsa.arima_process import arma_generate_sample >>> import statsmodels.api as sm >>> import numpy as np >>> arparams = np.array([.75, -.25]) >>> maparams = np.array([.65, .35]) >>> arparams = np.r_[1, -arparams] >>> maparam = np.r_[1, maparams] >>> nobs = 250 >>> np.random.seed(2014) >>> y = arma_generate_sample(arparams, maparams, nobs) >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc') >>> res.aic_min_order >>> res.bic_min_order Notes ----- This method can be used to tentatively identify the order of an ARMA from process, provided that the time series is stationary and invertible. This function computes the full exact MLE estimate of each model and can be, therefore a little slow. An implementation using approximate estimates will be provided in the future. In the meantime, consider passing {method : 'css'} to fit_kw. """ from pandas import DataFrame ar_range = lrange(0, max_ar + 1) ma_range = lrange(0, max_ma + 1) if isinstance(ic, string_types): ic = [ic] elif not isinstance(ic, (list, tuple)): raise ValueError("Need a list or a tuple for ic if not a string.") results = np.zeros((len(ic), max_ar + 1, max_ma + 1)) for ar in ar_range: for ma in ma_range: if ar == 0 and ma == 0 and trend == 'nc': results[:, ar, ma] = np.nan continue mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw) if mod is None: results[:, ar, ma] = np.nan continue for i, criteria in enumerate(ic): results[i, ar, ma] = getattr(mod, criteria) dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results] res = dict(zip(ic, dfs)) # add the minimums to the results dict min_res = {} for i, result in iteritems(res): mins = np.where(result.min().min() == result) min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])}) res.update(min_res) return Bunch(**res)
from pandas import to_datetime import numpy as np _quarter_to_day = { "1": (3, 31), "2": (6, 30), "3": (9, 30), "4": (12, 31), "I": (3, 31), "II": (6, 30), "III": (9, 30), "IV": (12, 31) } _mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] _months_with_days = lzip(lrange(1, 13), _mdays) _month_to_day = dict(zip(map(str, lrange(1, 13)), _months_with_days)) _month_to_day.update( dict( zip([ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII" ], _months_with_days))) # regex patterns _y_pattern = r'^\d?\d?\d?\d$' _q_pattern = r''' ^ # beginning of string \d?\d?\d?\d # match any number 1-9999, includes leading zeros
def influence_plot(results, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): """ Plot of influence in regression. Plots studentized resids vs. leverage. Parameters ---------- results : #1lab_results instance A fitted model. external : bool Whether to use externally or internally studentized residuals. It is recommended to leave external as True. alpha : float The alpha value to identify large studentized residuals. Large means abs(resid_studentized) > t.ppf(1-alpha/2, dof=#1lab_results.df_resid) criterion : str {'DFFITS', 'Cooks'} Which criterion to base the size of the points on. Options are DFFITS or Cook's D. size : float The range of `criterion` is mapped to 10**2 - size**2 in points. plot_alpha : float The `alpha` of the plotted points. ax : matplotlib Axes instance An instance of a matplotlib Axes. Returns ------- fig : matplotlib figure The matplotlib figure that contains the Axes. Notes ----- Row labels for the observations in which the leverage, measured by the diagonal of the hat matrix, is high or the residuals are large, as the combination of large residuals and a high influence value indicates an influence point. The value of large residuals can be controlled using the `alpha` parameter. Large leverage points are identified as hat_i > 2 * (df_model + 1)/nobs. """ fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range / old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized_internal from scipy import stats cutoff = stats.t.ppf(1. - alpha / 2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes( np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize / 2)**.5, (psize / 2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize": 16, "color": "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None): """Plot partial regression for a set of regressors. Parameters ---------- results : #1lab_results instance A regression model #1lab_results instance exog_idx : None, list of ints, list of strings (column) indices of the exog used in the plot, default is all. grid : None or tuple of int (nrows, ncols) If grid is given, then it is used for the arrangement of the subplots. If grid is None, then ncol is one, if there are only 2 subplots, and the number of columns is two otherwise. fig : Matplotlib figure instance, optional If given, this figure is simply returned. Otherwise a new figure is created. Returns ------- fig : Matplotlib figure instance If `fig` is None, the created figure. Otherwise `fig` itself. Notes ----- A subplot is created for each explanatory variable given by exog_idx. The partial regression plot shows the relationship between the response and the given explanatory variable after removing the effect of all other explanatory variables in exog. See Also -------- plot_partregress : Plot partial regression for a single regressor. plot_ccpr References ---------- See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm """ import pandas fig = utils.create_mpl_fig(fig) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) #maybe add option for using wendog, wexog instead y = pandas.Series(results.model.endog, name=results.model.endog_names) exog = results.model.exog k_vars = exog.shape[1] #this function doesn't make sense if k_vars=1 if not grid is None: nrows, ncols = grid else: if len(exog_idx) > 2: nrows = int(np.ceil(len(exog_idx) / 2.)) ncols = 2 title_kwargs = {"fontdict": {"fontsize": 'small'}} else: nrows = len(exog_idx) ncols = 1 title_kwargs = {} # for indexing purposes other_names = np.array(results.model.exog_names) for i, idx in enumerate(exog_idx): others = lrange(k_vars) others.pop(idx) exog_others = pandas.DataFrame(exog[:, others], columns=other_names[others]) ax = fig.add_subplot(nrows, ncols, i + 1) plot_partregress(y, pandas.Series(exog[:, idx], name=other_names[idx]), exog_others, ax=ax, title_kwargs=title_kwargs, obs_labels=False) ax.set_title("") fig.suptitle("Partial Regression Plot", fontsize="large") fig.tight_layout() fig.subplots_adjust(top=.95) return fig
class StataWriter(object): """ A class for writing Stata binary dta files from array-like objects Parameters ---------- fname : file path or buffer Where to save the dta file. data : array-like Array-like input to save. Pandas objects are also accepted. convert_dates : dict Dictionary mapping column of datetime types to the stata internal format that you want to use for the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a number or a name. encoding : str Default is latin-1. Note that Stata does not support unicode. byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` Returns ------- writer : StataWriter instance The StataWriter instance has a write_file method, which will write the file to the given `fname`. Examples -------- >>> writer = StataWriter('./data_file.dta', data) >>> writer.write_file() Or with dates >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'}) >>> writer.write_file() """ #type code #-------------------- #str1 1 = 0x01 #str2 2 = 0x02 #... #str244 244 = 0xf4 #byte 251 = 0xfb (sic) #int 252 = 0xfc #long 253 = 0xfd #float 254 = 0xfe #double 255 = 0xff #-------------------- #NOTE: the byte type seems to be reserved for categorical variables # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \ [(251, np.int16),(252, np.int32),(253, int), (254, np.float32), (255, np.float64)]) TYPE_MAP = lrange(251) + list('bhlfd') MISSING_VALUES = { 'b': 101, 'h': 32741, 'l': 2147483621, 'f': 1.7014118346046923e+38, 'd': 8.98846567431158e+307 } def __init__(self, fname, data, convert_dates=None, encoding="latin-1", byteorder=None): warnings.warn( "StataWriter is deprecated as of 0.10.0 and will be removed in a " "future version. Use pandas.DataFrame.to_stata or " "pandas.io.stata.StatWriter instead.", FutureWarning) self._convert_dates = convert_dates # attach nobs, nvars, data, varlist, typlist if data_util._is_using_pandas(data, None): self._prepare_pandas(data) elif data_util._is_array_like(data, None): data = np.asarray(data) if data_util._is_structured_ndarray(data): self._prepare_structured_array(data) else: if convert_dates is not None: raise ValueError("Not able to convert dates in a plain" " ndarray.") self._prepare_ndarray(data) else: # pragma : no cover raise ValueError("Type %s for data not understood" % type(data)) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) self._encoding = encoding self._file = get_file_obj(fname, 'wb', encoding) def _write(self, to_write): """ Helper to call asbytes before writing to file for Python 3 compat. """ self._file.write(asbytes(to_write)) def _prepare_structured_array(self, data): self.nobs = len(data) self.nvar = len(data.dtype) self.data = data self.datarows = iter(data) dtype = data.dtype descr = dtype.descr if dtype.names is None: varlist = _default_names(self.nvar) else: varlist = dtype.names # check for datetime and change the type convert_dates = self._convert_dates if convert_dates is not None: convert_dates = _maybe_convert_to_int_keys(convert_dates, varlist) self._convert_dates = convert_dates for key in convert_dates: descr[key] = (descr[key][0], _convert_datetime_to_stata_type( convert_dates[key])) dtype = np.dtype(descr) self.varlist = varlist self.typlist = [ _dtype_to_stata_type(dtype[i]) for i in range(self.nvar) ] self.fmtlist = [ _dtype_to_default_stata_fmt(dtype[i]) for i in range(self.nvar) ] # set the given format for the datetime cols if convert_dates is not None: for key in convert_dates: self.fmtlist[key] = convert_dates[key] def _prepare_ndarray(self, data): if data.ndim == 1: data = data[:, None] self.nobs, self.nvar = data.shape self.data = data self.datarows = iter(data) #TODO: this should be user settable dtype = data.dtype self.varlist = _default_names(self.nvar) self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)] self.fmtlist = [ _dtype_to_default_stata_fmt(dtype) for i in range(self.nvar) ] def _prepare_pandas(self, data): #NOTE: we might need a different API / class for pandas objects so # we can set different semantics - handle this with a PR to pandas.io class DataFrameRowIter(object): def __init__(self, data): self.data = data def __iter__(self): for i, row in data.iterrows(): yield row data = data.reset_index() self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data self.varlist = data.columns.tolist() dtypes = data.dtypes convert_dates = self._convert_dates if convert_dates is not None: convert_dates = _maybe_convert_to_int_keys(convert_dates, self.varlist) self._convert_dates = convert_dates for key in convert_dates: new_type = _convert_datetime_to_stata_type(convert_dates[key]) dtypes[key] = np.dtype(new_type) self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] # set the given format for the datetime cols if convert_dates is not None: for key in convert_dates: self.fmtlist[key] = convert_dates[key] def write_file(self): self._write_header() self._write_descriptors() self._write_variable_labels() # write 5 zeros for expansion fields self._write(_pad_bytes("", 5)) if self._convert_dates is None: self._write_data_nodates() else: self._write_data_dates() #self._write_value_labels() def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder # ds_format - just use 114 self._write(pack("b", 114)) # byteorder self._write(byteorder == ">" and "\x01" or "\x02") # filetype self._write("\x01") # unused self._write("\x00") # number of vars, 2 bytes self._write(pack(byteorder + "h", self.nvar)[:2]) # number of obs, 4 bytes self._write(pack(byteorder + "i", self.nobs)[:4]) # data label 81 bytes, char, null terminated if data_label is None: self._write( self._null_terminate(_pad_bytes("", 80), self._encoding)) else: self._write( self._null_terminate(_pad_bytes(data_label[:80], 80), self._encoding)) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm if time_stamp is None: time_stamp = datetime.datetime.now() elif not isinstance(time_stamp, datetime): raise ValueError("time_stamp should be datetime type") self._write( self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"), self._encoding)) def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, fmtlist=None, lbllist=None): nvar = self.nvar # typlist, length nvar, format byte array for typ in self.typlist: self._write(typ) # varlist, length 33*nvar, char array, null terminated for name in self.varlist: name = self._null_terminate(name, self._encoding) name = _pad_bytes(asstr(name[:32]), 33) self._write(name) # srtlist, 2*(nvar+1), int array, encoded by byteorder srtlist = _pad_bytes("", (2 * (nvar + 1))) self._write(srtlist) # fmtlist, 49*nvar, char array for fmt in self.fmtlist: self._write(_pad_bytes(fmt, 49)) # lbllist, 33*nvar, char array #NOTE: this is where you could get fancy with pandas categorical type for i in range(nvar): self._write(_pad_bytes("", 33)) def _write_variable_labels(self, labels=None): nvar = self.nvar if labels is None: for i in range(nvar): self._write(_pad_bytes("", 81)) def _write_data_nodates(self): data = self.datarows byteorder = self._byteorder TYPE_MAP = self.TYPE_MAP typlist = self.typlist for row in data: #row = row.squeeze().tolist() # needed for structured arrays for i, var in enumerate(row): typ = ord(typlist[i]) if typ <= 244: # we've got a string if len(var) < typ: var = _pad_bytes(asstr(var), len(var) + 1) self._write(var) else: try: if typ in _type_converters: var = _type_converters[typ](var) self._write(pack(byteorder + TYPE_MAP[typ], var)) except struct_error: # have to be strict about type pack won't do any # kind of casting self._write( pack(byteorder + TYPE_MAP[typ], _type_converters[typ](var))) def _write_data_dates(self): convert_dates = self._convert_dates data = self.datarows byteorder = self._byteorder TYPE_MAP = self.TYPE_MAP MISSING_VALUES = self.MISSING_VALUES typlist = self.typlist for row in data: #row = row.squeeze().tolist() # needed for structured arrays for i, var in enumerate(row): typ = ord(typlist[i]) #NOTE: If anyone finds this terribly slow, there is # a vectorized way to convert dates, see genfromdta for going # from int to datetime and reverse it. will copy data though if i in convert_dates: var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) if typ <= 244: # we've got a string if isnull(var): var = "" # missing string if len(var) < typ: var = _pad_bytes(var, len(var) + 1) self._write(var) else: if isnull(var): # this only matters for floats var = MISSING_VALUES[typ] self._write(pack(byteorder + TYPE_MAP[typ], var)) def _null_terminate(self, s, encoding): null_byte = '\x00' if PY3: s += null_byte return s.encode(encoding) else: s += null_byte return s
def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True, skip_header=False, title=None): '''create a summary table for the parameters Parameters ---------- res : results instance some required information is directly taken from the result instance yname : {str, None} optional name for the endogenous variable, default is "y" xname : {list[str], None} optional names for the exogenous variables, default is "var_xx" alpha : float significance level for the confidence intervals use_t : bool indicator whether the p-values are based on the Student-t distribution (if True) or on the normal distribution (if False) skip_headers : bool If false (default), then the header row is added. If true, then no header row is added. Returns ------- params_table : SimpleTable instance ''' # Parameters part of the summary table # ------------------------------------ # Note: this is not necessary since we standardized names, # only t versus normal if isinstance(results, tuple): # for multivariate endog # TODO: check whether I do not want to refactor this #we need to give parameter alpha to conf_int results, params, std_err, tvalues, pvalues, conf_int = results else: params = results.params std_err = results.bse tvalues = results.tvalues # is this sometimes called zvalues pvalues = results.pvalues conf_int = results.conf_int(alpha) if params.size == 0: return SimpleTable([['No Model Parameters']]) # Dictionary to store the header names for the parameter part of the # summary table. look up by modeltype if use_t: param_header = [ 'coef', 'std err', 't', 'P>|t|', '[' + str(alpha / 2), str(1 - alpha / 2) + ']' ] else: param_header = [ 'coef', 'std err', 'z', 'P>|z|', '[' + str(alpha / 2), str(1 - alpha / 2) + ']' ] if skip_header: param_header = None _, xname = _getnames(results, yname=yname, xname=xname) if len(xname) != len(params): raise ValueError('xnames and params do not have the same length') params_stubs = xname exog_idx = lrange(len(xname)) params_data = lzip([forg(params[i], prec=4) for i in exog_idx], [forg(std_err[i]) for i in exog_idx], [forg(tvalues[i]) for i in exog_idx], ["%#6.3f" % (pvalues[i]) for i in exog_idx], [forg(conf_int[i, 0]) for i in exog_idx], [forg(conf_int[i, 1]) for i in exog_idx]) parameter_table = SimpleTable(params_data, param_header, params_stubs, title=title, txt_fmt=fmt_params) return parameter_table
def anova2_lm_single(model, design_info, n_rows, test, pr_test, robust): """ Anova type II table for one fitted linear model. Parameters ---------- model : fitted linear model results instance A fitted linear model **kwargs** scale : float Estimate of variance, If None, will be estimated from the largest model. Default is None. test : str {"F", "Chisq", "Cp"} or None Test statistics to provide. Default is "F". Notes ----- Use of this function is discouraged. Use anova_lm instead. Type II Sum of Squares compares marginal contribution of terms. Thus, it is not particularly useful for models with significant interaction terms. """ terms_info = design_info.terms[:] # copy terms_info = _remove_intercept_patsy(terms_info) names = ['sum_sq', 'df', test, pr_test] table = DataFrame(np.zeros((n_rows, 4)), columns = names) cov = _get_covariance(model, None) robust_cov = _get_covariance(model, robust) col_order = [] index = [] for i, term in enumerate(terms_info): # grab all varaibles except interaction effects that contain term # need two hypotheses matrices L1 is most restrictive, ie., term==0 # L2 is everything except term==0 cols = design_info.slice(term) L1 = lrange(cols.start, cols.stop) L2 = [] term_set = set(term.factors) for t in terms_info: # for the term you have other_set = set(t.factors) if term_set.issubset(other_set) and not term_set == other_set: col = design_info.slice(t) # on a higher order term containing current `term` L1.extend(lrange(col.start, col.stop)) L2.extend(lrange(col.start, col.stop)) L1 = np.eye(model.model.exog.shape[1])[L1] L2 = np.eye(model.model.exog.shape[1])[L2] if L2.size: LVL = np.dot(np.dot(L1,robust_cov),L2.T) from scipy import linalg orth_compl,_ = linalg.qr(LVL) r = L1.shape[0] - L2.shape[0] # L1|2 # use the non-unique orthogonal completion since L12 is rank r L12 = np.dot(orth_compl[:,-r:].T, L1) else: L12 = L1 r = L1.shape[0] #from IPython.core.debugger import Pdb; Pdb().set_trace() if test == 'F': f = model.f_test(L12, cov_p=robust_cov) table.loc[table.index[i], test] = test_value = f.fvalue table.loc[table.index[i], pr_test] = f.pvalue # need to back out SSR from f_test table.loc[table.index[i], 'df'] = r col_order.append(cols.start) index.append(term.name()) table.index = Index(index + ['Residual']) table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1]+1])] # back out sum of squares from f_test ssr = table[test] * table['df'] * model.ssr/model.df_resid table['sum_sq'] = ssr # fill in residual table.loc['Residual', ['sum_sq','df', test, pr_test]] = (model.ssr, model.df_resid, np.nan, np.nan) return table
def test_pandasacovf(): s = Series(lrange(1, 11)) assert_almost_equal(acovf(s), acovf(s.values))
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray or string endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : ndarray or string exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : ndarray or list of strings other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array-like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the obseveration numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. Examples -------- Load the Statewide Crime data set and plot partial regression of the rate of high school graduation (hs_grad) on the murder rate(murder). The effects of the percent of the population living in urban areas (urban), below the poverty line (poverty) , and in a single person household (single) are removed by OLS regression. >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> crime_data = sm.datasets.statecrime.load_pandas() >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad', ... exog_others=['urban', 'poverty', 'single'], ... data=crime_data.data, obs_labels=False) >>> plt.show() .. plot:: plots/graphics_regression_partregress.py More detailed examples can be found in the Regression Plots notebook on the examples page. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, string_types): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, string_types): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size == 0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, string_types): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like if RHS_isemtpy: ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance( endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array-like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig