def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,**kwds): from statsmodels.graphics import utils fig, ax = utils.create_mpl_ax(ax) if title is None: title = "Index Plot" nobs = len(self.endog) index = np.arange(nobs) ax.scatter(index, y, **kwds) if threshold == 'all': large_points = np.ones(nobs, np.bool_) else: large_points = np.abs(y) > threshold psize = 3 * np.ones(nobs) # add point labels labels = self.results.model.data.row_labels if labels is None: labels = np.arange(nobs) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(index, y), lzip(-psize, psize), "large", ax) font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel(ylabel, **font) ax.set_xlabel("Observation", **font) ax.set_title(title, **font) return fig
def summary_params_2d(result, extras=None, endog_names=None, exog_names=None, title=None): '''create summary table of regression parameters with several equations This allows interleaving of parameters with bse and/or tvalues Parameters ---------- result : result instance the result instance with params and attributes in extras extras : list of strings additional attributes to add below a parameter row, e.g. bse or tvalues endog_names : None or list of strings names for rows of the parameter array (multivariate endog) exog_names : None or list of strings names for columns of the parameter array (exog) alpha : float level for confidence intervals, default 0.95 title : None or string Returns ------- tables : list of SimpleTable this contains a list of all seperate Subtables table_all : SimpleTable the merged table with results concatenated for each row of the parameter array ''' if endog_names is None: # TODO: note the [1:] is specific to current MNLogit endog_names = ['endog_%d' % i for i in np.unique(result.model.endog)[1:]] if exog_names is None: exog_names = ['var%d' % i for i in range(len(result.params))] # TODO: check formatting options with different values res_params = [[forg(item, prec=4) for item in row] for row in result.params] if extras: extras_list = [[['%10s' % ('(' + forg(v, prec=3).strip() + ')') for v in col] for col in getattr(result, what)] for what in extras ] data = lzip(res_params, *extras_list) data = [i for j in data for i in j] #flatten stubs = lzip(endog_names, *[['']*len(endog_names)]*len(extras)) stubs = [i for j in stubs for i in j] #flatten else: data = res_params stubs = endog_names txt_fmt = copy.deepcopy(fmt_params) txt_fmt["data_fmts"] = ["%s"]*result.params.shape[1] return SimpleTable(data, headers=exog_names, stubs=stubs, title=title, txt_fmt=txt_fmt)
def _influence_plot(results, influence, external=True, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): infl = influence fig, ax = utils.create_mpl_ax(ax) if criterion.lower().startswith('coo'): psize = infl.cooks_distance[0] elif criterion.lower().startswith('dff'): psize = np.abs(infl.dffits[0]) else: raise ValueError("Criterion %s not understood" % criterion) # scale the variables #TODO: what is the correct scaling and the assumption here? #we want plots to be comparable across different plots #so we would need to use the expected distribution of criterion probably old_range = np.ptp(psize) new_range = size**2 - 8**2 psize = (psize - psize.min()) * new_range/old_range + 8**2 leverage = infl.hat_matrix_diag if external: resids = infl.resid_studentized_external else: resids = infl.resid_studentized from scipy import stats cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) large_resid = np.abs(resids) > cutoff large_leverage = leverage > _high_leverage(results) large_points = np.logical_or(large_resid, large_leverage) ax.scatter(leverage, resids, s=psize, alpha=plot_alpha) # add point labels labels = results.model.data.row_labels if labels is None: labels = lrange(len(resids)) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(leverage, resids), lzip(-(psize/2)**.5, (psize/2)**.5), "x-large", ax) #TODO: make configurable or let people do it ex-post? font = {"fontsize" : 16, "color" : "black"} ax.set_ylabel("Studentized Residuals", **font) ax.set_xlabel("H Leverage", **font) ax.set_title("Influence Plot", **font) return fig
def test_mcnemar_vectorized(): ttk = np.random.randint(5,15, size=(2,2,3)) mcnemar(ttk) res = mcnemar(ttk, exact=False) res1 = lzip(*[mcnemar(ttk[:,:,i], exact=False) for i in range(3)]) assert_allclose(res, res1, rtol=1e-13) res = mcnemar(ttk, exact=False, correction=False) res1 = lzip(*[mcnemar(ttk[:,:,i], exact=False, correction=False) for i in range(3)]) assert_allclose(res, res1, rtol=1e-13) res = mcnemar(ttk, exact=True) res1 = lzip(*[mcnemar(ttk[:,:,i], exact=True) for i in range(3)]) assert_allclose(res, res1, rtol=1e-13)
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None, **kwargs): from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = influence leverage = infl.hat_matrix_diag resid = zscore(infl.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(int(results.nobs)) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def _coef_table(self): model = self.model k = model.neqs Xnames = self.model.exog_names data = lzip(model.params.T.ravel(), model.stderr.T.ravel(), model.tvalues.T.ravel(), model.pvalues.T.ravel()) header = ('coefficient','std. error','t-stat','prob') buf = StringIO() dim = k * model.k_ar + model.k_trend for i in range(k): section = "Results for equation %s" % model.names[i] buf.write(section + '\n') #print >> buf, section table = SimpleTable(data[dim * i : dim * (i + 1)], header, Xnames, title=None, txt_fmt = self.default_fmt) buf.write(str(table) + '\n') if i < k - 1: buf.write('\n') return buf.getvalue()
def __init__(self, group, name=''): super(self.__class__, self).__init__(group, name=name) idx = (np.nonzero(np.diff(group))[0]+1).tolist() self.groupidx = groupidx = lzip([0]+idx, idx+[len(group)]) ngroups = len(groupidx)
def add_dict(self, d, ncols=2, align='l', float_format="%.4f"): '''Add the contents of a Dict to summary table Parameters ---------- d : dict Keys and values are automatically coerced to strings with str(). Users are encouraged to format them before using add_dict. ncols: int Number of columns of the output table align : string Data alignment (l/c/r) ''' keys = [_formatter(x, float_format) for x in iterkeys(d)] vals = [_formatter(x, float_format) for x in itervalues(d)] data = np.array(lzip(keys, vals)) if data.shape[0] % ncols != 0: pad = ncols - (data.shape[0] % ncols) data = np.vstack([data, np.array(pad * [['', '']])]) data = np.split(data, ncols) data = reduce(lambda x, y: np.hstack([x, y]), data) self.add_array(data, align=align)
def _create_default_properties(data): """"Create the default properties of the mosaic given the data first it will varies the color hue (first category) then the color saturation (second category) and then the color value (third category). If a fourth category is found, it will put decoration on the rectangle. Doesn't manage more than four level of categories """ categories_levels = _categories_level(list(iterkeys(data))) Nlevels = len(categories_levels) # first level, the hue L = len(categories_levels[0]) # hue = np.linspace(1.0, 0.0, L+1)[:-1] hue = np.linspace(0.0, 1.0, L + 2)[:-2] # second level, the saturation L = len(categories_levels[1]) if Nlevels > 1 else 1 saturation = np.linspace(0.5, 1.0, L + 1)[:-1] # third level, the value L = len(categories_levels[2]) if Nlevels > 2 else 1 value = np.linspace(0.5, 1.0, L + 1)[:-1] # fourth level, the hatch L = len(categories_levels[3]) if Nlevels > 3 else 1 hatch = ['', '/', '-', '|', '+'][:L + 1] # convert in list and merge with the levels hue = lzip(list(hue), categories_levels[0]) saturation = lzip(list(saturation), categories_levels[1] if Nlevels > 1 else ['']) value = lzip(list(value), categories_levels[2] if Nlevels > 2 else ['']) hatch = lzip(list(hatch), categories_levels[3] if Nlevels > 3 else ['']) # create the properties dictionary properties = {} for h, s, v, t in product(hue, saturation, value, hatch): hv, hn = h sv, sn = s vv, vn = v tv, tn = t level = (hn,) + ((sn,) if sn else tuple()) level = level + ((vn,) if vn else tuple()) level = level + ((tn,) if tn else tuple()) hsv = array([hv, sv, vv]) prop = {'color': _single_hsv_to_rgb(hsv), 'hatch': tv, 'lw': 0} properties[level] = prop return properties
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. """ if line == '45': end_pts = lzip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = min(end_pts[0]) end_pts[1] = max(end_pts[1]) ax.plot(end_pts, end_pts, fmt) ax.set_xlim(end_pts) ax.set_ylim(end_pts) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x,y,fmt) elif line == 's': m,b = y.std(), y.mean() ref_line = x*m + b ax.plot(x, ref_line, fmt) elif line == 'q': _check_for_ppf(dist) q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([0.25, 0.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m*theoretical_quartiles[0] ax.plot(x, m*x + b, fmt)
def pval_table(self): '''create a (n_levels, n_levels) array with corrected p_values this needs to improve, similar to R pairwise output ''' k = self.n_levels pvals_mat = np.zeros((k, k)) # if we don't assume we have all pairs pvals_mat[lzip(*self.all_pairs)] = self.pval_corrected() return pvals_mat
def test_mcnemar_vectorized(reset_randomstate): ttk = np.random.randint(5,15, size=(2,2,3)) with pytest.deprecated_call(): res = sbmcnemar(ttk, exact=False) with pytest.deprecated_call(): res1 = lzip(*[sbmcnemar(ttk[:, :, i], exact=False) for i in range(3)]) assert_allclose(res, res1, rtol=1e-13) with pytest.deprecated_call(): res = sbmcnemar(ttk, exact=False, correction=False) with pytest.deprecated_call(): res1 = lzip(*[sbmcnemar(ttk[:, :, i], exact=False, correction=False) for i in range(3)]) assert_allclose(res, res1, rtol=1e-13) with pytest.deprecated_call(): res = sbmcnemar(ttk, exact=True) with pytest.deprecated_call(): res1 = lzip(*[sbmcnemar(ttk[:, :, i], exact=True) for i in range(3)]) assert_allclose(res, res1, rtol=1e-13)
def summary_table(self, float_fmt="%6.3f"): '''create a summary table with all influence and outlier measures This does currently not distinguish between statistics that can be calculated from the original regression results and for which a leave-one-observation-out loop is needed Returns ------- res : SimpleTable instance SimpleTable instance with the results, can be printed Notes ----- This also attaches table_data to the instance. ''' #print self.dfbetas # table_raw = [ np.arange(self.nobs), # self.endog, # self.fittedvalues, # self.cooks_distance(), # self.resid_studentized_internal, # self.hat_matrix_diag, # self.dffits_internal, # self.resid_studentized_external, # self.dffits, # self.dfbetas # ] table_raw = [ ('obs', np.arange(self.nobs)), ('endog', self.endog), ('fitted\nvalue', self.results.fittedvalues), ("Cook's\nd", self.cooks_distance[0]), ("student.\nresidual", self.resid_studentized_internal), ('hat diag', self.hat_matrix_diag), ('dffits \ninternal', self.dffits_internal[0]), ("ext.stud.\nresidual", self.resid_studentized_external), ('dffits', self.dffits[0]) ] colnames, data = lzip(*table_raw) #unzip data = np.column_stack(data) self.table_data = data from statsmodels.iolib.table import SimpleTable, default_html_fmt from statsmodels.iolib.tableformatting import fmt_base from copy import deepcopy fmt = deepcopy(fmt_base) fmt_html = deepcopy(default_html_fmt) fmt['data_fmts'] = ["%4d"] + [float_fmt] * (data.shape[1] - 1) #fmt_html['data_fmts'] = fmt['data_fmts'] return SimpleTable(data, headers=colnames, txt_fmt=fmt, html_fmt=fmt_html)
def in_domain(self, xs, ys, x): """ Returns the filtered (xs, ys) based on the Kernel domain centred on x """ # Disable black-list functions: filter used for speed instead of # list-comprehension # pylint: disable-msg=W0141 def isInDomain(xy): """Used for filter to check if point is in the domain""" u = (xy[0]-x)/self.h return u >= self.domain[0] and u <= self.domain[1] if self.domain is None: return (xs, ys) else: filtered = lfilter(isInDomain, lzip(xs, ys)) if len(filtered) > 0: xs, ys = lzip(*filtered) return (xs, ys) else: return ([], [])
def getquotes(symbol, start, end): quotes = fin.quotes_historical_yahoo(symbol, start, end) dates, open, close, high, low, volume = lzip(*quotes) data = { 'open' : open, 'close' : close, 'high' : high, 'low' : low, 'volume' : volume } dates = pa.Index([dt.datetime.fromordinal(int(d)) for d in dates]) return pa.DataFrame(data, index=dates)
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None, **kwargs): """ Plots leverage statistics vs. normalized residuals squared Parameters ---------- results : results instance A regression results instance alpha : float Specifies the cut-off for large-standardized residuals. Residuals are assumed to be distributed N(0, 1) with alpha=alpha. label_kwargs : dict The keywords to pass to annotate for the labels. ax : Axes instance Matplotlib Axes instance Returns ------- fig : matplotlib Figure A matplotlib figure instance. """ from scipy.stats import zscore, norm fig, ax = utils.create_mpl_ax(ax) infl = results.get_influence() leverage = infl.hat_matrix_diag resid = zscore(results.resid) ax.plot(resid**2, leverage, 'o', **kwargs) ax.set_xlabel("Normalized residuals**2") ax.set_ylabel("Leverage") ax.set_title("Leverage vs. Normalized residuals squared") large_leverage = leverage > _high_leverage(results) #norm or t here if standardized? cutoff = norm.ppf(1.-alpha/2) large_resid = np.abs(resid) > cutoff labels = results.model.data.row_labels if labels is None: labels = lrange(results.nobs) index = np.where(np.logical_or(large_leverage, large_resid))[0] ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage), [(0, 5)]*int(results.nobs), "large", ax=ax, ha="center", va="bottom") ax.margins(.075, .075) return fig
def conf_int(self, alpha=.05): """ Returns the confidence intervals of the marginal effects Parameters ---------- alpha : float Number between 0 and 1. The confidence intervals have the probability 1-alpha. Returns ------- conf_int : ndarray An array with lower, upper confidence intervals for the marginal effects. """ _check_at_is_all(self.margeff_options) me_se = self.margeff_se q = norm.ppf(1 - alpha / 2) lower = self.margeff - q * me_se upper = self.margeff + q * me_se return np.asarray(lzip(lower, upper))
def proportions_chisquare_allpairs(count, nobs, multitest_method='hs'): '''chisquare test of proportions for all pairs of k samples Performs a chisquare test for proportions for all pairwise comparisons. The alternative is two-sided Parameters ---------- count : integer or array_like the number of successes in nobs trials. nobs : integer the number of trials or observations. prop : float, optional The probability of success under the null hypothesis, `0 <= prop <= 1`. The default value is `prop = 0.5` multitest_method : string This chooses the method for the multiple testing p-value correction, that is used as default in the results. It can be any method that is available in ``multipletesting``. The default is Holm-Sidak 'hs'. Returns ------- result : AllPairsResults instance The returned results instance has several statistics, such as p-values, attached, and additional methods for using a non-default ``multitest_method``. Notes ----- Yates continuity correction is not available. ''' #all_pairs = lmap(list, lzip(*np.triu_indices(4, 1))) all_pairs = lzip(*np.triu_indices(len(count), 1)) pvals = [proportions_chisquare(count[list(pair)], nobs[list(pair)])[1] for pair in all_pairs] return AllPairsResults(pvals, all_pairs, multitest_method=multitest_method)
def pacf(x, nlags=40, method='ywunbiased', alpha=None): '''Partial autocorrelation estimated Parameters ---------- x : 1d array observations of time series for which pacf is calculated nlags : int largest lag for which pacf is returned method : 'ywunbiased' (default) or 'ywmle' or 'ols' specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) Returns ------- pacf : 1d array partial autocorrelations, nlags elements, including lag zero confint : array, optional Confidence intervals for the PACF. Returned if confint is not None. Notes ----- This solves yule_walker equations or ols for each desired lag and contains currently duplicate calculations. ''' if method == 'ols': ret = pacf_ols(x, nlags=nlags) elif method in ['yw', 'ywu', 'ywunbiased', 'yw_unbiased']: ret = pacf_yw(x, nlags=nlags, method='unbiased') elif method in ['ywm', 'ywmle', 'yw_mle']: ret = pacf_yw(x, nlags=nlags, method='mle') elif method in ['ld', 'ldu', 'ldunbiase', 'ld_unbiased']: acv = acovf(x, unbiased=True) ld_ = levinson_durbin(acv, nlags=nlags, isacov=True) #print 'ld', ld_ ret = ld_[2] # inconsistent naming with ywmle elif method in ['ldb', 'ldbiased', 'ld_biased']: acv = acovf(x, unbiased=False) ld_ = levinson_durbin(acv, nlags=nlags, isacov=True) ret = ld_[2] else: raise ValueError('method not available') if alpha is not None: varacf = 1. / len(x) # for all lags >=1 interval = stats.norm.ppf(1. - alpha / 2.) * np.sqrt(varacf) confint = np.array(lzip(ret - interval, ret + interval)) confint[0] = ret[0] # fix confidence interval for lag 0 to varpacf=0 return ret, confint else: return ret
def acf(x, unbiased=False, nlags=40, qstat=False, fft=False, alpha=None, missing='none'): """ Autocorrelation function for 1d arrays. Parameters ---------- x : array Time series data unbiased : bool If True, then denominators for autocovariance are n-k, otherwise n nlags: int, optional Number of lags to return autocorrelation for. qstat : bool, optional If True, returns the Ljung-Box q statistic for each autocorrelation coefficient. See q_stat for more information. fft : bool, optional If True, computes the ACF via FFT. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett\'s formula. missing : str, optional A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs are to be treated. Returns ------- acf : array autocorrelation function confint : array, optional Confidence intervals for the ACF. Returned if confint is not None. qstat : array, optional The Ljung-Box Q-Statistic. Returned if q_stat is True. pvalues : array, optional The p-values associated with the Q-statistics. Returned if q_stat is True. Notes ----- The acf at lag 0 (ie., 1) is returned. This is based np.correlate which does full convolution. For very long time series it is recommended to use fft convolution instead. If unbiased is true, the denominator for the autocovariance is adjusted but the autocorrelation is not an unbiased estimtor. References ---------- .. [1] Parzen, E., 1963. On spectral analysis with missing observations and amplitude modulation. Sankhya: The Indian Journal of Statistics, Series A, pp.383-392. """ nobs = len(x) # should this shrink for missing='drop' and NaNs in x? avf = acovf(x, unbiased=unbiased, demean=True, fft=fft, missing=missing) acf = avf[:nlags + 1] / avf[0] if not (qstat or alpha): return acf if alpha is not None: varacf = np.ones(nlags + 1) / nobs varacf[0] = 0 varacf[1] = 1. / nobs varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2) interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf) confint = np.array(lzip(acf - interval, acf + interval)) if not qstat: return acf, confint if qstat: qstat, pvalue = q_stat(acf[1:], nobs=nobs) # drop lag 0 if alpha is not None: return acf, confint, qstat, pvalue else: return acf, qstat, pvalue
_quarter_to_day = { "1" : (3, 31), "2" : (6, 30), "3" : (9, 30), "4" : (12, 31), "I" : (3, 31), "II" : (6, 30), "III" : (9, 30), "IV" : (12, 31) } _mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] _months_with_days = lzip(lrange(1,13), _mdays) _month_to_day = dict(zip(map(str,lrange(1,13)), _months_with_days)) _month_to_day.update(dict(zip(["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII"], _months_with_days))) # regex patterns _y_pattern = '^\d?\d?\d?\d$' _q_pattern = ''' ^ # beginning of string \d?\d?\d?\d # match any number 1-9999, includes leading zeros (:?q) # use q or a : as a separator ([1-4]|(I{1,3}V?)) # match 1-4 or I-IV roman numerals
def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds): """create new results instance with robust covariance as default Parameters ---------- cov_type : string the type of robust sandwich estimator to use. see Notes below use_t : bool If true, then the t distribution is used for inference. If false, then the normal distribution is used. kwds : depends on cov_type Required or optional arguments for robust covariance calculation. see Notes below Returns ------- results : results instance This method creates a new results instance with the requested robust covariance as the default covariance of the parameters. Inferential statistics like p-values and hypothesis tests will be based on this covariance matrix. Notes ----- Warning: Some of the options and defaults in cov_kwds may be changed in a future version. The covariance keywords provide an option 'scaling_factor' to adjust the scaling of the covariance matrix, that is the covariance is multiplied by this factor if it is given and is not `None`. This allows the user to adjust the scaling of the covariance matrix to match other statistical packages. For example, `scaling_factor=(nobs - 1.) / (nobs - k_params)` provides a correction so that the robust covariance matrices match those of Stata in some models like GLM and discrete Models. The following covariance types and required or optional arguments are currently available: - 'HC0', 'HC1', 'HC2', 'HC3' and no keyword arguments: heteroscedasticity robust covariance - 'HAC' and keywords - `maxlag` integer (required) : number of lags to use - `kernel` string (optional) : kernel, default is Bartlett - `use_correction` bool (optional) : If true, use small sample correction - 'cluster' and required keyword `groups`, integer group indicator - `groups` array_like, integer (required) : index of clusters or groups - `use_correction` bool (optional) : If True the sandwich covariance is calulated with a small sample correction. If False the the sandwich covariance is calulated without small sample correction. - `df_correction` bool (optional) If True (default), then the degrees of freedom for the inferential statistics and hypothesis tests, such as pvalues, f_pvalue, conf_int, and t_test and f_test, are based on the number of groups minus one instead of the total number of observations minus the number of explanatory variables. `df_resid` of the results instance is adjusted. If False, then `df_resid` of the results instance is not adjusted. - 'hac-groupsum' Driscoll and Kraay, heteroscedasticity and autocorrelation robust standard errors in panel data keywords - `time` array_like (required) : index of time periods - `maxlag` integer (required) : number of lags to use - `kernel` string (optional) : kernel, default is Bartlett - `use_correction` False or string in ['hac', 'cluster'] (optional) : If False the the sandwich covariance is calulated without small sample correction. If `use_correction = 'cluster'` (default), then the same small sample correction as in the case of 'covtype='cluster'' is used. - `df_correction` bool (optional) adjustment to df_resid, see cov_type 'cluster' above #TODO: we need more options here - 'hac-panel' heteroscedasticity and autocorrelation robust standard errors in panel data. The data needs to be sorted in this case, the time series for each panel unit or cluster need to be stacked. The membership to a timeseries of an individual or group can be either specified by group indicators or by increasing time periods. keywords - either `groups` or `time` : array_like (required) `groups` : indicator for groups `time` : index of time periods - `maxlag` integer (required) : number of lags to use - `kernel` string (optional) : kernel, default is Bartlett - `use_correction` False or string in ['hac', 'cluster'] (optional) : If False the the sandwich covariance is calulated without small sample correction. - `df_correction` bool (optional) adjustment to df_resid, see cov_type 'cluster' above #TODO: we need more options here Reminder: `use_correction` in "hac-groupsum" and "hac-panel" is not bool, needs to be in [False, 'hac', 'cluster'] TODO: Currently there is no check for extra or misspelled keywords, except in the case of cov_type `HCx` """ import statsmodels.stats.sandwich_covariance as sw #normalize names if cov_type == 'nw-panel': cov_type = 'hac-panel' if cov_type == 'nw-groupsum': cov_type = 'hac-groupsum' if 'kernel' in kwds: kwds['weights_func'] = kwds.pop('kernel') # pop because HCx raises if any kwds sc_factor = kwds.pop('scaling_factor', None) # TODO: make separate function that returns a robust cov plus info use_self = kwds.pop('use_self', False) if use_self: res = self else: # this doesn't work for most models, use raw instance instead from fit res = self.__class__(self.model, self.params, normalized_cov_params=self.normalized_cov_params, scale=self.scale) res.cov_type = cov_type # use_t might already be defined by the class, and already set if use_t is None: use_t = self.use_t res.cov_kwds = {'use_t':use_t} # store for information res.use_t = use_t adjust_df = False if cov_type in ['cluster', 'hac-panel', 'hac-groupsum']: df_correction = kwds.get('df_correction', None) # TODO: check also use_correction, do I need all combinations? if df_correction is not False: # i.e. in [None, True]: # user didn't explicitely set it to False adjust_df = True res.cov_kwds['adjust_df'] = adjust_df # verify and set kwds, and calculate cov # TODO: this should be outsourced in a function so we can reuse it in # other models # TODO: make it DRYer repeated code for checking kwds if cov_type.upper() in ('HC0', 'HC1', 'HC2', 'HC3'): if kwds: raise ValueError('heteroscedasticity robust covarians ' + 'does not use keywords') res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' + 'robust ' + '(' + cov_type + ')') res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None) if res.cov_params_default is None: # results classes that don't have cov_HCx attribute res.cov_params_default = sw.cov_white_simple(self, use_correction=False) elif cov_type.lower() == 'hac': maxlags = kwds['maxlags'] # required?, default in cov_hac_simple res.cov_kwds['maxlags'] = maxlags weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func use_correction = kwds.get('use_correction', False) res.cov_kwds['use_correction'] = use_correction res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' + 'and autocorrelation robust (HAC) using %d lags and %s small ' + 'sample correction') % (maxlags, ['without', 'with'][use_correction]) res.cov_params_default = sw.cov_hac_simple(self, nlags=maxlags, weights_func=weights_func, use_correction=use_correction) elif cov_type.lower() == 'cluster': #cluster robust standard errors, one- or two-way groups = kwds['groups'] if not hasattr(groups, 'shape'): groups = np.asarray(groups).T if groups.ndim >= 2: groups = groups.squeeze() res.cov_kwds['groups'] = groups use_correction = kwds.get('use_correction', True) res.cov_kwds['use_correction'] = use_correction if groups.ndim == 1: if adjust_df: # need to find number of groups # duplicate work self.n_groups = n_groups = len(np.unique(groups)) res.cov_params_default = sw.cov_cluster(self, groups, use_correction=use_correction) elif groups.ndim == 2: if hasattr(groups, 'values'): groups = groups.values if adjust_df: # need to find number of groups # duplicate work n_groups0 = len(np.unique(groups[:,0])) n_groups1 = len(np.unique(groups[:, 1])) self.n_groups = (n_groups0, n_groups1) n_groups = min(n_groups0, n_groups1) # use for adjust_df # Note: sw.cov_cluster_2groups has 3 returns res.cov_params_default = sw.cov_cluster_2groups(self, groups, use_correction=use_correction)[0] else: raise ValueError('only two groups are supported') res.cov_kwds['description'] = ('Standard Errors are robust to' + 'cluster correlation ' + '(' + cov_type + ')') elif cov_type.lower() == 'hac-panel': #cluster robust standard errors res.cov_kwds['time'] = time = kwds.get('time', None) res.cov_kwds['groups'] = groups = kwds.get('groups', None) #TODO: nlags is currently required #nlags = kwds.get('nlags', True) #res.cov_kwds['nlags'] = nlags #TODO: `nlags` or `maxlags` res.cov_kwds['maxlags'] = maxlags = kwds['maxlags'] use_correction = kwds.get('use_correction', 'hac') res.cov_kwds['use_correction'] = use_correction weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func # TODO: clumsy time index in cov_nw_panel if groups is not None: groups = np.asarray(groups) tt = (np.nonzero(groups[:-1] != groups[1:])[0] + 1).tolist() nobs_ = len(groups) elif time is not None: # TODO: clumsy time index in cov_nw_panel time = np.asarray(time) tt = (np.nonzero(time[1:] < time[:-1])[0] + 1).tolist() nobs_ = len(time) else: raise ValueError('either time or groups needs to be given') groupidx = lzip([0] + tt, tt + [nobs_]) self.n_groups = n_groups = len(groupidx) res.cov_params_default = sw.cov_nw_panel(self, maxlags, groupidx, weights_func=weights_func, use_correction=use_correction) res.cov_kwds['description'] = ('Standard Errors are robust to' + 'cluster correlation ' + '(' + cov_type + ')') elif cov_type.lower() == 'hac-groupsum': # Driscoll-Kraay standard errors res.cov_kwds['time'] = time = kwds['time'] #TODO: nlags is currently required #nlags = kwds.get('nlags', True) #res.cov_kwds['nlags'] = nlags #TODO: `nlags` or `maxlags` res.cov_kwds['maxlags'] = maxlags = kwds['maxlags'] use_correction = kwds.get('use_correction', 'cluster') res.cov_kwds['use_correction'] = use_correction weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func if adjust_df: # need to find number of groups tt = (np.nonzero(time[1:] < time[:-1])[0] + 1) self.n_groups = n_groups = len(tt) + 1 res.cov_params_default = sw.cov_nw_groupsum(self, maxlags, time, weights_func=weights_func, use_correction=use_correction) res.cov_kwds['description'] = ( 'Driscoll and Kraay Standard Errors are robust to ' + 'cluster correlation ' + '(' + cov_type + ')') else: raise ValueError('cov_type not recognized. See docstring for ' + 'available options and spelling') # generic optional factor to scale covariance res.cov_kwds['scaling_factor'] = sc_factor if sc_factor is not None: res.cov_params_default *= sc_factor if adjust_df: # Note: df_resid is used for scale and others, add new attribute res.df_resid_inference = n_groups - 1 return res
def __init__(self, group, name=''): super(self.__class__, self).__init__(group, name=name) idx = (np.nonzero(np.diff(group))[0] + 1).tolist() self.groupidx = lzip([0] + idx, idx + [len(group)])
def qqline(ax, line, x=None, y=None, dist=None, fmt="r-", **lineoptions): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {"45","r","s","q"} Options for the reference line to which the data is compared.: - "45" - 45-degree line - "s" - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - "r" - A regression line is fit - "q" - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : ndarray X data for plot. Not needed if line is "45". y : ndarray Y data for plot. Not needed if line is "45". dist : scipy.stats.distribution A scipy.stats distribution, needed if line is "q". fmt : str, optional Line format string passed to `plot`. **lineoptions Additional arguments to be passed to the `plot` command. Notes ----- There is no return value. The line is plotted on the given `ax`. Examples -------- Import the food expenditure dataset. Plot annual food expenditure on x-axis and household income on y-axis. Use qqline to add regression line into the plot. >>> import statsmodels.api as sm >>> import numpy as np >>> import matplotlib.pyplot as plt >>> from statsmodels.graphics.gofplots import qqline >>> foodexp = sm.datasets.engel.load() >>> x = foodexp.exog >>> y = foodexp.endog >>> ax = plt.subplot(111) >>> plt.scatter(x, y) >>> ax.set_xlabel(foodexp.exog_name[0]) >>> ax.set_ylabel(foodexp.endog_name) >>> qqline(ax, "r", x, y) >>> plt.show() .. plot:: plots/graphics_gofplots_qqplot_qqline.py """ lineoptions = lineoptions.copy() for ls in ("-", "--", "-.", ":"): if ls in fmt: lineoptions.setdefault("linestyle", ls) fmt = fmt.replace(ls, "") break for marker in ( ".", ",", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8", "s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d", "|", "_", ): if marker in fmt: lineoptions.setdefault("marker", marker) fmt = fmt.replace(marker, "") break if fmt: lineoptions.setdefault("color", fmt) if line == "45": end_pts = lzip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = min(end_pts[0]) end_pts[1] = max(end_pts[1]) ax.plot(end_pts, end_pts, **lineoptions) ax.set_xlim(end_pts) ax.set_ylim(end_pts) return # does this have any side effects? if x is None or y is None: raise ValueError("If line is not 45, x and y cannot be None.") x = np.array(x) y = np.array(y) if line == "r": # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are "clean" y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x, y, **lineoptions) elif line == "s": m, b = np.std(y), np.mean(y) ref_line = x * m + b ax.plot(x, ref_line, **lineoptions) elif line == "q": _check_for(dist, "ppf") q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([0.25, 0.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m * theoretical_quartiles[0] ax.plot(x, m * x + b, **lineoptions)
def pacf(x, nlags=40, method='ywunbiased', alpha=None): """ Partial autocorrelation estimated Parameters ---------- x : 1d array observations of time series for which pacf is calculated nlags : int largest lag for which pacf is returned method : {'ywunbiased', 'ywmle', 'ols'} specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf. Default. - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction alpha : float, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) Returns ------- pacf : 1d array partial autocorrelations, nlags elements, including lag zero confint : array, optional Confidence intervals for the PACF. Returned if confint is not None. Notes ----- This solves yule_walker equations or ols for each desired lag and contains currently duplicate calculations. """ if method == 'ols': ret = pacf_ols(x, nlags=nlags) elif method in ['yw', 'ywu', 'ywunbiased', 'yw_unbiased']: ret = pacf_yw(x, nlags=nlags, method='unbiased') elif method in ['ywm', 'ywmle', 'yw_mle']: ret = pacf_yw(x, nlags=nlags, method='mle') elif method in ['ld', 'ldu', 'ldunbiase', 'ld_unbiased']: acv = acovf(x, unbiased=True) ld_ = levinson_durbin(acv, nlags=nlags, isacov=True) #print 'ld', ld_ ret = ld_[2] # inconsistent naming with ywmle elif method in ['ldb', 'ldbiased', 'ld_biased']: acv = acovf(x, unbiased=False) ld_ = levinson_durbin(acv, nlags=nlags, isacov=True) ret = ld_[2] else: raise ValueError('method not available') if alpha is not None: varacf = 1. / len(x) # for all lags >=1 interval = stats.norm.ppf(1. - alpha / 2.) * np.sqrt(varacf) confint = np.array(lzip(ret - interval, ret + interval)) confint[0] = ret[0] # fix confidence interval for lag 0 to varpacf=0 return ret, confint else: return ret
def genfromdta(fname, missing_flt=-999., encoding=None, pandas=False, convert_dates=True): """ Returns an ndarray or DataFrame from a Stata .dta file. Parameters ---------- fname : str or filehandle Stata .dta file. missing_flt : numeric The numeric value to replace missing values with. Will be used for any numeric value. encoding : string, optional Used for Python 3 only. Encoding to use when reading the .dta file. Defaults to `locale.getpreferredencoding` pandas : bool Optionally return a DataFrame instead of an ndarray convert_dates : bool If convert_dates is True, then Stata formatted dates will be converted to datetime types according to the variable's format. """ if isinstance(fname, string_types): fhd = StataReader(open(fname, 'rb'), missing_values=False, encoding=encoding) elif not hasattr(fname, 'read'): raise TypeError("The input should be a string or a filehandle. "\ "(got %s instead)" % type(fname)) else: fhd = StataReader(fname, missing_values=False, encoding=encoding) # validate_names = np.lib._iotools.NameValidator(excludelist=excludelist, # deletechars=deletechars, # case_sensitive=case_sensitive) #TODO: This needs to handle the byteorder? header = fhd.file_headers() types = header['dtyplist'] nobs = header['nobs'] numvars = header['nvar'] varnames = header['varlist'] fmtlist = header['fmtlist'] dataname = header['data_label'] labels = header['vlblist'] # labels are thrown away unless DataArray # type is used data = np.zeros((nobs, numvars)) stata_dta = fhd.dataset() dt = np.dtype(lzip(varnames, types)) data = np.zeros((nobs), dtype=dt) # init final array for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. if None in line: for i, val in enumerate(line): #NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = missing_flt data[rownum] = tuple(line) if pandas: from pandas import DataFrame data = DataFrame.from_records(data) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0] for col in cols: i = col col = data.columns[col] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(fmtlist[i], )) elif convert_dates: #date_cols = np.where(map(lambda x : x in _date_formats, # fmtlist))[0] # make the dtype for the datetime types cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0] dtype = data.dtype.descr dtype = [(dt[0], object) if i in cols else dt for i, dt in enumerate(dtype)] data = data.astype(dtype) # have to copy for col in cols: def convert(x): return _stata_elapsed_date_to_datetime(x, fmtlist[col]) data[data.dtype.names[col]] = lmap(convert, data[data.dtype.names[col]]) return data
class StataReader(object): """ Stata .dta file reader. Provides methods to return the metadata of a Stata .dta file and a generator for the data itself. Parameters ---------- file : file-like A file-like object representing a Stata .dta file. missing_values : bool If missing_values is True, parse missing_values and return a Missing Values object instead of None. encoding : string, optional Used for Python 3 only. Encoding to use when reading the .dta file. Defaults to `locale.getpreferredencoding` See also -------- statsmodels.lib.io.genfromdta Notes ----- This is known only to work on file formats 113 (Stata 8/9), 114 (Stata 10/11), and 115 (Stata 12). Needs to be tested on older versions. Known not to work on format 104, 108. If you have the documentation for older formats, please contact the developers. For more information about the .dta format see http://www.stata.com/help.cgi?dta http://www.stata.com/help.cgi?dta_113 """ _header = {} _data_location = 0 _col_sizes = () _has_string_data = False _missing_values = False #type code #-------------------- #str1 1 = 0x01 #str2 2 = 0x02 #... #str244 244 = 0xf4 #byte 251 = 0xfb (sic) #int 252 = 0xfc #long 253 = 0xfd #float 254 = 0xfe #double 255 = 0xff #-------------------- #NOTE: the byte type seems to be reserved for categorical variables # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \ [(251, np.int16),(252, np.int32),(253, int), (254, np.float32), (255, np.float64)]) TYPE_MAP = lrange(251) + list('bhlfd') #NOTE: technically, some of these are wrong. there are more numbers # that can be represented. it's the 27 ABOVE and BELOW the max listed # numeric data type in [U] 12.2.2 of the 11.2 manual MISSING_VALUES = { 'b': (-127, 100), 'h': (-32767, 32740), 'l': (-2147483647, 2147483620), 'f': (-1.701e+38, +1.701e+38), 'd': (-1.798e+308, +8.988e+307) } def __init__(self, fname, missing_values=False, encoding=None): if encoding == None: import locale self._encoding = locale.getpreferredencoding() else: self._encoding = encoding self._missing_values = missing_values self._parse_header(fname) def file_headers(self): """ Returns all .dta file headers. out: dict Has keys typlist, data_label, lbllist, varlist, nvar, filetype, ds_format, nobs, fmtlist, vlblist, time_stamp, srtlist, byteorder """ return self._header def file_format(self): """ Returns the file format. Returns ------- out : int Notes ----- Format 113: Stata 8/9 Format 114: Stata 10/11 Format 115: Stata 12 """ return self._header['ds_format'] def file_label(self): """ Returns the dataset's label. Returns ------- out: string """ return self._header['data_label'] def file_timestamp(self): """ Returns the date and time Stata recorded on last file save. Returns ------- out : str """ return self._header['time_stamp'] def variables(self): """ Returns a list of the dataset's StataVariables objects. """ return lmap( _StataVariable, zip(lrange(self._header['nvar']), self._header['typlist'], self._header['varlist'], self._header['srtlist'], self._header['fmtlist'], self._header['lbllist'], self._header['vlblist'])) def dataset(self, as_dict=False): """ Returns a Python generator object for iterating over the dataset. Parameters ---------- as_dict : bool, optional If as_dict is True, yield each row of observations as a dict. If False, yields each row of observations as a list. Returns ------- Generator object for iterating over the dataset. Yields each row of observations as a list by default. Notes ----- If missing_values is True during instantiation of StataReader then observations with _StataMissingValue(s) are not filtered and should be handled by your applcation. """ try: self._file.seek(self._data_location) except Exception: pass if as_dict: vars = lmap(str, self.variables()) for i in range(len(self)): yield dict(zip(vars, self._next())) else: for i in range(self._header['nobs']): yield self._next() ### Python special methods def __len__(self): """ Return the number of observations in the dataset. This value is taken directly from the header and includes observations with missing values. """ return self._header['nobs'] def __getitem__(self, k): """ Seek to an observation indexed k in the file and return it, ordered by Stata's output to the .dta file. k is zero-indexed. Prefer using R.data() for performance. """ if not (isinstance(k, (int, long))) or k < 0 or k > len(self) - 1: raise IndexError(k) loc = self._data_location + sum(self._col_size()) * k if self._file.tell() != loc: self._file.seek(loc) return self._next() ### Private methods def _null_terminate(self, s, encoding): if PY3: # have bytes not strings, so must decode null_byte = asbytes('\x00') try: s = s.lstrip(null_byte)[:s.index(null_byte)] except: pass return s.decode(encoding) else: null_byte = asbytes('\x00') try: return s.lstrip(null_byte)[:s.index(null_byte)] except: return s def _parse_header(self, file_object): self._file = file_object encoding = self._encoding # parse headers self._header['ds_format'] = unpack('b', self._file.read(1))[0] if self._header['ds_format'] not in [113, 114, 115]: raise ValueError("Only file formats >= 113 (Stata >= 9)" " are supported. Got format %s. Please report " "if you think this error is incorrect." % self._header['ds_format']) byteorder = self._header['byteorder'] = unpack( 'b', self._file.read(1))[0] == 0x1 and '>' or '<' self._header['filetype'] = unpack('b', self._file.read(1))[0] self._file.read(1) nvar = self._header['nvar'] = unpack(byteorder + 'h', self._file.read(2))[0] self._header['nobs'] = unpack(byteorder + 'i', self._file.read(4))[0] self._header['data_label'] = self._null_terminate( self._file.read(81), encoding) self._header['time_stamp'] = self._null_terminate( self._file.read(18), encoding) # parse descriptors typlist = [ord(self._file.read(1)) for i in range(nvar)] self._header['typlist'] = [self.TYPE_MAP[typ] for typ in typlist] self._header['dtyplist'] = [self.DTYPE_MAP[typ] for typ in typlist] self._header['varlist'] = [ self._null_terminate(self._file.read(33), encoding) for i in range(nvar) ] self._header['srtlist'] = unpack(byteorder + ('h' * (nvar + 1)), self._file.read(2 * (nvar + 1)))[:-1] if self._header['ds_format'] <= 113: self._header['fmtlist'] = \ [self._null_terminate(self._file.read(12), encoding) \ for i in range(nvar)] else: self._header['fmtlist'] = \ [self._null_terminate(self._file.read(49), encoding) \ for i in range(nvar)] self._header['lbllist'] = [ self._null_terminate(self._file.read(33), encoding) for i in range(nvar) ] self._header['vlblist'] = [ self._null_terminate(self._file.read(81), encoding) for i in range(nvar) ] # ignore expansion fields # When reading, read five bytes; the last four bytes now tell you the # size of the next read, which you discard. You then continue like # this until you read 5 bytes of zeros. while True: data_type = unpack(byteorder + 'b', self._file.read(1))[0] data_len = unpack(byteorder + 'i', self._file.read(4))[0] if data_type == 0: break self._file.read(data_len) # other state vars self._data_location = self._file.tell() self._has_string_data = len( lfilter(lambda x: isinstance(x, int), self._header['typlist'])) > 0 self._col_size() def _calcsize(self, fmt): return isinstance(fmt, int) and fmt or \ calcsize(self._header['byteorder']+fmt) def _col_size(self, k=None): """Calculate size of a data record.""" if len(self._col_sizes) == 0: self._col_sizes = lmap(lambda x: self._calcsize(x), self._header['typlist']) if k == None: return self._col_sizes else: return self._col_sizes[k] def _unpack(self, fmt, byt): d = unpack(self._header['byteorder'] + fmt, byt)[0] if fmt[-1] in self.MISSING_VALUES: nmin, nmax = self.MISSING_VALUES[fmt[-1]] if d < nmin or d > nmax: if self._missing_values: return _StataMissingValue(nmax, d) else: return None return d def _next(self): typlist = self._header['typlist'] if self._has_string_data: data = [None] * self._header['nvar'] for i in range(len(data)): if isinstance(typlist[i], int): data[i] = self._null_terminate(self._file.read(typlist[i]), self._encoding) else: data[i] = self._unpack(typlist[i], self._file.read(self._col_size(i))) return data else: return lmap( lambda i: self._unpack(typlist[i], self._file.read(self._col_size(i))), lrange(self._header['nvar']))
class StataWriter(object): """ A class for writing Stata binary dta files from array-like objects Parameters ---------- fname : file path or buffer Where to save the dta file. data : array-like Array-like input to save. Pandas objects are also accepted. convert_dates : dict Dictionary mapping column of datetime types to the stata internal format that you want to use for the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a number or a name. encoding : str Default is latin-1. Note that Stata does not support unicode. byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` Returns ------- writer : StataWriter instance The StataWriter instance has a write_file method, which will write the file to the given `fname`. Examples -------- >>> writer = StataWriter('./data_file.dta', data) >>> writer.write_file() Or with dates >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'}) >>> writer.write_file() """ #type code #-------------------- #str1 1 = 0x01 #str2 2 = 0x02 #... #str244 244 = 0xf4 #byte 251 = 0xfb (sic) #int 252 = 0xfc #long 253 = 0xfd #float 254 = 0xfe #double 255 = 0xff #-------------------- #NOTE: the byte type seems to be reserved for categorical variables # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \ [(251, np.int16),(252, np.int32),(253, int), (254, np.float32), (255, np.float64)]) TYPE_MAP = lrange(251) + list('bhlfd') MISSING_VALUES = { 'b': 101, 'h': 32741, 'l': 2147483621, 'f': 1.7014118346046923e+38, 'd': 8.98846567431158e+307 } def __init__(self, fname, data, convert_dates=None, encoding="latin-1", byteorder=None): self._convert_dates = convert_dates # attach nobs, nvars, data, varlist, typlist if data_util._is_using_pandas(data, None): self._prepare_pandas(data) elif data_util._is_array_like(data, None): data = np.asarray(data) if data_util._is_structured_ndarray(data): self._prepare_structured_array(data) else: if convert_dates is not None: raise ValueError("Not able to convert dates in a plain" " ndarray.") self._prepare_ndarray(data) else: # pragma : no cover raise ValueError("Type %s for data not understood" % type(data)) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) self._encoding = encoding self._file = get_file_obj(fname, 'wb', encoding) def _write(self, to_write): """ Helper to call asbytes before writing to file for Python 3 compat. """ self._file.write(asbytes(to_write)) def _prepare_structured_array(self, data): self.nobs = len(data) self.nvar = len(data.dtype) self.data = data self.datarows = iter(data) dtype = data.dtype descr = dtype.descr if dtype.names is None: varlist = _default_names(nvar) else: varlist = dtype.names # check for datetime and change the type convert_dates = self._convert_dates if convert_dates is not None: convert_dates = _maybe_convert_to_int_keys(convert_dates, varlist) self._convert_dates = convert_dates for key in convert_dates: descr[key] = (descr[key][0], _convert_datetime_to_stata_type( convert_dates[key])) dtype = np.dtype(descr) self.varlist = varlist self.typlist = [ _dtype_to_stata_type(dtype[i]) for i in range(self.nvar) ] self.fmtlist = [ _dtype_to_default_stata_fmt(dtype[i]) for i in range(self.nvar) ] # set the given format for the datetime cols if convert_dates is not None: for key in convert_dates: self.fmtlist[key] = convert_dates[key] def _prepare_ndarray(self, data): if data.ndim == 1: data = data[:, None] self.nobs, self.nvar = data.shape self.data = data self.datarows = iter(data) #TODO: this should be user settable dtype = data.dtype self.varlist = _default_names(self.nvar) self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)] self.fmtlist = [ _dtype_to_default_stata_fmt(dtype) for i in range(self.nvar) ] def _prepare_pandas(self, data): #NOTE: we might need a different API / class for pandas objects so # we can set different semantics - handle this with a PR to pandas.io class DataFrameRowIter(object): def __init__(self, data): self.data = data def __iter__(self): for i, row in data.iterrows(): yield row data = data.reset_index() self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data self.varlist = data.columns.tolist() dtypes = data.dtypes convert_dates = self._convert_dates if convert_dates is not None: convert_dates = _maybe_convert_to_int_keys(convert_dates, self.varlist) self._convert_dates = convert_dates for key in convert_dates: new_type = _convert_datetime_to_stata_type(convert_dates[key]) dtypes[key] = np.dtype(new_type) self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] # set the given format for the datetime cols if convert_dates is not None: for key in convert_dates: self.fmtlist[key] = convert_dates[key] def write_file(self): self._write_header() self._write_descriptors() self._write_variable_labels() # write 5 zeros for expansion fields self._write(_pad_bytes("", 5)) if self._convert_dates is None: self._write_data_nodates() else: self._write_data_dates() #self._write_value_labels() def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder # ds_format - just use 114 self._write(pack("b", 114)) # byteorder self._write(byteorder == ">" and "\x01" or "\x02") # filetype self._write("\x01") # unused self._write("\x00") # number of vars, 2 bytes self._write(pack(byteorder + "h", self.nvar)[:2]) # number of obs, 4 bytes self._write(pack(byteorder + "i", self.nobs)[:4]) # data label 81 bytes, char, null terminated if data_label is None: self._write( self._null_terminate(_pad_bytes("", 80), self._encoding)) else: self._write( self._null_terminate(_pad_bytes(data_label[:80], 80), self._encoding)) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm if time_stamp is None: time_stamp = datetime.datetime.now() elif not isinstance(time_stamp, datetime): raise ValueError("time_stamp should be datetime type") self._write( self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"), self._encoding)) def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, fmtlist=None, lbllist=None): nvar = self.nvar # typlist, length nvar, format byte array for typ in self.typlist: self._write(typ) # varlist, length 33*nvar, char array, null terminated for name in self.varlist: name = self._null_terminate(name, self._encoding) name = _pad_bytes(asstr(name[:32]), 33) self._write(name) # srtlist, 2*(nvar+1), int array, encoded by byteorder srtlist = _pad_bytes("", (2 * (nvar + 1))) self._write(srtlist) # fmtlist, 49*nvar, char array for fmt in self.fmtlist: self._write(_pad_bytes(fmt, 49)) # lbllist, 33*nvar, char array #NOTE: this is where you could get fancy with pandas categorical type for i in range(nvar): self._write(_pad_bytes("", 33)) def _write_variable_labels(self, labels=None): nvar = self.nvar if labels is None: for i in range(nvar): self._write(_pad_bytes("", 81)) def _write_data_nodates(self): data = self.datarows byteorder = self._byteorder TYPE_MAP = self.TYPE_MAP typlist = self.typlist for row in data: #row = row.squeeze().tolist() # needed for structured arrays for i, var in enumerate(row): typ = ord(typlist[i]) if typ <= 244: # we've got a string if len(var) < typ: var = _pad_bytes(asstr(var), len(var) + 1) self._write(var) else: try: self._write(pack(byteorder + TYPE_MAP[typ], var)) except struct_error: # have to be strict about type pack won't do any # kind of casting self._write( pack(byteorder + TYPE_MAP[typ], _type_converters[typ](var))) def _write_data_dates(self): convert_dates = self._convert_dates data = self.datarows byteorder = self._byteorder TYPE_MAP = self.TYPE_MAP MISSING_VALUES = self.MISSING_VALUES typlist = self.typlist for row in data: #row = row.squeeze().tolist() # needed for structured arrays for i, var in enumerate(row): typ = ord(typlist[i]) #NOTE: If anyone finds this terribly slow, there is # a vectorized way to convert dates, see genfromdta for going # from int to datetime and reverse it. will copy data though if i in convert_dates: var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) if typ <= 244: # we've got a string if isnull(var): var = "" # missing string if len(var) < typ: var = _pad_bytes(var, len(var) + 1) self._write(var) else: if isnull(var): # this only matters for floats var = MISSING_VALUES[typ] self._write(pack(byteorder + TYPE_MAP[typ], var)) def _null_terminate(self, s, encoding): null_byte = '\x00' if PY3: s += null_byte return s.encode(encoding) else: s += null_byte return s
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : {ndarray, str} endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : {ndarray, str} exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : {ndarray, list[str]} other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array_like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the observation numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. **kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. Examples -------- Load the Statewide Crime data set and plot partial regression of the rate of high school graduation (hs_grad) on the murder rate(murder). The effects of the percent of the population living in urban areas (urban), below the poverty line (poverty) , and in a single person household (single) are removed by OLS regression. >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> crime_data = sm.datasets.statecrime.load_pandas() >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad', ... exog_others=['urban', 'poverty', 'single'], ... data=crime_data.data, obs_labels=False) >>> plt.show() .. plot:: plots/graphics_regression_partregress.py More detailed examples can be found in the Regression Plots notebook on the examples page. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, str): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, str): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size == 0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, str): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like if RHS_isemtpy: ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance( endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array_like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
print(shannonentropy(Y)) p = [1e-5, 1e-4, .001, .01, .1, .15, .2, .25, .3, .35, .4, .45, .5] plt.subplot(111) plt.ylabel("Information") plt.xlabel("Probability") x = np.linspace(0, 1, 100001) plt.plot(x, shannoninfo(x)) # plt.show() plt.subplot(111) plt.ylabel("Entropy") plt.xlabel("Probability") x = np.linspace(0, 1, 101) plt.plot(x, lmap(shannonentropy, lzip(x, 1 - x))) # plt.show() # define a joint probability distribution # from Golan (2008) table 3.3 w = np.array([[0, 0, 1. / 3], [1 / 9., 1 / 9., 1 / 9.], [1 / 18., 1 / 9., 1 / 6.]]) # table 3.4 px = w.sum(0) py = w.sum(1) H_X = shannonentropy(px) H_Y = shannonentropy(py) H_XY = shannonentropy(w) H_XgivenY = condentropy(px, py, w) H_YgivenX = condentropy(py, px, w) # note that cross-entropy is not a distance measure as the following shows
def add_lag(x, col=None, lags=1, drop=False, insert=True): """ Returns an array with lags included given an array. Parameters ---------- x : array An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : 'string', int, or None If data is a structured array or a recarray, `col` can be a string that is the name of the column containing the variable. Or `col` can be an int of the zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool Whether to keep the contemporaneous variable for the data. insert : bool or int If True, inserts the lagged values after `col`. If False, appends the data. If int inserts the lags at int. Returns ------- array : ndarray Array with lags Examples -------- >>> import statsmodels.api as sm >>> data = sm.datasets.macrodata.load(as_pandas=False) >>> data = data.data[['year','quarter','realgdp','cpi']] >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2) Notes ----- Trims the array both forward and backward, so that the array returned so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ if x.dtype.names: names = x.dtype.names if not col and np.squeeze(x).ndim > 1: raise IndexError("col is None and the input array is not 1d") elif len(names) == 1: col = names[0] if isinstance(col, (int, long)): col = x.dtype.names[col] if not PY3: # TODO: Get rid of this kludge. See GH # 3658 names = [bytes(name) if isinstance(name, unicode) # noqa:F821 else name for name in names] # Fail loudly if there is a non-ascii name. x.dtype.names = names if isinstance(col, unicode): # noqa:F821 col = bytes(col) contemp = x[col] # make names for lags tmp_names = [col + '_'+'L(%i)' % i for i in range(1, lags+1)] ndlags = lagmat(contemp, maxlag=lags, trim='Both') # get index for return if insert is True: ins_idx = list(names).index(col) + 1 elif insert is False: ins_idx = len(names) + 1 else: # insert is an int if insert > len(names): import warnings warnings.warn("insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert first_names = list(names[:ins_idx]) last_names = list(names[ins_idx:]) if drop: if col in first_names: first_names.pop(first_names.index(col)) else: last_names.pop(last_names.index(col)) if first_names: # only do this if x isn't "empty" # Workaround to avoid NumPy FutureWarning _x = recarray_select(x, first_names) first_arr = nprf.append_fields(_x[lags:], tmp_names, ndlags.T, usemask=False) else: first_arr = np.zeros(len(x)-lags, dtype=lzip(tmp_names, (x[col].dtype,)*lags)) for i,name in enumerate(tmp_names): first_arr[name] = ndlags[:,i] if last_names: return nprf.append_fields(first_arr, last_names, [x[name][lags:] for name in last_names], usemask=False) else: # lags for last variable return first_arr else: # we have an ndarray if x.ndim == 1: # make 2d if 1d x = x[:,None] if col is None: col = 0 # handle negative index if col < 0: col = x.shape[1] + col contemp = x[:,col] if insert is True: ins_idx = col + 1 elif insert is False: ins_idx = x.shape[1] else: if insert < 0: # handle negative index insert = x.shape[1] + insert + 1 if insert > x.shape[1]: insert = x.shape[1] import warnings warnings.warn("insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert ndlags = lagmat(contemp, lags, trim='Both') first_cols = lrange(ins_idx) last_cols = lrange(ins_idx,x.shape[1]) if drop: if col in first_cols: first_cols.pop(first_cols.index(col)) else: last_cols.pop(last_cols.index(col)) return np.column_stack((x[lags:,first_cols],ndlags, x[lags:,last_cols]))
def add_lag(x, col=None, lags=1, drop=False, insert=True): """ Returns an array with lags included given an array. Parameters ---------- x : array An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : 'string', int, or None If data is a structured array or a recarray, `col` can be a string that is the name of the column containing the variable. Or `col` can be an int of the zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool Whether to keep the contemporaneous variable for the data. insert : bool or int If True, inserts the lagged values after `col`. If False, appends the data. If int inserts the lags at int. Returns ------- array : ndarray Array with lags Examples -------- >>> import statsmodels.api as sm >>> data = sm.datasets.macrodata.load() >>> data = data.data[['year','quarter','realgdp','cpi']] >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2) Notes ----- Trims the array both forward and backward, so that the array returned so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ if x.dtype.names: names = x.dtype.names if not col and np.squeeze(x).ndim > 1: raise IndexError("col is None and the input array is not 1d") elif len(names) == 1: col = names[0] if isinstance(col, int): col = x.dtype.names[col] contemp = x[col] # make names for lags tmp_names = [col + '_'+'L(%i)' % i for i in range(1,lags+1)] ndlags = lagmat(contemp, maxlag=lags, trim='Both') # get index for return if insert is True: ins_idx = list(names).index(col) + 1 elif insert is False: ins_idx = len(names) + 1 else: # insert is an int if insert > len(names): import warnings warnings.warn("insert > number of variables, inserting at the" " last position", UserWarning) ins_idx = insert first_names = list(names[:ins_idx]) last_names = list(names[ins_idx:]) if drop: if col in first_names: first_names.pop(first_names.index(col)) else: last_names.pop(last_names.index(col)) if first_names: # only do this if x isn't "empty" first_arr = nprf.append_fields(x[first_names][lags:],tmp_names, ndlags.T, usemask=False) else: first_arr = np.zeros(len(x)-lags, dtype=lzip(tmp_names, (x[col].dtype,)*lags)) for i,name in enumerate(tmp_names): first_arr[name] = ndlags[:,i] if last_names: return nprf.append_fields(first_arr, last_names, [x[name][lags:] for name in last_names], usemask=False) else: # lags for last variable return first_arr else: # we have an ndarray if x.ndim == 1: # make 2d if 1d x = x[:,None] if col is None: col = 0 # handle negative index if col < 0: col = x.shape[1] + col contemp = x[:,col] if insert is True: ins_idx = col + 1 elif insert is False: ins_idx = x.shape[1] else: if insert < 0: # handle negative index insert = x.shape[1] + insert + 1 if insert > x.shape[1]: insert = x.shape[1] import warnings warnings.warn("insert > number of variables, inserting at the" " last position", UserWarning) ins_idx = insert ndlags = lagmat(contemp, lags, trim='Both') first_cols = lrange(ins_idx) last_cols = lrange(ins_idx,x.shape[1]) if drop: if col in first_cols: first_cols.pop(first_cols.index(col)) else: last_cols.pop(last_cols.index(col)) return np.column_stack((x[lags:,first_cols],ndlags, x[lags:,last_cols]))
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray or string endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : ndarray or string exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : ndarray or list of strings other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array-like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the obseveration numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, string_types): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, string_types): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size==0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, string_types): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like if RHS_isemtpy: ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array-like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds): """create new results instance with robust covariance as default Parameters ---------- cov_type : str the type of robust sandwich estimator to use. see Notes below use_t : bool If true, then the t distribution is used for inference. If false, then the normal distribution is used. kwds : depends on cov_type Required or optional arguments for robust covariance calculation. see Notes below Returns ------- results : results instance This method creates a new results instance with the requested robust covariance as the default covariance of the parameters. Inferential statistics like p-values and hypothesis tests will be based on this covariance matrix. Notes ----- Warning: Some of the options and defaults in cov_kwds may be changed in a future version. The covariance keywords provide an option 'scaling_factor' to adjust the scaling of the covariance matrix, that is the covariance is multiplied by this factor if it is given and is not `None`. This allows the user to adjust the scaling of the covariance matrix to match other statistical packages. For example, `scaling_factor=(nobs - 1.) / (nobs - k_params)` provides a correction so that the robust covariance matrices match those of Stata in some models like GLM and discrete Models. The following covariance types and required or optional arguments are currently available: - 'HC0', 'HC1', 'HC2', 'HC3': heteroscedasticity robust covariance - no keyword arguments - 'HAC': heteroskedasticity-autocorrelation robust covariance ``maxlags`` : integer, required number of lags to use ``kernel`` : {callable, str}, optional kernels currently available kernels are ['bartlett', 'uniform'], default is Bartlett ``use_correction``: bool, optional If true, use small sample correction - 'cluster': clustered covariance estimator ``groups`` : array_like[int], required : Integer-valued index of clusters or groups. ``use_correction``: bool, optional If True the sandwich covariance is calculated with a small sample correction. If False the sandwich covariance is calculated without small sample correction. ``df_correction``: bool, optional If True (default), then the degrees of freedom for the inferential statistics and hypothesis tests, such as pvalues, f_pvalue, conf_int, and t_test and f_test, are based on the number of groups minus one instead of the total number of observations minus the number of explanatory variables. `df_resid` of the results instance is also adjusted. When `use_t` is also True, then pvalues are computed using the Student's t distribution using the corrected values. These may differ substantially from p-values based on the normal is the number of groups is small. If False, then `df_resid` of the results instance is not adjusted. - 'hac-groupsum': Driscoll and Kraay, heteroscedasticity and autocorrelation robust covariance for panel data # TODO: more options needed here ``time`` : array_like, required index of time periods ``maxlags`` : integer, required number of lags to use ``kernel`` : {callable, str}, optional The available kernels are ['bartlett', 'uniform']. The default is Bartlett. ``use_correction`` : {False, 'hac', 'cluster'}, optional If False the the sandwich covariance is calculated without small sample correction. If `use_correction = 'cluster'` (default), then the same small sample correction as in the case of `covtype='cluster'` is used. ``df_correction`` : bool, optional The adjustment to df_resid, see cov_type 'cluster' above - 'hac-panel': heteroscedasticity and autocorrelation robust standard errors in panel data. The data needs to be sorted in this case, the time series for each panel unit or cluster need to be stacked. The membership to a time series of an individual or group can be either specified by group indicators or by increasing time periods. One of ``groups`` or ``time`` is required. # TODO: we need more options here ``groups`` : array_like[int] indicator for groups ``time`` : array_like[int] index of time periods ``maxlags`` : int, required number of lags to use ``kernel`` : {callable, str}, optional Available kernels are ['bartlett', 'uniform'], default is Bartlett ``use_correction`` : {False, 'hac', 'cluster'}, optional If False the sandwich covariance is calculated without small sample correction. ``df_correction`` : bool, optional Adjustment to df_resid, see cov_type 'cluster' above **Reminder**: ``use_correction`` in "hac-groupsum" and "hac-panel" is not bool, needs to be in {False, 'hac', 'cluster'}. .. todo:: Currently there is no check for extra or misspelled keywords, except in the case of cov_type `HCx` """ import statsmodels.stats.sandwich_covariance as sw cov_type = normalize_cov_type(cov_type) if 'kernel' in kwds: kwds['weights_func'] = kwds.pop('kernel') if 'weights_func' in kwds and not callable(kwds['weights_func']): kwds['weights_func'] = sw.kernel_dict[kwds['weights_func']] # pop because HCx raises if any kwds sc_factor = kwds.pop('scaling_factor', None) # TODO: make separate function that returns a robust cov plus info use_self = kwds.pop('use_self', False) if use_self: res = self else: # this does not work for most models, use raw instance instead from fit res = self.__class__(self.model, self.params, normalized_cov_params=self.normalized_cov_params, scale=self.scale) res.cov_type = cov_type # use_t might already be defined by the class, and already set if use_t is None: use_t = self.use_t res.cov_kwds = {'use_t': use_t} # store for information res.use_t = use_t adjust_df = False if cov_type in ['cluster', 'hac-panel', 'hac-groupsum']: df_correction = kwds.get('df_correction', None) # TODO: check also use_correction, do I need all combinations? if df_correction is not False: # i.e. in [None, True]: # user did not explicitely set it to False adjust_df = True res.cov_kwds['adjust_df'] = adjust_df # verify and set kwds, and calculate cov # TODO: this should be outsourced in a function so we can reuse it in # other models # TODO: make it DRYer repeated code for checking kwds if cov_type.upper() in ('HC0', 'HC1', 'HC2', 'HC3'): if kwds: raise ValueError('heteroscedasticity robust covariance ' 'does not use keywords') res.cov_kwds['description'] = descriptions[cov_type.upper()] res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None) if res.cov_params_default is None: # results classes that do not have cov_HCx attribute res.cov_params_default = sw.cov_white_simple(self, use_correction=False) elif cov_type.lower() == 'hac': maxlags = kwds['maxlags'] # required?, default in cov_hac_simple res.cov_kwds['maxlags'] = maxlags weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func use_correction = kwds.get('use_correction', False) res.cov_kwds['use_correction'] = use_correction res.cov_kwds['description'] = descriptions['HAC'].format( maxlags=maxlags, correction=['without', 'with'][use_correction]) res.cov_params_default = sw.cov_hac_simple( self, nlags=maxlags, weights_func=weights_func, use_correction=use_correction) elif cov_type.lower() == 'cluster': #cluster robust standard errors, one- or two-way groups = kwds['groups'] if not hasattr(groups, 'shape'): groups = np.asarray(groups).T if groups.ndim >= 2: groups = groups.squeeze() res.cov_kwds['groups'] = groups use_correction = kwds.get('use_correction', True) res.cov_kwds['use_correction'] = use_correction if groups.ndim == 1: if adjust_df: # need to find number of groups # duplicate work self.n_groups = n_groups = len(np.unique(groups)) res.cov_params_default = sw.cov_cluster( self, groups, use_correction=use_correction) elif groups.ndim == 2: if hasattr(groups, 'values'): groups = groups.values if adjust_df: # need to find number of groups # duplicate work n_groups0 = len(np.unique(groups[:, 0])) n_groups1 = len(np.unique(groups[:, 1])) self.n_groups = (n_groups0, n_groups1) n_groups = min(n_groups0, n_groups1) # use for adjust_df # Note: sw.cov_cluster_2groups has 3 returns res.cov_params_default = sw.cov_cluster_2groups( self, groups, use_correction=use_correction)[0] else: raise ValueError('only two groups are supported') res.cov_kwds['description'] = descriptions['cluster'] elif cov_type.lower() == 'hac-panel': #cluster robust standard errors res.cov_kwds['time'] = time = kwds.get('time', None) res.cov_kwds['groups'] = groups = kwds.get('groups', None) #TODO: nlags is currently required #nlags = kwds.get('nlags', True) #res.cov_kwds['nlags'] = nlags #TODO: `nlags` or `maxlags` res.cov_kwds['maxlags'] = maxlags = kwds['maxlags'] use_correction = kwds.get('use_correction', 'hac') res.cov_kwds['use_correction'] = use_correction weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func # TODO: clumsy time index in cov_nw_panel if groups is not None: groups = np.asarray(groups) tt = (np.nonzero(groups[:-1] != groups[1:])[0] + 1).tolist() nobs_ = len(groups) elif time is not None: # TODO: clumsy time index in cov_nw_panel time = np.asarray(time) tt = (np.nonzero(time[1:] < time[:-1])[0] + 1).tolist() nobs_ = len(time) else: raise ValueError('either time or groups needs to be given') groupidx = lzip([0] + tt, tt + [nobs_]) self.n_groups = n_groups = len(groupidx) res.cov_params_default = sw.cov_nw_panel(self, maxlags, groupidx, weights_func=weights_func, use_correction=use_correction) res.cov_kwds['description'] = descriptions['HAC-Panel'] elif cov_type.lower() == 'hac-groupsum': # Driscoll-Kraay standard errors res.cov_kwds['time'] = time = kwds['time'] #TODO: nlags is currently required #nlags = kwds.get('nlags', True) #res.cov_kwds['nlags'] = nlags #TODO: `nlags` or `maxlags` res.cov_kwds['maxlags'] = maxlags = kwds['maxlags'] use_correction = kwds.get('use_correction', 'cluster') res.cov_kwds['use_correction'] = use_correction weights_func = kwds.get('weights_func', sw.weights_bartlett) res.cov_kwds['weights_func'] = weights_func if adjust_df: # need to find number of groups tt = (np.nonzero(time[1:] < time[:-1])[0] + 1) self.n_groups = n_groups = len(tt) + 1 res.cov_params_default = sw.cov_nw_groupsum( self, maxlags, time, weights_func=weights_func, use_correction=use_correction) res.cov_kwds['description'] = descriptions['HAC-Groupsum'] else: raise ValueError('cov_type not recognized. See docstring for ' + 'available options and spelling') # generic optional factor to scale covariance res.cov_kwds['scaling_factor'] = sc_factor if sc_factor is not None: res.cov_params_default *= sc_factor if adjust_df: # Note: df_resid is used for scale and others, add new attribute res.df_resid_inference = n_groups - 1 return res
def summary_params_2d(result, extras=None, endog_names=None, exog_names=None, title=None): '''create summary table of regression parameters with several equations This allows interleaving of parameters with bse and/or tvalues Parameters ---------- result : result instance the result instance with params and attributes in extras extras : list of strings additional attributes to add below a parameter row, e.g. bse or tvalues endog_names : None or list of strings names for rows of the parameter array (multivariate endog) exog_names : None or list of strings names for columns of the parameter array (exog) alpha : float level for confidence intervals, default 0.95 title : None or string Returns ------- tables : list of SimpleTable this contains a list of all seperate Subtables table_all : SimpleTable the merged table with results concatenated for each row of the parameter array ''' if endog_names is None: # TODO: note the [1:] is specific to current MNLogit endog_names = [ 'endog_%d' % i for i in np.unique(result.model.endog)[1:] ] if exog_names is None: exog_names = ['var%d' % i for i in range(len(result.params))] # TODO: check formatting options with different values res_params = [[forg(item, prec=4) for item in row] for row in result.params] if extras: extras_list = [[[ '%10s' % ('(' + forg(v, prec=3).strip() + ')') for v in col ] for col in getattr(result, what)] for what in extras] data = lzip(res_params, *extras_list) data = [i for j in data for i in j] #flatten stubs = lzip(endog_names, *[[''] * len(endog_names)] * len(extras)) stubs = [i for j in stubs for i in j] #flatten else: data = res_params stubs = endog_names txt_fmt = copy.deepcopy(fmt_params) txt_fmt["data_fmts"] = ["%s"] * result.params.shape[1] return SimpleTable(data, headers=exog_names, stubs=stubs, title=title, txt_fmt=txt_fmt)
kern = kernels.Gaussian(h = h) kde = KDE( x, kern) print(kde.density(1.015469)) print(0.2034675) Xs = np.arange(-10,10,0.1) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(Xs, kde(Xs), "-") ax.set_ylim(-10, 10) ax.set_ylim(0,0.4) # 2-D case x = lzip(kdetest.faithfulData["eruptions"], kdetest.faithfulData["waiting"]) x = np.array(x) x = (x - x.mean(0))/x.std(0) nobs = x.shape[0] H = kdetest.Hpi kern = kernels.NdKernel( 2 ) kde = KDE( x, kern ) print(kde.density( np.matrix( [1,2 ]))) #.T plt.figure() plt.plot(x[:,0], x[:,1], 'o') n_grid = 50 xsp = np.linspace(x.min(0)[0], x.max(0)[0], n_grid) ysp = np.linspace(x.min(0)[1], x.max(0)[1], n_grid) # xsorted = np.sort(x)
def summary(self, yname=None, xname=None, title=0, alpha=.05, returns='text', model_info=None): """ Parameters ----------- yname : string optional, Default is `Y` xname : list of strings optional, Default is `X.#` for # in p the number of regressors Confidance interval : (0,1) not implimented title : string optional, Defualt is 'Generalized linear model' returns : string 'text', 'table', 'csv', 'latex', 'html' Returns ------- Default : returns='print' Prints the summarirized results Option : returns='text' Prints the summarirized results Option : returns='table' SimpleTable instance : summarizing the fit of a linear model. Option : returns='csv' returns a string of csv of the results, to import into a spreadsheet Option : returns='latex' Not implimented yet Option : returns='HTML' Not implimented yet Examples (needs updating) -------- >>> import statsmodels as sm >>> data = sm.datasets.longley.load(as_pandas=False) >>> data.exog = sm.add_constant(data.exog) >>> ols_results = sm.OLS(data.endog, data.exog).results >>> print ols_results.summary() ... Notes ----- conf_int calculated from normal dist. """ # TODO: Make sure all self.model.__class__.__name__ are listed model_types = { 'OLS': 'Ordinary least squares', 'GLS': 'Generalized least squares', 'GLSAR': 'Generalized least squares with AR(p)', 'WLS': 'Weighted least squares', 'RLM': 'Robust linear model', 'GLM': 'Generalized linear model' } if title == 0: title = model_types[self.model.__class__.__name__] yname, xname = _getnames(self, yname, xname) time_now = time.localtime() time_of_day = [time.strftime("%H:%M:%S", time_now)] date = time.strftime("%a, %d %b %Y", time_now) modeltype = self.model.__class__.__name__ nobs = self.nobs df_model = self.df_model df_resid = self.df_resid #General part of the summary table, Applicable to all? models #------------------------------------------------------------ # TODO: define this generically, overwrite in model classes #replace definition of stubs data by single list #e.g. gen_left = [ ('Model type:', [modeltype]), ('Date:', [date]), ('Dependent Variable:', yname), # TODO: What happens with multiple names? ('df model', [df_model]) ] gen_stubs_left, gen_data_left = zip_longest(*gen_left) #transpose row col gen_title = title gen_header = None gen_table_left = SimpleTable(gen_data_left, gen_header, gen_stubs_left, title=gen_title, txt_fmt=gen_fmt) gen_stubs_right = ('Method:', 'Time:', 'Number of Obs:', 'df resid') gen_data_right = ( [modeltype], #was dist family need to look at more time_of_day, [nobs], [df_resid]) gen_table_right = SimpleTable(gen_data_right, gen_header, gen_stubs_right, title=gen_title, txt_fmt=gen_fmt) gen_table_left.extend_right(gen_table_right) general_table = gen_table_left # Parameters part of the summary table # ------------------------------------ # Note: this is not necessary since we standardized names, # only t versus normal tstats = { 'OLS': self.t(), 'GLS': self.t(), 'GLSAR': self.t(), 'WLS': self.t(), 'RLM': self.t(), 'GLM': self.t() } prob_stats = { 'OLS': self.pvalues, 'GLS': self.pvalues, 'GLSAR': self.pvalues, 'WLS': self.pvalues, 'RLM': self.pvalues, 'GLM': self.pvalues } # Dictionary to store the header names for the parameter part of the # summary table. look up by modeltype alp = str((1 - alpha) * 100) + '%' param_header = { 'OLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLSAR': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'WLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLM': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], #glm uses t-distribution 'RLM': ['coef', 'std err', 'z', 'P>|z|', alp + ' Conf. Interval'] #checke z } params_stubs = xname params = self.params conf_int = self.conf_int(alpha) std_err = self.bse exog_len = lrange(len(xname)) tstat = tstats[modeltype] prob_stat = prob_stats[modeltype] # Simpletable should be able to handle the formating params_data = lzip(["%#6.4g" % (params[i]) for i in exog_len], ["%#6.4f" % (std_err[i]) for i in exog_len], ["%#6.4f" % (tstat[i]) for i in exog_len], ["%#6.4f" % (prob_stat[i]) for i in exog_len], ["(%#5g, %#5g)" % tuple(conf_int[i]) for i in exog_len]) parameter_table = SimpleTable(params_data, param_header[modeltype], params_stubs, title=None, txt_fmt=fmt_2) #special table #------------- #TODO: exists in linear_model, what about other models #residual diagnostics #output options #-------------- #TODO: JP the rest needs to be fixed, similar to summary in linear_model def ols_printer(): """ print summary table for ols models """ table = str(general_table) + '\n' + str(parameter_table) return table def glm_printer(): table = str(general_table) + '\n' + str(parameter_table) return table printers = {'OLS': ols_printer, 'GLM': glm_printer} if returns == 'print': try: return printers[modeltype]() except KeyError: return printers['OLS']()
print(shannonentropy(Y)) p = [1e-5,1e-4,.001,.01,.1,.15,.2,.25,.3,.35,.4,.45,.5] plt.subplot(111) plt.ylabel("Information") plt.xlabel("Probability") x = np.linspace(0,1,100001) plt.plot(x, shannoninfo(x)) # plt.show() plt.subplot(111) plt.ylabel("Entropy") plt.xlabel("Probability") x = np.linspace(0,1,101) plt.plot(x, lmap(shannonentropy, lzip(x,1-x))) # plt.show() # define a joint probability distribution # from Golan (2008) table 3.3 w = np.array([[0,0,1./3],[1/9.,1/9.,1/9.],[1/18.,1/9.,1/6.]]) # table 3.4 px = w.sum(0) py = w.sum(1) H_X = shannonentropy(px) H_Y = shannonentropy(py) H_XY = shannonentropy(w) H_XgivenY = condentropy(px,py,w) H_YgivenX = condentropy(py,px,w) # note that cross-entropy is not a distance measure as the following shows D_YX = logbasechange(2,np.e)*stats.entropy(px, py)
xihistory = np.zeros((2,nobs)) for i in range(1,nobs): xihistory[:,i] = np.dot(F,xihistory[:,i-1]) + \ np.dot(cholQ,np.random.randn(2,1)).squeeze() # this makes an ARMA process? # check notes, do the math y = np.dot(H.T, xihistory) y = y.T params = np.array([rho, sigma1, sigma2]) penalty = 1e5 upperbounds = np.array([.999, 100, 100]) lowerbounds = np.array([-.999, .001, .001]) xi10 = xihistory[:,0] ntrain = 1 bounds = lzip(lowerbounds,upperbounds) # if you use fmin_l_bfgs_b # results = optimize.fmin_bfgs(updatematrices, params, # args=(y,xi10,ntrain,penalty,upperbounds,lowerbounds), # gtol = 1e-8, epsilon=1e-10) # array([ 0.83111567, 1.2695249 , 0.61436685]) F = lambda x : np.array([[x[0],0],[0,0]]) def Q(x): cholQ = np.array([[x[1],0],[0,x[2]]]) return np.dot(cholQ,cholQ.T) H = np.ones((2,1)) # ssm_model = StateSpaceModel(y) # need to pass in Xi10! # ssm_model.fit_kalman(start_params=params, xi10=xi10, F=F, Q=Q, H=H, # upperbounds=upperbounds, lowerbounds=lowerbounds)
def categorical(data, col=None, dictnames=False, drop=False, ): ''' Returns a dummy matrix given an array of categorical variables. Parameters ---------- data : array A structured array, recarray, or array. This can be either a 1d vector of the categorical variable or a 2d array with the column specifying the categorical variable specified by the col argument. col : 'string', int, or None If data is a structured array or a recarray, `col` can be a string that is the name of the column that contains the variable. For all arrays `col` can be an int that is the (zero-based) column index number. `col` can only be None for a 1d array. The default is None. dictnames : bool, optional If True, a dictionary mapping the column number to the categorical name is returned. Used to have information about plain arrays. drop : bool Whether or not keep the categorical variable in the returned matrix. Returns -------- dummy_matrix, [dictnames, optional] A matrix of dummy (indicator/binary) float variables for the categorical data. If dictnames is True, then the dictionary is returned as well. Notes ----- This returns a dummy variable for EVERY distinct variable. If a a structured or recarray is provided, the names for the new variable is the old variable name - underscore - category name. So if the a variable 'vote' had answers as 'yes' or 'no' then the returned array would have to new variables-- 'vote_yes' and 'vote_no'. There is currently no name checking. Examples -------- >>> import numpy as np >>> import statsmodels.api as sm Univariate examples >>> import string >>> string_var = [string.lowercase[0:5], string.lowercase[5:10], \ string.lowercase[10:15], string.lowercase[15:20], \ string.lowercase[20:25]] >>> string_var *= 5 >>> string_var = np.asarray(sorted(string_var)) >>> design = sm.tools.categorical(string_var, drop=True) Or for a numerical categorical variable >>> instr = np.floor(np.arange(10,60, step=2)/10) >>> design = sm.tools.categorical(instr, drop=True) With a structured array >>> num = np.random.randn(25,2) >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'), \ ('instrument','f4'),('str_instr','a5')]) >>> struct_ar['var1'] = num[:,0][:,None] >>> struct_ar['var2'] = num[:,1][:,None] >>> struct_ar['instrument'] = instr[:,None] >>> struct_ar['str_instr'] = string_var[:,None] >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True) Or >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True) ''' if isinstance(col, (list, tuple)): try: assert len(col) == 1 col = col[0] except: raise ValueError("Can only convert one column at a time") # TODO: add a NameValidator function # catch recarrays and structured arrays if data.dtype.names or data.__class__ is np.recarray: if not col and np.squeeze(data).ndim > 1: raise IndexError("col is None and the input array is not 1d") if isinstance(col, int): col = data.dtype.names[col] if col is None and data.dtype.names and len(data.dtype.names) == 1: col = data.dtype.names[0] tmp_arr = np.unique(data[col]) # if the cols are shape (#,) vs (#,1) need to add an axis and flip _swap = True if data[col].ndim == 1: tmp_arr = tmp_arr[:, None] _swap = False tmp_dummy = (tmp_arr == data[col]).astype(float) if _swap: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) if not tmp_arr.dtype.names: # how do we get to this code path? tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)] elif tmp_arr.dtype.names: tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())] # prepend the varname and underscore, if col is numeric attribute # lookup is lost for recarrays... if col is None: try: col = data.dtype.names[0] except: col = 'var' # TODO: the above needs to be made robust because there could be many # var_yes, var_no varaibles for instance. tmp_arr = [col + '_' + item for item in tmp_arr] # TODO: test this for rec and structured arrays!!! if drop is True: if len(data.dtype) <= 1: if tmp_dummy.shape[0] < tmp_dummy.shape[1]: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr)) # preserve array type return np.array(lmap(tuple, tmp_dummy.tolist()), dtype=dt).view(type(data)) data = nprf.drop_fields(data, col, usemask=False, asrecarray=type(data) is np.recarray) data = nprf.append_fields(data, tmp_arr, data=tmp_dummy, usemask=False, asrecarray=type(data) is np.recarray) return data # handle ndarrays and catch array-like for an error elif data.__class__ is np.ndarray or not isinstance(data, np.ndarray): if not isinstance(data, np.ndarray): raise NotImplementedError("Array-like objects are not supported") if isinstance(col, int): offset = data.shape[1] # need error catching here? tmp_arr = np.unique(data[:, col]) tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: offset -= 1 data = np.delete(data, col, axis=1).astype(float) data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset) return data, col_map return data elif col is None and np.squeeze(data).ndim == 1: tmp_arr = np.unique(data) tmp_dummy = (tmp_arr[:, None] == data).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: if dictnames is True: col_map = _make_dictnames(tmp_arr) return tmp_dummy, col_map return tmp_dummy else: data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset=1) return data, col_map return data else: raise IndexError("The index %s is not understood" % col)
stop_on_error = True filelist = ['example_glsar.py', 'example_wls.py', 'example_gls.py', 'example_glm.py', 'example_ols_tftest.py', #'example_rpy.py', 'example_ols.py', 'example_ols_minimal.py', 'example_rlm.py', 'example_discrete.py', 'example_predict.py', 'example_ols_table.py', 'tut_ols.py', 'tut_ols_rlm.py', 'tut_ols_wls.py'] use_glob = True if use_glob: import glob filelist = glob.glob('*.py') print(lzip(range(len(filelist)), filelist)) for fname in ['run_all.py', 'example_rpy.py']: filelist.remove(fname) #filelist = filelist[15:] #temporarily disable show plt_show = plt.show def noop(*args): pass plt.show = noop cont = input("""Are you sure you want to run all of the examples?
def read_ch(fname): with open(fname) as f: lines = f.readlines() ps,rs,vs,qs = lzip(*[L.split(',') for L in lines]) return lmap(float, ps), lmap(float, rs),lmap(float, vs), lmap(float, qs)
def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True, skip_header=False, title=None): '''create a summary table for the parameters Parameters ---------- res : results instance some required information is directly taken from the result instance yname : string or None optional name for the endogenous variable, default is "y" xname : list of strings or None optional names for the exogenous variables, default is "var_xx" alpha : float significance level for the confidence intervals use_t : bool indicator whether the p-values are based on the Student-t distribution (if True) or on the normal distribution (if False) skip_headers : bool If false (default), then the header row is added. If true, then no header row is added. Returns ------- params_table : SimpleTable instance ''' # Parameters part of the summary table # ------------------------------------ # Note: this is not necessary since we standardized names, # only t versus normal if isinstance(results, tuple): # for multivariate endog # TODO: check whether I don't want to refactor this #we need to give parameter alpha to conf_int results, params, std_err, tvalues, pvalues, conf_int = results else: params = results.params std_err = results.bse tvalues = results.tvalues #is this sometimes called zvalues pvalues = results.pvalues conf_int = results.conf_int(alpha) # Dictionary to store the header names for the parameter part of the # summary table. look up by modeltype if use_t: param_header = [ 'coef', 'std err', 't', 'P>|t|', '[' + str(alpha / 2), str(1 - alpha / 2) + ']' ] else: param_header = [ 'coef', 'std err', 'z', 'P>|z|', '[' + str(alpha / 2), str(1 - alpha / 2) + ']' ] if skip_header: param_header = None _, xname = _getnames(results, yname=yname, xname=xname) if len(xname) != len(params): raise ValueError('xnames and params do not have the same length') params_stubs = xname exog_idx = lrange(len(xname)) params_data = lzip([forg(params[i], prec=4) for i in exog_idx], [forg(std_err[i]) for i in exog_idx], [forg(tvalues[i]) for i in exog_idx], ["%#6.3f" % (pvalues[i]) for i in exog_idx], [forg(conf_int[i, 0]) for i in exog_idx], [forg(conf_int[i, 1]) for i in exog_idx]) parameter_table = SimpleTable(params_data, param_header, params_stubs, title=title, txt_fmt=fmt_params) return parameter_table
def acf(x, unbiased=False, nlags=40, confint=None, qstat=False, fft=False, alpha=None): ''' Autocorrelation function for 1d arrays. Parameters ---------- x : array Time series data unbiased : bool If True, then denominators for autocovariance are n-k, otherwise n nlags: int, optional Number of lags to return autocorrelation for. confint : scalar, optional The use of confint is deprecated. See `alpha`. If a number is given, the confidence intervals for the given level are returned. For instance if confint=95, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett\'s formula. qstat : bool, optional If True, returns the Ljung-Box q statistic for each autocorrelation coefficient. See q_stat for more information. fft : bool, optional If True, computes the ACF via FFT. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett\'s formula. Returns ------- acf : array autocorrelation function confint : array, optional Confidence intervals for the ACF. Returned if confint is not None. qstat : array, optional The Ljung-Box Q-Statistic. Returned if q_stat is True. pvalues : array, optional The p-values associated with the Q-statistics. Returned if q_stat is True. Notes ----- The acf at lag 0 (ie., 1) is returned. This is based np.correlate which does full convolution. For very long time series it is recommended to use fft convolution instead. If unbiased is true, the denominator for the autocovariance is adjusted but the autocorrelation is not an unbiased estimtor. ''' nobs = len(x) d = nobs # changes if unbiased if not fft: avf = acovf(x, unbiased=unbiased, demean=True) #acf = np.take(avf/avf[0], range(1,nlags+1)) acf = avf[:nlags + 1] / avf[0] else: #JP: move to acovf x0 = x - x.mean() # ensure that we always use a power of 2 or 3 for zero-padding, # this way we'll ensure O(n log n) runtime of the fft. n = _next_regular(2 * nobs + 1) Frf = np.fft.fft(x0, n=n) # zero-pad for separability if unbiased: d = nobs - np.arange(nobs) acf = np.fft.ifft(Frf * np.conjugate(Frf))[:nobs] / d acf /= acf[0] #acf = np.take(np.real(acf), range(1,nlags+1)) acf = np.real(acf[:nlags + 1]) # keep lag 0 if not (confint or qstat or alpha): return acf if not confint is None: import warnings warnings.warn("confint is deprecated. Please use the alpha keyword", FutureWarning) varacf = np.ones(nlags + 1) / nobs varacf[0] = 0 varacf[1] = 1. / nobs varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2) interval = stats.norm.ppf(1 - (100 - confint) / 200.) * np.sqrt(varacf) confint = np.array(lzip(acf - interval, acf + interval)) if not qstat: return acf, confint if alpha is not None: varacf = np.ones(nlags + 1) / nobs varacf[0] = 0 varacf[1] = 1. / nobs varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2) interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf) confint = np.array(lzip(acf - interval, acf + interval)) if not qstat: return acf, confint if qstat: qstat, pvalue = q_stat(acf[1:], nobs=nobs) # drop lag 0 if (confint is not None or alpha is not None): return acf, confint, qstat, pvalue else: return acf, qstat, pvalue
freq=_freq_to_pandas[freq])) - 1 _quarter_to_day = { "1": (3, 31), "2": (6, 30), "3": (9, 30), "4": (12, 31), "I": (3, 31), "II": (6, 30), "III": (9, 30), "IV": (12, 31) } _mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] _months_with_days = lzip(lrange(1, 13), _mdays) _month_to_day = dict(zip(map(str, lrange(1, 13)), _months_with_days)) _month_to_day.update( dict( zip([ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII" ], _months_with_days))) # regex patterns _y_pattern = '^\d?\d?\d?\d$' _q_pattern = ''' ^ # beginning of string \d?\d?\d?\d # match any number 1-9999, includes leading zeros
def acf(x, unbiased=False, nlags=40, qstat=False, fft=False, alpha=None, missing='none'): """ Autocorrelation function for 1d arrays. Parameters ---------- x : array Time series data unbiased : bool If True, then denominators for autocovariance are n-k, otherwise n nlags: int, optional Number of lags to return autocorrelation for. qstat : bool, optional If True, returns the Ljung-Box q statistic for each autocorrelation coefficient. See q_stat for more information. fft : bool, optional If True, computes the ACF via FFT. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett\'s formula. missing : str, optional A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs are to be treated. Returns ------- acf : array autocorrelation function confint : array, optional Confidence intervals for the ACF. Returned if confint is not None. qstat : array, optional The Ljung-Box Q-Statistic. Returned if q_stat is True. pvalues : array, optional The p-values associated with the Q-statistics. Returned if q_stat is True. Notes ----- The acf at lag 0 (ie., 1) is returned. This is based np.correlate which does full convolution. For very long time series it is recommended to use fft convolution instead. If unbiased is true, the denominator for the autocovariance is adjusted but the autocorrelation is not an unbiased estimtor. References ---------- .. [*] Parzen, E., 1963. On spectral analysis with missing observations and amplitude modulation. Sankhya: The Indian Journal of Statistics, Series A, pp.383-392. """ nobs = len(x) # should this shrink for missing='drop' and NaNs in x? avf = acovf(x, unbiased=unbiased, demean=True, fft=fft, missing=missing) acf = avf[:nlags + 1] / avf[0] if not (qstat or alpha): return acf if alpha is not None: varacf = np.ones(nlags + 1) / nobs varacf[0] = 0 varacf[1] = 1. / nobs varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2) interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf) confint = np.array(lzip(acf - interval, acf + interval)) if not qstat: return acf, confint if qstat: qstat, pvalue = q_stat(acf[1:], nobs=nobs) # drop lag 0 if alpha is not None: return acf, confint, qstat, pvalue else: return acf, qstat, pvalue
def categorical(data, col=None, dictnames=False, drop=False): ''' Returns a dummy matrix given an array of categorical variables. Parameters ---------- data : array A structured array, recarray, array, Series or DataFrame. This can be either a 1d vector of the categorical variable or a 2d array with the column specifying the categorical variable specified by the col argument. col : {str, int, None} If data is a DataFrame col must in a column of data. If data is a Series, col must be either the name of the Series or None. If data is a structured array or a recarray, `col` can be a string that is the name of the column that contains the variable. For all other arrays `col` can be an int that is the (zero-based) column index number. `col` can only be None for a 1d array. The default is None. dictnames : bool, optional If True, a dictionary mapping the column number to the categorical name is returned. Used to have information about plain arrays. drop : bool Whether or not keep the categorical variable in the returned matrix. Returns ------- dummy_matrix, [dictnames, optional] A matrix of dummy (indicator/binary) float variables for the categorical data. If dictnames is True, then the dictionary is returned as well. Notes ----- This returns a dummy variable for EVERY distinct variable. If a a structured or recarray is provided, the names for the new variable is the old variable name - underscore - category name. So if the a variable 'vote' had answers as 'yes' or 'no' then the returned array would have to new variables-- 'vote_yes' and 'vote_no'. There is currently no name checking. Examples -------- >>> import numpy as np >>> import statsmodels.api as sm Univariate examples >>> import string >>> string_var = [string.ascii_lowercase[0:5], \ string.ascii_lowercase[5:10], \ string.ascii_lowercase[10:15], \ string.ascii_lowercase[15:20], \ string.ascii_lowercase[20:25]] >>> string_var *= 5 >>> string_var = np.asarray(sorted(string_var)) >>> design = sm.tools.categorical(string_var, drop=True) Or for a numerical categorical variable >>> instr = np.floor(np.arange(10,60, step=2)/10) >>> design = sm.tools.categorical(instr, drop=True) With a structured array >>> num = np.random.randn(25,2) >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'), \ ('instrument','f4'),('str_instr','a5')]) >>> struct_ar['var1'] = num[:,0][:,None] >>> struct_ar['var2'] = num[:,1][:,None] >>> struct_ar['instrument'] = instr[:,None] >>> struct_ar['str_instr'] = string_var[:,None] >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True) Or >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True) ''' # TODO: add a NameValidator function if isinstance(col, (list, tuple)): if len(col) == 1: col = col[0] else: raise ValueError("Can only convert one column at a time") if (not isinstance(data, (pd.DataFrame, pd.Series)) and not isinstance(col, (string_types, int)) and col is not None): raise TypeError('col must be a str, int or None') # Pull out a Series from a DataFrame if provided if isinstance(data, pd.DataFrame): if col is None: raise TypeError('col must be a str or int when using a DataFrame') elif col not in data: raise ValueError('Column \'{0}\' not found in data'.format(col)) data = data[col] # Set col to None since we not have a Series col = None if isinstance(data, pd.Series): if col is not None and data.name != col: raise ValueError('data.name does not match col ' '\'{0}\''.format(col)) data_cat = pd.Categorical(data) dummies = pd.get_dummies(data_cat) col_map = { i: cat for i, cat in enumerate(data_cat.categories) if cat in dummies } if not drop: dummies.columns = list(dummies.columns) dummies = pd.concat([dummies, data], 1) if dictnames: return dummies, col_map return dummies # catch recarrays and structured arrays elif data.dtype.names or data.__class__ is np.recarray: if not col and np.squeeze(data).ndim > 1: raise IndexError("col is None and the input array is not 1d") if isinstance(col, (int, long)): col = data.dtype.names[col] if col is None and data.dtype.names and len(data.dtype.names) == 1: col = data.dtype.names[0] tmp_arr = np.unique(data[col]) # if the cols are shape (#,) vs (#,1) need to add an axis and flip _swap = True if data[col].ndim == 1: tmp_arr = tmp_arr[:, None] _swap = False tmp_dummy = (tmp_arr == data[col]).astype(float) if _swap: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) if not tmp_arr.dtype.names: # how do we get to this code path? tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)] elif tmp_arr.dtype.names: tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())] # prepend the varname and underscore, if col is numeric attribute # lookup is lost for recarrays... if col is None: try: col = data.dtype.names[0] except: col = 'var' # TODO: the above needs to be made robust because there could be many # var_yes, var_no varaibles for instance. tmp_arr = [col + '_' + item for item in tmp_arr] # TODO: test this for rec and structured arrays!!! if drop is True: if len(data.dtype) <= 1: if tmp_dummy.shape[0] < tmp_dummy.shape[1]: tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) dt = lzip(tmp_arr, [tmp_dummy.dtype.str] * len(tmp_arr)) # preserve array type return np.array(lmap(tuple, tmp_dummy.tolist()), dtype=dt).view(type(data)) data = nprf.drop_fields(data, col, usemask=False, asrecarray=type(data) is np.recarray) data = nprf.append_fields(data, tmp_arr, data=tmp_dummy, usemask=False, asrecarray=type(data) is np.recarray) return data # Catch array_like for an error elif not isinstance(data, np.ndarray): raise NotImplementedError("array_like objects are not supported") else: if isinstance(col, (int, long)): offset = data.shape[1] # need error catching here? tmp_arr = np.unique(data[:, col]) tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: offset -= 1 data = np.delete(data, col, axis=1).astype(float) data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset) return data, col_map return data elif col is None and np.squeeze(data).ndim == 1: tmp_arr = np.unique(data) tmp_dummy = (tmp_arr[:, None] == data).astype(float) tmp_dummy = tmp_dummy.swapaxes(1, 0) if drop is True: if dictnames is True: col_map = _make_dictnames(tmp_arr) return tmp_dummy, col_map return tmp_dummy else: data = np.column_stack((data, tmp_dummy)) if dictnames is True: col_map = _make_dictnames(tmp_arr, offset=1) return data, col_map return data else: raise IndexError("The index %s is not understood" % col)