def webuse(data, baseurl='http://www.stata-press.com/data/r11/', as_df=True): """ Parameters ---------- data : str Name of dataset to fetch. baseurl : str The base URL to the stata datasets. as_df : bool If True, returns a `pandas.DataFrame` Returns ------- dta : Record Array A record array containing the Stata dataset. Examples -------- >>> dta = webuse('auto') Notes ----- Make sure baseurl has trailing forward slash. Doesn't do any error checking in response URLs. """ # lazy imports from statsmodels.iolib import genfromdta url = urljoin(baseurl, data+'.dta') dta = urlopen(url) dta = StringIO(dta.read()) # make it truly file-like if as_df: # could make this faster if we don't process dta twice? return DataFrame.from_records(genfromdta(dta)) else: return genfromdta(dta)
def _coef_table(self): model = self.model k = model.neqs Xnames = self.model.exog_names data = lzip(model.params.T.ravel(), model.stderr.T.ravel(), model.tvalues.T.ravel(), model.pvalues.T.ravel()) header = ('coefficient', 'std. error', 't-stat', 'prob') buf = StringIO() dim = k * model.k_ar + model.k_trend for i in range(k): section = "Results for equation %s" % model.names[i] buf.write(section + '\n') #print >> buf, section table = SimpleTable(data[dim * i:dim * (i + 1)], header, Xnames, title=None, txt_fmt=self.default_fmt) buf.write(str(table) + '\n') if i < k - 1: buf.write('\n') return buf.getvalue()
def _coef_table(self): model = self.model k = model.neqs Xnames = self.model.exog_names data = lzip(model.params.T.ravel(), model.stderr.T.ravel(), model.tvalues.T.ravel(), model.pvalues.T.ravel()) header = ('coefficient','std. error','t-stat','prob') buf = StringIO() dim = k * model.k_ar + model.k_trend for i in range(k): section = "Results for equation %s" % model.names[i] buf.write(section + '\n') #print >> buf, section table = SimpleTable(data[dim * i : dim * (i + 1)], header, Xnames, title=None, txt_fmt = self.default_fmt) buf.write(str(table) + '\n') if i < k - 1: buf.write('\n') return buf.getvalue()
def make(self, endog_names=None, exog_names=None): """ Summary of VAR model """ buf = StringIO() buf.write(self._header_table() + '\n') buf.write(self._stats_table() + '\n') buf.write(self._coef_table() + '\n') buf.write(self._resid_info() + '\n') return buf.getvalue()
def test_formula_labels(): # make sure labels pass through patsy as expected # data(Duncan) from car in R dta = StringIO(""""type","income","education","prestige"\n"accountant","prof",62,86,82\n"pilot","prof",72,76,83\n"architect","prof",75,92,90\n"author","prof",55,90,76\n"chemist","prof",64,86,90\n"minister","prof",21,84,87\n"professor","prof",64,93,93\n"dentist","prof",80,100,90\n"reporter","wc",67,87,52\n"engineer","prof",72,86,88\n"undertaker","prof",42,74,57\n"lawyer","prof",76,98,89\n"physician","prof",76,97,97\n"welfare.worker","prof",41,84,59\n"teacher","prof",48,91,73\n"conductor","wc",76,34,38\n"contractor","prof",53,45,76\n"factory.owner","prof",60,56,81\n"store.manager","prof",42,44,45\n"banker","prof",78,82,92\n"bookkeeper","wc",29,72,39\n"mail.carrier","wc",48,55,34\n"insurance.agent","wc",55,71,41\n"store.clerk","wc",29,50,16\n"carpenter","bc",21,23,33\n"electrician","bc",47,39,53\n"RR.engineer","bc",81,28,67\n"machinist","bc",36,32,57\n"auto.repairman","bc",22,22,26\n"plumber","bc",44,25,29\n"gas.stn.attendant","bc",15,29,10\n"coal.miner","bc",7,7,15\n"streetcar.motorman","bc",42,26,19\n"taxi.driver","bc",9,19,10\n"truck.driver","bc",21,15,13\n"machine.operator","bc",21,20,24\n"barber","bc",16,26,20\n"bartender","bc",16,28,7\n"shoe.shiner","bc",9,17,3\n"cook","bc",14,22,16\n"soda.clerk","bc",12,30,6\n"watchman","bc",17,25,11\n"janitor","bc",7,20,8\n"policeman","bc",34,47,41\n"waiter","bc",8,32,10""") from pandas import read_csv dta = read_csv(dta) model = ols("prestige ~ income + education", dta).fit() assert_equal(model.fittedvalues.index, dta.index)
def test_formula_labels(): # make sure labels pass through patsy as expected # data(Duncan) from car in R dta = StringIO(""""type" "income" "education" "prestige"\n"accountant" "prof" 62 86 82\n"pilot" "prof" 72 76 83\n"architect" "prof" 75 92 90\n"author" "prof" 55 90 76\n"chemist" "prof" 64 86 90\n"minister" "prof" 21 84 87\n"professor" "prof" 64 93 93\n"dentist" "prof" 80 100 90\n"reporter" "wc" 67 87 52\n"engineer" "prof" 72 86 88\n"undertaker" "prof" 42 74 57\n"lawyer" "prof" 76 98 89\n"physician" "prof" 76 97 97\n"welfare.worker" "prof" 41 84 59\n"teacher" "prof" 48 91 73\n"conductor" "wc" 76 34 38\n"contractor" "prof" 53 45 76\n"factory.owner" "prof" 60 56 81\n"store.manager" "prof" 42 44 45\n"banker" "prof" 78 82 92\n"bookkeeper" "wc" 29 72 39\n"mail.carrier" "wc" 48 55 34\n"insurance.agent" "wc" 55 71 41\n"store.clerk" "wc" 29 50 16\n"carpenter" "bc" 21 23 33\n"electrician" "bc" 47 39 53\n"RR.engineer" "bc" 81 28 67\n"machinist" "bc" 36 32 57\n"auto.repairman" "bc" 22 22 26\n"plumber" "bc" 44 25 29\n"gas.stn.attendant" "bc" 15 29 10\n"coal.miner" "bc" 7 7 15\n"streetcar.motorman" "bc" 42 26 19\n"taxi.driver" "bc" 9 19 10\n"truck.driver" "bc" 21 15 13\n"machine.operator" "bc" 21 20 24\n"barber" "bc" 16 26 20\n"bartender" "bc" 16 28 7\n"shoe.shiner" "bc" 9 17 3\n"cook" "bc" 14 22 16\n"soda.clerk" "bc" 12 30 6\n"watchman" "bc" 17 25 11\n"janitor" "bc" 7 20 8\n"policeman" "bc" 34 47 41\n"waiter" "bc" 8 32 10""") from pandas import read_table dta = read_table(dta, sep=" ") model = ols("prestige ~ income + education", dta).fit() assert_equal(model.fittedvalues.index, dta.index)
def print_ic_table(ics, selected_orders): """ For VAR order selection """ # Can factor this out into a utility method if so desired cols = sorted(ics) data = mat([["%#10.4g" % v for v in ics[c]] for c in cols], dtype=object).T # start minimums for i, col in enumerate(cols): idx = int(selected_orders[col]), i data[idx] = data[idx] + '*' # data[idx] = data[idx][:-1] + '*' # super hack, ugh fmt = dict(_default_table_fmt, data_fmts=("%s", ) * len(cols)) buf = StringIO() table = SimpleTable(data, cols, lrange(len(data)), title='VAR Order Selection', txt_fmt=fmt) buf.write(str(table) + '\n') buf.write('* Minimum' + '\n') print(buf.getvalue())
def pprint_matrix(values, rlabels, clabels, col_space=None): buf = StringIO() T, K = len(rlabels), len(clabels) if col_space is None: min_space = 10 col_space = [max(len(str(c)) + 2, min_space) for c in clabels] else: col_space = (col_space, ) * K row_space = max([len(str(x)) for x in rlabels]) + 2 head = _pfixed('', row_space) for j, h in enumerate(clabels): head += _pfixed(h, col_space[j]) buf.write(head + '\n') for i, rlab in enumerate(rlabels): line = ('%s' % rlab).ljust(row_space) for j in range(K): line += _pfixed(values[i, j], col_space[j]) buf.write(line + '\n') return buf.getvalue()
def _resid_info(self): buf = StringIO() names = self.model.names buf.write("Correlation matrix of residuals" + '\n') buf.write(pprint_matrix(self.model.resid_corr, names, names) + '\n') return buf.getvalue()
def load_basic_data(): raw_csv = StringIO( "res,qual\n2.00,=\n4.20,=\n4.62,=\n5.00,ND\n5.00,ND\n5.50,ND\n" "5.57,=\n5.66,=\n5.75,ND\n5.86,=\n6.65,=\n6.78,=\n6.79,=\n7.50,=\n" "7.50,=\n7.50,=\n8.63,=\n8.71,=\n8.99,=\n9.50,ND\n9.50,ND\n9.85,=\n" "10.82,=\n11.00,ND\n11.25,=\n11.25,=\n12.20,=\n14.92,=\n16.77,=\n" "17.81,=\n19.16,=\n19.19,=\n19.64,=\n20.18,=\n22.97,=\n") df = (pandas.read_csv(raw_csv).assign(conc=lambda df: df['res']).assign( censored=lambda df: df['qual'] == 'ND')) return df
def _get_dataset_meta(dataname, package, cache): # get the index, you'll probably want this cached because you have # to download info about all the data to get info about any of the data... index_url = ("https://raw.githubusercontent.com/vincentarelbundock/" "Rdatasets/master/datasets.csv") data, _ = _urlopen_cached(index_url, cache) data = data.decode('utf-8', 'strict') index = read_csv(StringIO(data)) idx = np.logical_and(index.Item == dataname, index.Package == package) dataset_meta = index.loc[idx] return dataset_meta["Title"].item()
def _get_data(base_url, dataname, cache, extension="csv"): url = base_url + (dataname + ".%s") % extension try: data, from_cache = _urlopen_cached(url, cache) except HTTPError as err: if '404' in str(err): raise ValueError("Dataset %s was not found." % dataname) else: raise err data = data.decode('utf-8', 'strict') return StringIO(data), from_cache
def pprint_matrix(values, rlabels, clabels, col_space=None): buf = StringIO() T, K = len(rlabels), len(clabels) if col_space is None: min_space = 10 col_space = [max(len(str(c)) + 2, min_space) for c in clabels] else: col_space = (col_space,) * K row_space = max([len(str(x)) for x in rlabels]) + 2 head = _pfixed('', row_space) for j, h in enumerate(clabels): head += _pfixed(h, col_space[j]) buf.write(head + '\n') for i, rlab in enumerate(rlabels): line = ('%s' % rlab).ljust(row_space) for j in range(K): line += _pfixed(values[i,j], col_space[j]) buf.write(line + '\n') return buf.getvalue()
def print_ic_table(ics, selected_orders): """ For VAR order selection """ # Can factor this out into a utility method if so desired cols = sorted(ics) data = mat([["%#10.4g" % v for v in ics[c]] for c in cols], dtype=object).T # start minimums for i, col in enumerate(cols): idx = int(selected_orders[col]), i data[idx] = data[idx] + '*' # data[idx] = data[idx][:-1] + '*' # super hack, ugh fmt = dict(_default_table_fmt, data_fmts=("%s",) * len(cols)) buf = StringIO() table = SimpleTable(data, cols, lrange(len(data)), title='VAR Order Selection', txt_fmt=fmt) buf.write(str(table) + '\n') buf.write('* Minimum' + '\n') print(buf.getvalue())
def summary(self): buf = StringIO() rng = lrange(self.periods) for i in range(self.neqs): ppm = output.pprint_matrix(self.decomp[i], rng, self.names) buf.write('FEVD for %s\n' % self.names[i]) buf.write(ppm + '\n') print(buf.getvalue())
def check_pickle(obj): fh =StringIO() cPickle.dump(obj, fh) plen = fh.pos fh.seek(0,0) res = cPickle.load(fh) fh.close() return res, plen
def get_ic_table(ics, selected_orders): ''' 该方法将滞后阶数结果转换为表格化的分析结果 :param ics: 滞后阶数结果 :param selected_orders: 最大滞后阶数 :return: 返回表格化的滞后阶数分析结果 ''' _default_table_fmt = dict(empty_cell='', colsep=' ', row_pre='', row_post='', table_dec_above='=', table_dec_below='=', header_dec_below='-', header_fmt='%s', stub_fmt='%s', title_align='c', header_align='r', data_aligns='r', stubs_align='l', fmt='txt') cols = sorted(ics) data = np.array([["%#10.4g" % v for v in ics[c]] for c in cols], dtype=object).T for i, col in enumerate(cols): idx = int(selected_orders[col]), i data[idx] = data[idx] + '*' fmt = dict(_default_table_fmt, data_fmts=("%s", ) * len(cols)) buf = StringIO() table = SimpleTable(data, cols, lrange(len(data)), title='VAR Order Selection', txt_fmt=fmt) buf.write(str(table) + '\n') buf.write('* Minimum' + '\n') return buf.getvalue()
kidney_table = StringIO("""Days Duration Weight ID 0.0 1 1 1 2.0 1 1 2 1.0 1 1 3 3.0 1 1 4 0.0 1 1 5 2.0 1 1 6 0.0 1 1 7 5.0 1 1 8 6.0 1 1 9 8.0 1 1 10 2.0 1 2 1 4.0 1 2 2 7.0 1 2 3 12.0 1 2 4 15.0 1 2 5 4.0 1 2 6 3.0 1 2 7 1.0 1 2 8 5.0 1 2 9 20.0 1 2 10 15.0 1 3 1 10.0 1 3 2 8.0 1 3 3 5.0 1 3 4 25.0 1 3 5 16.0 1 3 6 7.0 1 3 7 30.0 1 3 8 3.0 1 3 9 27.0 1 3 10 0.0 2 1 1 1.0 2 1 2 1.0 2 1 3 0.0 2 1 4 4.0 2 1 5 2.0 2 1 6 7.0 2 1 7 4.0 2 1 8 0.0 2 1 9 3.0 2 1 10 5.0 2 2 1 3.0 2 2 2 2.0 2 2 3 0.0 2 2 4 1.0 2 2 5 1.0 2 2 6 3.0 2 2 7 6.0 2 2 8 7.0 2 2 9 9.0 2 2 10 10.0 2 3 1 8.0 2 3 2 12.0 2 3 3 3.0 2 3 4 7.0 2 3 5 15.0 2 3 6 4.0 2 3 7 9.0 2 3 8 6.0 2 3 9 1.0 2 3 10 """)
class Test_ROS_RNADAdata(CheckROSMixin): decimal = 3 datastring = StringIO(dedent("""\ res cen 0.090 True 0.090 True 0.090 True 0.101 False 0.136 False 0.340 False 0.457 False 0.514 False 0.629 False 0.638 False 0.774 False 0.788 False 0.900 True 0.900 True 0.900 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 False 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.000 True 1.100 False 2.000 False 2.000 False 2.404 False 2.860 False 3.000 False 3.000 False 3.705 False 4.000 False 5.000 False 5.960 False 6.000 False 7.214 False 16.000 False 17.716 False 25.000 False 51.000 False""" )) rescol = 'res' cencol = 'cen' df = pandas.read_csv(datastring, sep='\s+') expected_final = numpy.array([ 0.01907990, 0.03826254, 0.06080717, 0.10100000, 0.13600000, 0.34000000, 0.45700000, 0.51400000, 0.62900000, 0.63800000, 0.77400000, 0.78800000, 0.08745914, 0.25257575, 0.58544205, 0.01711153, 0.03373885, 0.05287083, 0.07506079, 0.10081573, 1.00000000, 0.13070334, 0.16539309, 0.20569039, 0.25257575, 0.30725491, 0.37122555, 0.44636843, 0.53507405, 0.64042242, 0.76644378, 0.91850581, 1.10390531, 1.10000000, 2.00000000, 2.00000000, 2.40400000, 2.86000000, 3.00000000, 3.00000000, 3.70500000, 4.00000000, 5.00000000, 5.96000000, 6.00000000, 7.21400000, 16.00000000, 17.71600000, 25.00000000, 51.00000000 ]) expected_cohn = pandas.DataFrame({ 'nuncen_above': numpy.array([9., 0.0, 18., numpy.nan]), 'nobs_below': numpy.array([3., 15., 32., numpy.nan]), 'ncen_equal': numpy.array([3., 3., 17., numpy.nan]), 'prob_exceedance': numpy.array([0.84, 0.36, 0.36, 0]), })
def hypothesis_test_table(results, title, null_hyp): fmt = dict(_default_table_fmt, data_fmts=["%#15.6F","%#15.6F","%#15.3F", "%s"]) buf = StringIO() table = SimpleTable([[results['statistic'], results['crit_value'], results['pvalue'], str(results['df'])]], ['Test statistic', 'Critical Value', 'p-value', 'df'], [''], title=None, txt_fmt=fmt) buf.write(title + '\n') buf.write(str(table) + '\n') buf.write(null_hyp + '\n') buf.write("Conclusion: %s H_0" % results['conclusion']) buf.write(" at %.2f%% significance level" % (results['signif'] * 100)) return buf.getvalue()
def open(self, filename): fullfilename = [f for f in self.namelist() if filename in f][0] return StringIO(self.read(fullfilename))
def hypothesis_test_table(results, title, null_hyp): fmt = dict(_default_table_fmt, data_fmts=["%#15.6F", "%#15.6F", "%#15.3F", "%s"]) buf = StringIO() table = SimpleTable([[ results['statistic'], results['crit_value'], results['pvalue'], str(results['df']) ]], ['Test statistic', 'Critical Value', 'p-value', 'df'], [''], title=None, txt_fmt=fmt) buf.write(title + '\n') buf.write(str(table) + '\n') buf.write(null_hyp + '\n') buf.write("Conclusion: %s H_0" % results['conclusion']) buf.write(" at %.2f%% significance level" % (results['signif'] * 100)) return buf.getvalue()
]) cyl_labels = np.array([ 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Japan', 'Germany', 'France', 'Germany', 'Sweden', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'Sweden', 'USA', 'France', 'Japan', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'USA' ]) dta = np.recfromtxt(StringIO(ss), names=("Rust", "Brand", "Replication")) dta2 = np.recfromtxt(StringIO(ss2), names=("idx", "Treatment", "StressReduction")) dta3 = np.recfromtxt(StringIO(ss3), names=("Brand", "Relief")) from statsmodels.sandbox.stats.multicomp import tukeyhsd import statsmodels.sandbox.stats.multicomp as multi #print tukeyhsd(dta['Brand'], dta['Rust']) def get_thsd(mci): var_ = np.var(mci.groupstats.groupdemean(), ddof=len(mci.groupsunique)) means = mci.groupstats.groupmean nobs = mci.groupstats.groupnobs resi = tukeyhsd(means, nobs,
DEBUG = False ss = '''\ agecat smokes deaths pyears 1 1 32 52407 2 1 104 43248 3 1 206 28612 4 1 186 12663 5 1 102 5317 1 0 2 18790 2 0 12 10673 3 0 28 5710 4 0 28 2585 5 0 31 1462''' data = pd.read_csv(StringIO(ss), delimiter='\t') data = data.astype(int) data['logpyears'] = np.log(data['pyears']) class CheckPoissonConstrainedMixin(object): def test_basic(self): res1 = self.res1 res2 = self.res2 assert_allclose(res1[0], res2.params[self.idx], rtol=1e-6) # see below Stata has nan, we have zero bse1 = np.sqrt(np.diag(res1[1])) mask = (bse1 == 0) & np.isnan(res2.bse[self.idx]) assert_allclose(bse1[~mask], res2.bse[self.idx][~mask], rtol=1e-6) def test_basic_method(self):
Means Simultaneous 95% Confidence Limits Sign. 2 - 3 4.340 0.691 7.989 *** 2 - 1 4.600 0.951 8.249 *** 3 - 2 -4.340 -7.989 -0.691 *** 3 - 1 0.260 -3.389 3.909 - 1 - 2 -4.600 -8.249 -0.951 *** 1 - 3 -0.260 -3.909 3.389 ''' ss5 = '''\ 2 - 3 4.340 0.691 7.989 *** 2 - 1 4.600 0.951 8.249 *** 3 - 2 -4.340 -7.989 -0.691 *** 3 - 1 0.260 -3.389 3.909 - 1 - 2 -4.600 -8.249 -0.951 *** 1 - 3 -0.260 -3.909 3.389 ''' dta5 = np.recfromtxt(StringIO(ss5), names=('pair', 'mean', 'lower', 'upper', 'sig'), delimiter='\t') sas_ = dta5[[1, 3, 2]] confint1 = res3[1][4] confint2 = sas_[['lower', 'upper']].view(float).reshape((3, 2)) assert_almost_equal(confint1, confint2, decimal=2) reject1 = res3[1][1] reject2 = sas_['sig'] == '***' assert_equal(reject1, reject2) meandiff1 = res3[1][2] meandiff2 = sas_['mean'] assert_almost_equal(meandiff1, meandiff2, decimal=14)
def setup_module(): global _orig_stdout _orig_stdout = sys.stdout sys.stdout = StringIO()