def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] if exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog,) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (value_array,) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (value_array,) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (value_array,) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d) if missing == 'raise' and np.any(nan_mask): raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if combined_2d: combined.update(dict(zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update(dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
def test__reduce_dict(): data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), [1] * 8)) eq(_reduce_dict(data, ('m',)), 4) eq(_reduce_dict(data, ('m', 'o')), 2) eq(_reduce_dict(data, ('m', 'o', 'w')), 1) data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), lrange(8))) eq(_reduce_dict(data, ('m',)), 6) eq(_reduce_dict(data, ('m', 'o')), 1) eq(_reduce_dict(data, ('m', 'o', 'w')), 0)
def simulate(self): group_effect_var = self.dep_params[0] vcomp = self.dep_params[1:] vcomp.append(0) endog, exog, group, id_matrix = [], [], [], [] for i in range(self.ngroups): iterators = [lrange(n) for n in self.nest_sizes] # The random effects variances = [np.sqrt(v)*np.random.normal(size=n) for v,n in zip(vcomp, self.nest_sizes)] gpe = np.random.normal() * np.sqrt(group_effect_var) nest_all = [] for j in self.nest_sizes: nest_all.append(set()) for nest in product(*iterators): group.append(i) # The sum of all random effects that apply to this # unit ref = gpe + sum([v[j] for v,j in zip(variances, nest)]) exog1 = np.random.normal(size=5) exog1[0] = 1 exog.append(exog1) error = ref + self.error_sd * np.random.normal() endog1 = np.dot(exog1, self.params) + error endog.append(endog1) for j in range(len(nest)): nest_all[j].add(tuple(nest[0:j+1])) nest1 = [len(x)-1 for x in nest_all] id_matrix.append(nest1[0:-1]) self.exog = np.array(exog) self.endog = np.array(endog) self.group = np.array(group) self.id_matrix = np.array(id_matrix) self.time = np.zeros_like(self.endog)
def _band_quantiles(band, use_brute=use_brute, seed=seed): """ Find extreme curves for a quantile band. From the `band` of quantiles, the associated PDF extrema values are computed. If `min_alpha` is not provided (single quantile value), `max_pdf` is set to `1E6` in order not to constrain the problem on high values. An optimization is performed per component in order to find the min and max curves. This is done by comparing the PDF value of a given curve with the band PDF. Parameters ---------- band : array_like alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` use_brute : bool Use the brute force optimizer instead of the default differential evolution to find the curves. Default is False. seed : {None, int, np.random.RandomState} Seed value to pass to scipy.optimize.differential_evolution. Can be an integer or RandomState instance. If None, then the default RandomState provided by np.random is used. Returns ------- band_quantiles : list of 1-D array ``(max_quantile, min_quantile)`` (2, n_features) """ min_pdf = pvalues[alpha.index(band[0])] try: max_pdf = pvalues[alpha.index(band[1])] except IndexError: max_pdf = 1E6 band = [min_pdf, max_pdf] pool = Pool() data = zip(range(dim), itertools.repeat((band, pca, bounds, ks_gaussian, seed, use_brute))) band_quantiles = pool.map(_min_max_band, data) pool.terminate() pool.close() band_quantiles = list(zip(*band_quantiles)) return band_quantiles
def test_freq_to_period(): from pandas.tseries.frequencies import to_offset freqs = ['A', 'AS-MAR', 'Q', 'QS', 'QS-APR', 'W', 'W-MON', 'B'] expected = [1, 1, 4, 4, 4, 52, 52, 52] for i, j in zip(freqs, expected): assert_equal(tools.freq_to_period(i), j) assert_equal(tools.freq_to_period(to_offset(i)), j)
def test_ic(): #test information criteria #consistency check ics = [aic, aicc, bic, hqic] ics_sig = [aic_sigma, aicc_sigma, bic_sigma, hqic_sigma] for ic, ic_sig in zip(ics, ics_sig): assert_(ic(np.array(2),10,2).dtype == np.float, msg=repr(ic)) assert_(ic_sig(np.array(2),10,2).dtype == np.float, msg=repr(ic_sig) ) assert_almost_equal(ic(-10./2.*np.log(2.),10,2)/10, ic_sig(2, 10, 2), decimal=14) assert_almost_equal(ic_sig(np.log(2.),10,2, islog=True), ic_sig(2, 10, 2), decimal=14) #examples penalty directly from formula n, k = 10, 2 assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14) #next see Wikipedia assert_almost_equal(aicc(0, 10, 2), aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14) assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14) assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
def as_string(self, output_format='txt', **fmt_dict): """Return string: the formatted row. This is the default formatter for rows. Override this to get different formatting. A row formatter must accept as arguments a row (self) and an output format, one of ('html', 'txt', 'csv', 'latex'). """ fmt = self._get_fmt(output_format, **fmt_dict) # get column widths try: colwidths = self.table.get_colwidths(output_format, **fmt) except AttributeError: colwidths = fmt.get('colwidths') if colwidths is None: colwidths = (0,) * len(self) colsep = fmt['colsep'] row_pre = fmt.get('row_pre', '') row_post = fmt.get('row_post', '') formatted_cells = [] for cell, width in zip(self, colwidths): content = cell.format(width, output_format=output_format, **fmt) formatted_cells.append(content) formatted_row = row_pre + colsep.join(formatted_cells) + row_post formatted_row = self._decorate_below(formatted_row, output_format, **fmt) return formatted_row
def pooled_odds_ratio(self, tables): """ Returns the pooled odds ratio for a list of 2x2 tables. The pooled odds ratio is the inverse variance weighted average of the sample odds ratios of the tables. """ if len(tables) == 0: return 1. # Get the sampled odds ratios and variances log_oddsratio, var = [], [] for table in tables: lor = np.log(table[1, 1]) + np.log(table[0, 0]) -\ np.log(table[0, 1]) - np.log(table[1, 0]) log_oddsratio.append(lor) var.append((1 / table.astype(np.float64)).sum()) # Calculate the inverse variance weighted average wts = [1 / v for v in var] wtsum = sum(wts) wts = [w / wtsum for w in wts] log_pooled_or = sum([w * e for w, e in zip(wts, log_oddsratio)]) return np.exp(log_pooled_or)
def test_mosaic_simple(): # display a simple plot of 4 categories of data, splitted in four # levels with increasing size for each group # creation of the levels key_set = (['male', 'female'], ['old', 'adult', 'young'], ['worker', 'unemployed'], ['healty', 'ill']) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) # which colours should I use for the various categories? # put it into a dict props = {} #males and females in blue and red props[('male',)] = {'color': 'b'} props[('female',)] = {'color': 'r'} # all the groups corresponding to ill groups have a different color for key in keys: if 'ill' in key: if 'male' in key: props[key] = {'color': 'BlueViolet' , 'hatch': '+'} else: props[key] = {'color': 'Crimson' , 'hatch': '+'} # mosaic of the data, with given gaps and colors mosaic(data, gap=0.05, properties=props, axes_label=False) pylab.suptitle('syntetic data, 4 categories (plot 2 of 4)') #pylab.show() pylab.close('all')
def dataset(self, as_dict=False): """ Returns a Python generator object for iterating over the dataset. Parameters ---------- as_dict : bool, optional If as_dict is True, yield each row of observations as a dict. If False, yields each row of observations as a list. Returns ------- Generator object for iterating over the dataset. Yields each row of observations as a list by default. Notes ----- If missing_values is True during instantiation of StataReader then observations with _StataMissingValue(s) are not filtered and should be handled by your applcation. """ try: self._file.seek(self._data_location) except Exception: pass if as_dict: vars = lmap(str, self.variables()) for i in range(len(self)): yield dict(zip(vars, self._next())) else: for i in range(self._header['nobs']): yield self._next()
def plot2d(self,ix=0,iy=1,clf=True): """ Generates a 2-dimensional plot of the data set and principle components using matplotlib. ix specifies which p-dimension to put on the x-axis of the plot and iy specifies which to put on the y-axis (0-indexed) """ import matplotlib.pyplot as plt x,y=self.N[:,ix],self.N[:,iy] if clf: plt.clf() plt.scatter(x,y) vals,evs=self.getEigensystem() #evx,evy=evs[:,ix],evs[:,iy] xl,xu=plt.xlim() yl,yu=plt.ylim() dx,dy=(xu-xl),(yu-yl) for val,vec,c in zip(vals,evs.T,self._colors): plt.arrow(0,0,val*vec[ix],val*vec[iy],head_width=0.05*(dx*dy/4)**0.5,fc=c,ec=c) #plt.arrow(0,0,vals[ix]*evs[ix,ix],vals[ix]*evs[iy,ix],head_width=0.05*(dx*dy/4)**0.5,fc='g',ec='g') #plt.arrow(0,0,vals[iy]*evs[ix,iy],vals[iy]*evs[iy,iy],head_width=0.05*(dx*dy/4)**0.5,fc='r',ec='r') if self.names is not None: plt.xlabel('$'+self.names[ix]+'/\\sigma$') plt.ylabel('$'+self.names[iy]+'/\\sigma$')
def evaluate(self, xeval, order=None): xeval = self._transform(xeval) if order is None: order = len(self.polys) res = sum(c*p(xeval) for c, p in list(zip(self.coeffs, self.polys))[:order]) res = self._correction(res) return res
def setup_class(cls): data = sm.datasets.macrodata.load_pandas() cls.macro_df = data.data[['year', 'quarter', 'realgdp', 'cpi']] cls.random_data = np.random.randn(100) index = [str(int(yr)) + '-Q' + str(int(qu)) for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)] cls.macro_df.index = index cls.series = cls.macro_df.cpi
def test_freq_to_period(): from pandas.tseries.frequencies import to_offset freqs = ["A", "AS-MAR", "Q", "QS", "QS-APR", "W", "W-MON", "B", "D", "H"] expected = [1, 1, 4, 4, 4, 52, 52, 5, 7, 24] for i, j in zip(freqs, expected): assert_equal(tools.freq_to_period(i), j) assert_equal(tools.freq_to_period(to_offset(i)), j)
def test_recursive_split(): keys = list(product('mf')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(list(iterkeys(res)) == keys) res[('m',)] = (0.0, 0.0, 0.5, 1.0) res[('f',)] = (0.5, 0.0, 0.5, 1.0) keys = list(product('mf', 'yao')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(list(iterkeys(res)) == keys) res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3) res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3) res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3) res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3) res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3) res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
def contrast_labels(contrasts, names, reverse=False): if reverse: sl = slice(None, None, -1) else: sl = slice(None) labels = [''.join(['%s%s' % (signstr(c, noplus=True),v) for c,v in zip(row, names)[sl] if c != 0]) for row in contrasts] return labels
def variables(self): """ Returns a list of the dataset's StataVariables objects. """ return lmap(_StataVariable, zip(lrange(self._header['nvar']), self._header['typlist'], self._header['varlist'], self._header['srtlist'], self._header['fmtlist'], self._header['lbllist'], self._header['vlblist']))
def _categories_level(keys): """use the Ordered dict to implement a simple ordered set return each level of each category [[key_1_level_1,key_2_level_1],[key_1_level_2,key_2_level_2]] """ res = [] for i in zip(*(keys)): tuplefied = _tuplify(i) res.append(list(OrderedDict([(j, None) for j in tuplefied]))) return res
def setUpClass(cls): data = sm.datasets.macrodata.load() cls.macro_data = data.data[["year", "quarter", "realgdp", "cpi"]] cls.random_data = np.random.randn(100) year = cls.macro_data["year"] quarter = cls.macro_data["quarter"] cls.macro_df = pd.DataFrame.from_records(cls.macro_data) index = [str(int(yr)) + "-Q" + str(int(qu)) for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)] cls.macro_df.index = index cls.series = cls.macro_df.cpi
def setUpClass(cls): data = sm.datasets.macrodata.load() cls.macro_df = pd.DataFrame.from_records(data.data) cls.macro_df = cls.macro_df[['year', 'quarter', 'realgdp', 'cpi']] cls.macro_data = cls.macro_df.to_records(index=False) cls.random_data = np.random.randn(100) index = [str(int(yr)) + '-Q' + str(int(qu)) for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)] cls.macro_df.index = index cls.series = cls.macro_df.cpi
def setUpClass(cls): data = sm.datasets.macrodata.load() cls.macro_df = pd.DataFrame.from_records(data.data) cls.macro_df = cls.macro_df[['year', 'quarter', 'realgdp', 'cpi']] cls.macro_data = cls.macro_df.to_records(index=False) cls.random_data = np.random.randn(100) year = cls.macro_data['year'] quarter = cls.macro_data['quarter'] index = [str(int(yr)) + '-Q' + str(int(qu)) for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)] cls.macro_df.index = index cls.series = cls.macro_df.cpi
def extend_right(self, table): """Return None. Extend each row of `self` with corresponding row of `table`. Does **not** import formatting from ``table``. This generally makes sense only if the two tables have the same number of rows, but that is not enforced. :note: To extend append a table below, just use `extend`, which is the ordinary list method. This generally makes sense only if the two tables have the same number of columns, but that is not enforced. """ for row1, row2 in zip(self, table): row1.extend(row2)
def smooth(self, xs, ys, x): """Returns the kernel smoothing estimate for point x based on x-values xs and y-values ys. Not expected to be called by the user. """ xs, ys = self.in_domain(xs, ys, x) if len(xs)>0: w = np.sum(self((xs-x)/self.h)) #TODO: change the below to broadcasting when shape is sorted v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)]) return v / w else: return np.nan
def smoothvar(self, xs, ys, x): """Returns the kernel smoothing estimate of the variance at point x. """ xs, ys = self.in_domain(xs, ys, x) if len(xs) > 0: fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs]) sqresid = square(subtract(ys, fittedvals)) w = np.sum(self((xs - x) / self.h)) v = np.sum( [rr * self((xx - x) / self.h) for xx, rr in zip(xs, sqresid)]) return v / w else: return np.nan
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female', )] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in iteritems(data)]) keys = list(iterkeys(temp_data)) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)') #pylab.show() pylab.close('all')
def test_axes_labeling(close_figures): from numpy.random import rand key_set = (['male', 'female'], ['old', 'adult', 'young'], ['worker', 'unemployed'], ['yes', 'no']) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(zip(keys, rand(len(keys)))) lab = lambda k: ''.join(s[0] for s in k) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8)) mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45) mosaic(data, ax=ax2, labelizer=lab, horizontal=False, label_rotation=[0, 45, 90, 0]) #fig.tight_layout() fig.suptitle("correct alignment of the axes labels")
def summary(self): '''returns text summarizing the results uses the default pvalue correction of the instance stored in ``self.multitest_method`` ''' import statsmodels.stats.multitest as smt maxlevel = max((len(ss) for ss in self.all_pairs_names)) text = 'Corrected p-values using %s p-value correction\n\n' % \ smt.multitest_methods_names[self.multitest_method] text += 'Pairs' + (' ' * (maxlevel - 5 + 1)) + 'p-values\n' text += '\n'.join(('%s %6.4g' % (pairs, pv) for (pairs, pv) in zip(self.all_pairs_names, self.pval_corrected()))) return text
def setup_class(cls): data = sm.datasets.macrodata.load_pandas() cls.macro_df = data.data[['year', 'quarter', 'realgdp', 'cpi']] cols = list(cls.macro_df.columns) cls.realgdp_loc = cols.index('realgdp') cls.cpi_loc = cols.index('cpi') cls.random_data = np.random.randn(100) year = cls.macro_df['year'].values quarter = cls.macro_df['quarter'].values index = [ str(int(yr)) + '-Q' + str(int(qu)) for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter) ] cls.macro_df.index = index cls.series = cls.macro_df.cpi
def ftest_summary(self): '''run all ftests on the joint model Returns ------- fres : str a string that lists the results of all individual f-tests summarytable : list of tuples contains (pair, (fvalue, pvalue,df_denom, df_num)) for each f-test Note ---- This are the raw results and not formatted for nice printing. ''' if not hasattr(self, 'lsjoint'): self.fitjoint() txt = [] summarytable = [] txt.append('F-test for equality of coefficients across groups') fres = self.lsjoint.f_test(self.contrasts['all']) txt.append(fres.__str__()) summarytable.append( ('all', (fres.fvalue, fres.pvalue, fres.df_denom, fres.df_num))) # for group in self.unique[1:]: #replace with group1, group2 in sorted(keys) # txt.append('F-test for equality of coefficients between group' # ' %s and group %s' % (group, '0')) # fres = self.lsjoint.f_test(self.contrasts[group]) # txt.append(fres.__str__()) # summarytable.append((group,(fres.fvalue, fres.pvalue, fres.df_denom, fres.df_num))) pairs = np.triu_indices(len(self.unique), 1) for ind1, ind2 in zip( *pairs): #replace with group1, group2 in sorted(keys) g1 = self.unique[ind1] g2 = self.unique[ind2] txt.append('F-test for equality of coefficients between group' ' %s and group %s' % (g1, g2)) group = (g1, g2) fres = self.lsjoint.f_test(self.contrasts[group]) txt.append(fres.__str__()) summarytable.append((group, (fres.fvalue, fres.pvalue, fres.df_denom, fres.df_num))) self.summarytable = summarytable return '\n'.join(txt), summarytable
def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap): """ Given a dictionary where each entry is a rectangle, a list of key and value (count of elements in each category) it split each rect accordingly, as long as the key start with the tuple key_subset. The other keys are returned without modification. """ result = OrderedDict() L = len(key_subset) for name, (x, y, w, h) in iteritems(rect_dict): if key_subset == name[:L]: # split base on the values given divisions = _split_rect(x, y, w, h, values, horizontal, gap) for key, rect in zip(keys, divisions): result[name + (key, )] = rect else: result[name] = (x, y, w, h) return result
def density_orthopoly(x, polybase, order=5, xeval=None): #polybase = legendre #chebyt #hermitenorm# #polybase = chebyt #polybase = FPoly #polybase = ChtPoly #polybase = hermite #polybase = HPoly if xeval is None: xeval = np.linspace(x.min(), x.max(), 50) #polys = [legendre(i) for i in range(order)] polys = [polybase(i) for i in range(order)] #coeffs = [(p(x)*(1-x**2)**(-1/2.)).mean() for p in polys] #coeffs = [(p(x)*np.exp(-x*x)).mean() for p in polys] coeffs = [(p(x)).mean() for p in polys] res = sum(c * p(xeval) for c, p in zip(coeffs, polys)) #res *= (1-xeval**2)**(-1/2.) #res *= np.exp(-xeval**2./2) return res, xeval, coeffs, polys
def _get_colwidths(self, output_format, **fmt_dict): """Return list, the calculated widths of each column.""" output_format = get_output_format(output_format) fmt = self.output_formats[output_format].copy() fmt.update(fmt_dict) ncols = max(len(row) for row in self) request = fmt.get('colwidths') if request == 0: # assume no extra space desired (e.g, CSV) return [0] * ncols elif request is None: # assume no extra space desired (e.g, CSV) request = [0] * ncols elif isinstance(request, (int, long)): request = [request] * ncols elif len(request) < ncols: request = [request[i % len(request)] for i in range(ncols)] min_widths = [] for col in zip(*self): maxwidth = max(len(c.format(0, output_format, **fmt)) for c in col) min_widths.append(maxwidth) result = lmap(max, min_widths, request) return result
def _split_rect(x, y, width, height, proportion, horizontal=True, gap=0.05): """ Split the given rectangle in n segments whose proportion is specified along the given axis if a gap is inserted, they will be separated by a certain amount of space, retaining the relative proportion between them a gap of 1 correspond to a plot that is half void and the remaining half space is proportionally divided among the pieces. """ x, y, w, h = float(x), float(y), float(width), float(height) if (w < 0) or (h < 0): raise ValueError("dimension of the square less than" "zero w={} h=()".format(w, h)) proportions = _normalize_split(proportion) # extract the starting point and the dimension of each subdivision # in respect to the unit square starting = proportions[:-1] amplitude = proportions[1:] - starting # how much each extrema is going to be displaced due to gaps starting += gap * np.arange(len(proportions) - 1) # how much the squares plus the gaps are extended extension = starting[-1] + amplitude[-1] - starting[0] # normalize everything for fit again in the original dimension starting /= extension amplitude /= extension # bring everything to the original square starting = (x if horizontal else y) + starting * (w if horizontal else h) amplitude = amplitude * (w if horizontal else h) # create each 4-tuple for each new block results = [(s, y, a, h) if horizontal else (x, s, w, a) for s, a in zip(starting, amplitude)] return results
def smoothconf(self, xs, ys, x, alpha=0.05): """Returns the kernel smoothing estimate with confidence 1sigma bounds """ xs, ys = self.in_domain(xs, ys, x) if len(xs) > 0: fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs]) #fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle sqresid = square( subtract(ys, fittedvals) ) w = np.sum(self((xs-x)/self.h)) #var = sqresid.sum() / (len(sqresid) - 0) # nonlocal var ? JP just trying v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)]) var = v / w sd = np.sqrt(var) K = self.L2Norm yhat = self.smooth(xs, ys, x) from scipy import stats crit = stats.norm.isf(alpha / 2) err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const) return (yhat - err, yhat, yhat + err) else: return (np.nan, np.nan, np.nan)
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c', model_kw={}, fit_kw={}): """ Returns information criteria for many ARMA models Parameters ---------- y : array-like Time-series data max_ar : int Maximum number of AR lags to use. Default 4. max_ma : int Maximum number of MA lags to use. Default 2. ic : str, list Information criteria to report. Either a single string or a list of different criteria is possible. trend : str The trend to use when fitting the ARMA models. model_kw : dict Keyword arguments to be passed to the ``ARMA`` model fit_kw : dict Keyword arguments to be passed to ``ARMA.fit``. Returns ------- obj : Results object Each ic is an attribute with a DataFrame for the results. The AR order used is the row index. The ma order used is the column index. The minimum orders are available as ``ic_min_order``. Examples -------- >>> from statsmodels.tsa.arima_process import arma_generate_sample >>> import statsmodels.api as sm >>> import numpy as np >>> arparams = np.array([.75, -.25]) >>> maparams = np.array([.65, .35]) >>> arparams = np.r_[1, -arparams] >>> maparam = np.r_[1, maparams] >>> nobs = 250 >>> np.random.seed(2014) >>> y = arma_generate_sample(arparams, maparams, nobs) >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc') >>> res.aic_min_order >>> res.bic_min_order Notes ----- This method can be used to tentatively identify the order of an ARMA from process, provided that the time series is stationary and invertible. This function computes the full exact MLE estimate of each model and can be, therefore a little slow. An implementation using approximate estimates will be provided in the future. In the meantime, consider passing {method : 'css'} to fit_kw. """ from pandas import DataFrame ar_range = lrange(0, max_ar + 1) ma_range = lrange(0, max_ma + 1) if isinstance(ic, string_types): ic = [ic] elif not isinstance(ic, (list, tuple)): raise ValueError("Need a list or a tuple for ic if not a string.") results = np.zeros((len(ic), max_ar + 1, max_ma + 1)) for ar in ar_range: for ma in ma_range: if ar == 0 and ma == 0 and trend == 'nc': results[:, ar, ma] = np.nan continue mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw) if mod is None: results[:, ar, ma] = np.nan continue for i, criteria in enumerate(ic): results[i, ar, ma] = getattr(mod, criteria) dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results] res = dict(zip(ic, dfs)) # add the minimums to the results dict min_res = {} for i, result in iteritems(res): mins = np.where(result.min().min() == result) min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])}) res.update(min_res) return Bunch(**res)
import numpy as np _quarter_to_day = { "1": (3, 31), "2": (6, 30), "3": (9, 30), "4": (12, 31), "I": (3, 31), "II": (6, 30), "III": (9, 30), "IV": (12, 31) } _mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] _months_with_days = lzip(lrange(1, 13), _mdays) _month_to_day = dict(zip(map(str, lrange(1, 13)), _months_with_days)) _month_to_day.update( dict( zip([ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII" ], _months_with_days))) # regex patterns _y_pattern = r'^\d?\d?\d?\d$' _q_pattern = r''' ^ # beginning of string \d?\d?\d?\d # match any number 1-9999, includes leading zeros (:?q) # use q or a : as a separator
endog = 5 # dummy place holder ############## Example similar to Greene #get pickled data #endog3, xifloat3 = cPickle.load(open('xifloat2.pickle','rb')) tree0 = ('top', [('Fly', ['Air']), ('Ground', ['Train', 'Car', 'Bus'])]) ''' this is with real data from Greene's clogit example datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'], [xifloat[i]for i in range(4)])) ''' #for testing only (mock that returns it's own name datadict = dict( zip(['Air', 'Train', 'Bus', 'Car'], ['Airdata', 'Traindata', 'Busdata', 'Cardata'])) if testxb: datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'], np.arange(4))) datadict.update({'top': [], 'Fly': [], 'Ground': []}) paramsind = { 'top': [], 'Fly': [], 'Ground': [], 'Air': ['GC', 'Ttme', 'ConstA', 'Hinc'], 'Train': ['GC', 'Ttme', 'ConstT'], 'Bus': ['GC', 'Ttme', 'ConstB'], 'Car': ['GC', 'Ttme'] }
plt.figure() plt.plot(exog) #plt.plot(p, '.', lw=2) plt.plot(y_true, lw=2) y_pred = m.results.mu # + m.results.alpha #m.results.predict(d) plt.figure() plt.subplot(2, 2, 1) plt.plot(p, '.') plt.plot(yp, 'b-', label='true') plt.plot(y_pred, 'r-', label='GAM') plt.legend(loc='upper left') plt.title('gam.GAM Poisson') counter = 2 for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:, 0], x[:, 1]]): sortidx = np.argsort(xx) #plt.figure() plt.subplot(2, 2, counter) plt.plot(xx[sortidx], p[sortidx], 'k.', alpha=0.5) plt.plot(xx[sortidx], yp[sortidx], 'b.', label='true') plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='GAM') plt.legend(loc='upper left') plt.title('gam.GAM Poisson ' + ii) counter += 1 res = GLM(p, exog_reduced, family=f).fit() #plot component, compared to true component x1 = x[:, 0] x2 = x[:, 1]
def summary_col(results, float_format='%.4f', model_names=[], stars=False, info_dict=None, regressor_order=[]): """ Summarize multiple results instances side-by-side (coefs and SEs) Parameters ---------- results : statsmodels results instance or list of result instances float_format : string float format for coefficients and standard errors Default : '%.4f' model_names : list of strings of length len(results) if the names are not unique, a roman number will be appended to all model names stars : bool print significance stars info_dict : dict dict of lambda functions to be applied to results instances to retrieve model info. To use specific information for different models, add a (nested) info_dict with model name as the key. Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would only show `R2` for OLS regression models, but additionally `N` for all other results. Default : None (use the info_dict specified in result.default_model_infos, if this property exists) regressor_order : list of strings list of names of the regressors in the desired order. All regressors not specified will be appended to the end of the list. """ if not isinstance(results, list): results = [results] cols = [ _col_params(x, stars=stars, float_format=float_format) for x in results ] # Unique column names (pandas has problems merging otherwise) if model_names: colnames = _make_unique(model_names) else: colnames = _make_unique([x.columns[0] for x in cols]) for i in range(len(cols)): cols[i].columns = [colnames[i]] merg = lambda x, y: x.merge( y, how='outer', right_index=True, left_index=True) summ = reduce(merg, cols) if regressor_order: varnames = summ.index.get_level_values(0).tolist() ordered = [x for x in regressor_order if x in varnames] unordered = [x for x in varnames if x not in regressor_order + ['']] order = ordered + list(np.unique(unordered)) f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], []) summ.index = f(np.unique(varnames)) summ = summ.reindex(f(order)) summ.index = [x[:-4] for x in summ.index] idx = pd.Series(lrange(summ.shape[0])) % 2 == 1 summ.index = np.where(idx, '', summ.index.get_level_values(0)) # add infos about the models. if info_dict: cols = [ _col_info(x, info_dict.get(x.model.__class__.__name__, info_dict)) for x in results ] else: cols = [ _col_info(x, getattr(x, "default_model_infos", None)) for x in results ] # use unique column names, otherwise the merge will not succeed for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])): df.columns = [name] merg = lambda x, y: x.merge( y, how='outer', right_index=True, left_index=True) info = reduce(merg, cols) dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error dat.columns = summ.columns dat.index = pd.Index(summ.index.tolist() + info.index.tolist()) summ = dat summ = summ.fillna('') smry = Summary() smry.add_df(summ, header=True, align='l') smry.add_text('Standard errors in parentheses.') if stars: smry.add_text('* p<.1, ** p<.05, ***p<.01') return smry
def handle_missing(cls, endog, exog, missing, **kwargs): """ This returns a dictionary with keys endog, exog and the keys of kwargs. It preserves Nones. """ none_array_names = [] # patsy's already dropped NaNs in y/X missing_idx = kwargs.pop('missing_idx', None) if missing_idx is not None: # y, X already handled by patsy. add back in later. combined = () combined_names = [] if exog is None: none_array_names += ['exog'] elif exog is not None: combined = (endog, exog) combined_names = ['endog', 'exog'] else: combined = (endog, ) combined_names = ['endog'] none_array_names += ['exog'] # deal with other arrays combined_2d = () combined_2d_names = [] if len(kwargs): for key, value_array in iteritems(kwargs): if value_array is None or value_array.ndim == 0: none_array_names += [key] continue # grab 1d arrays if value_array.ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] elif value_array.squeeze().ndim == 1: combined += (np.asarray(value_array), ) combined_names += [key] # grab 2d arrays that are _assumed_ to be symmetric elif value_array.ndim == 2: combined_2d += (np.asarray(value_array), ) combined_2d_names += [key] else: raise ValueError("Arrays with more than 2 dimensions " "aren't yet handled") if missing_idx is not None: nan_mask = missing_idx updated_row_mask = None if combined: # there were extra arrays not handled by patsy combined_nans = _nan_rows(*combined) if combined_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra arrays given to model.") # for going back and updated endog/exog updated_row_mask = combined_nans[~nan_mask] nan_mask |= combined_nans # for updating extra arrays only if combined_2d: combined_2d_nans = _nan_rows(combined_2d) if combined_2d_nans.shape[0] != nan_mask.shape[0]: raise ValueError("Shape mismatch between endog/exog " "and extra 2d arrays given to model.") if updated_row_mask is not None: updated_row_mask |= combined_2d_nans[~nan_mask] else: updated_row_mask = combined_2d_nans[~nan_mask] nan_mask |= combined_2d_nans else: nan_mask = _nan_rows(*combined) if combined_2d: nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d) if not np.any(nan_mask): # no missing don't do anything combined = dict(zip(combined_names, combined)) if combined_2d: combined.update(dict(zip(combined_2d_names, combined_2d))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) if missing_idx is not None: combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) return combined, [] elif missing == 'raise': raise MissingDataError("NaNs were encountered in the data") elif missing == 'drop': nan_mask = ~nan_mask drop_nans = lambda x: cls._drop_nans(x, nan_mask) drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) combined = dict(zip(combined_names, lmap(drop_nans, combined))) if missing_idx is not None: if updated_row_mask is not None: updated_row_mask = ~updated_row_mask # update endog/exog with this new information endog = cls._drop_nans(endog, updated_row_mask) if exog is not None: exog = cls._drop_nans(exog, updated_row_mask) combined.update({'endog': endog}) if exog is not None: combined.update({'exog': exog}) if combined_2d: combined.update( dict( zip(combined_2d_names, lmap(drop_nans_2d, combined_2d)))) if none_array_names: combined.update( dict(zip(none_array_names, [None] * len(none_array_names)))) return combined, np.where(~nan_mask)[0].tolist() else: raise ValueError("missing option %s not understood" % missing)
>>> contrast.Contrast(formula.Term('(ff==a)'), fac).matrix Traceback (most recent call last): File "c:\...\scikits\statsmodels\sandbox\contrast_old.py", line 112, in _get_matrix self.compute_matrix() File "c:\...\scikits\statsmodels\sandbox\contrast_old.py", line 91, in compute_matrix T = np.transpose(np.array(t(*args, **kw))) File "c:\...\scikits\statsmodels\sandbox\formula.py", line 150, in __call__ If the term has no 'func' attribute, it returns KeyError: '(ff==a)' ''' #convert factor to formula f7 = formula.Formula(fac) # explicit updating of namespace with f7.namespace.update(dict(zip(fac.names(), fac()))) # contrast matrix with 2 of 3 terms contrast.Contrast(formula.Term('(ff==b)') + formula.Term('(ff==a)'), f7).matrix #array([[ 1., 0., 0.], # [ 0., 1., 0.]]) # contrast matrix for all terms contrast.Contrast(f7, f7).matrix #array([[ 1., 0., 0.], # [ 0., 1., 0.], # [ 0., 0., 1.]]) # contrast matrix for difference groups 1,2 versus group 0 contrast.Contrast(formula.Term('(ff==b)') + formula.Term('(ff==c)'), f7).matrix - contrast.Contrast(formula.Term('(ff==a)'),
sige = 0.1 nobs, k_vars = 500, 3 x = np.random.uniform(-1, 1, size=nobs) x.sort() exog = np.vander(x, k_vars + 1)[:, ::-1] mix = 0.1 * stats.norm.pdf( x[:, None], loc=np.linspace(-0.5, 0.75, 4), scale=0.01).sum(1) y = exog.sum(1) + mix + sige * (np.random.randn(nobs) / 2 + 1)**3 p = 0.5 res_qr = QuantReg(y, exog).fit(p) res_qr2 = QuantReg(y, exog).fit(0.1) res_qr3 = QuantReg(y, exog).fit(0.75) res_ols = sm.OLS(y, exog).fit() params = [res_ols.params, res_qr2.params, res_qr.params, res_qr3.params] labels = ['ols', 'qr 0.1', 'qr 0.5', 'qr 0.75'] plt.figure() plt.plot(x, y, '.', alpha=0.5) for lab, beta in zip(['ols', 'qr 0.1', 'qr 0.5', 'qr 0.75'], params): print('%-8s' % lab, np.round(beta, 4)) fitted = np.dot(exog, beta) lw = 2 plt.plot(x, fitted, lw=lw, label=lab) plt.legend() plt.title('Quantile Regression') plt.show()
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None, xdata=None, labels=None, ax=None, use_brute=False, seed=None): """ High Density Region boxplot Parameters ---------- data : sequence of ndarrays or 2-D ndarray The vectors of functions to create a functional boxplot from. If a sequence of 1-D arrays, these should all be the same size. The first axis is the function index, the second axis the one along which the function is defined. So ``data[0, :]`` is the first functional curve. ncomp : int, optional Number of components to use. If None, returns the as many as the smaller of the number of rows or columns in data. alpha : list of floats between 0 and 1, optional Extra quantile values to compute. Default is None threshold : float between 0 and 1, optional Percentile threshold value for outliers detection. High value means a lower sensitivity to outliers. Default is `0.95`. bw: array_like or str, optional If an array, it is a fixed user-specified bandwidth. If `None`, set to `normal_reference`. If a string, should be one of: - normal_reference: normal reference rule of thumb (default) - cv_ml: cross validation maximum likelihood - cv_ls: cross validation least squares xdata : ndarray, optional The independent variable for the data. If not given, it is assumed to be an array of integers 0..N-1 with N the length of the vectors in `data`. labels : sequence of scalar or str, optional The labels or identifiers of the curves in `data`. If not given, outliers are labeled in the plot with array indices. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. use_brute : bool Use the brute force optimizer instead of the default differential evolution to find the curves. Default is False. seed : {None, int, np.random.RandomState} Seed value to pass to scipy.optimize.differential_evolution. Can be an integer or RandomState instance. If None, then the default RandomState provided by np.random is used. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. hdr_res : HdrResults instance An `HdrResults` instance with the following attributes: - 'median', array. Median curve. - 'hdr_50', array. 50% quantile band. [sup, inf] curves - 'hdr_90', list of array. 90% quantile band. [sup, inf] curves. - 'extra_quantiles', list of array. Extra quantile band. [sup, inf] curves. - 'outliers', ndarray. Outlier curves. Notes ----- The median curve is the curve with the highest probability on the reduced space of a Principal Component Analysis (PCA). Outliers are defined as curves that fall outside the band corresponding to the quantile given by `threshold`. The non-outlying region is defined as the band made up of all the non-outlying curves. Behind the scene, the dataset is represented as a matrix. Each line corresponding to a 1D curve. This matrix is then decomposed using Principal Components Analysis (PCA). This allows to represent the data using a finite number of modes, or components. This compression process allows to turn the functional representation into a scalar representation of the matrix. In other words, you can visualize each curve from its components. Each curve is thus a point in this reduced space. With 2 components, this is called a bivariate plot (2D plot). In this plot, if some points are adjacent (similar components), it means that back in the original space, the curves are similar. Then, finding the median curve means finding the higher density region (HDR) in the reduced space. Moreover, the more you get away from this HDR, the more the curve is unlikely to be similar to the other curves. Using a kernel smoothing technique, the probability density function (PDF) of the multivariate space can be recovered. From this PDF, it is possible to compute the density probability linked to the cluster of points and plot its contours. Finally, using these contours, the different quantiles can be extracted along with the median curve and the outliers. Steps to produce the HDR boxplot include: 1. Compute a multivariate kernel density estimation 2. Compute contour lines for quantiles 90%, 50% and `alpha` % 3. Plot the bivariate plot 4. Compute median curve along with quantiles and outliers curves. References ---------- [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for Functional Data", vol. 19, pp. 29-45, 2010. Examples -------- Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea surface temperature data. >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> data = sm.datasets.elnino.load(as_pandas=False) Create a functional boxplot. We see that the years 1982-83 and 1997-98 are outliers; these are the years where El Nino (a climate pattern characterized by warming up of the sea surface and higher air pressures) occurred with unusual intensity. >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:], ... labels=data.raw_data[:, 0].astype(int), ... ax=ax) >>> ax.set_xlabel("Month of the year") >>> ax.set_ylabel("Sea surface temperature (C)") >>> ax.set_xticks(np.arange(13, step=3) - 1) >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) >>> ax.set_xlim([-0.2, 11.2]) >>> plt.show() .. plot:: plots/graphics_functional_hdrboxplot.py See Also -------- banddepth, rainbowplot, fboxplot """ fig, ax = utils.create_mpl_ax(ax) if labels is None: # For use with pandas, get the labels if hasattr(data, 'index'): labels = data.index else: labels = np.arange(len(data)) data = np.asarray(data) if xdata is None: xdata = np.arange(data.shape[1]) n_samples, dim = data.shape # PCA and bivariate plot pca = PCA(data, ncomp=ncomp) data_r = pca.factors # Create gaussian kernel ks_gaussian = KDEMultivariate(data_r, bw=bw, var_type='c' * data_r.shape[1]) # Boundaries of the n-variate space bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T # Compute contour line of pvalue linked to a given probability level if alpha is None: alpha = [threshold, 0.9, 0.5] else: alpha.extend([threshold, 0.9, 0.5]) alpha = list(set(alpha)) alpha.sort(reverse=True) n_quantiles = len(alpha) pdf_r = ks_gaussian.pdf(data_r).flatten() pvalues = [ np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear') for i in range(n_quantiles) ] # Find mean, outliers curves if have_de_optim and not use_brute: median = differential_evolution(lambda x: -ks_gaussian.pdf(x), bounds=bounds, maxiter=5, seed=seed).x else: median = brute(lambda x: -ks_gaussian.pdf(x), ranges=bounds, finish=fmin) outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0] labels_outlier = [labels[i] for i in outliers_idx] outliers = data[outliers_idx] # Find HDR given some quantiles def _band_quantiles(band, use_brute=use_brute, seed=seed): """ Find extreme curves for a quantile band. From the `band` of quantiles, the associated PDF extrema values are computed. If `min_alpha` is not provided (single quantile value), `max_pdf` is set to `1E6` in order not to constrain the problem on high values. An optimization is performed per component in order to find the min and max curves. This is done by comparing the PDF value of a given curve with the band PDF. Parameters ---------- band : array_like alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` use_brute : bool Use the brute force optimizer instead of the default differential evolution to find the curves. Default is False. seed : {None, int, np.random.RandomState} Seed value to pass to scipy.optimize.differential_evolution. Can be an integer or RandomState instance. If None, then the default RandomState provided by np.random is used. Returns ------- band_quantiles : list of 1-D array ``(max_quantile, min_quantile)`` (2, n_features) """ min_pdf = pvalues[alpha.index(band[0])] try: max_pdf = pvalues[alpha.index(band[1])] except IndexError: max_pdf = 1E6 band = [min_pdf, max_pdf] pool = Pool() data = zip( range(dim), itertools.repeat( (band, pca, bounds, ks_gaussian, seed, use_brute))) band_quantiles = pool.map(_min_max_band, data) pool.terminate() pool.close() band_quantiles = list(zip(*band_quantiles)) return band_quantiles extra_alpha = [ i for i in alpha if 0.5 != i and 0.9 != i and threshold != i ] if len(extra_alpha) > 0: extra_quantiles = [] for x in extra_alpha: for y in _band_quantiles([x], use_brute=use_brute, seed=seed): extra_quantiles.append(y) else: extra_quantiles = [] # Inverse transform from n-variate plot to dataset dataset's shape median = _inverse_transform(pca, median)[0] hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed) hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed) hdr_res = HdrResults({ "median": median, "hdr_50": hdr_50, "hdr_90": hdr_90, "extra_quantiles": extra_quantiles, "outliers": outliers, "outliers_idx": outliers_idx }) # Plots ax.plot(np.array([xdata] * n_samples).T, data.T, c='c', alpha=.1, label=None) ax.plot(xdata, median, c='k', label='Median') fill_betweens = [] fill_betweens.append( ax.fill_between(xdata, *hdr_50, color='gray', alpha=.4, label='50% HDR')) fill_betweens.append( ax.fill_between(xdata, *hdr_90, color='gray', alpha=.3, label='90% HDR')) if len(extra_quantiles) != 0: ax.plot(np.array([xdata] * len(extra_quantiles)).T, np.array(extra_quantiles).T, c='y', ls='-.', alpha=.4, label='Extra quantiles') if len(outliers) != 0: for ii, outlier in enumerate(outliers): if labels_outlier is None: label = 'Outliers' else: label = str(labels_outlier[ii]) ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label) handles, labels = ax.get_legend_handles_labels() # Proxy artist for fill_between legend entry # See http://matplotlib.org/1.3.1/users/legend_guide.html plt = _import_mpl() for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens): p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0]) handles.append(p) labels.append(label) by_label = OrderedDict(zip(labels, handles)) if len(outliers) != 0: by_label.pop('Median') by_label.pop('50% HDR') by_label.pop('90% HDR') ax.legend(by_label.values(), by_label.keys(), loc='best') return fig, hdr_res
ax.axhline(0, color='k') ax.set_ylim([-1, 1]) # hack? ax.set_xlim([-1, xs[-1] + 1]) mpl.rcParams['font.size'] = old_size #Example TSA descriptive data = sm.datasets.macrodata.load() mdata = data.data df = DataFrame.from_records(mdata) quarter_end = frequencies.BQuarterEnd() df.index = [quarter_end.rollforward(datetime(int(y), int(q) * 3, 1)) for y, q in zip(df.pop('year'), df.pop('quarter'))] logged = np.log(df.ix[:, ['m1', 'realgdp', 'cpi']]) logged.plot(subplots=True) log_difference = logged.diff().dropna() plot_acf_multiple(log_difference.values) #Example TSA VAR model = tsa.VAR(log_difference, freq='D') print(model.select_order()) res = model.fit(2) print(res.summary()) print(res.is_stable())
print('final branch with', tree, ''.join(tree), leavessum) #sum(tree) if testxb: return leavessum #sum(xb[tree]) else: return ''.join(tree) #sum(tree) print('working on branch', tree, branchsum) return branchsum tree = [[0, 1], [[2, 3], [4, 5, 6]], [7]] tree2 = ('top', [('B1', ['a', 'b']), ('B2', [('B21', ['c', 'd']), ('B22', ['e', 'f', 'g'])]), ('B3', ['h'])]) data2 = dict([i for i in zip('abcdefgh', lrange(8))]) #data2.update({'top':1000, 'B1':100, 'B2':200, 'B21':300,'B22':400, 'B3':400}) data2.update({ 'top': 1000, 'B1': 100, 'B2': 200, 'B21': 21, 'B22': 22, 'B3': 300 }) #data2 #{'a': 0, 'c': 2, 'b': 1, 'e': 4, 'd': 3, 'g': 6, 'f': 5, 'h': 7, #'top': 1000, 'B22': 22, 'B21': 21, 'B1': 100, 'B2': 200, 'B3': 300} print('\n tree with dictionary data')
def grangercausalitytests(x, mxlg, autolag=None, alpha=0.0001, max_iter=1e5, addconst=True, verbose=True): """four tests for granger non causality of 2 timeseries all four tests give similar results `params_ftest` and `ssr_ftest` are equivalent based on F test which is identical to lmtest:grangertest in R Parameters ---------- x : array, 2d data for test whether the time series in the second column Granger causes the time series in the first column lags : list of integers the Granger causality test results are calculated for all lags in the list autolag: If 'aic' the lag which minimizes the information criterion is used from the lags verbose : bool print results if true Returns ------- results : dictionary all test results, dictionary keys are the number of lags. For each lag the values are a tuple, with the first element a dictionary with teststatistic, pvalues, degrees of freedom, the second element are the OLS estimation results for the restricted model, the unrestricted model and the restriction (contrast) matrix for the parameter f_test. Notes ----- TODO: convert to class and attach results properly The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test. The null hypothesis for all four test is that the coefficients corresponding to past values of the second time series are zero. 'params_ftest', 'ssr_ftest' are based on F distribution 'ssr_chi2test', 'lrtest' are based on chi-square distribution References ---------- http://en.wikipedia.org/wiki/Granger_causality Greene: Econometric Analysis """ from scipy import stats from sklearn.linear_model import Lasso x = np.asarray(x) if x.shape[0] <= 3 * mxlg + int(addconst): raise ValueError("Insufficient observations. Maximum allowable " "lag is {0}".format(int((x.shape[0] - int(addconst)) / 3) - 1)) result = {} if verbose: print('\nGranger Causality') print('number of lags (no zero)', mxlg) # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both', dropex=1) #add constant if addconst: dtajoint = add_constant(dta[:, 1:], prepend=False) else: raise NotImplementedError('Not Implemented') # Run Lasso on all variables lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=max_iter) lassoreg.fit(dtajoint[:,1:], dta[:, 0]) pred = lassoreg.predict(dtajoint[:,1:]) actual = dta[:,0] errors = [abs(i-j) for i, j in zip(actual, pred)] step_size = 10 window_size = 30 avg_errors=[] for i in range(len(actual)/step_size): err = 0 for j in range(i*step_size,(i*step_size)+window_size): if j>=len(actual): break err += errors[j] avg_errors += [err] rmse = np.mean(avg_errors) result = lassoreg.coef_ non_zeros = [(i,x) for i, x in enumerate(result) if x != 0] non_zero_vars = {} best_vars = {} for (i,x) in non_zeros: k = (i+1)/(mxlg) if k not in non_zero_vars or abs(x) > abs(best_vars[k]): non_zero_vars[k] = (i+1)%(mxlg) best_vars[k] = x return (rmse , non_zero_vars, best_vars)
def contrast_product(names1, names2, intgroup1=None, intgroup2=None, pairs=False): '''build contrast matrices for products of two categorical variables this is an experimental script and should be converted to a class Parameters ---------- names1, names2 : lists of strings contains the list of level labels for each categorical variable intgroup1, intgroup2 : ndarrays TODO: this part not tested, finished yet categorical variable Notes ----- This creates a full rank matrix. It does not do all pairwise comparisons, parameterization is using contrast_all_one to get differences with first level. ? does contrast_all_pairs work as a plugin to get all pairs ? ''' n1 = len(names1) n2 = len(names2) names_prod = ['%s_%s' % (i,j) for i in names1 for j in names2] ee1 = np.zeros((1,n1)) ee1[0,0] = 1 if not pairs: dd = np.r_[ee1, -contrast_all_one(n1)] else: dd = np.r_[ee1, -contrast_allpairs(n1)] contrast_prod = np.kron(dd[1:], np.eye(n2)) names_contrast_prod0 = contrast_labels(contrast_prod, names_prod, reverse=True) names_contrast_prod = [''.join(['%s%s' % (signstr(c, noplus=True),v) for c,v in zip(row, names_prod)[::-1] if c != 0]) for row in contrast_prod] ee2 = np.zeros((1,n2)) ee2[0,0] = 1 #dd2 = np.r_[ee2, -contrast_all_one(n2)] if not pairs: dd2 = np.r_[ee2, -contrast_all_one(n2)] else: dd2 = np.r_[ee2, -contrast_allpairs(n2)] contrast_prod2 = np.kron(np.eye(n1), dd2[1:]) names_contrast_prod2 = [''.join(['%s%s' % (signstr(c, noplus=True),v) for c,v in zip(row, names_prod)[::-1] if c != 0]) for row in contrast_prod2] if (not intgroup1 is None) and (not intgroup1 is None): d1, _ = dummy_1d(intgroup1) d2, _ = dummy_1d(intgroup2) dummy = dummy_product(d1, d2) else: dummy = None return (names_prod, contrast_prod, names_contrast_prod, contrast_prod2, names_contrast_prod2, dummy)
def summary_col(results, float_format='%.4f', model_names=[], stars=True, more_info=None, regressor_order=[],show='t',title=None): # I added the parameter 'show' and changed the default of 'stars' into 'True', # then renamed the dict parameter 'info_dict' as a list one 'more_info' # finally assigned the regressor_order a initial value ['const'] """ Summarize multiple results instances side-by-side (coefs and SEs) Parameters ---------- results : statsmodels results instance or list of result instances float_format : string float format for coefficients and standard errors Default : '%.4f' model_names : list of strings of length len(results) if the names are not unique, a roman number will be appended to all model names stars : bool print significance stars info_dict : dict dict of lambda functions to be applied to results instances to retrieve model info. To use specific information for different models, add a (nested) info_dict with model name as the key. Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would only show `R2` for OLS regression models, but additionally `N` for all other results. Default : None (use the info_dict specified in result.default_model_infos, if this property exists) regressor_order : list of strings list of names of the regressors in the desired order. All regressors not specified will be appended to the end of the list. """ if not isinstance(results, list): results = [results] cols = [_col_params(x, stars=stars, float_format=float_format,show=show) for x in results] # Unique column names (pandas has problems merging otherwise) if model_names: colnames = _make_unique(model_names) else: colnames = _make_unique([x.columns[0] for x in cols]) for i in range(len(cols)): cols[i].columns = [colnames[i]] merg = lambda x, y: x.merge(y, how='outer', right_index=True, left_index=True) summ = reduce(merg, cols) # if regressor_order: if not regressor_order: regressor_order = ['const'] varnames = summ.index.get_level_values(0).tolist() ordered = [x for x in regressor_order if x in varnames] unordered = [x for x in varnames if x not in regressor_order + ['']] # Note: np.unique can disrupt the original order of list 'unordered'. # Then pd.Series().unique() works well. # order = ordered + list(np.unique(unordered)) order = ordered + list(pd.Series(unordered).unique()) f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], []) # summ.index = f(np.unique(varnames)) summ.index = f(pd.Series(varnames).unique()) summ = summ.reindex(f(order)) summ.index = [x[:-4] for x in summ.index] idx = pd.Series(lrange(summ.shape[0])) % 2 == 1 summ.index = np.where(idx, '', summ.index.get_level_values(0)) summ = summ.fillna('') # add infos about the models. # if info_dict: # cols = [_col_info(x, info_dict.get(x.model.__class__.__name__, # info_dict)) for x in results] # else: # cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in # results] cols = [_col_info(x,more_info=more_info) for x in results] # use unique column names, otherwise the merge will not succeed for df , name in zip(cols, _make_unique([df.columns[0] for df in cols])): df.columns = [name] merg = lambda x, y: x.merge(y, how='outer', right_index=True, left_index=True) info = reduce(merg, cols) info.columns = summ.columns info = info.fillna('') # dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error # dat.columns = summ.columns # dat.index = pd.Index(summ.index.tolist() + info.index.tolist()) # summ = dat # summ = summ.fillna('') # smry = Summary() # smry.add_df(summ, header=True, align='l') # smry.add_text('Standard errors in parentheses.') # if stars: # smry.add_text('* p<.1, ** p<.05, ***p<.01')*p<.01') # return smry if show is 't': note = ['\t t statistics in parentheses.'] if show is 'se': note = ['\t Std. error in parentheses.'] if show is 'p': note = ['\t pvalues in parentheses.'] if stars: note += ['\t * p<.1, ** p<.05, ***p<.01'] #Here I tried two ways to put extra text in index-location # or columns-location,finally found the former is better. # note_df = pd.DataFrame(note,index=['note']+['']*(len(note)-1),columns=[summ.columns[0]]) note_df = pd.DataFrame([ ],index=['note:']+note,columns=summ.columns).fillna('') # summ_all = pd.concat([summ,info,note_df],axis=0) # I construct a title DataFrame and adjust the location of title corresponding to the length of columns. if title is not None: title = str(title) else: title = '\t Results Summary' # Here I tried to construct a title DataFrame and # adjust the location of title corresponding to the length of columns. # But I failed because of not good printing effect. # col_len = len(summ.columns) # fake_data = ['']*col_len # if col_len % 2 == 1: # from math import ceil # i = ceil(col_len/2) # else: # i = int(col_len/2) # fake_data[i-1] = title # title_df = pd.DataFrame([fake_data],index=[''],columns=summ.columns).fillna('') title_df = pd.DataFrame([],index=[title],columns=summ.columns).fillna('') smry = Summary() smry.add_df(title_df,header=False,align='l') smry.add_df(summ, header=True, align='l') smry.add_df(info, header=False, align='l') smry.add_df(note_df, header=False, align='l') return smry
kde = KDE(x, kern) print(kde.density(np.matrix([1, 2]))) #.T plt.figure() plt.plot(x[:, 0], x[:, 1], 'o') n_grid = 50 xsp = np.linspace(x.min(0)[0], x.max(0)[0], n_grid) ysp = np.linspace(x.min(0)[1], x.max(0)[1], n_grid) # xsorted = np.sort(x) # xlow = xsorted[nobs/4] # xupp = xsorted[3*nobs/4] # xsp = np.linspace(xlow[0], xupp[0], n_grid) # ysp = np.linspace(xlow[1], xupp[1], n_grid) xr, yr = np.meshgrid(xsp, ysp) kde_vals = np.array([ kde.density(np.matrix([xi, yi])) for xi, yi in zip(xr.ravel(), yr.ravel()) ]) plt.contour(xsp, ysp, kde_vals.reshape(n_grid, n_grid)) plt.show() # 5 D case # random.seed(142) # mu = [1.0, 4.0, 3.5, -2.4, 0.0] # sigma = np.matrix( # [[ 0.6 - 0.1*abs(i-j) if i != j else 1.0 for j in xrange(5)] for i in xrange(5)]) # x = random.multivariate_normal(mu, sigma, size = 100) # kern = kernel.Gaussian() # kde = KernelEstimate( x, kern )
def interaction_plot(x, trace, response, func=np.mean, ax=None, plottype='b', xlabel=None, ylabel=None, colors=None, markers=None, linestyles=None, legendloc='best', legendtitle=None, **kwargs): """ Interaction plot for factor level statistics. Note. If categorial factors are supplied levels will be internally recoded to integers. This ensures matplotlib compatiblity. uses pandas.DataFrame to calculate an `aggregate` statistic for each level of the factor or group given by `trace`. Parameters ---------- x : array-like The `x` factor levels constitute the x-axis. If a `pandas.Series` is given its name will be used in `xlabel` if `xlabel` is None. trace : array-like The `trace` factor levels will be drawn as lines in the plot. If `trace` is a `pandas.Series` its name will be used as the `legendtitle` if `legendtitle` is None. response : array-like The reponse or dependent variable. If a `pandas.Series` is given its name will be used in `ylabel` if `ylabel` is None. func : function Anything accepted by `pandas.DataFrame.aggregate`. This is applied to the response variable grouped by the trace levels. plottype : str {'line', 'scatter', 'both'}, optional The type of plot to return. Can be 'l', 's', or 'b' ax : axes, optional Matplotlib axes instance xlabel : str, optional Label to use for `x`. Default is 'X'. If `x` is a `pandas.Series` it will use the series names. ylabel : str, optional Label to use for `response`. Default is 'func of response'. If `response` is a `pandas.Series` it will use the series names. colors : list, optional If given, must have length == number of levels in trace. linestyles : list, optional If given, must have length == number of levels in trace. markers : list, optional If given, must have length == number of lovels in trace kwargs These will be passed to the plot command used either plot or scatter. If you want to control the overall plotting options, use kwargs. Returns ------- fig : Figure The figure given by `ax.figure` or a new instance. Examples -------- >>> import numpy as np >>> np.random.seed(12345) >>> weight = np.random.randint(1,4,size=60) >>> duration = np.random.randint(1,3,size=60) >>> days = np.log(np.random.randint(1,30, size=60)) >>> fig = interaction_plot(weight, duration, days, ... colors=['red','blue'], markers=['D','^'], ms=10) >>> import matplotlib.pyplot as plt >>> plt.show() .. plot:: import numpy as np from statsmodels.graphics.factorplots import interaction_plot np.random.seed(12345) weight = np.random.randint(1,4,size=60) duration = np.random.randint(1,3,size=60) days = np.log(np.random.randint(1,30, size=60)) fig = interaction_plot(weight, duration, days, colors=['red','blue'], markers=['D','^'], ms=10) import matplotlib.pyplot as plt #plt.show() """ from pandas import DataFrame fig, ax = utils.create_mpl_ax(ax) response_name = ylabel or getattr(response, 'name', 'response') ylabel = '%s of %s' % (get_function_name(func), response_name) xlabel = xlabel or getattr(x, 'name', 'X') legendtitle = legendtitle or getattr(trace, 'name', 'Trace') ax.set_ylabel(ylabel) ax.set_xlabel(xlabel) x_values = x_levels = None if isinstance(x[0], str): x_levels = [l for l in np.unique(x)] x_values = lrange(len(x_levels)) x = _recode(x, dict(zip(x_levels, x_values))) data = DataFrame(dict(x=x, trace=trace, response=response)) plot_data = data.groupby(['trace', 'x']).aggregate(func).reset_index() # return data # check plot args n_trace = len(plot_data['trace'].unique()) linestyles = ['-'] * n_trace if linestyles is None else linestyles markers = ['.'] * n_trace if markers is None else markers colors = rainbow(n_trace) if colors is None else colors if len(linestyles) != n_trace: raise ValueError("Must be a linestyle for each trace level") if len(markers) != n_trace: raise ValueError("Must be a marker for each trace level") if len(colors) != n_trace: raise ValueError("Must be a color for each trace level") if plottype == 'both' or plottype == 'b': for i, (values, group) in enumerate(plot_data.groupby(['trace'])): # trace label label = str(group['trace'].values[0]) ax.plot(group['x'], group['response'], color=colors[i], marker=markers[i], label=label, linestyle=linestyles[i], **kwargs) elif plottype == 'line' or plottype == 'l': for i, (values, group) in enumerate(plot_data.groupby(['trace'])): # trace label label = str(group['trace'].values[0]) ax.plot(group['x'], group['response'], color=colors[i], label=label, linestyle=linestyles[i], **kwargs) elif plottype == 'scatter' or plottype == 's': for i, (values, group) in enumerate(plot_data.groupby(['trace'])): # trace label label = str(group['trace'].values[0]) ax.scatter(group['x'], group['response'], color=colors[i], label=label, marker=markers[i], **kwargs) else: raise ValueError("Plot type %s not understood" % plottype) ax.legend(loc=legendloc, title=legendtitle) ax.margins(.1) if all([x_levels, x_values]): ax.set_xticks(x_values) ax.set_xticklabels(x_levels) return fig