def getrlm(self): self.k2 = self.results['k2'] if isinstance(self.results['w'], dict): tmp = np.zeros((len(list(iterkeys(self.results['w']))))) for i in iterkeys(self.results['w']): tmp[int(i)-1] = self.results['w'][i] self.weights = tmp else: self.weights = self.results['w'] self.stddev = self.rsum['stddev'] # Don't know what this is yet self.wresid = None # these equal resids always?
def getrlm(self): self.k2 = self.results['k2'] if isinstance(self.results['w'], dict): tmp = np.zeros((len(list(iterkeys(self.results['w']))))) for i in iterkeys(self.results['w']): tmp[int(i) - 1] = self.results['w'][i] self.weights = tmp else: self.weights = self.results['w'] self.stddev = self.rsum['stddev'] # Don't know what this is yet self.wresid = None # these equal resids always?
def test_pickle_wrapper(self): fh = BytesIO() # use cPickle with binary content # test unwrapped results load save pickle self.results._results.save(fh) fh.seek(0, 0) res_unpickled = self.results._results.__class__.load(fh) assert_(type(res_unpickled) is type(self.results._results)) # test wrapped results load save fh.seek(0, 0) self.results.save(fh) fh.seek(0, 0) res_unpickled = self.results.__class__.load(fh) fh.close() # print type(res_unpickled) assert_(type(res_unpickled) is type(self.results)) before = sorted(iterkeys(self.results.__dict__)) after = sorted(iterkeys(res_unpickled.__dict__)) assert_(before == after, msg='not equal %r and %r' % (before, after)) before = sorted(iterkeys(self.results._results.__dict__)) after = sorted(iterkeys(res_unpickled._results.__dict__)) assert_(before == after, msg='not equal %r and %r' % (before, after)) before = sorted(iterkeys(self.results.model.__dict__)) after = sorted(iterkeys(res_unpickled.model.__dict__)) assert_(before == after, msg='not equal %r and %r' % (before, after)) before = sorted(iterkeys(self.results._cache)) after = sorted(iterkeys(res_unpickled._cache)) assert_(before == after, msg='not equal %r and %r' % (before, after))
def test_pickle_wrapper(self): fh = BytesIO() # use pickle with binary content # test unwrapped results load save pickle self.results._results.save(fh) fh.seek(0, 0) res_unpickled = self.results._results.__class__.load(fh) assert type(res_unpickled) is type(self.results._results) # noqa: E721 # test wrapped results load save fh.seek(0, 0) self.results.save(fh) fh.seek(0, 0) res_unpickled = self.results.__class__.load(fh) fh.close() assert type(res_unpickled) is type(self.results) # noqa: E721 before = sorted(iterkeys(self.results.__dict__)) after = sorted(iterkeys(res_unpickled.__dict__)) assert_(before == after, msg='not equal %r and %r' % (before, after)) before = sorted(iterkeys(self.results._results.__dict__)) after = sorted(iterkeys(res_unpickled._results.__dict__)) assert_(before == after, msg='not equal %r and %r' % (before, after)) before = sorted(iterkeys(self.results.model.__dict__)) after = sorted(iterkeys(res_unpickled.model.__dict__)) assert_(before == after, msg='not equal %r and %r' % (before, after)) before = sorted(iterkeys(self.results._cache)) after = sorted(iterkeys(res_unpickled._cache)) assert_(before == after, msg='not equal %r and %r' % (before, after))
def score(self, b, ties='breslow'): score = 0 for t in iterkeys(self.failures): fail = self.failures[t] d = len(fail) risk = self.risk[t] Z = self.design[t] score += Z[fail].sum() if ties == 'breslow': w = np.exp(np.dot(Z, b)) rv = Discrete(Z[risk], w=w[risk]) score -= rv.mean() * d elif ties == 'efron': w = np.exp(np.dot(Z, b)) score += Z[fail].sum() for j in range(d): efron_w = w efron_w[fail] -= i * w[fail] / float(d) rv = Discrete(Z[risk], w=efron_w[risk]) score -= rv.mean() elif ties == 'cox': raise NotImplementedError('Cox tie breaking method not \ implemented') else: raise NotImplementedError('tie breaking method not recognized') return np.array([score])
def loglike(self, b, ties='breslow'): logL = 0 for t in iterkeys(self.failures): fail = self.failures[t] d = len(fail) risk = self.risk[t] Zb = np.dot(self.design[t], b) logL += Zb[fail].sum() if ties == 'breslow': s = np.exp(Zb[risk]).sum() logL -= np.log(np.exp(Zb[risk]).sum()) * d elif ties == 'efron': s = np.exp(Zb[risk]).sum() r = np.exp(Zb[fail]).sum() for j in range(d): logL -= np.log(s - j * r / d) elif ties == 'cox': raise NotImplementedError('Cox tie breaking method not \ implemented') else: raise NotImplementedError('tie breaking method not recognized') return logL
def cache(self): if self.time_dependent: self.cachedir = tempfile.mkdtemp() self.design = {} self.risk = {} first = True for t in iterkeys(self.failures): if self.time_dependent: d = np.array([s(self.formula, time=t) for s in self.subjects]).astype(float)[:,None] dshape = d.shape dfile = file(tempfile.mkstemp(dir=self.cachedir)[1], 'w') d.tofile(dfile) dfile.close() del(d) self.design[t] = np.memmap(dfile.name, dtype=np.dtype(float), shape=dshape) elif first: d = np.array([s(self.formula, time=t) for s in self.subjects]).astype(np.float64) self.design[t] = d else: self.design[t] = d self.risk[t] = np.compress([s.atrisk(t) for s in self.subjects], np.arange(self.design[t].shape[0]),axis=-1)
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None return result, missing_mask
def information(self, b, ties='breslow'): info = 0 #np.zeros((len(b),len(b))) #0 score = 0 for t in iterkeys(self.failures): fail = self.failures[t] d = len(fail) risk = self.risk[t] Z = self.design[t] if ties == 'breslow': w = np.exp(np.dot(Z, b)) rv = Discrete(Z[risk], w=w[risk]) info += rv.cov() elif ties == 'efron': w = np.exp(np.dot(Z, b)) score += Z[fail].sum() for j in range(d): efron_w = w efron_w[fail] -= i * w[fail] / d rv = Discrete(Z[risk], w=efron_w[risk]) info += rv.cov() elif ties == 'cox': raise NotImplementedError('Cox tie breaking method not \ implemented') else: raise NotImplementedError('tie breaking method not recognized') return score
def score(self, b, ties='breslow'): score = 0 for t in iterkeys(self.failures): fail = self.failures[t] d = len(fail) risk = self.risk[t] Z = self.design[t] score += Z[fail].sum() if ties == 'breslow': w = np.exp(np.dot(Z, b)) rv = Discrete(Z[risk], w=w[risk]) score -= rv.mean() * d elif ties == 'efron': w = np.exp(np.dot(Z, b)) score += Z[fail].sum() for j in range(d): efron_w = w efron_w[fail] -= i * w[fail] / float(d) rv = Discrete(Z[risk], w=efron_w[risk]) score -= rv.mean() elif ties == 'cox': raise NotImplementedError('Cox tie breaking method not \ implemented') else: raise NotImplementedError('tie breaking method not recognized') return np.array([score])
def loglike(self, b, ties='breslow'): logL = 0 for t in iterkeys(self.failures): fail = self.failures[t] d = len(fail) risk = self.risk[t] Zb = np.dot(self.design[t], b) logL += Zb[fail].sum() if ties == 'breslow': s = np.exp(Zb[risk]).sum() logL -= np.log(np.exp(Zb[risk]).sum()) * d elif ties == 'efron': s = np.exp(Zb[risk]).sum() r = np.exp(Zb[fail]).sum() for j in range(d): logL -= np.log(s - j * r / d) elif ties == 'cox': raise NotImplementedError('Cox tie breaking method not \ implemented') else: raise NotImplementedError('tie breaking method not recognized') return logL
def cache(self): if self.time_dependent: self.cachedir = tempfile.mkdtemp() self.design = {} self.risk = {} first = True for t in iterkeys(self.failures): if self.time_dependent: d = np.array([s(self.formula, time=t) for s in self.subjects]).astype(float)[:,None] dshape = d.shape dfile = file(tempfile.mkstemp(dir=self.cachedir)[1], 'w') d.tofile(dfile) dfile.close() del(d) self.design[t] = np.memmap(dfile.name, dtype=np.dtype(float), shape=dshape) elif first: d = np.array([s(self.formula, time=t) for s in self.subjects]).astype(np.float64) self.design[t] = d else: self.design[t] = d self.risk[t] = np.compress([s.atrisk(t) for s in self.subjects], np.arange(self.design[t].shape[0]),axis=-1)
def information(self, b, ties='breslow'): info = 0 #np.zeros((len(b),len(b))) #0 score = 0 for t in iterkeys(self.failures): fail = self.failures[t] d = len(fail) risk = self.risk[t] Z = self.design[t] if ties == 'breslow': w = np.exp(np.dot(Z, b)) rv = Discrete(Z[risk], w=w[risk]) info += rv.cov() elif ties == 'efron': w = np.exp(np.dot(Z, b)) score += Z[fail].sum() for j in range(d): efron_w = w efron_w[fail] -= i * w[fail] / d rv = Discrete(Z[risk], w=efron_w[risk]) info += rv.cov() elif ties == 'cox': raise NotImplementedError('Cox tie breaking method not \ implemented') else: raise NotImplementedError('tie breaking method not recognized') return score
def add_dict(self, d, ncols=2, align='l', float_format="%.4f"): '''Add the contents of a Dict to summary table Parameters ---------- d : dict Keys and values are automatically coerced to strings with str(). Users are encouraged to format them before using add_dict. ncols: int Number of columns of the output table align : string Data alignment (l/c/r) ''' keys = [_formatter(x, float_format) for x in iterkeys(d)] vals = [_formatter(x, float_format) for x in itervalues(d)] data = np.array(lzip(keys, vals)) if data.shape[0] % ncols != 0: pad = ncols - (data.shape[0] % ncols) data = np.vstack([data, np.array(pad * [['', '']])]) data = np.split(data, ncols) data = reduce(lambda x, y: np.hstack([x, y]), data) self.add_array(data, align=align)
def add_dict(self, d, ncols=2, align='l', float_format="%.4f"): '''Add the contents of a Dict to summary table Parameters ---------- d : dict Keys and values are automatically coerced to strings with str(). Users are encouraged to format them before using add_dict. ncols: int Number of columns of the output table align : str Data alignment (l/c/r) ''' keys = [_formatter(x, float_format) for x in iterkeys(d)] vals = [_formatter(x, float_format) for x in itervalues(d)] data = np.array(lzip(keys, vals)) if data.shape[0] % ncols != 0: pad = ncols - (data.shape[0] % ncols) data = np.vstack([data, np.array(pad * [['', '']])]) data = np.split(data, ncols) data = reduce(lambda x, y: np.hstack([x, y]), data) self.add_array(data, align=align)
def test_recursive_split(): keys = list(product('mf')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(list(iterkeys(res)) == keys) res[('m',)] = (0.0, 0.0, 0.5, 1.0) res[('f',)] = (0.5, 0.0, 0.5, 1.0) keys = list(product('mf', 'yao')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(list(iterkeys(res)) == keys) res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3) res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3) res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3) res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3) res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3) res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
def test_recursive_split(): keys = list(product('mf')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(list(iterkeys(res)) == keys) res[('m', )] = (0.0, 0.0, 0.5, 1.0) res[('f', )] = (0.5, 0.0, 0.5, 1.0) keys = list(product('mf', 'yao')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(list(iterkeys(res)) == keys) res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3) res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3) res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3) res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3) res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3) res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
def test__key_splitting(): # subdivide starting with an empty tuple base_rect = {tuple(): (0, 0, 1, 1)} res = _key_splitting(base_rect, ['a', 'b'], [1, 1], tuple(), True, 0) assert_(list(iterkeys(res)) == [('a',), ('b',)]) eq(res[('a',)], (0, 0, 0.5, 1)) eq(res[('b',)], (0.5, 0, 0.5, 1)) # subdivide a in two sublevel res_bis = _key_splitting(res, ['c', 'd'], [1, 1], ('a',), False, 0) assert_(list(iterkeys(res_bis)) == [('a', 'c'), ('a', 'd'), ('b',)]) eq(res_bis[('a', 'c')], (0.0, 0.0, 0.5, 0.5)) eq(res_bis[('a', 'd')], (0.0, 0.5, 0.5, 0.5)) eq(res_bis[('b',)], (0.5, 0, 0.5, 1)) # starting with a non empty tuple and uneven distribution base_rect = {('total',): (0, 0, 1, 1)} res = _key_splitting(base_rect, ['a', 'b'], [1, 2], ('total',), True, 0) assert_(list(iterkeys(res)) == [('total',) + (e,) for e in ['a', 'b']]) eq(res[('total', 'a')], (0, 0, 1 / 3, 1)) eq(res[('total', 'b')], (1 / 3, 0, 2 / 3, 1))
def test__key_splitting(): # subdivide starting with an empty tuple base_rect = {tuple(): (0, 0, 1, 1)} res = _key_splitting(base_rect, ['a', 'b'], [1, 1], tuple(), True, 0) assert_(list(iterkeys(res)) == [('a', ), ('b', )]) eq(res[('a', )], (0, 0, 0.5, 1)) eq(res[('b', )], (0.5, 0, 0.5, 1)) # subdivide a in two sublevel res_bis = _key_splitting(res, ['c', 'd'], [1, 1], ('a', ), False, 0) assert_(list(iterkeys(res_bis)) == [('a', 'c'), ('a', 'd'), ('b', )]) eq(res_bis[('a', 'c')], (0.0, 0.0, 0.5, 0.5)) eq(res_bis[('a', 'd')], (0.0, 0.5, 0.5, 0.5)) eq(res_bis[('b', )], (0.5, 0, 0.5, 1)) # starting with a non empty tuple and uneven distribution base_rect = {('total', ): (0, 0, 1, 1)} res = _key_splitting(base_rect, ['a', 'b'], [1, 2], ('total', ), True, 0) assert_(list(iterkeys(res)) == [('total', ) + (e, ) for e in ['a', 'b']]) eq(res[('total', 'a')], (0, 0, 1 / 3, 1)) eq(res[('total', 'b')], (1 / 3, 0, 2 / 3, 1))
def verify(self): '''load the saved module and verify the data This tries several ways of comparing the saved and the attached data, but might not work for all possible data structures. Returns ------- all_correct : bool true if no differences are found, for floating point numbers rtol=1e-16, atol=1e-16 is used to determine equality (allclose) correctli : list list of attribute names that compare as equal incorrectli : list list of attribute names that did not compare as equal, either because they differ or because the comparison does not handle the data structure correctly ''' module = __import__(self._filename.replace('.py', '')) if not self._useinstance: raise NotImplementedError('currently only implemented when' 'useinstance is true') data = getattr(module, self.name) correctli = [] incorrectli = [] for d in self._what: self_item = getattr(data, d) saved_item = getattr(data, d) #print(d) #try simple equality correct = np.all(self.item == saved_item) #try allclose if not correct and not self.item.dtype == np.dtype('object'): correct = np.allclose(self_item, saved_item, rtol=1e-16, atol=1e-16) if not correct: import warnings warnings.warn("inexact precision in " + d, RuntimeWarning) #try iterating, if object array if not correct: correlem = [ np.all(data[d].item()[k] == getattr( testsave.var_results, d).item()[k]) for k in iterkeys(data[d].item()) ] if not correlem: #print(d, "wrong") incorrectli.append(d) correctli.append(d) return len(incorrectli) == 0, correctli, incorrectli
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None if len(result) > 1: # have RHS design design_info = result[1].design_info # detach it from DataFrame else: design_info = None # NOTE: is there ever a case where we'd need LHS design_info? return result, missing_mask, design_info
def interactions(terms, order=[1, 2]): """ Output all pairwise interactions of given order of a sequence of terms. The argument order is a sequence specifying which order of interactions should be generated -- the default creates main effects and two-way interactions. If order is an integer, it is changed to range(1,order+1), so order=3 is equivalent to order=[1,2,3], generating all one, two and three-way interactions. If any entry of order is greater than len(terms), it is effectively treated as len(terms). >>> print interactions([Term(l) for l in ['a', 'b', 'c']]) <formula: a*b + a*c + b*c + a + b + c> >>> >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5))) <formula: a*b + a*b*c + a*c + b*c + a + b + c> >>> """ n_terms = len(terms) values = {} if np.asarray(order).shape == (): order = lrange(1, int(order) + 1) # First order for o in order: indices = np.indices((n_terms, ) * (o)) indices.shape = (indices.shape[0], np.product(indices.shape[1:])) for m in range(indices.shape[1]): # only keep combinations that have unique entries if (np.unique(indices[:, m]).shape == indices[:, m].shape and np.alltrue( np.equal(np.sort(indices[:, m]), indices[:, m]))): ll = [terms[j] for j in indices[:, m]] v = ll[0] for ii in range(len(ll) - 1): v *= ll[ii + 1] values[tuple(indices[:, m])] = v key = list(iterkeys(values))[0] value = values[key] del (values[key]) for v in itervalues(values): value += v return value
def interactions(terms, order=[1,2]): """ Output all pairwise interactions of given order of a sequence of terms. The argument order is a sequence specifying which order of interactions should be generated -- the default creates main effects and two-way interactions. If order is an integer, it is changed to range(1,order+1), so order=3 is equivalent to order=[1,2,3], generating all one, two and three-way interactions. If any entry of order is greater than len(terms), it is effectively treated as len(terms). >>> print interactions([Term(l) for l in ['a', 'b', 'c']]) <formula: a*b + a*c + b*c + a + b + c> >>> >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5))) <formula: a*b + a*b*c + a*c + b*c + a + b + c> >>> """ l = len(terms) values = {} if np.asarray(order).shape == (): order = lrange(1, int(order)+1) # First order for o in order: I = np.indices((l,)*(o)) I.shape = (I.shape[0], np.product(I.shape[1:])) for m in range(I.shape[1]): # only keep combinations that have unique entries if (np.unique(I[:,m]).shape == I[:,m].shape and np.alltrue(np.equal(np.sort(I[:,m]), I[:,m]))): ll = [terms[j] for j in I[:,m]] v = ll[0] for ii in range(len(ll)-1): v *= ll[ii+1] values[tuple(I[:,m])] = v key = list(iterkeys(values))[0] value = values[key] del(values[key]) for v in itervalues(values): value += v return value
def __init__(self, sys, indep_endog=None, instruments=None): if len(sys) % 2 != 0: raise ValueError("sys must be a list of pairs of endogenous and \ exogenous variables. Got length %s" % len(sys)) M = len(sys[1::2]) self._M = M # The lists are probably a bad idea self.endog = sys[::2] # these are just list containers self.exog = sys[1::2] self._K = [np.linalg.matrix_rank(_) for _ in sys[1::2]] # fullexog = np.column_stack((_ for _ in self.exog)) self.instruments = instruments # Keep the Y_j's in a container to get IVs instr_endog = {} [instr_endog.setdefault(_, []) for _ in iterkeys(indep_endog)] for eq_key in indep_endog: for varcol in indep_endog[eq_key]: instr_endog[eq_key].append(self.exog[eq_key][:, varcol]) # ^ copy needed? # self._instr_endog = instr_endog self._indep_endog = indep_endog _col_map = np.cumsum(np.hstack((0, self._K))) # starting col no.s # move this check to whiten since we're not going to build a full exog? for eq_key in indep_endog: try: iter(indep_endog[eq_key]) except: # eq_key = [eq_key] raise TypeError("The values of the indep_exog dict must be " "iterable. Got type %s for converter %s" % (type(indep_endog[eq_key]), eq_key)) # for del_col in indep_endog[eq_key]: # fullexog = np.delete(fullexog, _col_map[eq_key]+del_col, 1) # _col_map[eq_key+1:] -= 1 # Josef's example for deleting reoccuring "rows" # fullexog = np.unique(fullexog.T.view([('',fullexog.dtype)]*\ # fullexog.shape[0])).view(fullexog.dtype).reshape(\ # fullexog.shape[0],-1) # From http://article.gmane.org/gmane.comp.python.numeric.general/32276/ # Or Jouni' suggetsion of taking a hash: # http://www.mail-archive.com/[email protected]/msg04209.html # not clear to me how this would work though, only if they are the *same* # elements? # self.fullexog = fullexog self.wexog = self.whiten(instr_endog)
def observed_crude_oddsratio(self): """The crude odds ratio is obtained by pooling all data corresponding to a given pair of cut points (c,c'), then forming the inverse variance weighted average of these odds ratios to obtain a single OR. Since the covariate effects are ignored, this OR will generally be greater than the stratified OR. """ cpp = self.cpp endog = self.model.endog_li # Storage for the contingency tables for each (c,c') tables = {} for ii in iterkeys(cpp[0]): tables[ii] = np.zeros((2, 2), dtype=np.float64) # Get the observed crude OR for i in range(len(endog)): if len(endog[i]) == 0: continue # The observed joint values for the current cluster yvec = endog[i] endog_11 = np.outer(yvec, yvec) endog_10 = np.outer(yvec, 1 - yvec) endog_01 = np.outer(1 - yvec, yvec) endog_00 = np.outer(1 - yvec, 1 - yvec) cpp1 = cpp[i] for ky in iterkeys(cpp1): ix = cpp1[ky] tables[ky][1, 1] += endog_11[ix[:, 0], ix[:, 1]].sum() tables[ky][1, 0] += endog_10[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 1] += endog_01[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 0] += endog_00[ix[:, 0], ix[:, 1]].sum() return self.pooled_odds_ratio(list(itervalues(tables)))
def getglm(self): self.deviance = self.rsum['deviance'] self.resid = [self.results['residuals'][str(k)] \ for k in range(1, 1+self.nobs)] if isinstance(self.resid, dict): tmp = np.zeros(len(self.resid)) for i in iterkeys(self.resid): tmp[int(i)-1] = self.resid[i] self.resid = tmp self.predict = [self.results['linear.predictors'][str(k)] \ for k in range(1, 1+self.nobs)] self.fittedvalues = [self.results['fitted.values'][str(k)] \ for k in range(1, 1+self.nobs)] self.weights = [self.results['weights'][str(k)] \ for k in range(1, 1+self.nobs)] self.resid_deviance = self.rsum['deviance.resid'] if isinstance(self.resid_deviance, dict): tmp = np.zeros(len(self.resid_deviance)) for i in iterkeys(self.resid_deviance): tmp[int(i)-1] = self.resid_deviance[i] self.resid_deviance = tmp self.null_deviance = self.rsum['null.deviance']
def initialize(self, subjects): print('called initialize') self.failures = {} for i in range(len(subjects)): s = subjects[i] if s.delta: if s.time not in self.failures: self.failures[s.time] = [i] else: self.failures[s.time].append(i) self.failure_times = list(iterkeys(self.failures)) self.failure_times.sort()
def initialize(self, subjects): print('called initialize') self.failures = {} for i in range(len(subjects)): s = subjects[i] if s.delta: if s.time not in self.failures: self.failures[s.time] = [i] else: self.failures[s.time].append(i) self.failure_times = list(iterkeys(self.failures)) self.failure_times.sort()
def getglm(self): self.deviance = self.rsum['deviance'] self.resid = [self.results['residuals'][str(k)] \ for k in range(1, 1+self.nobs)] if isinstance(self.resid, dict): tmp = np.zeros(len(self.resid)) for i in iterkeys(self.resid): tmp[int(i) - 1] = self.resid[i] self.resid = tmp self.predict = [self.results['linear.predictors'][str(k)] \ for k in range(1, 1+self.nobs)] self.fittedvalues = [self.results['fitted.values'][str(k)] \ for k in range(1, 1+self.nobs)] self.weights = [self.results['weights'][str(k)] \ for k in range(1, 1+self.nobs)] self.resid_deviance = self.rsum['deviance.resid'] if isinstance(self.resid_deviance, dict): tmp = np.zeros(len(self.resid_deviance)) for i in iterkeys(self.resid_deviance): tmp[int(i) - 1] = self.resid_deviance[i] self.resid_deviance = tmp self.null_deviance = self.rsum['null.deviance']
def observed_crude_oddsratio(self): """The crude odds ratio is obtained by pooling all data corresponding to a given pair of cut points (c,c'), then forming the inverse variance weighted average of these odds ratios to obtain a single OR. Since the covariate effects are ignored, this OR will generally be greater than the stratified OR. """ cpp = self.cpp endog = self.model.endog_li # Storage for the contingency tables for each (c,c') tables = {} for ii in iterkeys(cpp[0]): tables[ii] = np.zeros((2, 2), dtype=np.float64) # Get the observed crude OR for i in range(len(endog)): if len(endog[i]) == 0: continue # The observed joint values for the current cluster yvec = endog[i] endog_11 = np.outer(yvec, yvec) endog_10 = np.outer(yvec, 1 - yvec) endog_01 = np.outer(1 - yvec, yvec) endog_00 = np.outer(1 - yvec, 1 - yvec) cpp1 = cpp[i] for ky in iterkeys(cpp1): ix = cpp1[ky] tables[ky][1, 1] += endog_11[ix[:, 0], ix[:, 1]].sum() tables[ky][1, 0] += endog_10[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 1] += endog_01[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 0] += endog_00[ix[:, 0], ix[:, 1]].sum() return self.pooled_odds_ratio(list(itervalues(tables)))
def verify(self): '''load the saved module and verify the data This tries several ways of comparing the saved and the attached data, but might not work for all possible data structures. Returns ------- all_correct : bool true if no differences are found, for floating point numbers rtol=1e-16, atol=1e-16 is used to determine equality (allclose) correctli : list list of attribute names that compare as equal incorrectli : list list of attribute names that did not compare as equal, either because they differ or because the comparison does not handle the data structure correctly ''' module = __import__(self._filename.replace('.py','')) if not self._useinstance: raise NotImplementedError('currently only implemented when' 'useinstance is true') data = getattr(module, self.name) correctli = [] incorrectli = [] for d in self._what: self_item = getattr(data, d) saved_item = getattr(data, d) #print(d) #try simple equality correct = np.all(self.item == saved_item) #try allclose if not correct and not self.item.dtype == np.dtype('object'): correct = np.allclose(self_item, saved_item, rtol=1e-16, atol=1e-16) if not correct: import warnings warnings.warn("inexact precision in "+d, RuntimeWarning) #try iterating, if object array if not correct: correlem =[np.all(data[d].item()[k] == getattr(testsave.var_results, d).item()[k]) for k in iterkeys(data[d].item())] if not correlem: #print(d, "wrong") incorrectli.append(d) correctli.append(d) return len(incorrectli)==0, correctli, incorrectli
def test_all_to_tbl(self): from statsmodels.stats.libqsturng.make_tbls import T,R ps, rs, vs, qs = [], [], [], [] for p in T: for v in T[p]: for r in iterkeys(R): ps.append(p) vs.append(v) rs.append(r) qs.append(T[p][v][R[r]]) qs = np.array(qs) errors = np.abs(qs-qsturng(ps,rs,vs))/qs assert_equal(np.array([]), np.where(errors > .03)[0])
def __init__(self, sys, indep_endog=None, instruments=None): if len(sys) % 2 != 0: raise ValueError("sys must be a list of pairs of endogenous and \ exogenous variables. Got length %s" % len(sys)) M = len(sys[1::2]) self._M = M # The lists are probably a bad idea self.endog = sys[::2] # these are just list containers self.exog = sys[1::2] self._K = [np.linalg.matrix_rank(_) for _ in sys[1::2]] # fullexog = np.column_stack((_ for _ in self.exog)) self.instruments = instruments # Keep the Y_j's in a container to get IVs instr_endog = {} [instr_endog.setdefault(_,[]) for _ in iterkeys(indep_endog)] for eq_key in indep_endog: for varcol in indep_endog[eq_key]: instr_endog[eq_key].append(self.exog[eq_key][:,varcol]) # ^ copy needed? # self._instr_endog = instr_endog self._indep_endog = indep_endog _col_map = np.cumsum(np.hstack((0,self._K))) # starting col no.s # move this check to whiten since we're not going to build a full exog? for eq_key in indep_endog: try: iter(indep_endog[eq_key]) except: # eq_key = [eq_key] raise TypeError("The values of the indep_exog dict must be " "iterable. Got type %s for converter %s" % (type(indep_endog[eq_key]), eq_key)) # for del_col in indep_endog[eq_key]: # fullexog = np.delete(fullexog, _col_map[eq_key]+del_col, 1) # _col_map[eq_key+1:] -= 1 # Josef's example for deleting reoccuring "rows" # fullexog = np.unique(fullexog.T.view([('',fullexog.dtype)]*\ # fullexog.shape[0])).view(fullexog.dtype).reshape(\ # fullexog.shape[0],-1) # From http://article.gmane.org/gmane.comp.python.numeric.general/32276/ # Or Jouni' suggetsion of taking a hash: # http://www.mail-archive.com/[email protected]/msg04209.html # not clear to me how this would work though, only if they are the *same* # elements? # self.fullexog = fullexog self.wexog = self.whiten(instr_endog)
def t_est_all_to_tbl(self): from statsmodels.stats.libqsturng.make_tbls import T,R ps, rs, vs, qs = [], [], [], [] for p in T: for v in T[p]: for r in iterkeys(R): ps.append(p) vs.append(v) rs.append(r) qs.append(T[p][v][R[r]]) qs = np.array(qs) errors = np.abs(qs-qsturng(ps,rs,vs))/qs assert_equal(np.array([]), np.where(errors > .03)[0])
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female', )] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in iteritems(data)]) keys = list(iterkeys(temp_data)) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)') #pylab.show() pylab.close('all')
def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(iteritems(data)) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = list(iteritems(data)) # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(iterkeys(data))) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = lrange(len(categories_levels)) if index is None else index contingency = OrderedDict() for key, value in iteritems(data): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(iteritems(data)) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = list(iteritems(data)) # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(iterkeys(data))) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = lrange(len(categories_levels)) if index is None else index contingency = OrderedDict() for key, value in iteritems(data): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def _recode(x, levels): """ Recode categorial data to int factor. Parameters ---------- x : array-like array like object supporting with numpy array methods of categorially coded data. levels : dict mapping of labels to integer-codings Returns ------- out : instance numpy.ndarray """ from pandas import Series name = None index = None if isinstance(x, Series): name = x.name index = x.index x = x.values if x.dtype.type not in [np.str_, np.object_]: raise ValueError('This is not a categorial factor.' ' Array of str type required.') elif not isinstance(levels, dict): raise ValueError('This is not a valid value for levels.' ' Dict required.') elif not (np.unique(x) == np.unique(list(iterkeys(levels)))).all(): raise ValueError('The levels do not match the array values.') else: out = np.empty(x.shape[0], dtype=np.int) for level, coding in iteritems(levels): out[x == level] = coding if name: out = Series(out, name=name, index=index) return out
def _create_default_properties(data): """"Create the default properties of the mosaic given the data first it will varies the color hue (first category) then the color saturation (second category) and then the color value (third category). If a fourth category is found, it will put decoration on the rectangle. Doesn't manage more than four level of categories """ categories_levels = _categories_level(list(iterkeys(data))) Nlevels = len(categories_levels) # first level, the hue L = len(categories_levels[0]) # hue = np.linspace(1.0, 0.0, L+1)[:-1] hue = np.linspace(0.0, 1.0, L + 2)[:-2] # second level, the saturation L = len(categories_levels[1]) if Nlevels > 1 else 1 saturation = np.linspace(0.5, 1.0, L + 1)[:-1] # third level, the value L = len(categories_levels[2]) if Nlevels > 2 else 1 value = np.linspace(0.5, 1.0, L + 1)[:-1] # fourth level, the hatch L = len(categories_levels[3]) if Nlevels > 3 else 1 hatch = ['', '/', '-', '|', '+'][:L + 1] # convert in list and merge with the levels hue = lzip(list(hue), categories_levels[0]) saturation = lzip(list(saturation), categories_levels[1] if Nlevels > 1 else ['']) value = lzip(list(value), categories_levels[2] if Nlevels > 2 else ['']) hatch = lzip(list(hatch), categories_levels[3] if Nlevels > 3 else ['']) # create the properties dictionary properties = {} for h, s, v, t in product(hue, saturation, value, hatch): hv, hn = h sv, sn = s vv, vn = v tv, tn = t level = (hn,) + ((sn,) if sn else tuple()) level = level + ((vn,) if vn else tuple()) level = level + ((tn,) if tn else tuple()) hsv = array([hv, sv, vv]) prop = {'color': _single_hsv_to_rgb(hsv), 'hatch': tv, 'lw': 0} properties[level] = prop return properties
def _create_default_properties(data): """"Create the default properties of the mosaic given the data first it will varies the color hue (first category) then the color saturation (second category) and then the color value (third category). If a fourth category is found, it will put decoration on the rectangle. Doesn't manage more than four level of categories """ categories_levels = _categories_level(list(iterkeys(data))) Nlevels = len(categories_levels) # first level, the hue L = len(categories_levels[0]) # hue = np.linspace(1.0, 0.0, L+1)[:-1] hue = np.linspace(0.0, 1.0, L + 2)[:-2] # second level, the saturation L = len(categories_levels[1]) if Nlevels > 1 else 1 saturation = np.linspace(0.5, 1.0, L + 1)[:-1] # third level, the value L = len(categories_levels[2]) if Nlevels > 2 else 1 value = np.linspace(0.5, 1.0, L + 1)[:-1] # fourth level, the hatch L = len(categories_levels[3]) if Nlevels > 3 else 1 hatch = ['', '/', '-', '|', '+'][:L + 1] # convert in list and merge with the levels hue = lzip(list(hue), categories_levels[0]) saturation = lzip(list(saturation), categories_levels[1] if Nlevels > 1 else ['']) value = lzip(list(value), categories_levels[2] if Nlevels > 2 else ['']) hatch = lzip(list(hatch), categories_levels[3] if Nlevels > 3 else ['']) # create the properties dictionary properties = {} for h, s, v, t in product(hue, saturation, value, hatch): hv, hn = h sv, sn = s vv, vn = v tv, tn = t level = (hn,) + ((sn,) if sn else tuple()) level = level + ((vn,) if vn else tuple()) level = level + ((tn,) if tn else tuple()) hsv = array([hv, sv, vv]) prop = {'color': _single_hsv_to_rgb(hsv), 'hatch': tv, 'lw': 0} properties[level] = prop return properties
def _recode(x, levels): """ Recode categorial data to int factor. Parameters ---------- x : array-like array like object supporting with numpy array methods of categorially coded data. levels : dict mapping of labels to integer-codings Returns ------- out : instance numpy.ndarray """ from pandas import Series name = None index = None if isinstance(x, Series): name = x.name index = x.index x = x.values if x.dtype.type not in [np.str_, np.object_]: raise ValueError('This is not a categorial factor.' ' Array of str type required.') elif not isinstance(levels, dict): raise ValueError('This is not a valid value for levels.' ' Dict required.') elif not (np.unique(x) == np.unique(list(iterkeys(levels)))).all(): raise ValueError('The levels do not match the array values.') else: out = np.empty(x.shape[0], dtype=np.int) for level, coding in iteritems(levels): out[x == level] = coding if name: out = Series(out, name=name, index=index) return out
def update(self, params): """ Update the global odds ratio based on the current value of params. """ endog = self.model.endog_li cpp = self.cpp cached_means = self.model.cached_means # This will happen if all the clusters have only # one observation if len(cpp[0]) == 0: return tables = {} for ii in cpp[0]: tables[ii] = np.zeros((2, 2), dtype=np.float64) for i in range(self.model.num_group): endog_expval, _ = cached_means[i] emat_11 = self.get_eyy(endog_expval, i) emat_10 = endog_expval[:, None] - emat_11 emat_01 = -emat_11 + endog_expval emat_00 = 1. - (emat_11 + emat_10 + emat_01) cpp1 = cpp[i] for ky in iterkeys(cpp1): ix = cpp1[ky] tables[ky][1, 1] += emat_11[ix[:, 0], ix[:, 1]].sum() tables[ky][1, 0] += emat_10[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 1] += emat_01[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 0] += emat_00[ix[:, 0], ix[:, 1]].sum() cor_expval = self.pooled_odds_ratio(list(itervalues(tables))) self.dep_params *= self.crude_or / cor_expval if not np.isfinite(self.dep_params): self.dep_params = 1. warnings.warn("dep_params became inf, resetting to 1", ConvergenceWarning)
def update(self, params): """ Update the global odds ratio based on the current value of params. """ endog = self.model.endog_li cpp = self.cpp cached_means = self.model.cached_means # This will happen if all the clusters have only # one observation if len(cpp[0]) == 0: return tables = {} for ii in cpp[0]: tables[ii] = np.zeros((2, 2), dtype=np.float64) for i in range(self.model.num_group): endog_expval, _ = cached_means[i] emat_11 = self.get_eyy(endog_expval, i) emat_10 = endog_expval[:, None] - emat_11 emat_01 = -emat_11 + endog_expval emat_00 = 1. - (emat_11 + emat_10 + emat_01) cpp1 = cpp[i] for ky in iterkeys(cpp1): ix = cpp1[ky] tables[ky][1, 1] += emat_11[ix[:, 0], ix[:, 1]].sum() tables[ky][1, 0] += emat_10[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 1] += emat_01[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 0] += emat_00[ix[:, 0], ix[:, 1]].sum() cor_expval = self.pooled_odds_ratio(list(itervalues(tables))) self.dep_params *= self.crude_or / cor_expval if not np.isfinite(self.dep_params): self.dep_params = 1. warnings.warn("dep_params became inf, resetting to 1", ConvergenceWarning)
def _statistical_coloring(data): """evaluate colors from the indipendence properties of the matrix It will encounter problem if one category has all zeros """ data = _normalize_data(data, None) categories_levels = _categories_level(list(iterkeys(data))) Nlevels = len(categories_levels) total = 1.0 * sum(v for v in itervalues(data)) # count the proportion of observation # for each level that has the given name # at each level levels_count = [] for level_idx in range(Nlevels): proportion = {} for level in categories_levels[level_idx]: proportion[level] = 0.0 for key, value in iteritems(data): if level == key[level_idx]: proportion[level] += value proportion[level] /= total levels_count.append(proportion) # for each key I obtain the expected value # and it's standard deviation from a binomial distribution # under the hipothesys of independence expected = {} for key, value in iteritems(data): base = 1.0 for i, k in enumerate(key): base *= levels_count[i][k] expected[key] = base * total, np.sqrt(total * base * (1.0 - base)) # now we have the standard deviation of distance from the # expected value for each tile. We create the colors from this sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected)) props = {} for key, dev in iteritems(sigmas): red = 0.0 if dev < 0 else (dev / (1 + dev)) blue = 0.0 if dev > 0 else (dev / (-1 + dev)) green = (1.0 - red - blue) / 2.0 hatch = 'x' if dev > 2 else 'o' if dev < -2 else '' props[key] = {'color': [red, green, blue], 'hatch': hatch} return props
def _statistical_coloring(data): """evaluate colors from the indipendence properties of the matrix It will encounter problem if one category has all zeros """ data = _normalize_data(data, None) categories_levels = _categories_level(list(iterkeys(data))) Nlevels = len(categories_levels) total = 1.0 * sum(v for v in itervalues(data)) # count the proportion of observation # for each level that has the given name # at each level levels_count = [] for level_idx in range(Nlevels): proportion = {} for level in categories_levels[level_idx]: proportion[level] = 0.0 for key, value in iteritems(data): if level == key[level_idx]: proportion[level] += value proportion[level] /= total levels_count.append(proportion) # for each key I obtain the expected value # and it's standard deviation from a binomial distribution # under the hipothesys of independence expected = {} for key, value in iteritems(data): base = 1.0 for i, k in enumerate(key): base *= levels_count[i][k] expected[key] = base * total, np.sqrt(total * base * (1.0 - base)) # now we have the standard deviation of distance from the # expected value for each tile. We create the colors from this sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected)) props = {} for key, dev in iteritems(sigmas): red = 0.0 if dev < 0 else (dev / (1 + dev)) blue = 0.0 if dev > 0 else (dev / (-1 + dev)) green = (1.0 - red - blue) / 2.0 hatch = 'x' if dev > 2 else 'o' if dev < -2 else '' props[key] = {'color': [red, green, blue], 'hatch': hatch} return props
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female',)] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in iteritems(data)]) keys = list(iterkeys(temp_data)) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)') #pylab.show() pylab.close('all')
def handle_formula_data(Y, X, formula, depth=0): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] if X is not None: if data_util._is_using_pandas(Y, X): return dmatrices(formula, (Y, X), 2, return_type='dataframe') else: return dmatrices(formula, (Y, X), 2, return_type='dataframe') else: if data_util._is_using_pandas(Y, None): return dmatrices(formula, Y, 2, return_type='dataframe') else: return dmatrices(formula, Y, 2, return_type='dataframe')
def _hierarchical_split(count_dict, horizontal=True, gap=0.05): """ Split a square in a hierarchical way given a contingency table. Hierarchically split the unit square in alternate directions in proportion to the subdivision contained in the contingency table count_dict. This is the function that actually perform the tiling for the creation of the mosaic plot. If the gap array has been specified it will insert a corresponding amount of space (proportional to the unit lenght), while retaining the proportionality of the tiles. Parameters ---------- count_dict : dict Dictionary containing the contingency table. Each category should contain a non-negative number with a tuple as index. It expects that all the combination of keys to be representes; if that is not true, will automatically consider the missing values as 0 horizontal : bool The starting direction of the split (by default along the horizontal axis) gap : float or array of floats The list of gaps to be applied on each subdivision. If the lenght of the given array is less of the number of subcategories (or if it's a single number) it will extend it with exponentially decreasing gaps Returns --------- base_rect : dict A dictionary containing the result of the split. To each key is associated a 4-tuple of coordinates that are required to create the corresponding rectangle: 0 - x position of the lower left corner 1 - y position of the lower left corner 2 - width of the rectangle 3 - height of the rectangle """ # this is the unit square that we are going to divide base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))]) # get the list of each possible value for each level categories_levels = _categories_level(list(iterkeys(count_dict))) L = len(categories_levels) # recreate the gaps vector starting from an int if not np.iterable(gap): gap = [gap / 1.5**idx for idx in range(L)] # extend if it's too short if len(gap) < L: last = gap[-1] gap = list(*gap) + [last / 1.5**idx for idx in range(L)] # trim if it's too long gap = gap[:L] # put the count dictionay in order for the keys # this will allow some code simplification count_ordered = OrderedDict([(k, count_dict[k]) for k in list(product(*categories_levels))]) for cat_idx, cat_enum in enumerate(categories_levels): # get the partial key up to the actual level base_keys = list(product(*categories_levels[:cat_idx])) for key in base_keys: # for each partial and each value calculate how many # observation we have in the counting dictionary part_count = [ _reduce_dict(count_ordered, key + (partial, )) for partial in cat_enum ] # reduce the gap for subsequents levels new_gap = gap[cat_idx] # split the given subkeys in the rectangle dictionary base_rect = _key_splitting(base_rect, cat_enum, part_count, key, horizontal, new_gap) horizontal = not horizontal return base_rect
def __init__(self, y, design, model_type=r.lm, **kwds): ''' Set up and estimate R model with data and design ''' r.library('MASS') # still needs to be in test, but also here for # logical tests at the end not to show an error self.y = np.array(y) self.design = np.array(design) self.model_type = model_type self._design_cols = ['x.%d' % (i+1) for i in range( self.design.shape[1])] # Note the '-1' for no intercept - this is included in the design self.formula = r('y ~ %s-1' % '+'.join(self._design_cols)) self.frame = r.data_frame(y=y, x=self.design) rpy.set_default_mode(rpy.NO_CONVERSION) results = self.model_type(self.formula, data = self.frame, **kwds) self.robj = results # keep the Robj model so it can be # used in the tests rpy.set_default_mode(rpy.BASIC_CONVERSION) rsum = r.summary(results) self.rsum = rsum # Provide compatible interface with scipy models self.results = results.as_py() # coeffs = self.results['coefficients'] # self.beta0 = np.array([coeffs[c] for c in self._design_cols]) self.nobs = len(self.results['residuals']) if isinstance(self.results['residuals'], dict): self.resid = np.zeros((len(iterkeys(self.results['residuals'])))) for i in iterkeys(self.results['residuals']): self.resid[int(i)-1] = self.results['residuals'][i] else: self.resid = self.results['residuals'] self.fittedvalues = self.results['fitted.values'] self.df_resid = self.results['df.residual'] self.params = rsum['coefficients'][:,0] self.bse = rsum['coefficients'][:,1] self.bt = rsum['coefficients'][:,2] try: self.pvalues = rsum['coefficients'][:,3] except: pass self.rsquared = rsum.setdefault('r.squared', None) self.rsquared_adj = rsum.setdefault('adj.r.squared', None) self.aic_R = rsum.setdefault('aic', None) self.fvalue = rsum.setdefault('fstatistic', None) if self.fvalue and isinstance(self.fvalue, dict): self.fvalue = self.fvalue.setdefault('value', None) # for wls df = rsum.setdefault('df', None) if df: # for RLM, works for other models? self.df_model = df[0]-1 # R counts intercept self.df_resid = df[1] self.bcov_unscaled = rsum.setdefault('cov.unscaled', None) self.bcov = rsum.setdefault('cov.scaled', None) if 'sigma' in rsum: self.scale = rsum['sigma'] elif 'dispersion' in rsum: self.scale = rsum['dispersion'] else: self.scale = None self.llf = r.logLik(results) if model_type == r.glm: self.getglm() if model_type == r.rlm: self.getrlm()
form.termcolumns('C') File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental\scikits\statsmodels\sandbox\formula.py", line 494, in termcolumns raise ValueError('term not in formula') ValueError: term not in formula ''' print(form.hasterm('C')) print(form.termcolumns(formula.Term('C'))) #doesn't work with string argument #Example: use two columns and get contrast f2 = (form['A'] + form['B']) print(f2) print(repr(f2)) list(iterkeys(f2.namespace)) #namespace is still empty f2.namespace = namespace #associate data iterkeys(f2.namespace) f2.design().shape contrast.Contrast(formula.Term('A'), f2).matrix ''' >>> f2 = (form['A']+form['B']) >>> print f2 <formula: A + B> >>> print repr(f2) <statsmodels.sandbox.formula.Formula object at 0x036BAE70> >>> f2.namespace.keys() #namespace is still empty [] >>> f2.namespace = namespace #associate data >>> f2.namespace.keys() ['A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'J']
return len(incorrectli)==0, correctli, incorrectli if __name__ == '__main__': data = np.load(r"E:\Josef\eclipsegworkspace\statsmodels-josef-experimental-030\dist\statsmodels-0.3.0dev_with_Winhelp_a2\statsmodels-0.3.0dev\scikits\statsmodels\tsa\vector_ar\tests\results\vars_results.npz") res_var = HoldIt('var_results') for d in data: setattr(res_var, d, data[d]) np.set_printoptions(precision=120, linewidth=100) res_var.save(filename='testsave.py', header=True, comment='VAR test data converted from vars_results.npz') import testsave for d in data: print(d) correct = np.all(data[d] == getattr(testsave.var_results, d)) if not correct and not data[d].dtype == np.dtype('object'): correct = np.allclose(data[d], getattr(testsave.var_results, d), rtol=1e-16, atol=1e-16) if not correct: print("inexact precision") if not correct: correlem =[np.all(data[d].item()[k] == getattr(testsave.var_results, d).item()[k]) for k in iterkeys(data[d].item())] if not correlem: print(d, "wrong") print(res_var.verify())
def summary(self, stats='basic', columns='all', orientation='auto'): """ Return a summary of descriptive statistics. Parameters ---------- stats: list or str The desired statistics, Accepts 'basic' or 'all' or a list. 'basic' = ('obs', 'mean', 'std', 'min', 'max') 'all' = ('obs', 'mean', 'std', 'min', 'max', 'ptp', 'var', 'mode', 'meadian', 'skew', 'uss', 'kurtosis', 'percentiles') columns : list or str The columns/variables to report the statistics, default is 'all' If an object with named columns is given, you may specify the column names. For example """ #NOTE # standard array: Specifiy column numbers (NEED TO TEST) # percentiles currently broken # mode requires mode_val and mode_bin separately if self._arraytype is None: self._array_typer() if stats == 'basic': stats = ('obs', 'mean', 'std', 'min', 'max') elif stats == 'all': #stats = self.univariate.keys() #dict doesn't keep an order, use full list instead stats = ['obs', 'mean', 'std', 'min', 'max', 'ptp', 'var', 'mode_val', 'mode_bin', 'median', 'uss', 'skew', 'kurtosis', 'percentiles'] else: for astat in stats: pass #assert astat in self.univariate #hack around percentiles multiple output #bad naming import scipy.stats #BUG: the following has all per the same per=99 ##perdict = dict(('perc_%2d'%per, [lambda x: # scipy.stats.scoreatpercentile(x, per), None, None]) ## for per in (1,5,10,25,50,75,90,95,99)) def _fun(per): return lambda x: scipy.stats.scoreatpercentile(x, per) perdict = dict(('perc_%02d' % per, [_fun(per), None, None]) for per in (1,5,10,25,50,75,90,95,99)) if 'percentiles' in stats: self.univariate.update(perdict) idx = stats.index('percentiles') stats[idx:idx+1] = sorted(iterkeys(perdict)) #JP: this doesn't allow a change in sequence, sequence in stats is #ignored #this is just an if condition if any([aitem[1] for aitem in iteritems(self.univariate) if aitem[0] in stats]): if columns == 'all': self._columns_list = [] if self._arraytype == 'sctruct': self._columns_list = self.dataset.dtype.names #self._columns_list = [col for col in # self.dataset.dtype.names if # (self._is_dtype_like(col)=='number')] else: self._columns_list = lrange(self.dataset.shape[1]) else: self._columns_list = columns if self._arraytype == 'sctruct': for col in self._columns_list: assert (col in self.dataset.dtype.names) else: assert self._is_dtype_like(self.dataset) == 'number' columstypes = self.dataset.dtype #TODO: do we need to make sure they dtype is float64 ? for astat in stats: calc = self.univariate[astat] if self._arraytype == 'sctruct': calc[1] = self._columns_list calc[2] = [calc[0](self.dataset[col]) for col in self._columns_list if (self._is_dtype_like(col) == 'number')] #calc[2].append([len(np.unique(self.dataset[col])) for col # in self._columns_list if # self._is_dtype_like(col)=='string'] else: calc[1] = ['Col '+str(col) for col in self._columns_list] calc[2] = [calc[0](self.dataset[:,col]) for col in self._columns_list] return self.print_summary(stats, orientation=orientation) else: return self.print_summary(stats, orientation=orientation)
) res_var = HoldIt('var_results') for d in data: setattr(res_var, d, data[d]) np.set_printoptions(precision=120, linewidth=100) res_var.save(filename='testsave.py', header=True, comment='VAR test data converted from vars_results.npz') import testsave for d in data: print(d) correct = np.all(data[d] == getattr(testsave.var_results, d)) if not correct and not data[d].dtype == np.dtype('object'): correct = np.allclose(data[d], getattr(testsave.var_results, d), rtol=1e-16, atol=1e-16) if not correct: print("inexact precision") if not correct: correlem = [ np.all(data[d].item()[k] == getattr(testsave.var_results, d).item()[k]) for k in iterkeys(data[d].item()) ] if not correlem: print(d, "wrong") print(res_var.verify())
def _create_labels(rects, horizontal, ax, rotation): """find the position of the label for each value of each category right now it supports only up to the four categories ax: the axis on which the label should be applied rotation: the rotation list for each side """ categories = _categories_level(list(iterkeys(rects))) if len(categories) > 4: msg = ("maximum of 4 level supported for axes labeling... and 4" "is already a lot of levels, are you sure you need them all?") raise ValueError(msg) labels = {} #keep it fixed as will be used a lot of times items = list(iteritems(rects)) vertical = not horizontal #get the axis ticks and labels locator to put the correct values! ax2 = ax.twinx() ax3 = ax.twiny() #this is the order of execution for horizontal disposition ticks_pos = [ax.set_xticks, ax.set_yticks, ax3.set_xticks, ax2.set_yticks] ticks_lab = [ ax.set_xticklabels, ax.set_yticklabels, ax3.set_xticklabels, ax2.set_yticklabels ] #for the vertical one, rotate it by one if vertical: ticks_pos = ticks_pos[1:] + ticks_pos[:1] ticks_lab = ticks_lab[1:] + ticks_lab[:1] #clean them for pos, lab in zip(ticks_pos, ticks_lab): pos([]) lab([]) #for each level, for each value in the level, take the mean of all #the sublevel that correspond to that partial key for level_idx, level in enumerate(categories): #this dictionary keep the labels only for this level level_ticks = dict() for value in level: #to which level it should refer to get the preceding #values of labels? it's rather a tricky question... #this is dependent on the side. It's a very crude management #but I couldn't think a more general way... if horizontal: if level_idx == 3: index_select = [-1, -1, -1] else: index_select = [+0, -1, -1] else: if level_idx == 3: index_select = [+0, -1, +0] else: index_select = [-1, -1, -1] #now I create the base key name and append the current value #It will search on all the rects to find the corresponding one #and use them to evaluate the mean position basekey = tuple(categories[i][index_select[i]] for i in range(level_idx)) basekey = basekey + (value, ) subset = dict( (k, v) for k, v in items if basekey == k[:level_idx + 1]) #now I extract the center of all the tiles and make a weighted #mean of all these center on the area of the tile #this should give me the (more or less) correct position #of the center of the category vals = list(itervalues(subset)) W = sum(w * h for (x, y, w, h) in vals) x_lab = sum(_get_position(x, w, h, W) for (x, y, w, h) in vals) y_lab = sum(_get_position(y, h, w, W) for (x, y, w, h) in vals) #now base on the ordering, select which position to keep #needs to be written in a more general form of 4 level are enough? #should give also the horizontal and vertical alignment side = (level_idx + vertical) % 4 level_ticks[value] = y_lab if side % 2 else x_lab #now we add the labels of this level to the correct axis ticks_pos[level_idx](list(itervalues(level_ticks))) ticks_lab[level_idx](list(iterkeys(level_ticks)), rotation=rotation[level_idx]) return labels
def _hierarchical_split(count_dict, horizontal=True, gap=0.05): """ Split a square in a hierarchical way given a contingency table. Hierarchically split the unit square in alternate directions in proportion to the subdivision contained in the contingency table count_dict. This is the function that actually perform the tiling for the creation of the mosaic plot. If the gap array has been specified it will insert a corresponding amount of space (proportional to the unit lenght), while retaining the proportionality of the tiles. Parameters ---------- count_dict : dict Dictionary containing the contingency table. Each category should contain a non-negative number with a tuple as index. It expects that all the combination of keys to be representes; if that is not true, will automatically consider the missing values as 0 horizontal : bool The starting direction of the split (by default along the horizontal axis) gap : float or array of floats The list of gaps to be applied on each subdivision. If the lenght of the given array is less of the number of subcategories (or if it's a single number) it will extend it with exponentially decreasing gaps Returns ---------- base_rect : dict A dictionary containing the result of the split. To each key is associated a 4-tuple of coordinates that are required to create the corresponding rectangle: 0 - x position of the lower left corner 1 - y position of the lower left corner 2 - width of the rectangle 3 - height of the rectangle """ # this is the unit square that we are going to divide base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))]) # get the list of each possible value for each level categories_levels = _categories_level(list(iterkeys(count_dict))) L = len(categories_levels) # recreate the gaps vector starting from an int if not np.iterable(gap): gap = [gap / 1.5 ** idx for idx in range(L)] # extend if it's too short if len(gap) < L: last = gap[-1] gap = list(*gap) + [last / 1.5 ** idx for idx in range(L)] # trim if it's too long gap = gap[:L] # put the count dictionay in order for the keys # this will allow some code simplification count_ordered = OrderedDict([(k, count_dict[k]) for k in list(product(*categories_levels))]) for cat_idx, cat_enum in enumerate(categories_levels): # get the partial key up to the actual level base_keys = list(product(*categories_levels[:cat_idx])) for key in base_keys: # for each partial and each value calculate how many # observation we have in the counting dictionary part_count = [_reduce_dict(count_ordered, key + (partial,)) for partial in cat_enum] # reduce the gap for subsequents levels new_gap = gap[cat_idx] # split the given subkeys in the rectangle dictionary base_rect = _key_splitting(base_rect, cat_enum, part_count, key, horizontal, new_gap) horizontal = not horizontal return base_rect
form.termcolumns('C') File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental\scikits\statsmodels\sandbox\formula.py", line 494, in termcolumns raise ValueError('term not in formula') ValueError: term not in formula ''' print(form.hasterm('C')) print(form.termcolumns(formula.Term('C'))) #doesn't work with string argument #Example: use two columns and get contrast f2 = (form['A']+form['B']) print(f2) print(repr(f2)) list(iterkeys(f2.namespace)) #namespace is still empty f2.namespace = namespace #associate data iterkeys(f2.namespace) f2.design().shape contrast.Contrast(formula.Term('A'), f2).matrix ''' >>> f2 = (form['A']+form['B']) >>> print f2 <formula: A + B> >>> print repr(f2) <statsmodels.sandbox.formula.Formula object at 0x036BAE70> >>> f2.namespace.keys() #namespace is still empty [] >>> f2.namespace = namespace #associate data >>> f2.namespace.keys()
from __future__ import print_function from statsmodels.compat.python import iterkeys from rpy import r import numpy as np import statsmodels.api as sm examples = [1, 2] if 1 in examples: data = sm.datasets.longley.load(as_pandas=False) y, x = data.endog, sm.add_constant(data.exog, prepend=False) des_cols = ['x.%d' % (i + 1) for i in range(x.shape[1])] formula = r('y~%s-1' % '+'.join(des_cols)) frame = r.data_frame(y=y, x=x) results = r.lm(formula, data=frame) print(list(iterkeys(results))) print(results['coefficients']) if 2 in examples: data2 = sm.datasets.star98.load(as_pandas=False) y2, x2 = data2.endog, sm.add_constant(data2.exog, prepend=False) import rpy y2 = y2[:, 0] / y2.sum(axis=1) des_cols2 = ['x.%d' % (i + 1) for i in range(x2.shape[1])] formula2 = r('y~%s-1' % '+'.join(des_cols2)) frame2 = r.data_frame(y=y2, x=x2) results2 = r.glm(formula2, data=frame2, family='binomial') params_est = [ results2['coefficients'][k] for k in sorted(results2['coefficients']) ] print(params_est)
def summary(self, stats='basic', columns='all', orientation='auto'): """ Return a summary of descriptive statistics. Parameters ----------- stats: list or str The desired statistics, Accepts 'basic' or 'all' or a list. 'basic' = ('obs', 'mean', 'std', 'min', 'max') 'all' = ('obs', 'mean', 'std', 'min', 'max', 'ptp', 'var', 'mode', 'meadian', 'skew', 'uss', 'kurtosis', 'percentiles') columns : list or str The columns/variables to report the statistics, default is 'all' If an object with named columns is given, you may specify the column names. For example """ #NOTE # standard array: Specifiy column numbers (NEED TO TEST) # percentiles currently broken # mode requires mode_val and mode_bin separately if self._arraytype == None: self._array_typer() if stats == 'basic': stats = ('obs', 'mean', 'std', 'min', 'max') elif stats == 'all': #stats = self.univariate.keys() #dict doesn't keep an order, use full list instead stats = [ 'obs', 'mean', 'std', 'min', 'max', 'ptp', 'var', 'mode_val', 'mode_bin', 'median', 'uss', 'skew', 'kurtosis', 'percentiles' ] else: for astat in stats: pass #assert astat in self.univariate #hack around percentiles multiple output #bad naming import scipy.stats #BUG: the following has all per the same per=99 ##perdict = dict(('perc_%2d'%per, [lambda x: # scipy.stats.scoreatpercentile(x, per), None, None]) ## for per in (1,5,10,25,50,75,90,95,99)) def _fun(per): return lambda x: scipy.stats.scoreatpercentile(x, per) perdict = dict(('perc_%02d' % per, [_fun(per), None, None]) for per in (1, 5, 10, 25, 50, 75, 90, 95, 99)) if 'percentiles' in stats: self.univariate.update(perdict) idx = stats.index('percentiles') stats[idx:idx + 1] = sorted(iterkeys(perdict)) #JP: this doesn't allow a change in sequence, sequence in stats is #ignored #this is just an if condition if any([ aitem[1] for aitem in iteritems(self.univariate) if aitem[0] in stats ]): if columns == 'all': self._columns_list = [] if self._arraytype == 'sctruct': self._columns_list = self.dataset.dtype.names #self._columns_list = [col for col in # self.dataset.dtype.names if #(self._is_dtype_like(col)=='number')] else: self._columns_list = lrange(self.dataset.shape[1]) else: self._columns_list = columns if self._arraytype == 'sctruct': for col in self._columns_list: assert (col in self.dataset.dtype.names) else: assert self._is_dtype_like(self.dataset) == 'number' columstypes = self.dataset.dtype #TODO: do we need to make sure they dtype is float64 ? for astat in stats: calc = self.univariate[astat] if self._arraytype == 'sctruct': calc[1] = self._columns_list calc[2] = [ calc[0](self.dataset[col]) for col in self._columns_list if (self._is_dtype_like(col) == 'number') ] #calc[2].append([len(np.unique(self.dataset[col])) for col #in self._columns_list if #self._is_dtype_like(col)=='string'] else: calc[1] = ['Col ' + str(col) for col in self._columns_list] calc[2] = [ calc[0](self.dataset[:, col]) for col in self._columns_list ] return self.print_summary(stats, orientation=orientation) else: return self.print_summary(stats, orientation=orientation)
def _create_labels(rects, horizontal, ax, rotation): """find the position of the label for each value of each category right now it supports only up to the four categories ax: the axis on which the label should be applied rotation: the rotation list for each side """ categories = _categories_level(list(iterkeys(rects))) if len(categories) > 4: msg = ("maximum of 4 level supported for axes labeling..and 4" "is alreay a lot of level, are you sure you need them all?") raise NotImplementedError(msg) labels = {} #keep it fixed as will be used a lot of times items = list(iteritems(rects)) vertical = not horizontal #get the axis ticks and labels locator to put the correct values! ax2 = ax.twinx() ax3 = ax.twiny() #this is the order of execution for horizontal disposition ticks_pos = [ax.set_xticks, ax.set_yticks, ax3.set_xticks, ax2.set_yticks] ticks_lab = [ax.set_xticklabels, ax.set_yticklabels, ax3.set_xticklabels, ax2.set_yticklabels] #for the vertical one, rotate it by one if vertical: ticks_pos = ticks_pos[1:] + ticks_pos[:1] ticks_lab = ticks_lab[1:] + ticks_lab[:1] #clean them for pos, lab in zip(ticks_pos, ticks_lab): pos([]) lab([]) #for each level, for each value in the level, take the mean of all #the sublevel that correspond to that partial key for level_idx, level in enumerate(categories): #this dictionary keep the labels only for this level level_ticks = dict() for value in level: #to which level it should refer to get the preceding #values of labels? it's rather a tricky question... #this is dependent on the side. It's a very crude management #but I couldn't think a more general way... if horizontal: if level_idx == 3: index_select = [-1, -1, -1] else: index_select = [+0, -1, -1] else: if level_idx == 3: index_select = [+0, -1, +0] else: index_select = [-1, -1, -1] #now I create the base key name and append the current value #It will search on all the rects to find the corresponding one #and use them to evaluate the mean position basekey = tuple(categories[i][index_select[i]] for i in range(level_idx)) basekey = basekey + (value,) subset = dict((k, v) for k, v in items if basekey == k[:level_idx + 1]) #now I extract the center of all the tiles and make a weighted #mean of all these center on the area of the tile #this should give me the (more or less) correct position #of the center of the category vals = list(itervalues(subset)) W = sum(w * h for (x, y, w, h) in vals) x_lab = sum(_get_position(x, w, h, W) for (x, y, w, h) in vals) y_lab = sum(_get_position(y, h, w, W) for (x, y, w, h) in vals) #now base on the ordering, select which position to keep #needs to be written in a more general form of 4 level are enough? #should give also the horizontal and vertical alignment side = (level_idx + vertical) % 4 level_ticks[value] = y_lab if side % 2 else x_lab #now we add the labels of this level to the correct axis ticks_pos[level_idx](list(itervalues(level_ticks))) ticks_lab[level_idx](list(iterkeys(level_ticks)), rotation=rotation[level_idx]) return labels