def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ["gender", "age", "health", "work"] key_base = (["male", "female"], ["old", "young"], ["healty", "ill"], ["work", "unemployed"]) keys = list(product(*key_base)) data = OrderedDict(list(zip(keys, list(range(1, 1 + len(keys)))))) props = {} props[("male", "old")] = {"color": "r"} props[("female",)] = {"color": "pink"} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha="center", va="center") axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in list(data.items())]) keys = list(temp_data.keys()) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle("old males should look bright red, (plot 4 of 4)")
def test__reduce_dict(): data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), [1] * 8)) eq(_reduce_dict(data, ('m', )), 4) eq(_reduce_dict(data, ('m', 'o')), 2) eq(_reduce_dict(data, ('m', 'o', 'w')), 1) data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), range(8))) eq(_reduce_dict(data, ('m', )), 6) eq(_reduce_dict(data, ('m', 'o')), 1) eq(_reduce_dict(data, ('m', 'o', 'w')), 0)
def test_mosaic_simple(): # display a simple plot of 4 categories of data, splitted in four # levels with increasing size for each group # creation of the levels key_set = (['male', 'female'], ['old', 'adult', 'young'], ['worker', 'unemployed'], ['healty', 'ill']) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) # which colours should I use for the various categories? # put it into a dict props = {} #males and females in blue and red props[('male', )] = {'color': 'b'} props[('female', )] = {'color': 'r'} # all the groups corresponding to ill groups have a different color for key in keys: if 'ill' in key: if 'male' in key: props[key] = {'color': 'BlueViolet', 'hatch': '+'} else: props[key] = {'color': 'Crimson', 'hatch': '+'} # mosaic of the data, with given gaps and colors mosaic(data, gap=0.05, properties=props, axes_label=False) pylab.suptitle('syntetic data, 4 categories (plot 2 of 4)')
def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(data.items()) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = list(data.items()) # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(data.keys())) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = list(range(len(categories_levels))) if index is None else index contingency = OrderedDict() for key, value in list(data.items()): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def test_recursive_split(): keys = list(product('mf')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(res.keys() == keys) res[('m', )] = (0.0, 0.0, 0.5, 1.0) res[('f', )] = (0.5, 0.0, 0.5, 1.0) keys = list(product('mf', 'yao')) data = OrderedDict(zip(keys, [1] * len(keys))) res = _hierarchical_split(data, gap=0) assert_(res.keys() == keys) res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3) res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3) res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3) res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3) res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3) res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
def _categories_level(keys): """use the Ordered dict to implement a simple ordered set return each level of each category [[key_1_level_1,key_2_level_1],[key_1_level_2,key_2_level_2]] """ res = [] for i in zip(*(keys)): tuplefied = _tuplify(i) res.append(list(OrderedDict([(j, None) for j in tuplefied]))) return res
def summary_model(results): """Create a dict with information about the model """ def time_now(**kwrds): now = datetime.datetime.now() return now.strftime("%Y-%m-%d %H:%M") info = OrderedDict() info["Model:"] = lambda x: x.model.__class__.__name__ info["Model Family:"] = lambda x: x.family.__class.__name__ info["Link Function:"] = lambda x: x.family.link.__class__.__name__ info["Dependent Variable:"] = lambda x: x.model.endog_names info["Date:"] = time_now() info["No. Observations:"] = lambda x: "%#6d" % x.nobs info["Df Model:"] = lambda x: "%#6d" % x.df_model info["Df Residuals:"] = lambda x: "%#6d" % x.df_resid info["Converged:"] = lambda x: x.mle_retvals["converged"] info["No. Iterations:"] = lambda x: x.mle_retvals["iterations"] info["Method:"] = lambda x: x.method info["Norm:"] = lambda x: x.fit_options["norm"] info["Scale Est.:"] = lambda x: x.fit_options["scale_est"] info["Cov. Type:"] = lambda x: x.fit_options["cov"] info["R-squared:"] = lambda x: "%#8.3f" % x.rsquared info["Adj. R-squared:"] = lambda x: "%#8.3f" % x.rsquared_adj info["Pseudo R-squared:"] = lambda x: "%#8.3f" % x.prsquared info["AIC:"] = lambda x: "%8.4f" % x.aic info["BIC:"] = lambda x: "%8.4f" % x.bic info["Log-Likelihood:"] = lambda x: "%#8.5g" % x.llf info["LL-Null:"] = lambda x: "%#8.5g" % x.llnull info["LLR p-value:"] = lambda x: "%#8.5g" % x.llr_pvalue info["Deviance:"] = lambda x: "%#8.5g" % x.deviance info["Pearson chi2:"] = lambda x: "%#6.3g" % x.pearson_chi2 info["F-statistic:"] = lambda x: "%#8.4g" % x.fvalue info["Prob (F-statistic):"] = lambda x: "%#6.3g" % x.f_pvalue info["Scale:"] = lambda x: "%#8.5g" % x.scale out = OrderedDict() for key in info.keys(): try: out[key] = info[key](results) except: pass return out
def summary_model(results): '''Create a dict with information about the model ''' def time_now(**kwrds): now = datetime.datetime.now() return now.strftime('%Y-%m-%d %H:%M') info = OrderedDict() info['Model:'] = lambda x: x.model.__class__.__name__ info['Model Family:'] = lambda x: x.family.__class.__name__ info['Link Function:'] = lambda x: x.family.link.__class__.__name__ info['Dependent Variable:'] = lambda x: x.model.endog_names info['Date:'] = time_now() info['No. Observations:'] = lambda x: "%#6d" % x.nobs info['Df Model:'] = lambda x: "%#6d" % x.df_model info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid info['Converged:'] = lambda x: x.mle_retvals['converged'] info['No. Iterations:'] = lambda x: x.mle_retvals['iterations'] info['Method:'] = lambda x: x.method info['Norm:'] = lambda x: x.fit_options['norm'] info['Scale Est.:'] = lambda x: x.fit_options['scale_est'] info['Cov. Type:'] = lambda x: x.fit_options['cov'] info['R-squared:'] = lambda x: "%#8.3f" % x.rsquared info['Adj. R-squared:'] = lambda x: "%#8.3f" % x.rsquared_adj info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared info['AIC:'] = lambda x: "%8.4f" % x.aic info['BIC:'] = lambda x: "%8.4f" % x.bic info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue info['Deviance:'] = lambda x: "%#8.5g" % x.deviance info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue info['Scale:'] = lambda x: "%#8.5g" % x.scale out = OrderedDict() for key in info.keys(): try: out[key] = info[key](results) except: pass return out
def summary_model(results): '''Create a dict with information about the model ''' def time_now(**kwrds): now = datetime.datetime.now() return now.strftime('%Y-%m-%d %H:%M') info = OrderedDict() info['Model:'] = lambda x: x.model.__class__.__name__ info['Model Family:'] = lambda x: x.family.__class.__name__ info['Link Function:'] = lambda x: x.family.link.__class__.__name__ info['Dependent Variable:'] = lambda x: x.model.endog_names info['Date:'] = time_now() info['No. Observations:'] = lambda x: "%#6d" % x.nobs info['Df Model:'] = lambda x: "%#6d" % x.df_model info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid info['Converged:'] = lambda x: x.mle_retvals['converged'] info['No. Iterations:'] = lambda x: x.mle_retvals['iterations'] info['Method:'] = lambda x: x.method info['Norm:'] = lambda x: x.fit_options['norm'] info['Scale Est.:'] = lambda x: x.fit_options['scale_est'] info['Cov. Type:'] = lambda x: x.fit_options['cov'] info['R-squared:'] = lambda x: "%#8.3f" % x.rsquared info['Adj. R-squared:'] = lambda x: "%#8.3f" % x.rsquared_adj info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared info['AIC:'] = lambda x: "%8.4f" % x.aic info['BIC:'] = lambda x: "%8.4f" % x.bic info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue info['Deviance:'] = lambda x: "%#8.5g" % x.deviance info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue info['Scale:'] = lambda x: "%#8.5g" % x.scale out = OrderedDict() for key in list(info.keys()): try: out[key] = info[key](results) except: pass return out
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female',)] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in data.items()]) keys = temp_data.keys() for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)')
def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(data.iteritems()) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = data.items() # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(data.keys())) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = list(range(len(categories_levels))) if index is None else index contingency = OrderedDict() for key, value in data.items(): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap): """ Given a dictionary where each entry is a rectangle, a list of key and value (count of elements in each category) it split each rect accordingly, as long as the key start with the tuple key_subset. The other keys are returned without modification. """ result = OrderedDict() L = len(key_subset) for name, (x, y, w, h) in rect_dict.items(): if key_subset == name[:L]: # split base on the values given divisions = _split_rect(x, y, w, h, values, horizontal, gap) for key, rect in zip(keys, divisions): result[name + (key, )] = rect else: result[name] = (x, y, w, h) return result
def test_axes_labeling(): from numpy.random import rand key_set = (['male', 'female'], ['old', 'adult', 'young'], ['worker', 'unemployed'], ['yes', 'no']) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(zip(keys, rand(len(keys)))) lab = lambda k: ''.join(s[0] for s in k) fig, (ax1, ax2) = pylab.subplots(1, 2, figsize=(16, 8)) mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45) mosaic(data, ax=ax2, labelizer=lab, horizontal=False, label_rotation=[0, 45, 90, 0]) #fig.tight_layout() fig.suptitle("correct alignment of the axes labels")
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female', )] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in data.items()]) keys = temp_data.keys() for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)')
'ho': 'Hommel', 'fdr_bh': 'FDR Benjamini-Hochberg', 'fdr_by': 'FDR Benjamini-Yekutieli', 'fdr_tsbh': 'FDR 2-stage Benjamini-Hochberg', 'fdr_tsbky': 'FDR 2-stage Benjamini-Krieger-Yekutieli', 'fdr_gbs': 'FDR adaptive Gavrilov-Benjamini-Sarkar' } _alias_list = [['b', 'bonf', 'bonferroni'], ['s', 'sidak'], ['h', 'holm'], ['hs', 'holm-sidak'], ['sh', 'simes-hochberg'], ['ho', 'hommel'], ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp'], ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr'], ['fdr_tsbh', 'fdr_2sbh'], ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage'], ['fdr_gbs']] multitest_alias = OrderedDict() for m in _alias_list: multitest_alias[m[0]] = m[0] for a in m[1:]: multitest_alias[a] = m[0] def multipletests(pvals, alpha=0.05, method='hs', returnsorted=False): '''test results and p-value correction for multiple tests Parameters ---------- pvals : array_like uncorrected p-values alpha : float
def _hierarchical_split(count_dict, horizontal=True, gap=0.05): """ Split a square in a hierarchical way given a contingency table. Hierarchically split the unit square in alternate directions in proportion to the subdivision contained in the contingency table count_dict. This is the function that actually perform the tiling for the creation of the mosaic plot. If the gap array has been specified it will insert a corresponding amount of space (proportional to the unit lenght), while retaining the proportionality of the tiles. Parameters ---------- count_dict : dict Dictionary containing the contingency table. Each category should contain a non-negative number with a tuple as index. It expects that all the combination of keys to be representes; if that is not true, will automatically consider the missing values as 0 horizontal : bool The starting direction of the split (by default along the horizontal axis) gap : float or array of floats The list of gaps to be applied on each subdivision. If the lenght of the given array is less of the number of subcategories (or if it's a single number) it will extend it with exponentially decreasing gaps Returns ---------- base_rect : dict A dictionary containing the result of the split. To each key is associated a 4-tuple of coordinates that are required to create the corresponding rectangle: 0 - x position of the lower left corner 1 - y position of the lower left corner 2 - width of the rectangle 3 - height of the rectangle """ # this is the unit square that we are going to divide base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))]) # get the list of each possible value for each level categories_levels = _categories_level(list(count_dict.keys())) L = len(categories_levels) # recreate the gaps vector starting from an int if not np.iterable(gap): gap = [gap / 1.5**idx for idx in range(L)] # extend if it's too short if len(gap) < L: last = gap[-1] gap = list(*gap) + [last / 1.5**idx for idx in range(L)] # trim if it's too long gap = gap[:L] # put the count dictionay in order for the keys # this will allow some code simplification count_ordered = OrderedDict([(k, count_dict[k]) for k in list(product(*categories_levels))]) for cat_idx, cat_enum in enumerate(categories_levels): # get the partial key up to the actual level base_keys = list(product(*categories_levels[:cat_idx])) for key in base_keys: # for each partial and each value calculate how many # observation we have in the counting dictionary part_count = [ _reduce_dict(count_ordered, key + (partial, )) for partial in cat_enum ] # reduce the gap for subsequents levels new_gap = gap[cat_idx] # split the given subkeys in the rectangle dictionary base_rect = _key_splitting(base_rect, cat_enum, part_count, key, horizontal, new_gap) horizontal = not horizontal return base_rect