def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(data.items()) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = list(data.items()) # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(data.keys())) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = list(range(len(categories_levels))) if index is None else index contingency = OrderedDict() for key, value in list(data.items()): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female', )] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in data.items()]) keys = temp_data.keys() for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)')
def _normalize_data(data, index): """normalize the data to a dict with tuples of strings as keys right now it works with: 0 - dictionary (or equivalent mappable) 1 - pandas.Series with simple or hierarchical indexes 2 - numpy.ndarrays 3 - everything that can be converted to a numpy array 4 - pandas.DataFrame (via the _normalize_dataframe function) """ # if data is a dataframe we need to take a completely new road # before coming back here. Use the hasattr to avoid importing # pandas explicitly if hasattr(data, 'pivot') and hasattr(data, 'groupby'): data = _normalize_dataframe(data, index) index = None # can it be used as a dictionary? try: items = list(data.iteritems()) except AttributeError: # ok, I cannot use the data as a dictionary # Try to convert it to a numpy array, or die trying data = np.asarray(data) temp = OrderedDict() for idx in np.ndindex(data.shape): name = tuple(i for i in idx) temp[name] = data[idx] data = temp items = data.items() # make all the keys a tuple, even if simple numbers data = OrderedDict([_tuplify(k), v] for k, v in items) categories_levels = _categories_level(list(data.keys())) # fill the void in the counting dictionary indexes = product(*categories_levels) contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes]) data = contingency # reorder the keys order according to the one specified by the user # or if the index is None convert it into a simple list # right now it doesn't do any check, but can be modified in the future index = list(range(len(categories_levels))) if index is None else index contingency = OrderedDict() for key, value in data.items(): new_key = tuple(key[i] for i in index) contingency[new_key] = value data = contingency return data
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female',)] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in data.items()]) keys = temp_data.keys() for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)')
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ["gender", "age", "health", "work"] key_base = (["male", "female"], ["old", "young"], ["healty", "ill"], ["work", "unemployed"]) keys = list(product(*key_base)) data = OrderedDict(list(zip(keys, list(range(1, 1 + len(keys)))))) props = {} props[("male", "old")] = {"color": "r"} props[("female",)] = {"color": "pink"} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha="center", va="center") axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in list(data.items())]) keys = list(temp_data.keys()) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle("old males should look bright red, (plot 4 of 4)")