def heatmap(attr1, attr2, annot=True): df = shroom_dealer.get_data_frame() labels1 = shroom_dealer.get_attribute_dictionary()[attr1] labels2 = shroom_dealer.get_attribute_dictionary()[attr2] data = [] for a in df[attr1].cat.categories: column = df[attr2][df[attr1] == a].value_counts() / len( df[df[attr1] == a]) data.append(column) d = pd.concat(data, axis=1) d.columns = [labels1[a] for a in df[attr1].cat.categories] ticks = [labels2[a] for a in d.index] sns.heatmap(d, annot=annot, yticklabels=ticks, fmt='.2f') plt.title("{} and {}".format(attr1, attr2)) plt.yticks(rotation=0) plt.tight_layout() plt.savefig("heatmaps/{}_and_{}.png".format(attr1, attr2)) plt.clf()
def plot_comparative_data(attribute, plot=True, save=False): edible_data = data(attribute)["edible"] poisonous_data = data(attribute)["poisonous"] labels = shroom_dealer.get_attribute_dictionary()[attribute] index = np.arange(len(edible_data)) bar_width = 0.35 opacity=0.4 fig, ax = plt.subplots() plt.bar(index, edible_data.values(), bar_width, align='center', color='b', label='edible', alpha=opacity) plt.bar(index + bar_width, poisonous_data.values(), bar_width, align='center', color='r', label='poisonous', alpha=opacity) plt.xlabel('Attributes') plt.ylabel('Frequency') plt.title('Frequency by attribute and edibility ({})'.format(attribute)) plt.xticks(index + bar_width / 2, [labels[key] for key in edible_data.keys()]) plt.legend() plt.tight_layout() if plot: plt.show() if save: plt.savefig('comparative_barcharts/{}.png'.format(attribute)) plt.close()
def observed_data(attr1, attr2): df = shroom_dealer.get_data_frame() labels1 = shroom_dealer.get_attribute_dictionary()[attr1] labels2 = shroom_dealer.get_attribute_dictionary()[attr2] data = [] for a in df[attr1].cat.categories: column = df[attr2][df[attr1] == a].value_counts() data.append(column) observed = pd.concat(data, axis=1) observed.columns = [labels1[a] for a in df[attr1].cat.categories] return observed
def get_corr(): df = shroom_dealer.get_data_frame() attr_dict = shroom_dealer.get_attribute_dictionary() data = [] for attribute in attr_dict: for sub_attr in attr_dict[attribute]: data.append((attr_dict[attribute][sub_attr],[1 if x==sub_attr else 0 for x in df[attribute]])) l = [x[1] for x in data] # return pd.DataFrame(np.array(l).transpose(), columns=[x[0] for x in data]).corr().dropna(thresh=1).drop("distant")
def data(attribute): df = shroom_dealer.get_data_frame() attribute_values = shroom_dealer.get_attribute_dictionary()[attribute] poisonous_data = {} edible_data = {} for a in attribute_values.keys(): poisonous_data[a] = \ df[attribute][df['poisonous'] == 'p'][df[attribute] == a].count() edible_data[a] = \ df[attribute][df['poisonous'] == 'e'][df[attribute] == a].count() return {"poisonous": poisonous_data, "edible": edible_data}
def get_pie_data(): att_map = shroom_dealer.get_attribute_dictionary() df = shroom_dealer.get_data_frame() pie_data = dict([(atr, None) for atr in att_map]) poison_pie_data = dict([(atr, None) for atr in att_map]) for x in att_map: counts = dict([(att_map[x][y], 0) for y in att_map[x]]) poison_counts = dict([(att_map[x][y], 0) for y in att_map[x]]) for val, poison in zip(df[x], df["poisonous"]): counts[att_map[x][val]] += 1 if (poison == "p"): poison_counts[att_map[x][val]] += 1 pie_data[x] = counts poison_pie_data[x] = poison_counts return pie_data, poison_pie_data
import sys import os import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pandas as pd sys.path.append(os.path.abspath('..')) from preprocessing import shroom_dealer a_map = shroom_dealer.get_attribute_dictionary() df = shroom_dealer.get_data_frame() for col in df.columns: print df[col].replace(a_map[col], inplace=True) #Stacked Bar Graphed # the cross tab operator provides an easy way to get these numbers for valY in df.axes[1]: for valX in df.axes[1]: if valY != valX and valY != "poisonous" and valX != "poisonous": poison = pd.crosstab([df[valY], df[valX]], df.poisonous.apply(str)) poison_norm = poison.div(poison.sum(1).astype(float), axis=0) ax = poison_norm.plot(kind='barh', stacked=True) plt.savefig('stackedbar/%s.png' % (valY + '--' + valX), bbox_inches='tight') plt.close()
def heatmap_all(): cat_names = shroom_dealer.get_attribute_dictionary().keys() combos = itertools.combinations(cat_names, 2) for attr1, attr2 in combos: heatmap(attr1, attr2)
def save_all(): attributes = shroom_dealer.get_attribute_dictionary() for a in attributes.keys(): plot_comparative_data(a, plot=False, save=True)
def expected_data(observed): expected = np.zeros(observed.shape) total = observed.sum().sum() for j in [0, 1]: for i, col_total in enumerate(observed.sum()): row_total = observed.sum(axis=1)[j] expected[j][i] = row_total * col_total / total return pd.DataFrame(expected, index=observed.index, columns=observed.columns) cat_names = shroom_dealer.get_attribute_dictionary().keys() chisqrs = [] for cat in cat_names: if cat != 'poisonous': observed = observed_data(cat, 'poisonous') expected = expected_data(observed) chisqr = (((observed - expected)**2) / expected).sum().sum() chisqrs.append((chisqr, cat)) chisqrs = sorted(chisqrs)[::-1] chisqrs = chisqrs[:10] values = [d[0] for d in chisqrs] labels = [d[1].replace("-", "\n") for d in chisqrs] #index = np.arange(len(chisqrs))
import json import sys import os sys.path.append(os.path.abspath('..')) from preprocessing import shroom_dealer attr_names= shroom_dealer.get_attribute_dictionary() nodes = {} df = shroom_dealer.get_data_frame() for index, row in df.iterrows(): for attr in attr_names: attr_id = attr+"-"+attr_names[attr][row[attr]] for attr2 in attr_names: if(attr == attr2): continue attr_id2 = attr2+"-"+attr_names[attr2][row[attr2]] if(attr_id not in nodes): nodes[attr_id] = {} if(attr_id2 not in nodes[attr_id]): nodes[attr_id][attr_id2] = 0 nodes[attr_id][attr_id2]+=1 for MIN in range(0,159):