コード例 #1
0
def heatmap(attr1, attr2, annot=True):
    df = shroom_dealer.get_data_frame()

    labels1 = shroom_dealer.get_attribute_dictionary()[attr1]
    labels2 = shroom_dealer.get_attribute_dictionary()[attr2]

    data = []

    for a in df[attr1].cat.categories:
        column = df[attr2][df[attr1] == a].value_counts() / len(
            df[df[attr1] == a])
        data.append(column)

    d = pd.concat(data, axis=1)
    d.columns = [labels1[a] for a in df[attr1].cat.categories]

    ticks = [labels2[a] for a in d.index]

    sns.heatmap(d, annot=annot, yticklabels=ticks, fmt='.2f')

    plt.title("{} and {}".format(attr1, attr2))
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig("heatmaps/{}_and_{}.png".format(attr1, attr2))
    plt.clf()
コード例 #2
0
def plot_comparative_data(attribute, plot=True, save=False):
    edible_data = data(attribute)["edible"]
    poisonous_data = data(attribute)["poisonous"]

    labels = shroom_dealer.get_attribute_dictionary()[attribute]

    index = np.arange(len(edible_data))
    bar_width = 0.35
    opacity=0.4

    fig, ax = plt.subplots()

    plt.bar(index, edible_data.values(), bar_width, align='center',
            color='b', label='edible', alpha=opacity)
    plt.bar(index + bar_width, poisonous_data.values(), bar_width,
            align='center', color='r', label='poisonous', alpha=opacity)

    plt.xlabel('Attributes')
    plt.ylabel('Frequency')
    plt.title('Frequency by attribute and edibility ({})'.format(attribute))
    plt.xticks(index + bar_width / 2,
               [labels[key] for key in edible_data.keys()])

    plt.legend()

    plt.tight_layout()

    if plot:
        plt.show()

    if save:
        plt.savefig('comparative_barcharts/{}.png'.format(attribute))

    plt.close()
コード例 #3
0
def observed_data(attr1, attr2):
    df = shroom_dealer.get_data_frame()

    labels1 = shroom_dealer.get_attribute_dictionary()[attr1]
    labels2 = shroom_dealer.get_attribute_dictionary()[attr2]

    data = []

    for a in df[attr1].cat.categories:
        column = df[attr2][df[attr1] == a].value_counts()
        data.append(column)

    observed = pd.concat(data, axis=1)
    observed.columns = [labels1[a] for a in df[attr1].cat.categories]

    return observed
コード例 #4
0
def get_corr():
    df = shroom_dealer.get_data_frame()
    attr_dict = shroom_dealer.get_attribute_dictionary()

    data = []
    for attribute in attr_dict:
        for sub_attr in attr_dict[attribute]:
            data.append((attr_dict[attribute][sub_attr],[1 if x==sub_attr else 0 for x in df[attribute]]))

    l = [x[1] for x in data]
    #
    return pd.DataFrame(np.array(l).transpose(), columns=[x[0] for x in data]).corr().dropna(thresh=1).drop("distant")
コード例 #5
0
def data(attribute):
    df = shroom_dealer.get_data_frame()
    attribute_values = shroom_dealer.get_attribute_dictionary()[attribute]

    poisonous_data = {}
    edible_data = {}

    for a in attribute_values.keys():
        poisonous_data[a] = \
                df[attribute][df['poisonous'] == 'p'][df[attribute] == a].count()
        edible_data[a] = \
                df[attribute][df['poisonous'] == 'e'][df[attribute] == a].count()

    return {"poisonous": poisonous_data, "edible": edible_data}
コード例 #6
0
def get_pie_data():
    att_map = shroom_dealer.get_attribute_dictionary()
    df = shroom_dealer.get_data_frame()

    pie_data = dict([(atr, None) for atr in att_map])
    poison_pie_data = dict([(atr, None) for atr in att_map])

    for x in att_map:
        counts = dict([(att_map[x][y], 0) for y in att_map[x]])
        poison_counts = dict([(att_map[x][y], 0) for y in att_map[x]])

        for val, poison in zip(df[x], df["poisonous"]):
            counts[att_map[x][val]] += 1
            if (poison == "p"):
                poison_counts[att_map[x][val]] += 1
        pie_data[x] = counts
        poison_pie_data[x] = poison_counts

    return pie_data, poison_pie_data
コード例 #7
0
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

sys.path.append(os.path.abspath('..'))
from preprocessing import shroom_dealer

a_map = shroom_dealer.get_attribute_dictionary()
df = shroom_dealer.get_data_frame()
for col in df.columns:
    print df[col].replace(a_map[col], inplace=True)

#Stacked Bar Graphed
# the cross tab operator provides an easy way to get these numbers
for valY in df.axes[1]:
    for valX in df.axes[1]:
        if valY != valX and valY != "poisonous" and valX != "poisonous":

            poison = pd.crosstab([df[valY], df[valX]], df.poisonous.apply(str))
            poison_norm = poison.div(poison.sum(1).astype(float), axis=0)
            ax = poison_norm.plot(kind='barh', stacked=True)

            plt.savefig('stackedbar/%s.png' % (valY + '--' + valX),
                        bbox_inches='tight')
            plt.close()
コード例 #8
0
def heatmap_all():
    cat_names = shroom_dealer.get_attribute_dictionary().keys()
    combos = itertools.combinations(cat_names, 2)
    for attr1, attr2 in combos:
        heatmap(attr1, attr2)
コード例 #9
0
def save_all():
    attributes = shroom_dealer.get_attribute_dictionary()
    for a in attributes.keys():
        plot_comparative_data(a, plot=False, save=True)
コード例 #10
0
def expected_data(observed):
    expected = np.zeros(observed.shape)

    total = observed.sum().sum()
    for j in [0, 1]:
        for i, col_total in enumerate(observed.sum()):
            row_total = observed.sum(axis=1)[j]
            expected[j][i] = row_total * col_total / total

    return pd.DataFrame(expected,
                        index=observed.index,
                        columns=observed.columns)


cat_names = shroom_dealer.get_attribute_dictionary().keys()

chisqrs = []
for cat in cat_names:
    if cat != 'poisonous':
        observed = observed_data(cat, 'poisonous')
        expected = expected_data(observed)
        chisqr = (((observed - expected)**2) / expected).sum().sum()
        chisqrs.append((chisqr, cat))

chisqrs = sorted(chisqrs)[::-1]
chisqrs = chisqrs[:10]
values = [d[0] for d in chisqrs]
labels = [d[1].replace("-", "\n") for d in chisqrs]

#index = np.arange(len(chisqrs))
コード例 #11
0
import json
import sys
import os
sys.path.append(os.path.abspath('..'))

from preprocessing import shroom_dealer

attr_names= shroom_dealer.get_attribute_dictionary()

nodes = {}

df = shroom_dealer.get_data_frame()

for index, row in df.iterrows():
    for attr in attr_names:
        attr_id = attr+"-"+attr_names[attr][row[attr]]

        for attr2 in attr_names:
            if(attr == attr2):
                continue

            attr_id2 = attr2+"-"+attr_names[attr2][row[attr2]]

            if(attr_id not in nodes):
                nodes[attr_id] = {}
            if(attr_id2 not in nodes[attr_id]):
                nodes[attr_id][attr_id2] = 0

            nodes[attr_id][attr_id2]+=1

for MIN in range(0,159):