Beispiel #1
0
def distribution(excel_rows, item_name, items, file_name):
    # Using 95% confidence interval
    # (1-0.95)/2
    Z_score = abs(st.norm.ppf(0.025))
    alpha = 1 - 0.95
    data_files = {}

    # create dataframe
    for item in items:
        if item_name == "Monkey":
            df = (monkey_df[(monkey_df.Monkey == item)])
        elif item_name == "gender":
            df = (gender_df[(gender_df.gender == item)])
        z = BayesianEstimator(model, df)
        cat_cpd = z.estimate_cpd('Category',
                                 prior_type="bdeu",
                                 equivalent_sample_size=0)  # .to_factor()
        for condition in conditions:
            for category in categories:
                try:
                    count = list(
                        z.state_counts('Category')
                        [condition].to_dict().values())[0][category]
                    # count = z.state_counts('Category')[condition][category][category]
                    prob = cat_cpd.get_value(**{
                        'Condition': condition,
                        'Category': category
                    })
                    # print(prob)
                    # p_hat and q_hat set to conservative since we have no previous data #0.5 for each
                    # Since its probability I clip to 0
                    lower_ci = max(
                        prob - Z_score * math.sqrt((0.5 * 0.5) / df.shape[0]),
                        0)
                    upper_ci = prob + Z_score * math.sqrt(
                        (0.5 * 0.5) / df.shape[0])
                    if not isNaN(prob) and prob > 0:
                        excel_rows.append([
                            item, condition, category, count, prob, lower_ci,
                            upper_ci, alpha
                        ])
                    else:
                        pass
                        # excel_rows.append([item, left, right, cat, count, prob, 0, 0, 0])
                except KeyError:
                    pass
                    # excel_rows.append([item, left, right, cat, count, 0, 0 , 0, 0])

    prob_df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0])
    writer = pd.ExcelWriter(file_name + ".xlsx")
    prob_df.to_excel(writer, sheet_name='Distribution')
    prob_df.sort_values('Probability', ascending=True).drop_duplicates(
        [item_name]).to_excel(writer, sheet_name='prefference')
    writer.save()
    return prob_df
Beispiel #2
0
def distribution(excel_rows, item_name, items, file_name, df_cols,
                 groupby_cols, bp_group):
    # Using 95% confidence interval
    # (1-0.95)/2
    Z_score = abs(st.norm.ppf(0.025))
    alpha = 1 - 0.95
    data_files = {}
    Orientations = ["left", "right"]

    # create dataframe
    for item in items:
        if item_name == "Monkey":
            df = (monkey_df[(monkey_df.Monkey == item)])
        elif item_name == "gender":
            df = (gender_df[(gender_df.gender == item)])
        z = BayesianEstimator(model, df)
        cat_cpd = z.estimate_cpd('Orientation',
                                 prior_type="bdeu",
                                 equivalent_sample_size=6)  # .to_factor()
        for left in categories:
            for right in categories:
                for cat in Orientations:
                    try:
                        count = z.state_counts('Orientation')[left][right][cat]
                        prob = cat_cpd.get_value(
                            **{
                                'Left_categ': left,
                                'Right_categ': right,
                                'Orientation': cat
                            })

                        # p_hat and q_hat set to conservative since we have no previous data #0.5 for each
                        # Since its probability I clip to 0
                        lower_ci = max(
                            prob - Z_score * math.sqrt(
                                (0.5 * 0.5) / df.shape[0]), 0)
                        upper_ci = prob + Z_score * math.sqrt(
                            (0.5 * 0.5) / df.shape[0])
                        if not isNaN(prob) and prob > 0:
                            excel_rows.append([
                                item, left, right, cat, count, prob, lower_ci,
                                upper_ci, alpha
                            ])
                        else:
                            pass
                            # excel_rows.append([item, left, right, cat, count, prob, 0, 0, 0])
                    except KeyError:
                        pass
                        # excel_rows.append([item, left, right, cat, count, 0, 0 , 0, 0])

    prob_df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0])
    gen_df = prob_df[df_cols].groupby(groupby_cols)['Count'].agg(
        ['sum'])  # .reset_index()

    ax, bp = gen_df.boxplot(rot=90,
                            fontsize=12,
                            figsize=(16, 10),
                            column=['sum'],
                            by=bp_group,
                            return_type="both")[0]
    plt.title(item_name.capitalize() + " Box plot grouped by : " +
              str(bp_group))
    plt.suptitle('')
    plt.ylabel("sum")

    # group = ['Left-Category', 'Category']
    # ax, bp = gen_df.boxplot(rot=90, fontsize=12, figsize=(24, 12), column=['sum'], by=group, return_type="both")[0]
    # plt.title("Box plot grouped by : " + str(group))
    # plt.suptitle('')
    # plt.ylabel("sum")
    #
    #
    # group = ['Right-Category', 'Category']
    # ax, bp = gen_df.boxplot(rot=90, fontsize=12, figsize=(24, 12), column=['sum'], by=group, return_type="both")[0]
    # plt.title("Box plot grouped by : " + str(group))
    # plt.suptitle('')
    # plt.ylabel("sum")

    writer = pd.ExcelWriter(file_name + ".xlsx")
    prob_df.to_excel(writer, sheet_name='Distribution')
    prob_df.sort_values('Probability', ascending=False).drop_duplicates(
        [item_name]).to_excel(writer, sheet_name='prefference')
    writer.save()

    plt.savefig(file_name + ".png", dpi=100)
    plt.show()
    plt.clf()