def feature_importance(spn, target_id, rang=None, value_dict=None, numeric_prec=50):
    
    if value_dict is None : value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None : assert(rang[target_id] is None)
    if rang is not None: _, spn = fn.marg_rang(spn, rang)
    
    n_vals = len(value_dict[target_id][2])
    
    
    overall_pops = []
    for v in range(n_vals):
        tmp_rang = [None] * (np.max(spn.scope)+1)
        tmp_rang[target_id] = NominalRange([v])
        p, spn1 = fn.marg_rang(spn, tmp_rang)
        overall_pop = fn.get_overall_population(spn1, value_dict=value_dict, numeric_prec=numeric_prec)
        overall_pops.append([p, overall_pop])
    
    fis = []
    for f_id in spn1.scope:
        dists = [[p, overall_pop[f_id]] for p, overall_pop in overall_pops]
        fi = _compare_distributions(dists, value_dict[f_id])
        fis.append(fi)
    
    return fis  
def visualize_overall_distribution(spn,
                                   value_dict=None,
                                   rang=None,
                                   numeric_prec=50,
                                   save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: _, spn = fn.marg_rang(spn, rang)
    overall_population = fn.get_overall_population(spn,
                                                   value_dict=value_dict,
                                                   numeric_prec=numeric_prec)

    ncols = len(spn.scope)
    nrows = 1
    figsize_x = ncols * 3
    figsize_y = nrows * 3
    fig, axes = plt.subplots(nrows,
                             ncols,
                             figsize=(figsize_x, figsize_y),
                             squeeze=False)

    for i, f_id in enumerate(sorted(list(overall_population))):
        dist = overall_population[f_id]

        if dist["feature_type"] == "discrete":
            viz_helper.bar_plot(axes[0][i],
                                dist["y_means"],
                                dist["x_labels"],
                                y_err=np.sqrt(dist["y_vars"]),
                                y_label="probability",
                                ylim=[0, 1])

        elif dist["feature_type"] == "numeric":
            viz_helper.line_plot(axes[0][i],
                                 dist["x_vals"],
                                 dist["y_means"],
                                 y_errs=np.sqrt(dist["y_vars"]),
                                 y_label="density")
        else:
            raise Exception("Unknown attribute-type: " +
                            str(value_dict[dist.scope[0]]))

    pad_col = 5
    feature_names = [value_dict[x][1] for x in sorted(spn.scope)]
    for ax, col in zip(axes[0], feature_names):
        ax.annotate(col,
                    xy=(0.5, 1),
                    xytext=(0, pad_col),
                    xycoords='axes fraction',
                    textcoords='offset points',
                    size='large',
                    ha='center',
                    va='baseline')

    plt.tight_layout()
    fig.subplots_adjust(top=0.9)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
def visualize_target_based_conds_overall_distribution_compact(
        spn,
        target_conds,
        value_dict=None,
        rang=None,
        target_names=None,
        numeric_prec=50,
        save_path=None):
    '''
    TODOOOO
    '''

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    target_ids = set([cond for conds in target_conds for cond in conds])
    if rang is not None:
        for conds in target_conds:
            for target_id in conds:
                assert (rang[target_id] is None)
    if rang is not None: _, spn = fn.marg_rang(spn, rang)

    n_vals = len(target_conds)

    ncols = len(spn.scope) - 1
    nrows = 1
    figsize_x = ncols * 3
    figsize_y = nrows * 3
    fig, axes = plt.subplots(nrows,
                             ncols,
                             figsize=(figsize_x, figsize_y),
                             squeeze=False)

    ps = []
    plot_data = {f_id: [] for f_id in spn.scope if f_id not in target_ids}
    for v in range(n_vals):
        tmp_rang = [None] * (np.max(spn.scope) + 1)
        for target_id, cond in target_conds[v].items():
            tmp_rang[target_id] = cond

        p, spn1 = fn.marg_rang(spn, tmp_rang)
        ps.append(p)
        overall_population = fn.get_overall_population(
            spn1, value_dict=value_dict, numeric_prec=numeric_prec)
        for f_id in spn1.scope:
            plot_data[f_id].append(overall_population[f_id])

    for i, f_id in enumerate(plot_data):

        if value_dict[f_id][0] == "discrete":
            y_means = []
            y_errs = []
            for j, dist in enumerate(plot_data[f_id]):
                y_means.append(dist["y_means"])
                y_errs.append(dist["y_vars"])
            #viz_helper.multiple_bar_plot(axes[0][i], y_means, dist["x_labels"], y_errs=np.sqrt(y_errs), legend_labels=target_names, y_label="probability", ylim=[0,1])
            viz_helper.multiple_bar_plot(axes[0][i],
                                         y_means,
                                         dist["x_labels"],
                                         legend_labels=target_names,
                                         y_label="probability",
                                         ylim=[0, 1])

        elif value_dict[f_id][0] == "numeric":
            for j, dist in enumerate(plot_data[f_id]):
                #viz_helper.line_plot(axes[0][i], dist["x_vals"], dist["y_means"], y_errs=np.sqrt(dist["y_vars"]), label=target_names[j], y_label="density")
                viz_helper.line_plot(axes[0][i],
                                     dist["x_vals"],
                                     dist["y_means"],
                                     label=target_names[j],
                                     y_label="density")
        else:
            raise Exception("Unknown attribute-type: " +
                            str(value_dict[dist.scope[0]]))

    pad_col = 5
    feature_names = [value_dict[x][1] for x in sorted(spn1.scope)]
    for ax, col in zip(axes[0], feature_names):
        ax.annotate(col,
                    xy=(0.5, 1),
                    xytext=(0, pad_col),
                    xycoords='axes fraction',
                    textcoords='offset points',
                    size='large',
                    ha='center',
                    va='baseline')

    #pad_row = 5
    #info = ""
    #for i, prob in enumerate(ps):
    #    info += str(value_dict[target_id][1]) + "=" + str(value_dict[target_id][2][i]) + " " + str(round(prob*100,4)) + "%\n"
    #axes[0][0].annotate(info, xy=(0, 0.5), xytext=(-axes[0][0].yaxis.labelpad - pad_row, 0), xycoords=axes[0][0].yaxis.label, textcoords='offset points', size='large', ha='right', va='center')
    axes[0][0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.25))
    plt.tight_layout()
    #fig.subplots_adjust(left=0.15, top=0.9)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
def visualize_target_based_overall_distribution_single(spn,
                                                       target_id,
                                                       value_dict=None,
                                                       rang=None,
                                                       numeric_prec=50,
                                                       save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: assert (rang[target_id] is None)
    if rang is not None: _, spn = fn.marg_rang(spn, rang)

    n_vals = len(value_dict[target_id][2])

    ncols = len(spn.scope) - 1
    nrows = n_vals
    figsize_x = ncols * 3
    figsize_y = nrows * 2
    fig, axes = plt.subplots(nrows,
                             ncols,
                             figsize=(figsize_x, figsize_y),
                             squeeze=False)

    ps = []
    for v in range(n_vals):
        tmp_rang = [None] * (np.max(spn.scope) + 1)
        tmp_rang[target_id] = NominalRange([v])

        p, spn1 = fn.marg_rang(spn, tmp_rang)
        ps.append(p)
        overall_population = fn.get_overall_population(
            spn1, value_dict=value_dict, numeric_prec=numeric_prec)

        for i, f_id in enumerate(sorted(spn1.scope)):
            dist = overall_population[f_id]

            if dist["feature_type"] == "discrete":
                viz_helper.bar_plot(axes[v][i],
                                    dist["y_means"],
                                    dist["x_labels"],
                                    y_err=np.sqrt(dist["y_vars"]),
                                    y_label="probability",
                                    ylim=[0, 1])

            elif dist["feature_type"] == "numeric":
                viz_helper.line_plot(axes[v][i],
                                     dist["x_vals"],
                                     dist["y_means"],
                                     y_errs=np.sqrt(dist["y_vars"]),
                                     y_label="density")
            else:
                raise Exception("Unknown attribute-type: " +
                                str(value_dict[dist.scope[0]]))

    pad_col = 5
    feature_names = [value_dict[x][1] for x in sorted(spn1.scope)]
    for ax, col in zip(axes[0], feature_names):
        ax.annotate(col,
                    xy=(0.5, 1),
                    xytext=(0, pad_col),
                    xycoords='axes fraction',
                    textcoords='offset points',
                    size='large',
                    ha='center',
                    va='baseline')

    pad_row = 5
    for i, p in enumerate(ps):
        axes[i][0].annotate(str(round(p * 100, 4)) + "%\n" +
                            value_dict[target_id][1] + "=" +
                            value_dict[target_id][2][i],
                            xy=(0, 0.5),
                            xytext=(-axes[i][0].yaxis.labelpad - pad_row, 0),
                            xycoords=axes[i][0].yaxis.label,
                            textcoords='offset points',
                            size='large',
                            ha='right',
                            va='center')

    plt.tight_layout()
    fig.subplots_adjust(left=0.15, top=0.9)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
def test_get_overall_population():
    spn = example_spns.get_gender_spn()
    overall_pop = fn.get_overall_population(spn)

    print(overall_pop)
Example #6
0
    feature_scope = {2}
    data = np.array([np.nan, np.nan, np.nan])
    expect = fn.expect_spnflow(spn, feature_scope, data)
    print(expect)

    #Sub-population
    sub_pops = fn.get_sub_populations(spn)
    print(sub_pops)

    #Value_dict
    val_dict = fn.generate_adhoc_value_dict(spn)
    print(val_dict)

    #overall_population
    overall_pop = fn.get_overall_population(spn)

    #Titanic
    spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold,
                                              min_instances_slice)

    #Classify
    ranges = np.array([[
        NominalRange([1]),
        NominalRange([1]), None, None, None, None, None, None
    ],
                       [
                           NominalRange([1]),
                           NominalRange([0]), None, None, None, None, None,
                           None
                       ],