Esempio n. 1
0
def interestingness_matrix(spn, value_dict=None, numeric_prec=20):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    sub_pops = fn.get_sub_populations(spn)

    all_scores = []
    for i, f_id in enumerate(sorted(spn.scope)):
        dists = np.array([dists[i] for _, dists in sub_pops])
        scores = _compare_distributions(dists, value_dict[f_id], numeric_prec)
        all_scores.append(scores)

    all_scores = np.array(all_scores).T

    return sub_pops, all_scores
Esempio n. 2
0
def topdown_interesting_rules(
        spn,
        value_dict,
        metrics=['sup', 'conf', 'head_sup', 'F', 'cosine_distance'],
        full_value_dict=None,
        beta=1.,
        labeled=True):
    subpops = fn.get_sub_populations(spn, )
    l = []
    for sub in subpops:
        l.extend(get_interesting_leaves(spn, sub, value_dict, top=6))
    sorted(l, key=lambda x: x[2])
    # rules = [[get_leaf_rules(leaf), diff, weight] for leaf, diff, weight in l]
    rules = []
    for leaf, diff, weight in l:
        leafrules = get_leaf_rules(leaf)
        for r in leafrules:
            if head_compatible_body(r[1],
                                    r[0],
                                    one_hot_vd=value_dict,
                                    full_value_dict=full_value_dict):
                rules.append([r, diff, weight])
    # rrules, rheads, rsup, rconf = [], [], [], []
    final_rules = []
    for lst in rules:
        #get confidence
        rule, head = lst[0]
        if len(rule) == 0 or len(head) == 0:
            continue
        stats = rule_stats(
            spn,
            rule,
            head,
            metrics=metrics,
            beta=beta,
        )

        if stats[metrics.index('F')] > 0.03:
            # if True:
            final_rules.append((head, rule, *stats))
    if labeled:
        final_rules_labeled = [(*get_labeled_rule(r[0], r[1], value_dict),
                                *r[2:]) for r in final_rules]
    rule_df = pd.DataFrame(final_rules_labeled,
                           columns=['head', 'body', *metrics])
    rule_df = rule_df.drop_duplicates(['body', 'head'])
    return rule_df
def visualized_target_based_expected_sub_populations(spn,
                                                     target_id,
                                                     value_dict=None,
                                                     top=None,
                                                     rang=None,
                                                     numeric_prec=10,
                                                     save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: spn = fn.marg_rang(spn, rang)

    n_vals = len(value_dict[target_id][2])

    ps = []
    all_lines = []
    for v in range(n_vals):

        tmp_rang = [None] * (np.max(spn.scope) + 1)
        tmp_rang[target_id] = NominalRange([v])
        p, spn1 = fn.marg_rang(spn, tmp_rang)
        ps.append(p)
        sub_pops = fn.get_sub_populations(spn1, sort=True, top=top)
        sub_pops = [[p * p1, dists] for p1, dists in sub_pops]

        lines = []
        for [p, dists] in sub_pops:
            line = []
            for dist in dists:
                f_id = dist.scope[0]

                if value_dict[f_id][0] == "discrete":
                    rang = [None] * (np.max(spn.scope) + 1)
                    expect = fn.expect(dist, f_id, rang)
                    y_val = np.linspace(0, 1,
                                        len(value_dict[f_id][2]))[int(expect)]
                    line.append(y_val)

                elif value_dict[f_id][0] == "numeric":
                    rang = [None] * (np.max(spn.scope) + 1)
                    expect = fn.expect(dist, f_id, rang)

                    mi = value_dict[f_id][2][0]
                    ma = value_dict[f_id][2][1]
                    y_val = (expect - mi) / (ma - mi)
                    line.append(y_val)
                else:
                    raise Exception("Unknown attribute-type: " +
                                    str(value_dict[dist.scope[0]]))

            lines.append([p, line])
        all_lines.append(lines)

    fig, axes = plt.subplots(n_vals,
                             1,
                             figsize=(16, 6 * n_vals),
                             squeeze=False)
    for i, lines in enumerate(all_lines):

        plot = axes[i][0]
        plot.set_yticklabels([])
        for [p, line] in lines:
            x_vals = []
            y_vals = []
            for i in range(len(line) - 1):
                y_val = line[i]
                next_y_val = line[i + 1]

                for r in np.linspace(0, 1, numeric_prec):
                    x_vals.append(i + r)
                    y_vals.append(y_val + (next_y_val - y_val) * r +
                                  np.random.normal() * 0.025)

            plot.plot(x_vals, y_vals, linewidth=p * 100)

        x_feature_ids = sorted(list(set(spn.scope) - set([target_id])))
        plot.set_xticks(np.arange(len(x_feature_ids)))
        if value_dict is not None:
            plot.set_xticklabels(
                [value_dict[scope][1] for scope in x_feature_ids])

        for j, feature_id in enumerate(x_feature_ids):

            if value_dict[feature_id][0] == "discrete":
                for i, y_val in enumerate(
                        np.linspace(0, 1, len(value_dict[feature_id][2]))):
                    val_name = value_dict[feature_id][2][i]
                    plot.text(j, y_val, val_name)
            elif value_dict[feature_id][0] == "numeric":
                mi = value_dict[feature_id][2][0]
                ma = value_dict[feature_id][2][1]
                for i, y_val in enumerate(np.linspace(0, 1, 5)):
                    val_name = round(y_val * (ma - mi) + mi, 4)
                    plot.text(j, y_val, val_name)
            else:
                raise Exception(
                    "Not implemented for other than discrete or numeric")

    pad_row = 5
    for i, (ax, p) in enumerate(zip(axes[:, 0], ps)):
        info = value_dict[target_id][1] + "=" + value_dict[target_id][2][
            i] + " " + str(round(p * 100, 4)) + "%\n"
        ax.annotate(info,
                    xy=(0, 0.5),
                    xytext=(-ax.yaxis.labelpad - pad_row, 0),
                    xycoords=ax.yaxis.label,
                    textcoords='offset points',
                    size='large',
                    ha='right',
                    va='center')
    plt.tight_layout()
    fig.subplots_adjust(left=0.15)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
def visualize_expected_sub_populations(spn,
                                       value_dict=None,
                                       top=None,
                                       rang=None,
                                       numeric_prec=10,
                                       save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: spn = fn.marg_rang(spn, rang)
    sub_pops = fn.get_sub_populations(spn, sort=True, top=top)

    fig, axes = plt.subplots(1, 1, figsize=(16, 6), squeeze=False)

    lines = []
    for [prob, dists] in sub_pops:
        line = []
        for dist in dists:
            f_id = dist.scope[0]

            if value_dict[f_id][0] == "discrete":
                rang = [None] * (np.max(spn.scope) + 1)
                expect = fn.expect(dist, f_id, rang)
                y_val = np.linspace(0, 1,
                                    len(value_dict[f_id][2]))[int(expect)]
                line.append(y_val)

            elif value_dict[f_id][0] == "numeric":
                rang = [None] * (np.max(spn.scope) + 1)
                expect = fn.expect(dist, f_id, rang)

                mi = value_dict[f_id][2][0]
                ma = value_dict[f_id][2][1]
                y_val = (expect - mi) / (ma - mi)
                line.append(y_val)
            else:
                raise Exception("Unknown attribute-type: " +
                                str(value_dict[dist.scope[0]]))

        lines.append([prob, line])

    plot = axes[0][0]
    plot.set_yticklabels([])
    for [prob, line] in lines:
        x_vals = []
        y_vals = []
        for i in range(len(line) - 1):
            y_val = line[i]
            next_y_val = line[i + 1]
            for r in np.linspace(0, 1, numeric_prec):
                x_vals.append(i + r)
                y_vals.append(y_val + (next_y_val - y_val) * r +
                              np.random.normal() * 0.025)

        plot.plot(x_vals, y_vals, linewidth=prob * 100)
    plot.set_xticks(np.arange(len(spn.scope)))
    if value_dict is not None:
        plot.set_xticklabels([value_dict[scope][1] for scope in spn.scope])

    for j, feature_id in enumerate(spn.scope):

        if value_dict[feature_id][0] == "discrete":
            for i, y_val in enumerate(
                    np.linspace(0, 1, len(value_dict[feature_id][2]))):
                val_name = value_dict[feature_id][2][i]
                plot.text(j, y_val, val_name)
        elif value_dict[feature_id][0] == "numeric":
            mi = value_dict[feature_id][2][0]
            ma = value_dict[feature_id][2][1]
            for i, y_val in enumerate(np.linspace(0, 1, 5)):
                val_name = round(y_val * (ma - mi) + mi, 4)
                plot.text(j, y_val, val_name)
        else:
            raise Exception(
                "Not implemented for other than discrete or numeric")

    plt.tight_layout()

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
def visualize_sub_populations(spn,
                              value_dict=None,
                              top=None,
                              rang=None,
                              numeric_prec=50,
                              save_path=None):

    if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn)
    if rang is not None: spn = fn.marg_rang(spn, rang)
    sub_pops = fn.get_sub_populations(spn, sort=True, top=top)

    ncols = len(spn.scope)
    nrows = len(sub_pops)
    figsize_x = ncols * 3
    figsize_y = nrows * 2
    fig, axes = plt.subplots(nrows,
                             ncols,
                             figsize=(figsize_x, figsize_y),
                             squeeze=False)

    for i, [_, dists] in enumerate(sub_pops):
        for j, dist in enumerate(dists):
            f_id = dist.scope[0]
            if value_dict[f_id][0] == "discrete":

                val_pairs = sorted(value_dict[f_id][2].items(),
                                   key=lambda x: x[0])
                y_vals = fn.evaluate_discrete_leaf(
                    dist, f_vals=[x[0] for x in val_pairs])
                viz_helper.bar_plot(axes[i][j],
                                    y_vals,
                                    x_tick_labels=[x[1] for x in val_pairs],
                                    y_label="probability",
                                    ylim=[0, 1])

            elif value_dict[f_id][0] == "numeric":

                x_vals = np.linspace(value_dict[f_id][2][0],
                                     value_dict[f_id][2][1],
                                     num=numeric_prec)
                y_vals = fn.evaluate_numeric_density_leaf(dist, x_vals)
                viz_helper.line_plot(axes[i][j],
                                     x_vals,
                                     y_vals,
                                     y_label="density")

            else:
                raise Exception("Unknown attribute-type: " +
                                str(value_dict[dist.scope[0]]))

    pad_col = 5
    if value_dict is None:
        feature_names = ["Feature " + str(x) for x in sorted(spn.scope)]
    else:
        feature_names = [value_dict[x][1] for x in sorted(spn.scope)]
    for ax, col in zip(axes[0], feature_names):
        ax.annotate(col,
                    xy=(0.5, 1),
                    xytext=(0, pad_col),
                    xycoords='axes fraction',
                    textcoords='offset points',
                    size='large',
                    ha='center',
                    va='baseline')
    pad_row = 5
    for ax, row in zip(axes[:, 0], [round(x, 6) for [x, _] in sub_pops]):
        ax.annotate(row,
                    xy=(0, 0.5),
                    xytext=(-ax.yaxis.labelpad - pad_row, 0),
                    xycoords=ax.yaxis.label,
                    textcoords='offset points',
                    size='large',
                    ha='right',
                    va='center')
    plt.tight_layout()
    fig.subplots_adjust(left=0.15, top=0.95)

    if save_path is None:
        plt.show()
    else:
        plt.savefig(save_path)
def test_get_subpopulations():
    spn = example_spns.get_gender_spn()
    #rang = [NominalRange([0]), NominalRange([1]), None]
    sub_pops = fn.get_sub_populations(spn)

    print(sub_pops)
Esempio n. 7
0
df = pd.DataFrame(data, columns=[value_dict[i][1] for i in range(num_vars)])
print(df.corr())

# parameters for the construction
rdc_threshold = 0.1
min_instances_slice = 0.1
if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice):
    print("Creating SPN ...")

    # get data
    # df, value_dict, parametric_types = real_data.get_titanic()

    spn, value_dict, _ = spn_handler.create_parametric_spns(
        data,
        data_types,
        dataset_name, [rdc_threshold], [min_instances_slice],
        value_dict,
        save=False)
# # Load SPN
# spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice)
# Print some statistics
fn.print_statistics(spn)
visualize_expected_sub_populations(spn, value_dict, 10)
visualize_sub_populations(spn, value_dict, 10)
subpops = fn.get_sub_populations(spn, )

print(subpops)
print('============')
pprint(subpops)
fn.plot_spn(spn, "icecream_spn.pdf", value_dict)