def test_expect(): spn = example_spns.get_gender_spn() rang = [None, None, None] expect = fn.expect(spn, feature_id=2, rang=rang) print(expect) rang = [NominalRange([0]), None, None] expect = fn.expect(spn, feature_id=2, rang=rang) print(expect) rang = [NominalRange([1]), None, None] expect = fn.expect(spn, feature_id=2, rang=rang) print(expect) rang = [None, NominalRange([0]), None] expect = fn.expect(spn, feature_id=2, rang=rang) print(expect) feature_scope = {2} data = np.array([[np.nan, np.nan, np.nan]]) expect = fn.expects_spnflow(spn, feature_scope, data) print(expect) feature_scope = {2} data = np.array([np.nan, np.nan, np.nan]) expect = fn.expect_spnflow(spn, feature_scope, data) print(expect)
def test_marg(): spn = example_spns.get_gender_spn() spn1 = fn.marg(spn, [2]) fn.plot_spn(spn1, "marg1.pdf") spn2 = fn.marg(spn, [0]) fn.plot_spn(spn2, "marg2.pdf") spn3 = fn.marg(spn, [1]) fn.plot_spn(spn3, "marg3.pdf") spn4 = fn.marg(spn, [1, 2]) fn.plot_spn(spn4, "marg4.pdf") rang = [None, NominalRange([1]), None] prob, spn5 = fn.marg_rang(spn, rang) fn.plot_spn(spn5, "marg5.pdf") rang = [None, NominalRange([1]), NumericRange([[10, 12]])] prob, spn6 = fn.marg_rang(spn, rang) fn.plot_spn(spn6, "marg6.pdf") rang = [NominalRange([0]), NominalRange([1]), None] prob = fn.prob(spn, rang) print(prob) prob = fn.prob(spn6, rang) print(prob)
def test_classify(): from util import io from data import real_data loc = "_spns" ident = "rdc=" + str(0.3) + "_mis=" + str(0.1) spn, _ = io.load(ident, "titanic", loc) value_dict = real_data.get_titanic_value_dict() #spn = fn.marg(spn, keep=[0,1,2,4,5,7]) ranges = np.array( [[None, NominalRange([1]), None, None, None, None, None, None], [None, NominalRange([0]), None, None, None, None, None, None], [None, NominalRange([0]), None, None, None, None, None, None]]) res = fn.classifies(spn, target_id=0, ranges=ranges, value_dict=value_dict) print(res) res = fn.classify(spn, target_id=0) print(res) df, _ = real_data.get_titanic() a = {v[1]: v[2] for _, v in value_dict.items() if v[0] == "discrete"} df = df.replace(a) preds = fn.classify_dataset(spn, target_id=0, df=df, transform=True, value_dict=value_dict) print(preds)
def explore_1(): dataset_name = "rki_ed_1" rdc_threshold = 0.3 min_instances_slice = 0.01 if not spn_handler.exist_spn(dataset_name, rdc_threshold, min_instances_slice): df, value_dict, parametric_types = ed_data.get_rki_ed_1() spn_handler.create_parametric_spns(df.values, parametric_types, dataset_name, [rdc_threshold], [min_instances_slice], value_dict) spn, value_dict, _ = spn_handler.load_spn(dataset_name, rdc_threshold, min_instances_slice) spn = fn.marg(spn, keep=[0,2,3,4,5]) fn.print_statistics(spn) p = io.get_path("_results/ed_data_explore") #vz.visualize_overall_distribution(spn, value_dict) from spn.experiments.AQP.Ranges import NominalRange target_conds = [{0 : NominalRange([5,6])}, {0 : NominalRange([0,1,2,3,4])}] #target_conds = [{0 : NominalRange([5,6]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}, {0 : NominalRange([0,1,2,3,4]), 1 : NominalRange([0,1,2,3,4,5,6,7,8,9,10,11])}] vz.visualize_target_based_conds_overall_distribution_compact(spn, target_conds, value_dict, target_names=["Wochenende", "Unter der Woche"], save_path=p+dataset_name+"_weekend_measures.pdf")
def test_sampling(): spn = example_spns.get_gender_spn() ''' Always same random number generator ''' samples = fn.sampling(spn, n_samples=10, random_seed=1) print(samples) samples = fn.sampling_rang(spn, rang=[None, None, None, None], n_samples=10, random_seed=1) print(samples) samples = fn.sampling_rang( spn, rang=[None, None, NumericRange([[10, 11], [29, 30]])], n_samples=10, random_seed=1) print(samples) samples = fn.sampling_rang( spn, rang=[NominalRange([0]), None, NumericRange([[14, 15], [29, 30]])], n_samples=10, random_seed=1) print(samples)
def reduce_spn(): from spn.experiments.AQP.Ranges import NominalRange, NumericRange from spn.structure.Base import Sum, Product from spn.algorithms.Inference import sum_likelihood, prod_likelihood from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical from spn.structure.leaves.parametric.InferenceRange import categorical_likelihood_range from simple_spn.UpdateRange import categorical_update_range evidence = [NominalRange([0]), None, None, None] inference_support_ranges = { Gaussian: None, Categorical: categorical_likelihood_range, Sum: sum_likelihood, Product: prod_likelihood } distribution_update_ranges = { Gaussian: None, Categorical: categorical_update_range } #spn_util.plot_spn(spn, "old.pdf") prob, spn = spn_for_evidence( spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) print(prob)
def classifies(spn, target_id, ranges, value_dict=None): if value_dict is None: value_dict = generate_adhoc_value_dict(spn) if ranges is None: ranges = np.array([[None] * (np.max(spn.scope) + 1)]) assert (not any(ranges[:, target_id])) ps = [] for v in range(len(value_dict[target_id][2])): ranges[:, target_id] = NominalRange([v]) ps.append(probs(spn, ranges)) return np.argmax(ps, axis=0)
def naive_approach(spn, min_support=0.1, value_dict=None): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) n_rv = np.max(spn.scope) + 1 ranges = np.full(shape=(n_rv, n_rv), fill_value=None) for i in range(len(ranges)): if len(value_dict[i][2]) == 2: ranges[i][i] = NominalRange([1]) freq_sets = [] new_freq_sets = [] for i in range(len(spn.scope)): print("Iteration: " + str(i)) if len(ranges) == 0: break probs = fn.probs(spn, ranges) new_freq_sets = [] for i, prob in enumerate(probs): if prob >= min_support: ids = [ i for i, cond in enumerate(ranges[i]) if cond is not None ] new_freq_sets.append([prob, ids]) freq_sets += new_freq_sets ranges = [] for prob, ids in new_freq_sets: for i in range(n_rv): if i not in ids: rang = np.array([None] * n_rv) rang[ids] = NominalRange([1]) rang[i] = NominalRange([1]) ranges.append(rang) ranges = np.array(ranges) return freq_sets
def not_rang(rang, value_dict): assert len(rang) == len(value_dict) res = [None] * len(rang) for i, r in enumerate(rang): if not r is np.NaN: if not isinstance(r, Range): vals = list(value_dict[i][2].keys()) vals.pop(r) else: vals = list(value_dict[i][2].keys()) for v in r.get_ranges(): vals.pop(v) res[i] = NominalRange(vals) return res
def test_prob(): spn = example_spns.get_gender_spn() rang = [None, None, None] prob = fn.prob(spn, rang) print(prob) rang = [NominalRange([0]), NominalRange([1]), NumericRange([[20]])] prob = fn.prob(spn, rang) print(prob) ranges = np.array([[None, None, NumericRange([[0, 20]])], [NominalRange([0]), None, None], [None, NominalRange([1]), None]]) probs = fn.probs(spn, ranges) print(probs) inst = [0, np.nan, np.nan] prob = fn.prob_spflow(spn, inst) print(prob) data = np.array([[0, np.nan, np.nan], [0, 1, np.nan]]) probs = fn.probs_spflow(spn, data) print(probs)
def _generate_conds(target_id, value_dict, numeric_intervals=10): conds = [] labels = [] if value_dict[target_id][0] == "discrete": for val in sorted(value_dict[target_id][2]): conds.append(NominalRange([val])) labels.append(value_dict[target_id][2][val]) elif value_dict[target_id][0] == "numeric": val_space = np.linspace(value_dict[target_id][2][0], value_dict[target_id][2][1], numeric_intervals + 1) for interval in zip(val_space[1:], val_space[:-1]): conds.append(NumericRange([list(interval)])) labels.append(str(list(interval))) else: raise Exception( "Not implemented for other than discrete or numeric ...: " + str(value_dict[target_id][0])) return conds, labels
def classify_dataset(spn, target_id, df, transform=False, value_dict=None, epsilon=0.01): if value_dict is None: value_dict = generate_adhoc_value_dict(spn) sorted_scope = sorted(spn.scope) if transform: inv_val_dict = { v[1]: {v2: k2 for k2, v2 in v[2].items()} for _, v in value_dict.items() if v[0] == "discrete" } for col_name, map_dict in inv_val_dict.items(): df[col_name] = df[col_name].map(map_dict) values = np.array(df.values) ranges = np.full(shape=(len(values), np.max(spn.scope) + 1), fill_value=None) for i, col in enumerate(values.T): f_id = sorted_scope[i] if f_id == target_id: continue if value_dict[f_id][0] == "discrete": for j, v in enumerate(col): ranges[j, f_id] = NominalRange([v]) elif value_dict[f_id][0] == "numeric": bound = epsilon * (value_dict[f_id][2][1] - value_dict[f_id][2][0]) for j, v in enumerate(col): ranges[j, f_id] = NumericRange([[v - bound, v + bound]]) else: raise Exception("Unknown attribute-type: " + str(value_dict[f_id][0])) return classifies(spn, target_id, ranges, value_dict)
def feature_importance(spn, target_id, rang=None, value_dict=None, numeric_prec=50): if value_dict is None : value_dict = fn.generate_adhoc_value_dict(spn) if rang is not None : assert(rang[target_id] is None) if rang is not None: _, spn = fn.marg_rang(spn, rang) n_vals = len(value_dict[target_id][2]) overall_pops = [] for v in range(n_vals): tmp_rang = [None] * (np.max(spn.scope)+1) tmp_rang[target_id] = NominalRange([v]) p, spn1 = fn.marg_rang(spn, tmp_rang) overall_pop = fn.get_overall_population(spn1, value_dict=value_dict, numeric_prec=numeric_prec) overall_pops.append([p, overall_pop]) fis = [] for f_id in spn1.scope: dists = [[p, overall_pop[f_id]] for p, overall_pop in overall_pops] fi = _compare_distributions(dists, value_dict[f_id]) fis.append(fi) return fis
def visualize_Density(spn): from spn.experiments.AQP.Ranges import NominalRange, NumericRange from spn.algorithms import Inference from simple_spn.InferenceRange import categorical_likelihood_range, gaussian_likelihood_range from spn.structure.Base import Sum, Product from spn.algorithms.Inference import sum_likelihood, prod_likelihood from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical from simple_spn.UpdateRange import categorical_update_range inference_support_ranges = { Gaussian: None, Categorical: categorical_likelihood_range, Sum: sum_likelihood, Product: prod_likelihood } distribution_update_ranges = { Gaussian: None, Categorical: categorical_update_range } import matplotlib.pyplot as plt _, axes = plt.subplots(1, 5, figsize=(15, 10), squeeze=False, sharey=False, sharex=True) space_start = 0.00 space_end = 1.0 steps = 100 max_y = 5 for i in range(5): x_vals = np.linspace(space_start, space_end, num=steps) ranges = [] for x_val in x_vals: r = [None] * i + [NumericRange([[x_val]])] + [None] * (5 - i) ranges.append(r) ranges = np.array(ranges) inference_support_ranges = { Gaussian: gaussian_likelihood_range, Categorical: categorical_likelihood_range, Sum: sum_likelihood, Product: prod_likelihood } y_vals = Inference.likelihood( spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:, 0] axes[0][i].plot(x_vals, y_vals) axes[0][i].set_title("Method " + str(i) + " All") axes[0][i].set_ylim([0, max_y]) evidence = [None, None, None, None, None, NominalRange([0])] prob_no_alarm, spn_no_alarm = spn_for_evidence( spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) print(prob_no_alarm) for i in range(5): x_vals = np.linspace(space_start, space_end, num=steps) ranges = [] for x_val in x_vals: r = [None] * i + [NumericRange([[x_val]])] + [None] * (5 - i) ranges.append(r) ranges = np.array(ranges) inference_support_ranges = { Gaussian: gaussian_likelihood_range, Categorical: categorical_likelihood_range, Sum: sum_likelihood, Product: prod_likelihood } y_vals = Inference.likelihood( spn_no_alarm, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:, 0] axes[0][i].plot(x_vals, y_vals, label="No Alarm", linestyle=":") evidence = [None, None, None, None, None, NominalRange([1])] prob_alarm, spn_alarm = spn_for_evidence( spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) print(prob_alarm) for i in range(5): x_vals = np.linspace(space_start, space_end, num=steps) ranges = [] for x_val in x_vals: r = [None] * i + [NumericRange([[x_val]])] + [None] * (5 - i) ranges.append(r) ranges = np.array(ranges) inference_support_ranges = { Gaussian: gaussian_likelihood_range, Categorical: categorical_likelihood_range, Sum: sum_likelihood, Product: prod_likelihood } y_vals = Inference.likelihood( spn_alarm, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:, 0] axes[0][i].plot(x_vals, y_vals, label="Alarm") plt.legend() plt.tight_layout() plt.savefig("pdp.pdf") plt.show() spn_util.plot_spn(spn, "pval.pdf") tmp = get_nodes_with_weight(spn, 5) for (weight, node) in tmp: print(str(round(node.p[1], 2)) + "\t" + str(weight))
def visualize_Density_2d(spn): from spn.experiments.AQP.Ranges import NominalRange, NumericRange from spn.algorithms import Inference from simple_spn.InferenceRange import categorical_likelihood_range, gaussian_likelihood_range from simple_spn.UpdateRange import categorical_update_range from spn.experiments.AQP.Ranges import NominalRange, NumericRange from spn.structure.Base import Sum, Product from spn.algorithms.Inference import sum_likelihood, prod_likelihood from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical distribution_update_ranges = { Gaussian: None, Categorical: categorical_update_range } inference_support_ranges = { Gaussian: gaussian_likelihood_range, Categorical: categorical_likelihood_range, Sum: sum_likelihood, Product: prod_likelihood } import matplotlib.pyplot as plt _, axes = plt.subplots(1, 3, figsize=(15, 10), squeeze=False, sharey=False, sharex=True) x_vals = np.linspace(0, 1, num=50) y_vals = np.linspace(0, 1, num=50) X, Y = np.meshgrid(x_vals, y_vals) ranges = [] vals = [] for y_val in y_vals: print(y_val) ranges = [] for x_val in x_vals: ranges.append([ NumericRange([[x_val]]), NumericRange([[y_val]]), None, None, None, None ]) ranges = np.array(ranges) densities = Inference.likelihood( spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:, 0] for i, d in enumerate(densities): if d > 5: densities[i] = 5 vals.append(densities) vals = np.array(vals) axes[0][0].contour(X, Y, vals) axes[0][0].set_xlabel("Method1") axes[0][0].set_ylabel("Method2") axes[0][0].set_title("Overall") evidence = [None, None, None, None, None, NominalRange([0])] prob_no_alarm, spn_no_alarm = spn_for_evidence( spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) print(prob_no_alarm) ranges = [] vals = [] for y_val in y_vals: print(y_val) ranges = [] for x_val in x_vals: ranges.append([ NumericRange([[x_val]]), NumericRange([[y_val]]), None, None, None, None ]) ranges = np.array(ranges) densities = Inference.likelihood( spn_no_alarm, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:, 0] for i, d in enumerate(densities): if d > 5: densities[i] = 5 vals.append(densities) vals = np.array(vals) axes[0][1].contour(X, Y, vals) axes[0][1].set_xlabel("Method1") axes[0][1].set_ylabel("Method2") axes[0][1].set_title("Keine Epidemie") evidence = [None, None, None, None, None, NominalRange([1])] prob_alarm, spn_alarm = spn_for_evidence( spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) print(prob_alarm) ranges = [] vals = [] for y_val in y_vals: print(y_val) ranges = [] for x_val in x_vals: ranges.append([ NumericRange([[x_val]]), NumericRange([[y_val]]), None, None, None, None ]) ranges = np.array(ranges) densities = Inference.likelihood( spn_alarm, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:, 0] for i, d in enumerate(densities): if d > 5: densities[i] = 5 vals.append(densities) vals = np.array(vals) axes[0][2].contour(X, Y, vals) axes[0][2].set_xlabel("Method1") axes[0][2].set_ylabel("Method2") axes[0][2].set_title("Epidemie") plt.savefig("cdp.pdf") plt.show()
def visualized_target_based_expected_sub_populations(spn, target_id, value_dict=None, top=None, rang=None, numeric_prec=10, save_path=None): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) if rang is not None: spn = fn.marg_rang(spn, rang) n_vals = len(value_dict[target_id][2]) ps = [] all_lines = [] for v in range(n_vals): tmp_rang = [None] * (np.max(spn.scope) + 1) tmp_rang[target_id] = NominalRange([v]) p, spn1 = fn.marg_rang(spn, tmp_rang) ps.append(p) sub_pops = fn.get_sub_populations(spn1, sort=True, top=top) sub_pops = [[p * p1, dists] for p1, dists in sub_pops] lines = [] for [p, dists] in sub_pops: line = [] for dist in dists: f_id = dist.scope[0] if value_dict[f_id][0] == "discrete": rang = [None] * (np.max(spn.scope) + 1) expect = fn.expect(dist, f_id, rang) y_val = np.linspace(0, 1, len(value_dict[f_id][2]))[int(expect)] line.append(y_val) elif value_dict[f_id][0] == "numeric": rang = [None] * (np.max(spn.scope) + 1) expect = fn.expect(dist, f_id, rang) mi = value_dict[f_id][2][0] ma = value_dict[f_id][2][1] y_val = (expect - mi) / (ma - mi) line.append(y_val) else: raise Exception("Unknown attribute-type: " + str(value_dict[dist.scope[0]])) lines.append([p, line]) all_lines.append(lines) fig, axes = plt.subplots(n_vals, 1, figsize=(16, 6 * n_vals), squeeze=False) for i, lines in enumerate(all_lines): plot = axes[i][0] plot.set_yticklabels([]) for [p, line] in lines: x_vals = [] y_vals = [] for i in range(len(line) - 1): y_val = line[i] next_y_val = line[i + 1] for r in np.linspace(0, 1, numeric_prec): x_vals.append(i + r) y_vals.append(y_val + (next_y_val - y_val) * r + np.random.normal() * 0.025) plot.plot(x_vals, y_vals, linewidth=p * 100) x_feature_ids = sorted(list(set(spn.scope) - set([target_id]))) plot.set_xticks(np.arange(len(x_feature_ids))) if value_dict is not None: plot.set_xticklabels( [value_dict[scope][1] for scope in x_feature_ids]) for j, feature_id in enumerate(x_feature_ids): if value_dict[feature_id][0] == "discrete": for i, y_val in enumerate( np.linspace(0, 1, len(value_dict[feature_id][2]))): val_name = value_dict[feature_id][2][i] plot.text(j, y_val, val_name) elif value_dict[feature_id][0] == "numeric": mi = value_dict[feature_id][2][0] ma = value_dict[feature_id][2][1] for i, y_val in enumerate(np.linspace(0, 1, 5)): val_name = round(y_val * (ma - mi) + mi, 4) plot.text(j, y_val, val_name) else: raise Exception( "Not implemented for other than discrete or numeric") pad_row = 5 for i, (ax, p) in enumerate(zip(axes[:, 0], ps)): info = value_dict[target_id][1] + "=" + value_dict[target_id][2][ i] + " " + str(round(p * 100, 4)) + "%\n" ax.annotate(info, xy=(0, 0.5), xytext=(-ax.yaxis.labelpad - pad_row, 0), xycoords=ax.yaxis.label, textcoords='offset points', size='large', ha='right', va='center') plt.tight_layout() fig.subplots_adjust(left=0.15) if save_path is None: plt.show() else: plt.savefig(save_path)
def demo_visualize_density(): #data, parametric_types = real_data.get_p_value_dataset() #learn_SPN.create_parametric_spns(data, parametric_types, [0.3], [0.01], folder="p_value_test") loc = "_spns" ident = "rdc=" + str(0.3) + "_mis=" + str(0.01) spn, _ = io.load(ident, "p_value_test", loc) value_dict = real_data.get_p_value_test_value_dict() rang = None save_path = os.path.dirname(os.path.realpath( __file__)) + "/../../../_plots/interpretability/blackbox/density1.pdf" visualize_density(spn, value_dict, rang=rang, max_density=10, save_path=save_path) rang = [None] * 5 + [NominalRange([0])] save_path = os.path.dirname(os.path.realpath( __file__)) + "/../../../_plots/interpretability/blackbox/density2.pdf" visualize_density(spn, value_dict, rang=rang, max_density=10, save_path=save_path) rang = None save_path = os.path.dirname(os.path.realpath( __file__)) + "/../../../_plots/interpretability/blackbox/density3.pdf" visualize_density_target(spn, 5, value_dict, rang=rang, max_density=10, save_path=save_path) loc = "_spns" ident = "rdc=" + str(0.3) + "_mis=" + str(0.01) spn, _ = io.load(ident, "titanic", loc) value_dict = real_data.get_titanic_value_dict() rang = None save_path = os.path.dirname(os.path.realpath( __file__)) + "/../../../_plots/interpretability/blackbox/density5.pdf" visualize_density(spn, value_dict, max_density=0.1, save_path=save_path) rang = None save_path = os.path.dirname(os.path.realpath( __file__)) + "/../../../_plots/interpretability/blackbox/density6.pdf" visualize_density_target(spn, 0, value_dict, max_density=0.1, save_path=save_path) rang = None save_path = os.path.dirname(os.path.realpath( __file__)) + "/../../../_plots/interpretability/blackbox/density7.pdf" visualize_density_target(spn, 2, value_dict, max_density=0.1, save_path=save_path) rang = [None] * 2 + [NominalRange([0])] + [None] * 5 save_path = os.path.dirname(os.path.realpath( __file__)) + "/../../../_plots/interpretability/blackbox/density8.pdf" visualize_density_target(spn, 0, value_dict, max_density=0.1, save_path=save_path)
def visualize_target_based_overall_distribution_single(spn, target_id, value_dict=None, rang=None, numeric_prec=50, save_path=None): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) if rang is not None: assert (rang[target_id] is None) if rang is not None: _, spn = fn.marg_rang(spn, rang) n_vals = len(value_dict[target_id][2]) ncols = len(spn.scope) - 1 nrows = n_vals figsize_x = ncols * 3 figsize_y = nrows * 2 fig, axes = plt.subplots(nrows, ncols, figsize=(figsize_x, figsize_y), squeeze=False) ps = [] for v in range(n_vals): tmp_rang = [None] * (np.max(spn.scope) + 1) tmp_rang[target_id] = NominalRange([v]) p, spn1 = fn.marg_rang(spn, tmp_rang) ps.append(p) overall_population = fn.get_overall_population( spn1, value_dict=value_dict, numeric_prec=numeric_prec) for i, f_id in enumerate(sorted(spn1.scope)): dist = overall_population[f_id] if dist["feature_type"] == "discrete": viz_helper.bar_plot(axes[v][i], dist["y_means"], dist["x_labels"], y_err=np.sqrt(dist["y_vars"]), y_label="probability", ylim=[0, 1]) elif dist["feature_type"] == "numeric": viz_helper.line_plot(axes[v][i], dist["x_vals"], dist["y_means"], y_errs=np.sqrt(dist["y_vars"]), y_label="density") else: raise Exception("Unknown attribute-type: " + str(value_dict[dist.scope[0]])) pad_col = 5 feature_names = [value_dict[x][1] for x in sorted(spn1.scope)] for ax, col in zip(axes[0], feature_names): ax.annotate(col, xy=(0.5, 1), xytext=(0, pad_col), xycoords='axes fraction', textcoords='offset points', size='large', ha='center', va='baseline') pad_row = 5 for i, p in enumerate(ps): axes[i][0].annotate(str(round(p * 100, 4)) + "%\n" + value_dict[target_id][1] + "=" + value_dict[target_id][2][i], xy=(0, 0.5), xytext=(-axes[i][0].yaxis.labelpad - pad_row, 0), xycoords=axes[i][0].yaxis.label, textcoords='offset points', size='large', ha='right', va='center') plt.tight_layout() fig.subplots_adjust(left=0.15, top=0.9) if save_path is None: plt.show() else: plt.savefig(save_path)
def visualize_target_based_overall_distribution_compact( spn, target_id, value_dict=None, rang=None, numeric_prec=50, save_path=None): if value_dict is None: value_dict = fn.generate_adhoc_value_dict(spn) if rang is not None: assert (rang[target_id] is None) if rang is not None: _, spn = fn.marg_rang(spn, rang) n_vals = len(value_dict[target_id][2]) ncols = len(spn.scope) - 1 nrows = 1 figsize_x = ncols * 3 figsize_y = nrows * 3 fig, axes = plt.subplots(nrows, ncols, figsize=(figsize_x, figsize_y), squeeze=False) ps = [] plot_data = {f_id: [] for f_id in spn.scope if f_id != target_id} for v in range(n_vals): tmp_rang = [None] * (np.max(spn.scope) + 1) tmp_rang[target_id] = NominalRange([v]) p, spn1 = fn.marg_rang(spn, tmp_rang) ps.append(p) overall_population = fn.get_overall_population( spn1, value_dict=value_dict, numeric_prec=numeric_prec) for f_id in spn1.scope: plot_data[f_id].append(overall_population[f_id]) for i, f_id in enumerate(plot_data): if value_dict[f_id][0] == "discrete": y_means = [] y_errs = [] legend_labels = [] for j, dist in enumerate(plot_data[f_id]): y_means.append(dist["y_means"]) y_errs.append(dist["y_vars"]) legend_labels.append( str(value_dict[target_id][1]) + "=" + str(value_dict[target_id][2][j])) viz_helper.multiple_bar_plot(axes[0][i], y_means, dist["x_labels"], y_errs=np.sqrt(y_errs), legend_labels=legend_labels, y_label="probability", ylim=[0, 1]) elif value_dict[f_id][0] == "numeric": for j, dist in enumerate(plot_data[f_id]): viz_helper.line_plot(axes[0][i], dist["x_vals"], dist["y_means"], y_errs=np.sqrt(dist["y_vars"]), label=str(value_dict[target_id][1]) + "=" + str(value_dict[target_id][2][j]), y_label="density") else: raise Exception("Unknown attribute-type: " + str(value_dict[dist.scope[0]])) pad_col = 5 feature_names = [value_dict[x][1] for x in sorted(spn1.scope)] for ax, col in zip(axes[0], feature_names): ax.annotate(col, xy=(0.5, 1), xytext=(0, pad_col), xycoords='axes fraction', textcoords='offset points', size='large', ha='center', va='baseline') pad_row = 5 info = "" for i, prob in enumerate(ps): info += str(value_dict[target_id][1]) + "=" + str( value_dict[target_id][2][i]) + " " + str(round(prob * 100, 4)) + "%\n" axes[0][0].annotate(info, xy=(0, 0.5), xytext=(-axes[0][0].yaxis.labelpad - pad_row, 0), xycoords=axes[0][0].yaxis.label, textcoords='offset points', size='large', ha='right', va='center') axes[0][0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.25)) plt.tight_layout() fig.subplots_adjust(left=0.15, top=0.9) if save_path is None: plt.show() else: plt.savefig(save_path)
def visualize_density_target(spn, target_id, value_dict, rang=None, n_steps=50, max_density=None, save_path=None): #Only select numeric features selected_features = [] for feature_id in spn.scope: if value_dict[feature_id][0] == "numeric": selected_features.append(feature_id) #Create ranges if rang is None: rang = np.array([None] * (max(spn.scope) + 1)) results = [] assert (value_dict[target_id][0] == "discrete") for v in value_dict[target_id][2]: rang[target_id] = NominalRange([v]) ranges = [] for i, feature_id in enumerate(selected_features): for x_val in np.linspace(value_dict[feature_id][2][0], value_dict[feature_id][2][1], num=n_steps): n_rang = rang.copy() n_rang[feature_id] = NumericRange([[x_val]]) ranges.append(n_rang) results.append(fn.probs(spn, np.array(ranges))) #Visualize ncols = len(results) nrows = len(selected_features) figsize_x = 16 figsize_y = 6 * len(selected_features) _, axes = plt.subplots(nrows, ncols, figsize=(figsize_x, figsize_y), squeeze=False, sharey=True, sharex=False) for j, res in enumerate(results): for i, feature_id in enumerate(selected_features): plot = axes[i][j] x_vals = np.linspace(value_dict[feature_id][2][0], value_dict[feature_id][2][1], num=n_steps) y_vals = res[n_steps * i:n_steps * i + n_steps] plot.plot(x_vals, y_vals) if max_density is not None: plot.set_ylim(0, max_density) plot.set_title(value_dict[feature_id][1] + " - " + value_dict[target_id][1] + "=" + value_dict[target_id][2][j]) plt.tight_layout() if save_path is None: plt.show() else: plt.savefig(save_path)
def evaluate_discrete_leaf(leaf, f_vals): f_id = leaf.scope[0] ranges = np.array([f_id * [None] + [NominalRange([x])] for x in f_vals]) return probs(leaf, ranges)
from spn.structure.Base import Sum, Product from spn.algorithms.Inference import sum_likelihood, prod_likelihood from spn.structure.leaves.parametric.Parametric import Gaussian, Categorical from spn.structure.leaves.parametric.InferenceRange import categorical_likelihood_range from simple_spn.internal.UpdateRange import categorical_update_range inference_support_ranges = {Gaussian : None, Categorical : categorical_likelihood_range, Sum : sum_likelihood, Product : prod_likelihood} distribution_update_ranges = {Gaussian : None, Categorical : categorical_update_range} evidence = [None, NominalRange([1]), None, None] prob, pos_spn = spn_for_evidence(flat_spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) #spn_util.plot_spn(pos_spn, "positive_flat_rule_spn.pdf") evidence = [None, NominalRange([0]), None, None] prob, neg_spn = spn_for_evidence(flat_spn, evidence, node_likelihood=inference_support_ranges, distribution_update_ranges=distribution_update_ranges) #spn_util.plot_spn(neg_spn, "negative_flat_rule_spn.pdf")
def extract_rules(spn, feature_id=1): from spn.experiments.AQP.Ranges import NominalRange from spn.algorithms import Inference from simple_spn.internal.InferenceRange import categorical_likelihood_range from spn.structure.Base import Sum, Product from spn.algorithms.Inference import sum_likelihood, prod_likelihood from spn.structure.leaves.parametric.Parametric import Categorical inference_support_ranges = {Categorical : categorical_likelihood_range, Sum : sum_likelihood, Product : prod_likelihood} freq_items = get_frequent_items(spn, min_support=0.0) freq_items_filtered = freq_items#filter(lambda x : any(cond[0] == feature_id for cond in x[1]), freq_items) freq_items_sorted = sorted(freq_items_filtered, key=lambda x: x[0], reverse=True) #evidence = numpy.empty((3,3,) feature_dict = {0: ("g", ("m ", "w ")), 1: ("c", ("no ", "yes")), 2: ("s", ("no ", "yes")), 3: ("w", ("no ", "yes"))} freq_sets = [] for (sup, conds) in freq_items_sorted: str_conds=[] ranges = [None] * len(spn.scope) for cond in conds: ranges[cond[0]] = NominalRange([cond[1]]) str_conds.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]]) ranges = np.array([ranges]) sup_spn = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0] freq_sets.append(["(" + ", ".join(str_conds) + ")", sup, sup_spn]) rules = sorted(freq_sets, key=lambda x : x[2], reverse=True) rule_df = pd.DataFrame(rules, columns=["frequent set", "s_support", "g_support"]) io.print_pretty_table(rule_df.head(400)) exit() rules = [] for (sup, conds) in freq_items_sorted: rule_body = [] rule_head = [] conf = np.nan ranges = [None] * len(spn.scope) for cond in conds: if cond[0] == feature_id: rule_head.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]]) else: rule_body.append(feature_dict[cond[0]][0] + "=" + feature_dict[cond[0]][1][cond[1]]) ranges[cond[0]] = NominalRange([cond[1]]) #Optimization possible ranges = np.array([ranges]) prob_with_feature = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0] ranges[0][feature_id] = None prob_without_feature = Inference.likelihood(spn, data=ranges, dtype=np.float64, node_likelihood=inference_support_ranges)[:,0][0] spn_sup = prob_without_feature spn_conf = prob_with_feature / prob_without_feature rules.append([" AND ".join(rule_body) + "-->" + " AND ".join(rule_head), sup, conf, spn_sup, spn_conf, spn_sup*spn_conf]) rules = sorted(rules, key=lambda x : x[5], reverse=True) rule_df = pd.DataFrame(rules, columns=["Rule", "c_Support", "c_Confidence", "spn_Support", "spn_Confidence", "score"]) #rule_df.drop_duplicates(subset=["Rule"], keep = True, inplace = True) io.print_pretty_table(rule_df.head(400)) pass
from spn.experiments.AQP.Ranges import NominalRange, NumericRange #Import inference from spn.algorithms import Inference from spn.algorithms.Inference import sum_likelihood, prod_likelihood inference_support_ranges = {PiecewiseLinear : piecewise_likelihood_range, Categorical : categorical_likelihood_range, IdentityNumeric : identity_likelihood_range, Sum : sum_likelihood, Product : prod_likelihood} #Use None instead of np.nan ranges = np.array([[None, None, None], #Without any conditions [NominalRange([0]), None, None], #Only male [NominalRange([0]), NominalRange([1]), None], #Only male and student [NominalRange([0]), NominalRange([1]), NumericRange([[21,100]])], #Only male and student and older than 21 [NominalRange([0]), NominalRange([1]), NumericRange([[10,15], [25,100]])]] #Only male and student and age between 10 and 17 or 21 and 100 ) probabilities = Inference.likelihood(root_node, ranges, dtype=np.float64, node_likelihood=inference_support_ranges) print("Probabilities:") print(probabilities) print() #Sampling for given ranges from spn.algorithms import SamplingRange from spn.structure.leaves.piecewise.SamplingRange import sample_piecewise_node
x = [0., 1., 2., 3., 4.] y = [0., 0., 0., 10., 0.] x, y = np.array(x), np.array(y) auc = np.trapz(y, x) y = y / auc node4 = PiecewiseLinear(x_range=x, y_range=y, bin_repr_points=x[1:-1], scope=[1]) root_node = 0.49 * (node1 * node3) + 0.51 * (node2 * node4) #Set context #meta_types = [MetaType.DISCRETE, MetaType.REAL] #domains = [[0,1],[0.,4.]] #ds_context = Context(meta_types=meta_types, domains=domains) inference_support_ranges = {PiecewiseLinear : piecewise_likelihood_range, Categorical : categorical_likelihood_range} node_sample = {Categorical : sample_categorical_node, PiecewiseLinear : sample_piecewise_node} ranges = [NominalRange([0]),None] samples = SamplingRange.sample_instances(root_node, 2, 30, rand_gen, ranges=ranges, node_sample=node_sample, node_likelihood=inference_support_ranges)#, return_Zs, return_partition, dtype) print("Samples: " + str(samples)) ranges = [NominalRange([0]),NumericRange([[3., 3.1], [3.5, 4.]])] samples = SamplingRange.sample_instances(root_node, 2, 30, rand_gen, ranges=ranges, node_sample=node_sample, node_likelihood=inference_support_ranges)#, return_Zs, return_partition, dtype) print("Samples: " + str(samples))
samples = fn.sampling_rang(spn, rang=[None, None, None, None], n_samples=10, random_seed=1) print(samples) samples = fn.sampling_rang( spn, rang=[None, None, NumericRange([[10, 11], [29, 30]])], n_samples=10, random_seed=1) print(samples) samples = fn.sampling_rang( spn, rang=[NominalRange([0]), None, NumericRange([[14, 15], [29, 30]])], n_samples=10, random_seed=1) print(samples) #Test probabilities rang = [None, None, None] prob = fn.prob(spn, rang) print(prob) rang = [NominalRange([0]), NominalRange([1]), NumericRange([[20]])] prob = fn.prob(spn, rang) print(prob) ranges = np.array([[None, None, NumericRange([[0, 20]])],