def learn_cond(data, ds_context, scope, cols, rows, min_instances_slice, threshold, ohe): split_cols = None if cols == "ci": from spn.algorithms.splitting.RCoT import getCIGroup split_cols = getCIGroup( np.random.RandomState(17)) #(data, scope, threshold) else: raise ValueError('invalid independence test') if rows == "rand_hp": from spn.algorithms.splitting.Random import get_split_rows_random_partition split_rows = get_split_rows_random_partition( np.random.RandomState(17)) #(data, scope, threshold) elif rows == "kmeans": split_rows = get_split_rows_KMeans() else: # todo add other clustering? raise ValueError('invalid clustering method') nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop, scope)
def l_mspn(data, ds_context, cols, rows, min_instances_slice, threshold, ohe): split_cols, split_rows = get_splitting_functions( cols, rows, ohe, threshold, rand_gen, cpus) nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn_param(data, ds_context, cols, rows, min_instances_slice, threshold, ohe): split_cols, split_rows = get_splitting_functions( cols, rows, ohe, threshold, rand_gen, cpus) nextop = get_next_operation(min_instances_slice, min_features_slice, multivariate_leaf, cluster_univariate) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe): split_cols = None if cols == "rdc": split_cols = get_split_cols_RDC(threshold, ohe, linear) if rows == "kmeans": split_rows = get_split_rows_KMeans() leaves = create_histogram_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe): split_cols = None if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) if rows == "kmeans": split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn(data, ds_context, min_instances_slice=200, threshold=0.00000001, linear=False): split_cols = lambda data, ds_context, scope: split_cols_RDC( data, ds_context, scope, threshold=threshold, linear=linear) nextop = lambda data, no_clusters=False, no_independencies=False, is_first=False, cluster_first=True, cluster_univariate=False, min_instances_slice=min_instances_slice: next_operation( data, no_clusters, no_independencies, is_first, cluster_first, cluster_univariate, min_instances_slice) spn = learn_structure(data, ds_context, split_rows_KMeans, split_cols, create_histogram_leaf, nextop) return spn
def learn(data, ds_context, min_instances_slice, rand_gen): if rand_gen is None: rand_gen = np.random.RandomState(17) ds_context.rand_gen = rand_gen split_cols = get_split_cols_binary_random_partition(threshold=col_threshold, beta_a=col_a, beta_b=col_b) splot_rows = get_split_rows_binary_random_partition(beta_a=row_a, beta_b=row_b) # leaves = create_random_parametric_leaf leaves = create_random_unconstrained_type_mixture_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, splot_rows, split_cols, leaves, nextop)
def learn_param(data, ds_context, cols, rows, min_instances_slice, threshold, ohe, initial_scope, l_rfft, is_2d): split_cols, split_rows = get_splitting_functions( cols, rows, ohe, threshold, rand_gen, cpus) nextop = get_next_operation(min_instances_slice, min_features_slice, multivariate_leaf) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop, initial_scope, l_rfft=l_rfft, is_2d=is_2d)
def learn_param(data, ds_context, cols, rows, min_instances_slice, threshold, ohe): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) elif rows == "kmeans": split_rows = get_split_rows_KMeans() nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def l_mspn_missing(data, ds_context, cols, rows, min_instances_slice, threshold, linear, ohe): if cols == "rdc": split_cols = get_split_cols_RDC_py(threshold, rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) if rows == "rdc": split_rows = get_split_rows_RDC_py(rand_gen=rand_gen, ohe=ohe, n_jobs=cpus) elif rows == "kmeans": split_rows = get_split_rows_KMeans() if leaves is None: leaves = create_histogram_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
def learn(data, ds_context, min_instances_slice, threshold, linear, ohe, rand_gen=None): if rand_gen is None: rand_gen = np.random.RandomState(17) ds_context.rand_gen = rand_gen # # FIXME: adopt the python version of RDC, allowing to deal with missing values # split_cols = get_split_cols_RDC(threshold, ohe, linear) split_cols = get_split_cols_RDC_py(threshold, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=True, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) # get_split_rows_RDC(n_clusters=2, k=10, s=1 / 6, ohe=True, seed=rand_gen) leaves = create_type_leaf nextop = get_next_operation(min_instances_slice) return learn_structure(data, ds_context, split_rows, split_cols, leaves, nextop)
@author: Alejandro Molina ''' from spn.algorithms import Inference from spn.algorithms.StructureLearning import learn_structure from spn.algorithms.splitting.Clustering import get_split_rows_KMeans from spn.algorithms.splitting.RDC import get_split_cols_RDC from spn.data.datasets import get_nips_data from spn.structure.Base import Context from spn.structure.leaves.Histograms import add_domains, create_histogram_leaf if __name__ == '__main__': import numpy as np ds_name, words, data, train, _, statistical_type, _ = get_nips_data() print(words) print(data) ds_context = Context() ds_context.statistical_type = np.asarray(["discrete"] * data.shape[1]) add_domains(data, ds_context) spn = learn_structure(data, ds_context, get_split_rows_KMeans(), get_split_cols_RDC(), create_histogram_leaf) # print(to_str_equation(spn, words)) print(Inference.likelihood(spn, data[0:100, :]))
def build_spn(numpy_data, feature_types, spn_params, rand_gen): from spn.algorithms.StructureLearning import get_next_operation, learn_structure from spn.algorithms.splitting.RDC import get_split_cols_RDC_py, get_split_rows_RDC_py from spn.structure.leaves.parametric.Parametric import Categorical from spn.structure.leaves.piecewise.PiecewiseLinear import create_piecewise_leaf from spn.experiments.AQP.leaves.identity.IdentityNumeric import create_identity_leaf #cast may not be necessary numpy_data = np.array(numpy_data, np.float64) #Generate meta_type array meta_types = [] for feature_type in feature_types: if feature_type == "discrete": meta_types.append(MetaType.DISCRETE) elif feature_type == "continuous": meta_types.append(MetaType.REAL) else: raise Exception("Unknown feature type for SPN: " + feature_type) #Create information about the domains domains = [] for col in range(numpy_data.shape[1]): feature_type = feature_types[col] if feature_type == 'continuous': domains.append([np.min(numpy_data[:, col]), np.max(numpy_data[:, col])]) elif feature_type in {'discrete', 'categorical'}: domains.append(np.unique(numpy_data[:, col])) #Create context ds_context = Context(meta_types=meta_types, domains=domains) #Fixed parameters rdc_threshold = spn_params["rdc_threshold"] cols = spn_params["cols"] rows = spn_params["rows"] min_instances_slice = spn_params["min_instances_slice"] ohe = spn_params["ohe"] prior_weight = spn_params["prior_weight"] identity_numeric = spn_params["identity_numeric"] #Method to create leaves in the SPN def create_leaf(data, ds_context, scope): idx = scope[0] meta_type = ds_context.meta_types[idx] if meta_type == MetaType.REAL: if identity_numeric: return create_identity_leaf(data, scope) if prior_weight == 0.: return create_piecewise_leaf(data, ds_context, scope, prior_weight=None) else: return create_piecewise_leaf(data, ds_context, scope, prior_weight=prior_weight) elif meta_type == MetaType.DISCRETE: unique, counts = np.unique(data[:,0], return_counts=True) sorted_counts = np.zeros(len(ds_context.domains[idx]), dtype=np.float64) for i, x in enumerate(unique): sorted_counts[int(x)] = counts[i] p = sorted_counts / data.shape[0] #Do regularization if prior_weight > 0.: p += prior_weight p = p/np.sum(p) return Categorical(p, scope) else: raise Exception("Mehtod learn_mspn_for_aqp(...) cannot create leaf for " + str(meta_type)) #Set method to create leaves leaves = create_leaf #Set methods to cluster and to do the independence test if cols == "rdc": #split_cols = get_split_cols_RDC(rdc_threshold, ohe=ohe, linear=True) split_cols = get_split_cols_RDC_py(rdc_threshold, ohe=ohe, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) if rows == "rdc": #split_rows = get_split_rows_RDC(ohe=ohe) split_rows = get_split_rows_RDC_py(n_clusters=2, ohe=ohe, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=1, rand_gen=rand_gen) #This choses which operation is performed nextop = get_next_operation(min_instances_slice) #Learn the SPN root_node = learn_structure(numpy_data, ds_context, split_rows, split_cols, leaves, nextop) return root_node