def learn_MSPN(): import numpy as np np.random.seed(123) a = np.random.randint(2, size=1000).reshape(-1, 1) b = np.random.randint(3, size=1000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (300, 1)), np.random.normal(20, 10, (700, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType ds_context = Context(meta_types=[ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL ]).add_domains(train_data) from spn.algorithms.LearningWrappers import learn_mspn mspn = learn_mspn(train_data, ds_context, min_instances_slice=20) from spn.algorithms.Statistics import get_structure_stats print(get_structure_stats(mspn))
def test_histogram_samples(self): import numpy as np from numpy.random.mtrand import RandomState from spn.algorithms.Sampling import sample_instances from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType from spn.algorithms.LearningWrappers import learn_mspn np.random.seed(123) a = np.random.randint(2, size=10000).reshape(-1, 1) b = np.random.randint(3, size=10000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (3000, 1)), np.random.normal(20, 10, (7000, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] ds_context = Context(meta_types=[ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL ]).add_domains(train_data) mspn = learn_mspn(train_data, ds_context, min_instances_slice=200) samples = sample_instances( mspn, np.array([np.nan, np.nan, np.nan, np.nan] * 100).reshape(-1, 4), RandomState(123)) print(np.max(samples, axis=0), np.min(samples, axis=0)) print(ds_context.domains)
def apply(train_datasets, ds_contexts, test_datasets, n_folds, result_path, filename, foldLog): # Comment this if you are interested in seen the warnings, we observed that many informative warnings are # thrown here, but didn't see nothing suspicious, simly executing Spflow's Mspn method warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=UserWarning) print("\n========================") print("MSPN") print("========================") results = {} folds = {} avg_learning_time = 0 avg_test_ll = 0 for i in range(1, n_folds + 1): index = i-1 # Only for MSPN: ds_contexts[index].add_domains(train_datasets[index]) init_time = time.time()*1000 model = learn_mspn(train_datasets[index], ds_contexts[index], min_instances_slice=20) end_time = time.time()*1000 learning_time = end_time - init_time test_ll = log_likelihood(model, test_datasets[index]) test_ll = np.sum(test_ll) fold_result = {"test_ll": test_ll, "learning_time": learning_time} folds["fold_" + str(i)] = fold_result avg_learning_time = avg_learning_time + learning_time avg_test_ll = avg_test_ll + test_ll if foldLog: print("----------------------------------------") print("Fold (" + str(i) + "): ") print("Test LL: " + str(test_ll)) print("Learning time: " + str(learning_time)) # Generate the average results and store them in the dictionary, then store them in a JSON file avg_test_ll = avg_test_ll / n_folds avg_learning_time = avg_learning_time / n_folds / 1000 # in seconds results["average_test_ll"] = avg_test_ll results["average_learning_time"] = avg_learning_time results["folds"] = folds store_json(results, result_path, filename) print("----------------------------------------") print("----------------------------------------") print("Average Test LL: " + str(avg_test_ll)) print("Average learning time: " + str(avg_learning_time))
def run_experiment(dataset, top_n_features, linear=False): ds_name, words, data, train, _, statistical_type, _ = dataset data = data[:, 0:top_n_features] words = words[0:top_n_features] train = train[:, 0:top_n_features] ds_context = Context() ds_context.statistical_type = statistical_type add_domains(data, ds_context) spn = learn_mspn(train, ds_context, linear=linear, memory=memory) save_exp(spn, ds_name, top_n_features, words, data)
def test_singular_domain(self): import numpy as np np.random.seed(123) b = np.random.randint(3, size=1000).reshape(-1, 1) d = np.random.randint(2, size=1000).reshape(-1, 1) train_data = np.c_[b, d] from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType ds_context = Context(meta_types=[MetaType.DISCRETE, MetaType.BINARY]) ds_context.add_domains(train_data) from spn.algorithms.LearningWrappers import learn_mspn mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
def _fit(self, var_types=None, **kwargs): df = self.data.copy() # Exchange all object columns for their codes for key, value in self._categorical_variables.items(): df[key] = value['categorical'].codes self._nameToVarType = var_types #Check if variable types are given if self._nameToVarType is None: raise ValueError("missing argument 'var_types'") self._initial_names = self.names.copy() self._initial_names_count = len(self._initial_names) self._initial_names_to_index = {self._initial_names[i]: i for i in range(self._initial_names_count)} # Initialize _density_mask with np.nan self._density_mask = np.array( [np.nan for i in self._initial_names] ).reshape(-1, self._initial_names_count).astype(float) # Initialize _condition with np.nan self._condition = np.repeat( np.nan, self._initial_names_count ).reshape(-1, self._initial_names_count).astype(float) self._marginalized = set() self._conditioned = set() try: var_types = [self._nameToVarType[name] for name in self.names] except KeyError as err: raise ValueError('missing var type information for some dimension {}.'.format(err.args[0])) if self._spn_type == 'spn': context = Context(parametric_types=var_types).add_domains(df.values) self._spn = learn_parametric(df.values, context) elif self._spn_type == 'mspn': context = Context(meta_types=var_types).add_domains(df.values) self._spn = learn_mspn(df.values, context) else: raise Exception("Type of SPN not known: " + self._spn_type) return self._unbound_updater,
def test_eval_histogram(self): np.random.seed(17) data = np.random.normal(10, 0.01, size=2000).tolist() + np.random.normal( 30, 10, size=2000).tolist() data = np.array(data).reshape((-1, 10)) data[data < 0] = 0 data = data.astype(int) ds_context = Context(meta_types=[MetaType.DISCRETE] * data.shape[1]) ds_context.add_domains(data) spn = learn_mspn(data, ds_context) ll = log_likelihood(spn, data) tf_ll = eval_tf(spn, data) self.assertTrue(np.all(np.isclose(ll, tf_ll)))
def run_experiment_binary(ds_file, min_instances=200, threshold=0.3): ds_name, words, data, train, _, statistical_type, _ = get_binary_data( ds_file) ds_context = Context() ds_context.statistical_type = statistical_type add_domains(data, ds_context) print("train data shape", train.shape) spn = learn_mspn(train, ds_context, min_instances_slice=min_instances, threshold=threshold, linear=True, memory=memory) print(fpga_count_ops(spn)) save_exp(spn, ds_name, min_instances, words, data)
def _fit(self, var_types=None, **kwargs): if self._spn_type == None: raise Exception("No SPN-type provided") if var_types != None: self.var_types = var_types else: var_types = self.var_types df = self.data.copy() # Exchange all object columns for their codes as SPFLOW cannot deal with Strings for key, value in self._categorical_variables.items(): df[key] = value['categorical'].codes self._nameToVarType = var_types # Check if variable types are given if self._nameToVarType is None: raise ValueError("missing argument 'var_types'") self._initial_names = self.names.copy() self._initial_names_count = len(self._initial_names) self._initial_names_to_index = { self._initial_names[i]: i for i in range(self._initial_names_count) } # Initialize _state_mask with np.nan self._state_mask = np.array([ np.nan for i in self._initial_names ]).reshape(-1, self._initial_names_count).astype(float) # Initialize _condition with np.nan self._condition = np.repeat(np.nan, self._initial_names_count).reshape( -1, self._initial_names_count).astype(float) self._marginalized = set() self._conditioned = set() try: var_types = [self._nameToVarType[name] for name in self.names] except KeyError as err: raise ValueError( 'missing var type information for dimension: {}.'.format( err.args[0])) if self._spn_type == 'spn': context = Context(parametric_types=var_types).add_domains( df.values) self._spn = learn_parametric(df.values, context) elif self._spn_type == 'mspn': context = Context(meta_types=var_types).add_domains(df.values) self._spn = learn_mspn(df.values, context) else: raise Exception("Type of SPN not known: " + self._spn_type) # TODO: DEBUG OUTPUT for NIPS2020 if self._spn: plot_spn(self._spn, fname=Path( f"../../bin/experiments/spn_graphs/{self.name}.pdf")) plot_spn_to_svg( self._spn, fname=Path( f"../../bin/experiments/spn_graphs/{self.name}.svg")) return self._unbound_updater,
import numpy as np from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType from spn.algorithms.LearningWrappers import learn_mspn from spn.io.Graphics import plot_spn np.random.seed(123) a = np.random.randint(2, size=1000).reshape(-1, 1) b = np.random.randint(3, size=1000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (300, 1)), np.random.normal(20, 10, (700, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] ds_context = Context(meta_types=[ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL ]) ds_context.add_domains(train_data) mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE ] ds_context = Context(meta_types=types) ds_context.parametric_types = [Gaussian, Gaussian, Gaussian, Categorical] ds_context.add_domains(data) num_classes = len(np.unique(data[:, 3])) #spn = learn_mspn(data, ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.3) spn = Sum() for label, count in zip(*np.unique(data[:, 3], return_counts=True)): branch = learn_mspn(data[data[:, 3] == label, :], ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.1) spn.children.append(branch) spn.weights.append(count / data.shape[0]) spn.scope.extend(branch.scope) print("learned") prediction = [] cls_data = np.zeros((num_classes, 4)) cls_data[:, 3] = np.arange(num_classes) for i, x in enumerate(data):
def learn_spn(dataset="data/iris", precision=25, independence=0.1, header=0, date=None, isotonic=False, histogram=True, types=False): ''' Learning wrapper for the jupyter notebooks. Performs some preliminary feature analysis on the given dataset and prepares the data for the mspn :param dataset: file location of the dataset :param precision: the precision used for calculating the notebooks spn TODO: find better name :param independence: the maximum dependence a statistical independence test may show to split columns :param header: the location of the csv header :param date: location of datetime columns in the csv :param isotonic: whether to use isotonic nodes :param histogram: whether to ue histogram nodes :param types: whether the csv contains type annotations :return: a valid spn and a data dictionary containing preprocessing information ''' skiprows = [1] if types else [] df = pd.read_csv(dataset, delimiter=",", header=header, parse_dates=date, skiprows=skiprows) df = df.dropna(axis=0, how='any') feature_names = df.columns.values.tolist() if header == 0 else [ "X_{}".format(i) for i in range(len(df.columns)) ] dtypes = df.dtypes def to_feature_types(types): feature_types = [] for feature_type in types: if feature_type.kind == 'O': feature_types.append(Categorical) elif feature_type.kind == 'f': feature_types.append(PiecewiseLinear) elif feature_type.kind == np.dtype('i'): feature_types.append(PiecewiseLinear) else: feature_types.append(PiecewiseLinear) return feature_types if not types: feature_types = to_feature_types(dtypes) # TODO: Build Context wrapper according to README.md, this should work pretty well data_dictionary = { 'features': [{ "name": name, "type": typ, "pandas_type": dtypes[i] } for i, (name, typ) in enumerate(zip(feature_names, feature_types))], 'num_entries': len(df) } idx = df.columns for id, name in enumerate(idx): if feature_types[id] == Categorical: lb = LabelEncoder() data_dictionary['features'][id]["encoder"] = lb df[name] = df[name].astype('category') df[name] = lb.fit_transform(df[name]) data_dictionary['features'][id]["values"] = lb.transform( lb.classes_) if dtypes[id].kind == 'M': df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D') data = np.array(df) # TODO: PiecewiseLinear Types do not work like the ParametricNodes spn = learn_mspn(data, Context(parametric_types=feature_types).add_domains(data), cols="rdc", rows="kmeans", min_instances_slice=200, threshold=0.3, ohe=False, leaves=None) spn.name = dataset return spn, data_dictionary
def learn_spmn_structure(train_data, index, scope_index, params): train_data = train_data curr_var_set = params.partial_order[index] if params.partial_order[index][0] in params.decision_nodes: decision_node = params.partial_order[index][0] cl, dec_vals= split_on_decision_node(train_data, curr_var_set) spn0 = [] index= index+1 set_next_operation("None") for c in cl: if index < len(params.partial_order): spn0.append(learn_spmn_structure(c, index, scope_index, params)) spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node) else: spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn else: curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set) split_cols = get_split_cols_RDC_py() scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names) ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params) data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod) curr_op = get_next_operation() if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) : set_next_operation("Sum") if params.util_to_bin : spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod) else: spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope=scope_prod) index = index + 1 scope_index = scope_index +curr_train_data_prod.shape[1] if index < len(params.partial_order): spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params) spn = Product(children=[spn0, spn1]) assign_ids(spn) rebuild_scopes_bottom_up(spn) else: spn = spn0 assign_ids(spn) rebuild_scopes_bottom_up(spn) else: split_rows = get_split_rows_KMeans() scope_sum = list(range(train_data.shape[1])) ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params) data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum) spn0 = [] weights = [] index = index if index < len(params.partial_order): for cl, scop, weight in data_slices_sum: set_next_operation("Prod") spn0.append(learn_spmn_structure(cl, index, scope_index, params)) weights.append(weight) spn = Sum(weights=weights, children=spn0) assign_ids(spn) rebuild_scopes_bottom_up(spn) assign_ids(spn) rebuild_scopes_bottom_up(spn) return spn
def learn_wrapper(data, ds_context): return learn_mspn(data, ds_context, min_instances_slice=100, leaves=create_leaf, memory=memory)