Beispiel #1
0
def learn_MSPN():
    import numpy as np

    np.random.seed(123)

    a = np.random.randint(2, size=1000).reshape(-1, 1)
    b = np.random.randint(3, size=1000).reshape(-1, 1)
    c = np.r_[np.random.normal(10, 5, (300, 1)),
              np.random.normal(20, 10, (700, 1))]
    d = 5 * a + 3 * b + c
    train_data = np.c_[a, b, c, d]

    from spn.structure.Base import Context
    from spn.structure.StatisticalTypes import MetaType

    ds_context = Context(meta_types=[
        MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL
    ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_mspn

    mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)

    from spn.algorithms.Statistics import get_structure_stats

    print(get_structure_stats(mspn))
Beispiel #2
0
    def test_histogram_samples(self):
        import numpy as np
        from numpy.random.mtrand import RandomState
        from spn.algorithms.Sampling import sample_instances
        from spn.structure.Base import Context
        from spn.structure.StatisticalTypes import MetaType
        from spn.algorithms.LearningWrappers import learn_mspn

        np.random.seed(123)
        a = np.random.randint(2, size=10000).reshape(-1, 1)
        b = np.random.randint(3, size=10000).reshape(-1, 1)
        c = np.r_[np.random.normal(10, 5, (3000, 1)),
                  np.random.normal(20, 10, (7000, 1))]
        d = 5 * a + 3 * b + c
        train_data = np.c_[a, b, c, d]

        ds_context = Context(meta_types=[
            MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL
        ]).add_domains(train_data)
        mspn = learn_mspn(train_data, ds_context, min_instances_slice=200)

        samples = sample_instances(
            mspn,
            np.array([np.nan, np.nan, np.nan, np.nan] * 100).reshape(-1, 4),
            RandomState(123))
        print(np.max(samples, axis=0), np.min(samples, axis=0))
        print(ds_context.domains)
Beispiel #3
0
def apply(train_datasets, ds_contexts, test_datasets, n_folds, result_path, filename, foldLog):

    # Comment this if you are interested in seen the warnings, we observed that many informative warnings are
    # thrown here, but didn't see nothing suspicious, simly executing Spflow's Mspn method
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=UserWarning)

    print("\n========================")
    print("MSPN")
    print("========================")

    results = {}
    folds = {}
    avg_learning_time = 0
    avg_test_ll = 0
    for i in range(1, n_folds + 1):

        index = i-1

        # Only for MSPN:
        ds_contexts[index].add_domains(train_datasets[index])

        init_time = time.time()*1000
        model = learn_mspn(train_datasets[index], ds_contexts[index], min_instances_slice=20)
        end_time = time.time()*1000

        learning_time = end_time - init_time
        test_ll = log_likelihood(model, test_datasets[index])
        test_ll = np.sum(test_ll)

        fold_result = {"test_ll": test_ll, "learning_time": learning_time}

        folds["fold_" + str(i)] = fold_result
        avg_learning_time = avg_learning_time + learning_time
        avg_test_ll = avg_test_ll + test_ll

        if foldLog:
            print("----------------------------------------")
            print("Fold (" + str(i) + "): ")
            print("Test LL: " + str(test_ll))
            print("Learning time: " + str(learning_time))

    # Generate the average results and store them in the dictionary, then store them in a JSON file
    avg_test_ll = avg_test_ll / n_folds
    avg_learning_time = avg_learning_time / n_folds / 1000  # in seconds
    results["average_test_ll"] = avg_test_ll
    results["average_learning_time"] = avg_learning_time
    results["folds"] = folds
    store_json(results, result_path, filename)

    print("----------------------------------------")
    print("----------------------------------------")
    print("Average Test LL: " + str(avg_test_ll))
    print("Average learning time: " + str(avg_learning_time))
Beispiel #4
0
def run_experiment(dataset, top_n_features, linear=False):
    ds_name, words, data, train, _, statistical_type, _ = dataset

    data = data[:, 0:top_n_features]
    words = words[0:top_n_features]
    train = train[:, 0:top_n_features]

    ds_context = Context()
    ds_context.statistical_type = statistical_type
    add_domains(data, ds_context)

    spn = learn_mspn(train, ds_context, linear=linear, memory=memory)
    save_exp(spn, ds_name, top_n_features, words, data)
Beispiel #5
0
    def test_singular_domain(self):
        import numpy as np
        np.random.seed(123)

        b = np.random.randint(3, size=1000).reshape(-1, 1)
        d = np.random.randint(2, size=1000).reshape(-1, 1)
        train_data = np.c_[b, d]

        from spn.structure.Base import Context
        from spn.structure.StatisticalTypes import MetaType

        ds_context = Context(meta_types=[MetaType.DISCRETE, MetaType.BINARY])
        ds_context.add_domains(train_data)

        from spn.algorithms.LearningWrappers import learn_mspn

        mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
Beispiel #6
0
    def _fit(self, var_types=None, **kwargs):
        df = self.data.copy()
        # Exchange all object columns for their codes
        for key, value in self._categorical_variables.items():
            df[key] = value['categorical'].codes

        self._nameToVarType = var_types

        #Check if variable types are given
        if self._nameToVarType is None:
            raise ValueError("missing argument 'var_types'")

        self._initial_names = self.names.copy()
        self._initial_names_count = len(self._initial_names)
        self._initial_names_to_index = {self._initial_names[i]: i for i in range(self._initial_names_count)}

        # Initialize _density_mask with np.nan
        self._density_mask = np.array(
            [np.nan for i in self._initial_names]
        ).reshape(-1, self._initial_names_count).astype(float)

        # Initialize _condition with np.nan
        self._condition = np.repeat(
            np.nan,
            self._initial_names_count
        ).reshape(-1, self._initial_names_count).astype(float)

        self._marginalized = set()
        self._conditioned = set()

        try:
            var_types = [self._nameToVarType[name] for name in self.names]
        except KeyError as err:
            raise ValueError('missing var type information for some dimension {}.'.format(err.args[0]))

        if self._spn_type == 'spn':
            context = Context(parametric_types=var_types).add_domains(df.values)
            self._spn = learn_parametric(df.values, context)

        elif self._spn_type == 'mspn':
            context = Context(meta_types=var_types).add_domains(df.values)
            self._spn = learn_mspn(df.values, context)
        else:
            raise Exception("Type of SPN not known: " + self._spn_type)
        return self._unbound_updater,
Beispiel #7
0
    def test_eval_histogram(self):
        np.random.seed(17)
        data = np.random.normal(10, 0.01,
                                size=2000).tolist() + np.random.normal(
                                    30, 10, size=2000).tolist()
        data = np.array(data).reshape((-1, 10))
        data[data < 0] = 0
        data = data.astype(int)

        ds_context = Context(meta_types=[MetaType.DISCRETE] * data.shape[1])
        ds_context.add_domains(data)

        spn = learn_mspn(data, ds_context)

        ll = log_likelihood(spn, data)

        tf_ll = eval_tf(spn, data)

        self.assertTrue(np.all(np.isclose(ll, tf_ll)))
Beispiel #8
0
def run_experiment_binary(ds_file, min_instances=200, threshold=0.3):
    ds_name, words, data, train, _, statistical_type, _ = get_binary_data(
        ds_file)

    ds_context = Context()
    ds_context.statistical_type = statistical_type
    add_domains(data, ds_context)

    print("train data shape", train.shape)
    spn = learn_mspn(train,
                     ds_context,
                     min_instances_slice=min_instances,
                     threshold=threshold,
                     linear=True,
                     memory=memory)

    print(fpga_count_ops(spn))

    save_exp(spn, ds_name, min_instances, words, data)
Beispiel #9
0
    def _fit(self, var_types=None, **kwargs):
        if self._spn_type == None:
            raise Exception("No SPN-type provided")

        if var_types != None:
            self.var_types = var_types
        else:
            var_types = self.var_types

        df = self.data.copy()
        # Exchange all object columns for their codes as SPFLOW cannot deal with Strings
        for key, value in self._categorical_variables.items():
            df[key] = value['categorical'].codes

        self._nameToVarType = var_types

        # Check if variable types are given
        if self._nameToVarType is None:
            raise ValueError("missing argument 'var_types'")

        self._initial_names = self.names.copy()
        self._initial_names_count = len(self._initial_names)
        self._initial_names_to_index = {
            self._initial_names[i]: i
            for i in range(self._initial_names_count)
        }

        # Initialize _state_mask with np.nan
        self._state_mask = np.array([
            np.nan for i in self._initial_names
        ]).reshape(-1, self._initial_names_count).astype(float)

        # Initialize _condition with np.nan
        self._condition = np.repeat(np.nan, self._initial_names_count).reshape(
            -1, self._initial_names_count).astype(float)

        self._marginalized = set()
        self._conditioned = set()

        try:
            var_types = [self._nameToVarType[name] for name in self.names]
        except KeyError as err:
            raise ValueError(
                'missing var type information for dimension: {}.'.format(
                    err.args[0]))

        if self._spn_type == 'spn':
            context = Context(parametric_types=var_types).add_domains(
                df.values)
            self._spn = learn_parametric(df.values, context)

        elif self._spn_type == 'mspn':
            context = Context(meta_types=var_types).add_domains(df.values)
            self._spn = learn_mspn(df.values, context)
        else:
            raise Exception("Type of SPN not known: " + self._spn_type)

        # TODO: DEBUG OUTPUT for NIPS2020
        if self._spn:
            plot_spn(self._spn,
                     fname=Path(
                         f"../../bin/experiments/spn_graphs/{self.name}.pdf"))
            plot_spn_to_svg(
                self._spn,
                fname=Path(
                    f"../../bin/experiments/spn_graphs/{self.name}.svg"))
        return self._unbound_updater,
Beispiel #10
0
import numpy as np
from spn.structure.Base import Context
from spn.structure.StatisticalTypes import MetaType
from spn.algorithms.LearningWrappers import learn_mspn
from spn.io.Graphics import plot_spn

np.random.seed(123)

a = np.random.randint(2, size=1000).reshape(-1, 1)
b = np.random.randint(3, size=1000).reshape(-1, 1)
c = np.r_[np.random.normal(10, 5, (300, 1)),
          np.random.normal(20, 10, (700, 1))]
d = 5 * a + 3 * b + c
train_data = np.c_[a, b, c, d]

ds_context = Context(meta_types=[
    MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL
])
ds_context.add_domains(train_data)

mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
Beispiel #11
0
    MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE
]

ds_context = Context(meta_types=types)
ds_context.parametric_types = [Gaussian, Gaussian, Gaussian, Categorical]
ds_context.add_domains(data)

num_classes = len(np.unique(data[:, 3]))

#spn = learn_mspn(data, ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.3)

spn = Sum()
for label, count in zip(*np.unique(data[:, 3], return_counts=True)):
    branch = learn_mspn(data[data[:, 3] == label, :],
                        ds_context,
                        min_instances_slice=10,
                        leaves=create_leaf,
                        threshold=0.1)
    spn.children.append(branch)
    spn.weights.append(count / data.shape[0])

spn.scope.extend(branch.scope)

print("learned")

prediction = []

cls_data = np.zeros((num_classes, 4))
cls_data[:, 3] = np.arange(num_classes)

for i, x in enumerate(data):
Beispiel #12
0
def learn_spn(dataset="data/iris",
              precision=25,
              independence=0.1,
              header=0,
              date=None,
              isotonic=False,
              histogram=True,
              types=False):
    '''
    Learning wrapper for the jupyter notebooks. Performs some preliminary feature analysis on the given dataset and prepares the data
    for the mspn
    :param dataset: file location of the dataset
    :param precision: the precision used for calculating the notebooks spn TODO: find better name
    :param independence: the maximum dependence a statistical independence test
                         may show to split columns
    :param header: the location of the csv header
    :param date: location of datetime columns in the csv
    :param isotonic: whether to use isotonic nodes
    :param histogram: whether to ue histogram nodes
    :param types: whether the csv contains type annotations
    :return: a valid spn and a data dictionary containing preprocessing information
    '''
    skiprows = [1] if types else []
    df = pd.read_csv(dataset,
                     delimiter=",",
                     header=header,
                     parse_dates=date,
                     skiprows=skiprows)
    df = df.dropna(axis=0, how='any')
    feature_names = df.columns.values.tolist() if header == 0 else [
        "X_{}".format(i) for i in range(len(df.columns))
    ]

    dtypes = df.dtypes

    def to_feature_types(types):
        feature_types = []
        for feature_type in types:
            if feature_type.kind == 'O':
                feature_types.append(Categorical)
            elif feature_type.kind == 'f':
                feature_types.append(PiecewiseLinear)
            elif feature_type.kind == np.dtype('i'):
                feature_types.append(PiecewiseLinear)
            else:
                feature_types.append(PiecewiseLinear)
        return feature_types

    if not types:
        feature_types = to_feature_types(dtypes)

    # TODO: Build Context wrapper according to README.md, this should work pretty well

    data_dictionary = {
        'features': [{
            "name": name,
            "type": typ,
            "pandas_type": dtypes[i]
        } for i, (name, typ) in enumerate(zip(feature_names, feature_types))],
        'num_entries':
        len(df)
    }

    idx = df.columns

    for id, name in enumerate(idx):
        if feature_types[id] == Categorical:
            lb = LabelEncoder()
            data_dictionary['features'][id]["encoder"] = lb
            df[name] = df[name].astype('category')
            df[name] = lb.fit_transform(df[name])
            data_dictionary['features'][id]["values"] = lb.transform(
                lb.classes_)
        if dtypes[id].kind == 'M':
            df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D')

    data = np.array(df)

    # TODO: PiecewiseLinear Types do not work like the ParametricNodes

    spn = learn_mspn(data,
                     Context(parametric_types=feature_types).add_domains(data),
                     cols="rdc",
                     rows="kmeans",
                     min_instances_slice=200,
                     threshold=0.3,
                     ohe=False,
                     leaves=None)

    spn.name = dataset
    return spn, data_dictionary
Beispiel #13
0
def learn_spmn_structure(train_data, index, scope_index, params):


    train_data = train_data
    curr_var_set = params.partial_order[index]

    if params.partial_order[index][0] in  params.decision_nodes:

        decision_node = params.partial_order[index][0]
        cl, dec_vals= split_on_decision_node(train_data, curr_var_set)
        spn0 = []
        index= index+1
        set_next_operation("None")

        for c in cl:

            if index < len(params.partial_order):

                spn0.append(learn_spmn_structure(c, index, scope_index, params))
                spn = Max(dec_values=dec_vals, children=spn0, feature_name=decision_node)

            else:
                spn = Max(dec_values=dec_vals, children=None, feature_name=decision_node)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn



    else:

        curr_train_data_prod, curr_train_data = get_curr_train_data_prod(train_data, curr_var_set)

        split_cols = get_split_cols_RDC_py()
        scope_prod = get_scope_prod(curr_train_data_prod, scope_index, params.feature_names)

        ds_context_prod = get_ds_context_prod(curr_train_data_prod, scope_prod, index, scope_index, params)

        data_slices_prod = split_cols(curr_train_data_prod, ds_context_prod, scope_prod)
        curr_op = get_next_operation()


        if len(data_slices_prod)>1 or curr_op == "Prod" or index == len(params.partial_order) :
            set_next_operation("Sum")

            if params.util_to_bin :

                spn0 = learn_parametric(curr_train_data_prod, ds_context_prod, min_instances_slice=20, initial_scope= scope_prod)

            else:

                spn0 = learn_mspn(curr_train_data_prod, ds_context_prod, min_instances_slice=20,
                                    initial_scope=scope_prod)

            index = index + 1
            scope_index = scope_index +curr_train_data_prod.shape[1]

            if index < len(params.partial_order):

                spn1 = learn_spmn_structure(curr_train_data, index, scope_index, params)
                spn = Product(children=[spn0, spn1])

                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

            else:
                spn = spn0
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        else:

            split_rows = get_split_rows_KMeans()
            scope_sum = list(range(train_data.shape[1]))

            ds_context_sum = get_ds_context_sum(train_data, scope_sum, index, scope_index, params)
            data_slices_sum = split_rows(train_data, ds_context_sum, scope_sum)

            spn0 = []
            weights = []
            index = index

            if index < len(params.partial_order):

                for cl, scop, weight in data_slices_sum:

                    set_next_operation("Prod")
                    spn0.append(learn_spmn_structure(cl, index, scope_index, params))
                    weights.append(weight)

                spn = Sum(weights=weights, children=spn0)
                assign_ids(spn)
                rebuild_scopes_bottom_up(spn)

        assign_ids(spn)
        rebuild_scopes_bottom_up(spn)
        return spn
Beispiel #14
0
 def learn_wrapper(data, ds_context):
     return learn_mspn(data,
                       ds_context,
                       min_instances_slice=100,
                       leaves=create_leaf,
                       memory=memory)