Python SumNode.id Examples

Programming Language: Python

Namespace/Package Name: spn.linked.nodes

Class/Type: SumNode

Method/Function: id

Examples at hotexamples.com: 8

Python SumNode.id - 8 examples found. These are the top rated real world Python examples of spn.linked.nodes.SumNode.id extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SumNode(30)

add_child(23)

id(6)

eval(3)

log_der(2)

backprop(1)

build_k(1)

is_complete(1)

normalize(1)

var_scope(1)

Example #1

Show file

File: learnspn.py Project: danhlephuoc/spae

def estimate_kernel_density_spn(data_slice, feature_sizes, data, alpha,
                                node_id_assoc, building_stack,
                                slices_to_process):
    """
    A mixture with one component for each instance
    """

    instance_ids = data_slice.instance_ids
    feature_ids = data_slice.feature_ids
    current_id = data_slice.id

    n_instances = len(instance_ids)
    n_features = len(feature_ids)

    logging.info('Adding a kernel density estimation ' +
                 'over a slice {0} X {1}'.format(n_instances, n_features))

    #
    # create sum node
    root_sum_node = SumNode(var_scope=frozenset(feature_ids))

    data_slice.type = SumNode
    building_stack.append(data_slice)

    root_sum_node.id = current_id
    node_id_assoc[current_id] = root_sum_node

    #
    # for each instance
    for i in instance_ids:
        #
        # create a slice
        instance_slice = DataSlice(numpy.array([i]), feature_ids)
        slices_to_process.append(instance_slice)
        #
        # linking with appropriate weight
        data_slice.add_child(instance_slice, 1.0 / n_instances)

    return root_sum_node, node_id_assoc, building_stack, slices_to_process

Example #2

Show file

    def fit_structure(self, data):

        #
        # a queue containing the data slices to process
        slices_to_process = deque()

        # a stack for building nodes
        building_stack = deque()

        # a dict to keep track of id->nodes
        node_id_assoc = {}

        # creating the first slice
        whole_slice = DataSlice.whole_slice(data.shape[0], data.shape[1])
        slices_to_process.append(whole_slice)

        cluster_first = self._cluster_first

        #
        # iteratively process & split slices
        #
        while slices_to_process:

            # process a slice
            current_slice = slices_to_process.popleft()

            # pointers to the current data slice
            current_instances = current_slice.instance_ids
            current_features = current_slice.feature_ids
            current_id = current_slice.id

            n_features = len(current_features)

            #             if n_features > 1:
            # #                 # print("removing Zeros")
            #                 datarowsIdx = numpy.sum(data[current_instances, :][:, current_features], 1) > 0
            #                 if not any(datarowsIdx):
            #                     datarowsIdx[0] = True
            #                 current_instances = current_slice.instance_ids[datarowsIdx]

            n_instances = len(current_instances)

            #             if n_instances == 0:
            #                 #too strong cutting the zeroes
            #                 current_instances = [current_slice.instance_ids[0]]
            #                 n_instances = len(current_instances)

            slice_data_rows = data[current_instances, :]
            current_slice_data = slice_data_rows[:, current_features]

            # is this a leaf node or we can split?
            if n_features == 1 and (current_slice.doNotCluster or
                                    n_instances <= self._min_instances_slice):

                (feature_id, ) = current_features

                if self.family == "poisson":
                    leaf_node = PoissonNode(data, current_instances,
                                            current_features)
                elif self.family == "gaussian":
                    leaf_node = GaussianNode(data, current_instances,
                                             current_features)

                # storing links
                # input_nodes.append(leaf_node)
                leaf_node.id = current_id
                node_id_assoc[current_id] = leaf_node

            # elif (current_slice_data.shape[0] < self._min_instances_slice):
            # elif ( (n_instances <= self._min_instances_slice and n_features > 1) and current_slice_data.shape[0]  < self._min_instances_slice):
            # elif ((n_instances <= self._min_instances_slice and n_features > 1)):
            elif n_features > 1 and (current_slice.doNotCluster or
                                     n_instances <= self._min_instances_slice):

                # print('into naive factorization')
                child_slices = [
                    DataSlice(current_instances, [feature_id])
                    for feature_id in current_features
                ]
                slices_to_process.extend(child_slices)

                #children_ids = [child.id for child in child_slices]

                for child_slice in child_slices:
                    child_slice.doNotCluster = current_slice.doNotCluster
                    current_slice.add_child(child_slice)
                current_slice.type = ProductNode
                building_stack.append(current_slice)

                prod_node = ProductNode(data, current_instances,
                                        current_features)
                prod_node.id = current_id

                node_id_assoc[current_id] = prod_node

            else:

                split_on_features = False

                # first_run = False
                #
                # first run is a split on rows
                if n_features == 1 or cluster_first:
                    cluster_first = False
                else:

                    if self._ind_test_method == "pairwise_treeglm" or self._ind_test_method == "subsample":

                        fcdata = current_slice_data

                        if self._ind_test_method == "subsample":
                            #sampled_rows = 2000
                            #sampled_rows = math.floor(current_slice_data.shape[0]*10/100)
                            sampled_rows = self._sub_sample_rows
                            if sampled_rows < current_slice_data.shape[0]:
                                fcdata = current_slice_data[
                                    numpy.random.choice(
                                        current_slice_data.shape[0],
                                        sampled_rows,
                                        replace=False)]
                            else:
                                fcdata = current_slice_data

                        #Using R
                        #from pdn.independenceptest import getIndependentGroups
                        #feature_clusters = retrieve_clustering(getIndependentGroups(fcdata, alpha=self._alpha, family=self.family), current_features)
                        feature_clusters = retrieve_clustering(
                            getIndependentGroupsStabilityTest(
                                fcdata, alpha=self._alpha), current_features)
                    elif self._ind_test_method == "KMeans":

                        feature_clusters = retrieve_clustering(
                            cluster_rows(
                                (data[current_instances, :][:,
                                                            current_features]
                                 ).T,
                                n_clusters=2,
                                cluster_method=self._row_cluster_method,
                                n_iters=self._n_iters,
                                n_restarts=self._n_restarts,
                                cluster_prep_method="sqrt",
                                cluster_penalty=self._cluster_penalty,
                                rand_gen=self._rand_gen,
                                sklearn_args=self._sklearn_args),
                            current_instances)

                    split_on_features = len(feature_clusters) > 1

                #
                # have dependent components been found?
                if split_on_features:
                    #
                    # splitting on columns
                    # print('---> Splitting on features')
                    # print(feature_clusters)

                    slices = [
                        DataSlice(current_instances, cluster)
                        for cluster in feature_clusters
                    ]

                    slices_to_process.extend(slices)

                    current_slice.type = ProductNode
                    building_stack.append(current_slice)
                    for child_slice in slices:
                        current_slice.add_child(child_slice)

                    prod_node = ProductNode(data, current_instances,
                                            current_features)
                    prod_node.id = current_id
                    node_id_assoc[current_id] = prod_node

                else:
                    # print('---> Splitting on rows')

                    k_row_clusters = min(self._n_cluster_splits,
                                         n_instances - 1)

                    if n_features == 1:
                        # do one kmeans run with K large enough to split into N min instances
                        k_row_clusters = math.floor(
                            n_instances / self._min_instances_slice) + 1
                        k_row_clusters = min(k_row_clusters, n_instances - 1)

                    clustering = retrieve_clustering(
                        cluster_rows(
                            data[current_instances, :][:, current_features],
                            n_clusters=k_row_clusters,
                            cluster_method=self._row_cluster_method,
                            n_iters=self._n_iters,
                            n_restarts=self._n_restarts,
                            cluster_prep_method=self._cluster_prep_method,
                            cluster_penalty=self._cluster_penalty,
                            rand_gen=self._rand_gen,
                            sklearn_args=self._sklearn_args),
                        current_instances)

                    cluster_slices = [
                        DataSlice(cluster, current_features)
                        for cluster in clustering
                    ]

                    if len(clustering) < k_row_clusters:
                        for cluster_slice in cluster_slices:
                            cluster_slice.doNotCluster = True

                    n_instances_clusters = sum(
                        [len(cluster) for cluster in clustering])
                    cluster_weights = [
                        len(cluster) / n_instances_clusters
                        for cluster in clustering
                    ]

                    slices_to_process.extend(cluster_slices)

                    current_slice.type = SumNode
                    building_stack.append(current_slice)
                    for child_slice, child_weight in zip(
                            cluster_slices, cluster_weights):
                        current_slice.add_child(child_slice, child_weight)

                    sum_node = SumNode(data, current_instances,
                                       current_features)
                    sum_node.id = current_id
                    node_id_assoc[current_id] = sum_node

        root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc,
                                                      building_stack, True)

        spn = SpnFactory.layered_linked_spn(root_node, data, self.config)

        return spn

Example #3

Show file

File: test_factory.py Project: arranger1044/spyn

def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var,
                                     values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var,
                                      values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars,
                        var_values=var_values,
                        data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var,
                                      values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var,
                                      values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var,
                                      values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var,
                                      values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var,
                                      values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10

Example #4

Show file

File: test_factory.py Project: arranger1044/spyn

def test_pruned_spn_from_slices():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    node_assoc = {}
    building_stack = deque()

    slice_1 = DataSlice.whole_slice(rows, cols)
    slice_1.type = SumNode
    node_1 = SumNode()
    node_1.id = slice_1.id
    node_assoc[node_1.id] = node_1
    building_stack.append(slice_1)

    slice_2 = DataSlice.whole_slice(rows, cols)
    slice_2.type = ProductNode
    node_2 = ProductNode()
    node_2.id = slice_2.id
    node_assoc[node_2.id] = node_2
    building_stack.append(slice_2)

    slice_3 = DataSlice.whole_slice(rows, cols)
    slice_3.type = SumNode
    node_3 = SumNode()
    node_3.id = slice_3.id
    node_assoc[node_3.id] = node_3
    building_stack.append(slice_3)

    # adding first level
    slice_1.add_child(slice_2, 0.8)
    slice_1.add_child(slice_3, 0.2)

    slice_4 = DataSlice.whole_slice(rows, cols)
    slice_4.type = ProductNode
    node_4 = ProductNode()
    node_4.id = slice_4.id
    node_assoc[node_4.id] = node_4
    building_stack.append(slice_4)

    leaf_5 = CategoricalSmoothedNode(var,
                                     values)
    slice_5 = DataSlice.whole_slice(rows, cols)
    leaf_5.id = slice_5.id
    node_assoc[leaf_5.id] = leaf_5
    # not adding the slice to the stack

    slice_2.add_child(slice_4)
    slice_2.add_child(slice_5)

    slice_6 = DataSlice.whole_slice(rows, cols)
    slice_6.type = SumNode
    node_6 = SumNode()
    node_6.id = slice_6.id
    node_assoc[node_6.id] = node_6
    building_stack.append(slice_6)

    slice_7 = DataSlice.whole_slice(rows, cols)
    slice_7.type = SumNode
    node_7 = SumNode()
    node_7.id = slice_7.id
    node_assoc[node_7.id] = node_7
    building_stack.append(slice_7)

    slice_3.add_child(slice_6, 0.4)
    slice_3.add_child(slice_7, 0.6)

    slice_8 = DataSlice.whole_slice(rows, cols)
    slice_8.type = ProductNode
    node_8 = ProductNode()
    node_8.id = slice_8.id
    node_assoc[node_8.id] = node_8
    building_stack.append(slice_8)

    leaf_15 = CategoricalSmoothedNode(var,
                                      values)
    slice_15 = DataSlice.whole_slice(rows, cols)
    leaf_15.id = slice_15.id
    node_assoc[leaf_15.id] = leaf_15

    slice_4.add_child(slice_8)
    slice_4.add_child(slice_15)

    leaf_13 = CategoricalSmoothedNode(var,
                                      values)
    slice_13 = DataSlice.whole_slice(rows, cols)
    leaf_13.id = slice_13.id
    node_assoc[leaf_13.id] = leaf_13

    leaf_14 = CategoricalSmoothedNode(var,
                                      values)
    slice_14 = DataSlice.whole_slice(rows, cols)
    leaf_14.id = slice_14.id
    node_assoc[leaf_14.id] = leaf_14

    slice_8.add_child(slice_13)
    slice_8.add_child(slice_14)

    slice_9 = DataSlice.whole_slice(rows, cols)
    slice_9.type = ProductNode
    node_9 = ProductNode()
    node_9.id = slice_9.id
    node_assoc[node_9.id] = node_9
    building_stack.append(slice_9)

    leaf_16 = CategoricalSmoothedNode(var,
                                      values)
    slice_16 = DataSlice.whole_slice(rows, cols)
    leaf_16.id = slice_16.id
    node_assoc[leaf_16.id] = leaf_16

    leaf_17 = CategoricalSmoothedNode(var,
                                      values)
    slice_17 = DataSlice.whole_slice(rows, cols)
    leaf_17.id = slice_17.id
    node_assoc[leaf_17.id] = leaf_17

    slice_9.add_child(slice_16)
    slice_9.add_child(slice_17)

    slice_10 = DataSlice.whole_slice(rows, cols)
    slice_10.type = ProductNode
    node_10 = ProductNode()
    node_10.id = slice_10.id
    node_assoc[node_10.id] = node_10
    building_stack.append(slice_10)

    leaf_18 = CategoricalSmoothedNode(var,
                                      values)
    slice_18 = DataSlice.whole_slice(rows, cols)
    leaf_18.id = slice_18.id
    node_assoc[leaf_18.id] = leaf_18

    leaf_19 = CategoricalSmoothedNode(var,
                                      values)
    slice_19 = DataSlice.whole_slice(rows, cols)
    leaf_19.id = slice_19.id
    node_assoc[leaf_19.id] = leaf_19

    slice_10.add_child(slice_18)
    slice_10.add_child(slice_19)

    slice_6.add_child(slice_9, 0.1)
    slice_6.add_child(slice_10, 0.9)

    slice_11 = DataSlice.whole_slice(rows, cols)
    slice_11.type = ProductNode
    node_11 = ProductNode()
    node_11.id = slice_11.id
    node_assoc[node_11.id] = node_11
    building_stack.append(slice_11)

    leaf_20 = CategoricalSmoothedNode(var,
                                      values)
    slice_20 = DataSlice.whole_slice(rows, cols)
    leaf_20.id = slice_20.id
    node_assoc[leaf_20.id] = leaf_20

    leaf_21 = CategoricalSmoothedNode(var,
                                      values)
    slice_21 = DataSlice.whole_slice(rows, cols)
    leaf_21.id = slice_21.id
    node_assoc[leaf_21.id] = leaf_21

    slice_11.add_child(slice_20)
    slice_11.add_child(slice_21)

    slice_12 = DataSlice.whole_slice(rows, cols)
    slice_12.type = ProductNode
    node_12 = ProductNode()
    node_12.id = slice_12.id
    node_assoc[node_12.id] = node_12
    building_stack.append(slice_12)

    leaf_22 = CategoricalSmoothedNode(var,
                                      values)
    slice_22 = DataSlice.whole_slice(rows, cols)
    leaf_22.id = slice_22.id
    node_assoc[leaf_22.id] = leaf_22

    leaf_23 = CategoricalSmoothedNode(var,
                                      values)
    slice_23 = DataSlice.whole_slice(rows, cols)
    leaf_23.id = slice_23.id
    node_assoc[leaf_23.id] = leaf_23

    slice_12.add_child(slice_22)
    slice_12.add_child(slice_23)

    slice_7.add_child(slice_11, 0.2)
    slice_7.add_child(slice_12, 0.7)

    root_node = SpnFactory.pruned_spn_from_slices(node_assoc,
                                                  building_stack)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12

Example #5

Show file

File: learnspn.py Project: danhlephuoc/spae

    def fit_structure(self, data, feature_sizes):
        """
        data is a numpy array of size {n_instances X n_features}
        feature_sizes is an array of integers representing feature ranges
        """

        #
        # resetting the data slice ids (just in case)
        DataSlice.reset_id_counter()

        tot_n_instances = data.shape[0]
        tot_n_features = data.shape[1]

        logging.info('Learning SPN structure on a (%d X %d) dataset',
                     tot_n_instances, tot_n_features)
        learn_start_t = perf_counter()

        #
        # a queue containing the data slices to process
        slices_to_process = deque()

        # a stack for building nodes
        building_stack = deque()

        # a dict to keep track of id->nodes
        node_id_assoc = {}

        # creating the first slice
        whole_slice = DataSlice.whole_slice(tot_n_instances, tot_n_features)
        slices_to_process.append(whole_slice)

        first_run = True

        #
        # iteratively process & split slices
        #
        while slices_to_process:

            # process a slice
            current_slice = slices_to_process.popleft()

            # pointers to the current data slice
            current_instances = current_slice.instance_ids
            current_features = current_slice.feature_ids
            current_id = current_slice.id

            n_instances = len(current_instances)
            n_features = len(current_features)

            logging.info('\n*** Processing slice %d (%d X %d)', current_id,
                         n_instances, n_features)
            logging.debug('\tinstances:%s\n\tfeatures:%s', current_instances,
                          current_features)

            #
            # is this a leaf node or we can split?
            if n_features == 1:
                logging.info('---> Adding a leaf (just one feature)')

                (feature_id, ) = current_features
                feature_size = feature_sizes[feature_id]

                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                # create the node
                leaf_node = CategoricalSmoothedNode(
                    var=feature_id,
                    var_values=feature_size,
                    data=current_slice_data,
                    instances=current_instances,
                    alpha=self._alpha)
                # print('lnvf', leaf_node._var_freqs)
                # storing links
                # input_nodes.append(leaf_node)
                leaf_node.id = current_id
                node_id_assoc[current_id] = leaf_node

                logging.debug('\tCreated Smooth Node %s', leaf_node)

            elif (n_instances <= self._min_instances_slice and n_features > 1):
                #
                # splitting the slice on each feature
                logging.info('---> Few instances (%d), decompose all features',
                             n_instances)
                #
                # shall put a cltree or
                if self._cltree_leaves:
                    logging.info('into a Chow-Liu tree')
                    #
                    # slicing data
                    slice_data_rows = data[current_instances, :]
                    current_slice_data = slice_data_rows[:, current_features]

                    current_feature_sizes = [
                        feature_sizes[i] for i in current_features
                    ]
                    #
                    # creating a Chow-Liu tree as leaf
                    leaf_node = CLTreeNode(vars=current_features,
                                           var_values=current_feature_sizes,
                                           data=current_slice_data,
                                           alpha=self._alpha)
                    #
                    # storing links
                    leaf_node.id = current_id
                    node_id_assoc[current_id] = leaf_node

                    logging.debug('\tCreated Chow-Liu Tree Node %s', leaf_node)

                elif self._kde and n_instances > 1:
                    estimate_kernel_density_spn(current_slice, feature_sizes,
                                                data, self._alpha,
                                                node_id_assoc, building_stack,
                                                slices_to_process)

                # elif n_instances == 1:  # FIXME: there is a bug here
                else:
                    current_slice, slices_to_process, building_stack, node_id_assoc = \
                        self.make_naive_factorization(current_slice,
                                                      slices_to_process,
                                                      building_stack,
                                                      node_id_assoc)
            else:

                #
                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                split_on_features = False
                #
                # first run is a split on rows
                if first_run:
                    logging.info('-- FIRST RUN --')
                    first_run = False
                else:
                    #
                    # try clustering on cols
                    # logging.debug('...trying to split on columns')
                    split_start_t = perf_counter()
                    print(data.shape)
                    dependent_features, other_features = greedy_feature_split(
                        data, current_slice, feature_sizes, self._g_factor,
                        self._rand_gen)
                    split_end_t = perf_counter()
                    logging.info('...tried to split on columns in {}'.format(
                        split_end_t - split_start_t))
                    if len(other_features) > 0:
                        split_on_features = True
                #
                # have dependent components been found?
                if split_on_features:
                    #
                    # splitting on columns
                    logging.info(
                        '---> Splitting on features' +
                        ' {} -> ({}, {})'.format(len(current_features),
                                                 len(dependent_features),
                                                 len(other_features)))

                    #
                    # creating two new data slices and putting them on queue
                    first_slice = DataSlice(current_instances,
                                            dependent_features)
                    second_slice = DataSlice(current_instances, other_features)
                    slices_to_process.append(first_slice)
                    slices_to_process.append(second_slice)

                    children_ids = [first_slice.id, second_slice.id]

                    #
                    # storing link parent children
                    current_slice.type = ProductNode
                    building_stack.append(current_slice)
                    current_slice.add_child(first_slice)
                    current_slice.add_child(second_slice)

                    #
                    # creating product node
                    prod_node = ProductNode(
                        var_scope=frozenset(current_features))
                    prod_node.id = current_id
                    node_id_assoc[current_id] = prod_node
                    logging.debug('\tCreated Prod Node %s (with children %s)',
                                  prod_node, children_ids)

                else:
                    #
                    # clustering on rows
                    logging.info('---> Splitting on rows')

                    #
                    # at most n_rows clusters, for sklearn
                    k_row_clusters = min(self._n_cluster_splits,
                                         n_instances - 1)

                    clustering = cluster_rows(
                        data,
                        current_slice,
                        n_clusters=k_row_clusters,
                        cluster_method=self._row_cluster_method,
                        n_iters=self._n_iters,
                        n_restarts=self._n_restarts,
                        cluster_penalty=self._cluster_penalty,
                        rand_gen=self._rand_gen,
                        sklearn_args=self._sklearn_args)

                    if len(clustering) < 2:
                        logging.info('\n\n\nLess than 2 clusters\n\n (%d)',
                                     len(clustering))

                        logging.info('forcing a naive factorization')
                        current_slice, slices_to_process, building_stack, node_id_assoc = \
                            self.make_naive_factorization(current_slice,
                                                          slices_to_process,
                                                          building_stack,
                                                          node_id_assoc)

                    else:
                        # logging.debug('obtained clustering %s', clustering)
                        logging.info('clustered into %d parts (min %d)',
                                     len(clustering), k_row_clusters)
                        # splitting
                        cluster_slices = [
                            DataSlice(cluster, current_features)
                            for cluster in clustering
                        ]
                        cluster_slices_ids = [
                            slice.id for slice in cluster_slices
                        ]

                        # cluster_prior = 5.0
                        # cluster_weights = [(slice.n_instances() + cluster_prior) /
                        #                    (n_instances + cluster_prior * len(cluster_slices))
                        #                    for slice in cluster_slices]
                        cluster_weights = [
                            slice.n_instances() / n_instances
                            for slice in cluster_slices
                        ]

                        #
                        # appending for processing
                        slices_to_process.extend(cluster_slices)

                        #
                        # storing links
                        # current_slice.children = cluster_slices_ids
                        # current_slice.weights = cluster_weights
                        current_slice.type = SumNode
                        building_stack.append(current_slice)
                        for child_slice, child_weight in zip(
                                cluster_slices, cluster_weights):
                            current_slice.add_child(child_slice, child_weight)

                        #
                        # building a sum node
                        SCOPES_DICT[frozenset(current_features)] += 1
                        sum_node = SumNode(
                            var_scope=frozenset(current_features))
                        sum_node.id = current_id
                        node_id_assoc[current_id] = sum_node
                        logging.debug(
                            '\tCreated Sum Node %s (with children %s)',
                            sum_node, cluster_slices_ids)

        learn_end_t = perf_counter()

        logging.info('\n\n\tStructure learned in %f secs',
                     (learn_end_t - learn_start_t))

        #
        # linking the spn graph (parent -> children)
        #
        logging.info('===> Building tree')

        link_start_t = perf_counter()
        root_build_node = building_stack[0]
        root_node = node_id_assoc[root_build_node.id]
        logging.debug('root node: %s', root_node)

        root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc,
                                                      building_stack)
        link_end_t = perf_counter()
        logging.info('\tLinked the spn in %f secs (root_node %s)',
                     (link_end_t - link_start_t), root_node)

        #
        # building layers
        #
        logging.info('===> Layering spn')
        layer_start_t = perf_counter()
        spn = SpnFactory.layered_linked_spn(root_node)
        layer_end_t = perf_counter()
        logging.info('\tLayered the spn in %f secs',
                     (layer_end_t - layer_start_t))

        logging.info('\nLearned SPN\n\n%s', spn.stats())
        #logging.info('%s', SCOPES_DICT.most_common(30))

        return spn

Example #6

Show file

def test_categorical_to_indicator_input_layer():
    #
    # creating all the data slices
    # the slicing is a fake stub
    # rows = 5
    # cols = 5
    var_1 = 0
    values_1 = 2
    var_2 = 1
    values_2 = 3
    var_3 = 2
    values_3 = 4

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var_1, values_1)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    leaf_15 = CategoricalSmoothedNode(var_2, values_2)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var_3, values_3)
    leaf_13.id = 13

    leaf_14 = CategoricalSmoothedNode(var_1, values_1)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    node_9 = ProductNode()
    node_9.id = 9

    leaf_16 = CategoricalSmoothedNode(var_2, values_2)
    leaf_16.id = 16

    leaf_17 = CategoricalSmoothedNode(var_3, values_3)
    leaf_17.id = 17

    node_9.add_child(leaf_16)
    node_9.add_child(leaf_17)

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var_2, values_2)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var_2, values_2)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(node_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var_1, values_1)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var_3, values_3)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CategoricalSmoothedNode(var_1, values_1)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var_3, values_3)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12

    #
    # changing input layer
    spn = linked_categorical_input_to_indicators(spn)

    print('Changed input layer to indicator variables')
    print(spn)

Example #7

Show file

def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var, values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var, values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var, values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var, values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var, values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var, values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var, values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10

Example #8

Show file

def test_pruned_spn_from_slices():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    node_assoc = {}
    building_stack = deque()

    slice_1 = DataSlice.whole_slice(rows, cols)
    slice_1.type = SumNode
    node_1 = SumNode()
    node_1.id = slice_1.id
    node_assoc[node_1.id] = node_1
    building_stack.append(slice_1)

    slice_2 = DataSlice.whole_slice(rows, cols)
    slice_2.type = ProductNode
    node_2 = ProductNode()
    node_2.id = slice_2.id
    node_assoc[node_2.id] = node_2
    building_stack.append(slice_2)

    slice_3 = DataSlice.whole_slice(rows, cols)
    slice_3.type = SumNode
    node_3 = SumNode()
    node_3.id = slice_3.id
    node_assoc[node_3.id] = node_3
    building_stack.append(slice_3)

    # adding first level
    slice_1.add_child(slice_2, 0.8)
    slice_1.add_child(slice_3, 0.2)

    slice_4 = DataSlice.whole_slice(rows, cols)
    slice_4.type = ProductNode
    node_4 = ProductNode()
    node_4.id = slice_4.id
    node_assoc[node_4.id] = node_4
    building_stack.append(slice_4)

    leaf_5 = CategoricalSmoothedNode(var, values)
    slice_5 = DataSlice.whole_slice(rows, cols)
    leaf_5.id = slice_5.id
    node_assoc[leaf_5.id] = leaf_5
    # not adding the slice to the stack

    slice_2.add_child(slice_4)
    slice_2.add_child(slice_5)

    slice_6 = DataSlice.whole_slice(rows, cols)
    slice_6.type = SumNode
    node_6 = SumNode()
    node_6.id = slice_6.id
    node_assoc[node_6.id] = node_6
    building_stack.append(slice_6)

    slice_7 = DataSlice.whole_slice(rows, cols)
    slice_7.type = SumNode
    node_7 = SumNode()
    node_7.id = slice_7.id
    node_assoc[node_7.id] = node_7
    building_stack.append(slice_7)

    slice_3.add_child(slice_6, 0.4)
    slice_3.add_child(slice_7, 0.6)

    slice_8 = DataSlice.whole_slice(rows, cols)
    slice_8.type = ProductNode
    node_8 = ProductNode()
    node_8.id = slice_8.id
    node_assoc[node_8.id] = node_8
    building_stack.append(slice_8)

    leaf_15 = CategoricalSmoothedNode(var, values)
    slice_15 = DataSlice.whole_slice(rows, cols)
    leaf_15.id = slice_15.id
    node_assoc[leaf_15.id] = leaf_15

    slice_4.add_child(slice_8)
    slice_4.add_child(slice_15)

    leaf_13 = CategoricalSmoothedNode(var, values)
    slice_13 = DataSlice.whole_slice(rows, cols)
    leaf_13.id = slice_13.id
    node_assoc[leaf_13.id] = leaf_13

    leaf_14 = CategoricalSmoothedNode(var, values)
    slice_14 = DataSlice.whole_slice(rows, cols)
    leaf_14.id = slice_14.id
    node_assoc[leaf_14.id] = leaf_14

    slice_8.add_child(slice_13)
    slice_8.add_child(slice_14)

    slice_9 = DataSlice.whole_slice(rows, cols)
    slice_9.type = ProductNode
    node_9 = ProductNode()
    node_9.id = slice_9.id
    node_assoc[node_9.id] = node_9
    building_stack.append(slice_9)

    leaf_16 = CategoricalSmoothedNode(var, values)
    slice_16 = DataSlice.whole_slice(rows, cols)
    leaf_16.id = slice_16.id
    node_assoc[leaf_16.id] = leaf_16

    leaf_17 = CategoricalSmoothedNode(var, values)
    slice_17 = DataSlice.whole_slice(rows, cols)
    leaf_17.id = slice_17.id
    node_assoc[leaf_17.id] = leaf_17

    slice_9.add_child(slice_16)
    slice_9.add_child(slice_17)

    slice_10 = DataSlice.whole_slice(rows, cols)
    slice_10.type = ProductNode
    node_10 = ProductNode()
    node_10.id = slice_10.id
    node_assoc[node_10.id] = node_10
    building_stack.append(slice_10)

    leaf_18 = CategoricalSmoothedNode(var, values)
    slice_18 = DataSlice.whole_slice(rows, cols)
    leaf_18.id = slice_18.id
    node_assoc[leaf_18.id] = leaf_18

    leaf_19 = CategoricalSmoothedNode(var, values)
    slice_19 = DataSlice.whole_slice(rows, cols)
    leaf_19.id = slice_19.id
    node_assoc[leaf_19.id] = leaf_19

    slice_10.add_child(slice_18)
    slice_10.add_child(slice_19)

    slice_6.add_child(slice_9, 0.1)
    slice_6.add_child(slice_10, 0.9)

    slice_11 = DataSlice.whole_slice(rows, cols)
    slice_11.type = ProductNode
    node_11 = ProductNode()
    node_11.id = slice_11.id
    node_assoc[node_11.id] = node_11
    building_stack.append(slice_11)

    leaf_20 = CategoricalSmoothedNode(var, values)
    slice_20 = DataSlice.whole_slice(rows, cols)
    leaf_20.id = slice_20.id
    node_assoc[leaf_20.id] = leaf_20

    leaf_21 = CategoricalSmoothedNode(var, values)
    slice_21 = DataSlice.whole_slice(rows, cols)
    leaf_21.id = slice_21.id
    node_assoc[leaf_21.id] = leaf_21

    slice_11.add_child(slice_20)
    slice_11.add_child(slice_21)

    slice_12 = DataSlice.whole_slice(rows, cols)
    slice_12.type = ProductNode
    node_12 = ProductNode()
    node_12.id = slice_12.id
    node_assoc[node_12.id] = node_12
    building_stack.append(slice_12)

    leaf_22 = CategoricalSmoothedNode(var, values)
    slice_22 = DataSlice.whole_slice(rows, cols)
    leaf_22.id = slice_22.id
    node_assoc[leaf_22.id] = leaf_22

    leaf_23 = CategoricalSmoothedNode(var, values)
    slice_23 = DataSlice.whole_slice(rows, cols)
    leaf_23.id = slice_23.id
    node_assoc[leaf_23.id] = leaf_23

    slice_12.add_child(slice_22)
    slice_12.add_child(slice_23)

    slice_7.add_child(slice_11, 0.2)
    slice_7.add_child(slice_12, 0.7)

    root_node = SpnFactory.pruned_spn_from_slices(node_assoc, building_stack)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12