Exemple #1
0
def test_cltree_node_eval():
    #
    # testing for the correctness of data masking while evaluating
    # shall I move this test to CLTree?
    s_data = numpy.random.binomial(n=1, p=0.5, size=(1000, 100))
    print(s_data)

    random_features = numpy.random.choice(s_data.shape[1], 20)
    sub_s_data = s_data[:, random_features]

    clt_node = CLTreeNode(vars=random_features,
                          data=sub_s_data,
                          var_values=numpy.array([2 for i in
                                                  range(sub_s_data.shape[1])]))
    # evaluating on all s_data
    lls = []
    for instance in s_data:
        clt_node.eval(instance)
        lls.append(clt_node.log_val)

    #
    # now do one on the only sub
    clt_node = CLTreeNode(vars=[i for i in range(sub_s_data.shape[1])],
                          data=sub_s_data,
                          var_values=numpy.array([2 for i in
                                                  range(sub_s_data.shape[1])]))
    lls_s = []
    for instance in sub_s_data:
        clt_node.eval(instance)
        lls_s.append(clt_node.log_val)

    # print(lls)
    # print(lls_s)
    assert_array_almost_equal(numpy.array(lls), numpy.array(lls_s))
Exemple #2
0
def test_categorical_clt_input_layer_eval():
    #
    # just a little test to see how mixed nodes layers
    # dispatch eval
    s_data = numpy.array([[1, 1, 1, 0],
                          [0, 0, 1, 0],
                          [0, 0, 1, 0],
                          [1, 1, 0, 0],
                          [1, 0, 1, 0],
                          [0, 1, 1, 0]])
    features = [0, 2, 3]
    feature_vals = [2, 2, 2]
    clt_node = CLTreeNode(data=s_data[:, features],
                          vars=features,
                          var_values=feature_vals,
                          alpha=0.0)

    #
    # creating a categorical smoothed node
    cs_node = CategoricalSmoothedNode(var=1,
                                      var_values=2,
                                      alpha=0.,
                                      data=s_data[:, [1]])

    clti_layer = CategoricalCLInputLayer(nodes=[cs_node, clt_node])

    nico_cltree_subtree = numpy.array([-1,  0,  1])
    nico_cltree_sublls = numpy.array([-1.09861228867,
                                      -0.69314718056,
                                      -0.69314718056,
                                      -1.79175946923,
                                      -1.09861228867,
                                      -0.69314718056])

    assert_array_equal(numpy.array(features), clt_node.vars)
    assert_array_equal(nico_cltree_subtree,
                       clt_node._cltree._tree)

    s_log_prob = numpy.log(0.5)

    #
    # evaluating the layer
    for i, instance in enumerate(s_data):
        clti_layer.eval(instance)
        for node in clti_layer.nodes():
            if isinstance(node, CLTreeNode):
                assert_almost_equal(nico_cltree_sublls[i], node.log_val)
            else:
                # in the other case is 0.5
                assert_almost_equal(s_log_prob, node.log_val)
Exemple #3
0
def test_cltree_node_init_and_eval():
    s_data = numpy.array([[1, 1, 1, 0],
                          [0, 0, 1, 0],
                          [0, 0, 1, 0],
                          [1, 0, 0, 0],
                          [1, 0, 1, 0],
                          [0, 1, 1, 0]])

    features = [0, 1, 2, 3]
    feature_vals = [2, 2, 2, 2]
    clt_node = CLTreeNode(data=s_data,
                          vars=features,
                          var_values=feature_vals,
                          alpha=0.0)

    assert_array_equal(numpy.array(features), clt_node.vars)
    assert_almost_equal(clt_node._alpha, 0.0)
    assert_array_almost_equal(s_data, clt_node._data)

    nico_cltree_tree = numpy.array([-1,  2,  0,  2])
    nico_cltree_lls = numpy.array([-2.01490302054,
                                   -1.20397280433,
                                   -1.20397280433,
                                   -1.79175946923,
                                   -1.60943791243,
                                   -1.60943791243])

    assert_array_equal(nico_cltree_tree, clt_node._cltree._tree)

    print('Created node')
    print(clt_node)

    #
    # evaluating
    for i, instance in enumerate(s_data):
        clt_node.eval(instance)
        assert_almost_equal(nico_cltree_lls[i], clt_node.log_val)
        print(clt_node.log_val, nico_cltree_lls[i])

    #
    # creating now with a subset from data
    features = [0, 2, 3]
    feature_vals = [2, 2, 2]
    clt_node = CLTreeNode(data=s_data[:, features],
                          vars=features,
                          var_values=feature_vals,
                          alpha=0.0)

    nico_cltree_subtree = numpy.array([-1,  0,  1])
    nico_cltree_sublls = numpy.array([-1.09861228867,
                                      -0.69314718056,
                                      -0.69314718056,
                                      -1.79175946923,
                                      -1.09861228867,
                                      -0.69314718056])

    assert_array_equal(numpy.array(features), clt_node.vars)
    assert_array_equal(nico_cltree_subtree,
                       clt_node._cltree._tree)
    for i, instance in enumerate(s_data):
        clt_node.eval(instance)
        assert_almost_equal(nico_cltree_sublls[i], clt_node.log_val)
        print(clt_node.log_val, nico_cltree_sublls[i])
Exemple #4
0
def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var,
                                     values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var,
                                      values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars,
                        var_values=var_values,
                        data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var,
                                      values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var,
                                      values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var,
                                      values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var,
                                      values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var,
                                      values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10
Exemple #5
0
    def fit_structure(self, data, feature_sizes):
        """
        data is a numpy array of size {n_instances X n_features}
        feature_sizes is an array of integers representing feature ranges
        """

        #
        # resetting the data slice ids (just in case)
        DataSlice.reset_id_counter()

        tot_n_instances = data.shape[0]
        tot_n_features = data.shape[1]

        logging.info('Learning SPN structure on a (%d X %d) dataset',
                     tot_n_instances, tot_n_features)
        learn_start_t = perf_counter()

        #
        # a queue containing the data slices to process
        slices_to_process = deque()

        # a stack for building nodes
        building_stack = deque()

        # a dict to keep track of id->nodes
        node_id_assoc = {}

        # creating the first slice
        whole_slice = DataSlice.whole_slice(tot_n_instances, tot_n_features)
        slices_to_process.append(whole_slice)

        first_run = True

        #
        # iteratively process & split slices
        #
        while slices_to_process:

            # process a slice
            current_slice = slices_to_process.popleft()

            # pointers to the current data slice
            current_instances = current_slice.instance_ids
            current_features = current_slice.feature_ids
            current_id = current_slice.id

            n_instances = len(current_instances)
            n_features = len(current_features)

            logging.info('\n*** Processing slice %d (%d X %d)', current_id,
                         n_instances, n_features)
            logging.debug('\tinstances:%s\n\tfeatures:%s', current_instances,
                          current_features)

            #
            # is this a leaf node or we can split?
            if n_features == 1:
                logging.info('---> Adding a leaf (just one feature)')

                (feature_id, ) = current_features
                feature_size = feature_sizes[feature_id]

                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                # create the node
                leaf_node = CategoricalSmoothedNode(
                    var=feature_id,
                    var_values=feature_size,
                    data=current_slice_data,
                    instances=current_instances,
                    alpha=self._alpha)
                # print('lnvf', leaf_node._var_freqs)
                # storing links
                # input_nodes.append(leaf_node)
                leaf_node.id = current_id
                node_id_assoc[current_id] = leaf_node

                logging.debug('\tCreated Smooth Node %s', leaf_node)

            elif (n_instances <= self._min_instances_slice and n_features > 1):
                #
                # splitting the slice on each feature
                logging.info('---> Few instances (%d), decompose all features',
                             n_instances)
                #
                # shall put a cltree or
                if self._cltree_leaves:
                    logging.info('into a Chow-Liu tree')
                    #
                    # slicing data
                    slice_data_rows = data[current_instances, :]
                    current_slice_data = slice_data_rows[:, current_features]

                    current_feature_sizes = [
                        feature_sizes[i] for i in current_features
                    ]
                    #
                    # creating a Chow-Liu tree as leaf
                    leaf_node = CLTreeNode(vars=current_features,
                                           var_values=current_feature_sizes,
                                           data=current_slice_data,
                                           alpha=self._alpha)
                    #
                    # storing links
                    leaf_node.id = current_id
                    node_id_assoc[current_id] = leaf_node

                    logging.debug('\tCreated Chow-Liu Tree Node %s', leaf_node)

                elif self._kde and n_instances > 1:
                    estimate_kernel_density_spn(current_slice, feature_sizes,
                                                data, self._alpha,
                                                node_id_assoc, building_stack,
                                                slices_to_process)

                # elif n_instances == 1:  # FIXME: there is a bug here
                else:
                    current_slice, slices_to_process, building_stack, node_id_assoc = \
                        self.make_naive_factorization(current_slice,
                                                      slices_to_process,
                                                      building_stack,
                                                      node_id_assoc)
            else:

                #
                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                split_on_features = False
                #
                # first run is a split on rows
                if first_run:
                    logging.info('-- FIRST RUN --')
                    first_run = False
                else:
                    #
                    # try clustering on cols
                    # logging.debug('...trying to split on columns')
                    split_start_t = perf_counter()
                    print(data.shape)
                    dependent_features, other_features = greedy_feature_split(
                        data, current_slice, feature_sizes, self._g_factor,
                        self._rand_gen)
                    split_end_t = perf_counter()
                    logging.info('...tried to split on columns in {}'.format(
                        split_end_t - split_start_t))
                    if len(other_features) > 0:
                        split_on_features = True
                #
                # have dependent components been found?
                if split_on_features:
                    #
                    # splitting on columns
                    logging.info(
                        '---> Splitting on features' +
                        ' {} -> ({}, {})'.format(len(current_features),
                                                 len(dependent_features),
                                                 len(other_features)))

                    #
                    # creating two new data slices and putting them on queue
                    first_slice = DataSlice(current_instances,
                                            dependent_features)
                    second_slice = DataSlice(current_instances, other_features)
                    slices_to_process.append(first_slice)
                    slices_to_process.append(second_slice)

                    children_ids = [first_slice.id, second_slice.id]

                    #
                    # storing link parent children
                    current_slice.type = ProductNode
                    building_stack.append(current_slice)
                    current_slice.add_child(first_slice)
                    current_slice.add_child(second_slice)

                    #
                    # creating product node
                    prod_node = ProductNode(
                        var_scope=frozenset(current_features))
                    prod_node.id = current_id
                    node_id_assoc[current_id] = prod_node
                    logging.debug('\tCreated Prod Node %s (with children %s)',
                                  prod_node, children_ids)

                else:
                    #
                    # clustering on rows
                    logging.info('---> Splitting on rows')

                    #
                    # at most n_rows clusters, for sklearn
                    k_row_clusters = min(self._n_cluster_splits,
                                         n_instances - 1)

                    clustering = cluster_rows(
                        data,
                        current_slice,
                        n_clusters=k_row_clusters,
                        cluster_method=self._row_cluster_method,
                        n_iters=self._n_iters,
                        n_restarts=self._n_restarts,
                        cluster_penalty=self._cluster_penalty,
                        rand_gen=self._rand_gen,
                        sklearn_args=self._sklearn_args)

                    if len(clustering) < 2:
                        logging.info('\n\n\nLess than 2 clusters\n\n (%d)',
                                     len(clustering))

                        logging.info('forcing a naive factorization')
                        current_slice, slices_to_process, building_stack, node_id_assoc = \
                            self.make_naive_factorization(current_slice,
                                                          slices_to_process,
                                                          building_stack,
                                                          node_id_assoc)

                    else:
                        # logging.debug('obtained clustering %s', clustering)
                        logging.info('clustered into %d parts (min %d)',
                                     len(clustering), k_row_clusters)
                        # splitting
                        cluster_slices = [
                            DataSlice(cluster, current_features)
                            for cluster in clustering
                        ]
                        cluster_slices_ids = [
                            slice.id for slice in cluster_slices
                        ]

                        # cluster_prior = 5.0
                        # cluster_weights = [(slice.n_instances() + cluster_prior) /
                        #                    (n_instances + cluster_prior * len(cluster_slices))
                        #                    for slice in cluster_slices]
                        cluster_weights = [
                            slice.n_instances() / n_instances
                            for slice in cluster_slices
                        ]

                        #
                        # appending for processing
                        slices_to_process.extend(cluster_slices)

                        #
                        # storing links
                        # current_slice.children = cluster_slices_ids
                        # current_slice.weights = cluster_weights
                        current_slice.type = SumNode
                        building_stack.append(current_slice)
                        for child_slice, child_weight in zip(
                                cluster_slices, cluster_weights):
                            current_slice.add_child(child_slice, child_weight)

                        #
                        # building a sum node
                        SCOPES_DICT[frozenset(current_features)] += 1
                        sum_node = SumNode(
                            var_scope=frozenset(current_features))
                        sum_node.id = current_id
                        node_id_assoc[current_id] = sum_node
                        logging.debug(
                            '\tCreated Sum Node %s (with children %s)',
                            sum_node, cluster_slices_ids)

        learn_end_t = perf_counter()

        logging.info('\n\n\tStructure learned in %f secs',
                     (learn_end_t - learn_start_t))

        #
        # linking the spn graph (parent -> children)
        #
        logging.info('===> Building tree')

        link_start_t = perf_counter()
        root_build_node = building_stack[0]
        root_node = node_id_assoc[root_build_node.id]
        logging.debug('root node: %s', root_node)

        root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc,
                                                      building_stack)
        link_end_t = perf_counter()
        logging.info('\tLinked the spn in %f secs (root_node %s)',
                     (link_end_t - link_start_t), root_node)

        #
        # building layers
        #
        logging.info('===> Layering spn')
        layer_start_t = perf_counter()
        spn = SpnFactory.layered_linked_spn(root_node)
        layer_end_t = perf_counter()
        logging.info('\tLayered the spn in %f secs',
                     (layer_end_t - layer_start_t))

        logging.info('\nLearned SPN\n\n%s', spn.stats())
        #logging.info('%s', SCOPES_DICT.most_common(30))

        return spn
Exemple #6
0
def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var, values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var, values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var, values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var, values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var, values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var, values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var, values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10