Esempio n. 1
0
File: cltree.py Progetto: Rhuax/dcsn
    def __GRASP(self, forest_approach, vdata):

        times = 3
        k = 3  # Best k edges

        if len(forest_approach) > 1:
            times = int(forest_approach[1])
            if len(forest_approach) > 2:
                k = int(forest_approach[2])

        """GRASP"""
        t = 0
        while t < times:

            """CONSTRUCT"""
            initial_tree = None
            mst = minimum_spanning_tree_K(-(self.MI), k)  # Using modified version of kruskal algorithm

            dfs_tree = depth_first_order(mst, directed=False, i_start=0)
            initial_tree = self.create_tree(dfs_tree)
            """End Construct"""

            """ Local Search"""
            initial_valid_ll = self.score_samples_log_proba_v(vdata, initial_tree)
            initial_num_tree = 1
            improved = True
            while improved:
                improved = False
                best_ll = -np.inf
                best_edge = None
                valid_edges = np.where(initial_tree != -1)
                if np.size(valid_edges) > 0:
                    for i in np.nditer(valid_edges):
                        new = np.copy(initial_tree)
                        new[i] = -1
                        valid_ll = self.score_samples_log_proba_v(vdata, new)
                        if valid_ll > best_ll:
                            best_edge = i
                            best_ll = valid_ll
                    if best_ll > initial_valid_ll:
                        initial_valid_ll = best_ll
                        initial_num_tree += 1
                        initial_tree[best_edge] = -1
                        improved = True

            """End local search"""

            if initial_valid_ll > self.current_best_validationll:
                self.current_best_validationll = initial_valid_ll
                self.num_trees = initial_num_tree
                self.tree = initial_tree
                # Now i can compute the log factors
                self.log_factors = np.zeros((self.n_features, 2, 2))
                self.log_factors = compute_log_factors(self.tree, self.n_features, self.log_probs, self.log_c_probs,
                                                       self.log_factors)

            t += 1
Esempio n. 2
0
File: cltree.py Progetto: Rhuax/dcsn
    def _Minimum_SPTree_log_probs(self, log_probs, log_c_probs):
        """ the tree is represented as a sequence of parents"""
        mst = minimum_spanning_tree(-(self.MI))
        dfs_tree = depth_first_order(mst, directed=False, i_start=0)
        self.df_order = dfs_tree[0]
        self.tree = self.create_tree(dfs_tree)

        # computing the factored representation
        self.log_factors = np.zeros((self.n_features, 2, 2))
        self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_c_probs, self.log_factors)
Esempio n. 3
0
def ordered_hull_idx_2d(hull):
    n = hull.simplices.shape[0]
    
    # determine order of edges in the convex hull
    v = coo_matrix((np.ones(2*n), (np.repeat(np.arange(n), 2), hull.neighbors.ravel())))
    facet_order = csgraph.depth_first_order(v, 0, return_predecessors=False)
    facet_vidx = hull.simplices[facet_order]
    
    # pick one vertex for each edge, based on which direction the walk went
    m = hull.neighbors[facet_order][:-1] == facet_order[1:,None]
    i = np.arange(n)
    j = np.r_[np.where(m)[1], 0] 
    
    ordered_vertex_idx = facet_vidx[i, j]
    
    # sanity check
    assert np.all(np.unique(ordered_vertex_idx) == np.unique(hull.simplices.ravel()))
 
    return ordered_vertex_idx
Esempio n. 4
0
    def __init__(self, data, features_name=None, alpha=1.0):

        self.data = data
        self.alpha = alpha

        self.n_features = data.shape[1]
        if features_name is None:
            self.features = [i for i in range(self.n_features)]
        else:
            self.features = features_name

        self.num_instances = data.shape[0]

        (self.log_probs, self.log_j_probs) = self.log_p_jp(
            self.data, self.n_features, self.num_instances)
        self.log_c_probs = self.log_cp(
            self.n_features, self.log_probs, self.log_j_probs)

        self.MI = self.cMI(self.n_features, self.log_probs, self.log_j_probs)

        " the tree is represented as a sequence of parents"

        mst = minimum_spanning_tree(-(self.MI + 1))
        dfs_tree = depth_first_order(mst, directed=False, i_start=0)

        self.tree = np.zeros(self.n_features)
        self.tree[0] = -1
        for p in range(1, self.n_features):
            self.tree[p] = dfs_tree[1][p]

        # computing the factored represetation
        self.factors = np.zeros((self.n_features, 2, 2))
        self.factors = self.log_factors()

        self.data = None
        self.MI = None
        self.log_j_probs = None
        self.log_probs = None
        self.log_c_probs = None
        mst = None
        dfs_tree = None
Esempio n. 5
0
    def compute(self):
        cov = cov3D(self.k_neighbors)
        u, s, v = np.linalg.svd(cov)

        normals = u[:, :, -1]

        # Orient normals as in "Surface Reconstruction from Unorganized Points"

        max_z = self.pyntcloud.xyz.argmax(0)[-1]
        if normals[max_z, 2] < 0:
            normals[max_z] = -normals[max_z]
        
        # Dot product between each point's normal and the normals of it's neighbours
        dot3D = 1 - abs(np.einsum("ij, ikj -> ik",
                                  normals,
                                  normals[self.k_neighbors_idx]))

        n = self.pyntcloud.xyz.shape[0]
        graph = np.zeros((n, n), dtype=np.float32)
        for i in range(n):
            graph[i, self.k_neighbors_idx[i]] = dot3D[i]

        MST = minimum_spanning_tree(csr_matrix(graph))
        DFO = depth_first_order(MST, max_z,
                                directed=False,
                                return_predecessors=False)
        """
        for i in range(1, len(DFO)):
            n1 = normals[DFO[i - 1]]
            n2 = normals[DFO[i]]
            if np.dot(n1, n2) < 0:
                normals[DFO[i]] *= -1
        """
        nx = normals[:, 0]
        ny = normals[:, 1]
        nz = normals[:, 2]

        k = self.k_neighbors.shape[1]
        self.to_be_added["nx({})".format(k)] = nx
        self.to_be_added["ny({})".format(k)] = ny
        self.to_be_added["nz({})".format(k)] = nz
Esempio n. 6
0
    def group_branches(self, graph, root, directed=False):
        """

        :param graph:
        :type graph: csr_matrix
        :param root:
        :type root: int
        :return:
        :rtype: list(list(int))
        """

        dft, preds = sp.depth_first_order(graph, root, directed=directed,
                                          return_predecessors=True)
        branches = []
        current_branch = -1
        for node in dft[1:]:
            if preds[node] == root:
                current_branch += 1
                branches.append([])
            branches[current_branch].append(node)

        return branches
Esempio n. 7
0
    def fit(self, X, m_priors, j_priors, alpha=1.0, sample_weight=None, scope=None, and_leaves=False, multilabel = False, n_labels=0, ml_tree_structure=0):
        """Fit the model to the data.

        Parameters
        ----------
        X : ndarray, shape=(n, m)
        The data array.

        m_priors: 
        the marginal priors for each feature
        
        j_priors: 
        the joint priors for each couple of features

        alpha: float, default=1.0
        the constant for the smoothing

        sample_weight: ndarray, shape=(n,)
        The weight of each sample.

        scope: 
        unique identifiers for the features

        and_leaves: boolean, default=False

        multilabel: boolean, default=False
        its value indicates whether the cltree are used for multilabel classification 
        problems when imported by mlcsn.py

        n_labels: integer, default=0
        in case of multilabel classification problem indicates the number of labels,
        assumed to be the n_labels rows of X

        ml_tree_structure: integer, default=0
        in case of multilabel classification problem indicates the structure of the tree 
        to be learned. The set of features F corresponds to the union of A (the attributes)
        and Y (the labels):
        - 0, no constraint on the resulting tree
        - 1, the parent of each variable in Y must have the parent in Y, while the parent of each
        variable in A can have the parent in A or in Y. A label variable depends on a label 
        variable; an attribute variable can depend on a label variable or on an attribute variable
        - 2, the parent of each variable in Y must have the parent in Y, and the parent of each
        variable in A can have the parent in Y. A label variable depends on a label variable; an 
        attribute variable depends on a label variable
        
        """


        self.alpha = alpha
        self.and_leaves = and_leaves
        self.n_features = X.shape[1]

        rootTree = False
        if scope is None:
            self.scope = np.array([i for i in range(self.n_features)])
            rootTree = True
        else:
            self.scope = scope

        if sample_weight is None:
            self.n_samples = X.shape[0]
        else:
            self.n_samples = np.sum(sample_weight)


        (log_probs, log_j_probs) = self.compute_log_probs(X, sample_weight, m_priors, j_priors)


        MI = self.cMI(log_probs, log_j_probs)


        if multilabel == True:
            if ml_tree_structure == 1:
                MI[-n_labels:,-n_labels:] += np.max(MI)
            elif ml_tree_structure == 2:
                MI[-n_labels:,-n_labels:] += np.max(MI)
                MI[:-n_labels,:-n_labels] = 0
            elif ml_tree_structure == 3:
                MI[:-n_labels,:-n_labels] = 0
        
        " the tree is represented as a sequence of parents"

        mst = minimum_spanning_tree(-(MI))
        dfs_tree = depth_first_order(mst, directed=False, i_start=0)

        self.df_order = dfs_tree[0]
        self.post_order = dfs_tree[0][::-1]
        self.tree = np.zeros(self.n_features, dtype=np.int)
        self.tree[0] = -1
        for p in range(1, self.n_features):
            self.tree[p]=dfs_tree[1][p]

        
        penalization = logr(X.shape[0])/(2*X.shape[0])

        if self.and_leaves == True:
            for p in range(1,self.n_features):
                if MI[self.tree[p],p]<penalization:
                    self.tree[p]=-1
                    self.num_trees = self.num_trees + 1
            if self.num_trees > 1:
                self._forest = True

        """
        selected_MI = []
        for p in range(1,self.n_features):
            selected_MI.append((p,MI[self.tree[p],p]))
        selected_MI.sort(key=lambda mi: mi[1], reverse=True)
        for p in range(10,self.n_features-1):
            self.tree[selected_MI[p][0]]=-1
        """

        if multilabel == True and rootTree:
            pX = 0
            for i in range(self.n_features-n_labels):
                if self.tree[i]>=(self.n_features-n_labels):
                    pX += 1
            pY = 0
            for i in range(self.n_features-n_labels,self.n_features):
                if self.tree[i]>=(self.n_features-n_labels):
                    pY += 1
                    
            print("Xs with Y parent: ", pX)
            print("Ys with Y parent: ", pY)            

        self.num_edges = self.n_features - self.num_trees
        # computing the factored represetation
        self.log_factors = np.zeros((self.n_features, 2, 2))
        self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_j_probs, self.log_factors)
Esempio n. 8
0
    def _learn_from_data(self,
                         data,
                         features=None,
                         n_feature_vals=2,
                         feature_vals=None,
                         alpha=0.1,
                         sparse=True,
                         mem_free=True):
        """
        Chow and Liu learning algorithm
        """
        #
        # this trick helps for sparse matrices
        # TODO: check if this cond is needed or the sparse dot is equal to
        # the dense one performance-wise
        if sparse:
            self._data = scipy.sparse.csr_matrix(data)
        else:
            self._data = data

        self._alpha = alpha
        self._n_features = data.shape[1]
        self._n_instances = data.shape[0]

        self.features = features

        #
        # assuming homogeneous features this could be restrictive
        # TODO: extend the whole code to categorical non homogeneous features
        self._feature_vals = feature_vals

        if self._feature_vals is None:
            self._feature_vals = \
                numpy.array([n_feature_vals
                             for i in range(self._n_features)])

        #
        # getting the max to pre-allocate the memory
        self._n_feature_vals = n_feature_vals
        if self._n_feature_vals is None:
            self._n_feature_vals = max(self._feature_vals)

        if self.features is None:
            self.features = numpy.array([i for i in range(self._n_features)])

        #
        # pre-allocating arrays for freqs and probs
        # self._marg_freqs = numpy.zeros(self._n_features)
        self._joint_freqs = numpy.zeros((self._n_features,
                                         self._n_features,
                                         self._n_feature_vals,
                                         self._n_feature_vals))
        self._log_marg_probs = numpy.zeros((self._n_features,
                                            self._n_feature_vals))
        self._log_joint_probs = numpy.zeros((self._n_features,
                                             self._n_features,
                                             self._n_feature_vals,
                                             self._n_feature_vals))
        self._log_cond_probs = numpy.zeros((self._n_features,
                                            self._n_features,
                                            self._n_feature_vals,
                                            self._n_feature_vals))
        self._mutual_info = numpy.zeros((self._n_features,
                                         self._n_features))

        #
        # computing freqs and probs (and smoothing)
        co_occ_matrix = self._data.T.dot(self._data)
        #
        # marginal frequencies
        if sparse:
            co_occ_matrix = numpy.array(co_occ_matrix.todense())
            self._marg_freqs = co_occ_matrix.diagonal()
        else:
            self._marg_freqs = co_occ_matrix.diagonal()

        self._log_marg_probs = self.log_marg_probs(self._marg_freqs,
                                                   self._log_marg_probs)
        #
        # joint estimation
        self._joint_freqs = self.joint_freqs(self._joint_freqs,
                                             co_occ_matrix)
        self._log_joint_probs = self.log_joint_probs(self._joint_freqs,
                                                     self._log_joint_probs)
        #
        # conditional estimation
        self._log_cond_probs = self.log_cond_probs(self._log_marg_probs,
                                                   self._log_joint_probs,
                                                   self._log_cond_probs)
        self._mutual_info = self.mutual_information(self._log_marg_probs,
                                                    self._log_joint_probs,
                                                    self._mutual_info)

        #
        # computing the MST (this way we are not overwriting mutual_info)
        # this can be useful for testing but not for efficiency
        # mst = minimum_spanning_tree(-self._mutual_info, copy=copy_mi)
        mst = minimum_spanning_tree(-(self._mutual_info + 1))
        dfs_tree = depth_first_order(mst, directed=False, i_start=0)

        #
        # representing the CLTree as a sequence of parents ids
        self._tree = numpy.zeros(self._n_features, dtype=int)

        # self._tree[0] = -1
        # the root is its parent
        self._tree[0] = 0

        for feature in range(1, self._n_features):
            self._tree[feature] = dfs_tree[1][feature]

        #
        # computing the factored represetation
        self._factors = numpy.zeros((self._n_features,
                                     self._n_feature_vals,
                                     self._n_feature_vals))
        self._factors = self.log_factors(self._log_marg_probs,
                                         self._log_joint_probs,
                                         self._factors)

        #
        # removing references,this is optional for test purposes
        if mem_free:
            self._mutual_info = None
            self._joint_freqs = None
            self._log_marg_probs = None
            self._log_joint_probs = None
            self._log_cond_probs = None
            self._marg_freqs = None
            self._data = None
Esempio n. 9
0
import scipy.sparse.csgraph as csg
import depth_first

i, j = zip((0, 1),
           (0, 2),
           (1, 2),
           (2, 3),
           (2, 5),
           (3, 4),
           (3, 6),
           (4, 6),
           (5, 7))

g = sparse.csr_matrix((np.ones(len(i)),(i,j)), shape=(8,8))
g = g + g.T

import timeit

o, p = depth_first.depth_first_order(g, 0)
print "order:", o
print "pred :", p

o, p = csg.depth_first_order(g, 0)
print "order:", o
print "pred :", p

# timing
print "my:", timeit.timeit('o, p = depth_first.depth_first_order(g, 0)', setup='from __main__ import *', number=100000)
print "scipy:", timeit.timeit('o, p = csg.depth_first_order(g, 0)', setup='from __main__ import *', number=100000)