Exemple #1
0
    def page_rank_nibble(self, g, ref_node, vol, phi = 0.5, algorithm = 'fista', epsilon = 1.0e-2, max_iter = 10000, max_time = 100, cpp = True):
        """
           DESCRIPTION
           -----------

           Page Rank Nibble Algorithm. For details please refer to: 
           R. Andersen, F. Chung and K. Lang. Local Graph Partitioning using PageRank Vectors
           link: http://www.cs.cmu.edu/afs/cs/user/glmiller/public/Scientific-Computing/F-11/RelatedWork/local_partitioning_full.pdf
           The algorithm works on the connected component that the given reference node belongs to.

           PARAMETERS (mandatory)
           ----------------------

           g:         graph object       

           ref_node:  integer
                      The reference node, i.e., node of interest around which
                      we are looking for a target cluster.

           vol:       float, double
                      Lower bound for the volume of the output cluster.

           PARAMETERS (optional)
           ---------------------

           phi: float, double
                default == 0.5
                Target conductance for the output cluster.

           algorithm: string
                      default == 'fista'
                      Algorithm for spectral local graph clustering
                      Options: 'fista', 'ista', 'acl'.

           epsilon: float, double
                    default = 1.0e-2
                    Termination tolerance for l1-regularized PageRank, i.e., applies to FISTA and ISTA algorithms

           max_iter: integer
                     default = 10000
                     Maximum number of iterations of FISTA, ISTA or ACL.

           max_time: float, double
                     default = 100
                     Maximum time in seconds

           cpp: boolean
                default = True
                Use the faster C++ version of FISTA or not.

           RETURNS
           -------
           
           The output can be accessed from the localCluster object that calls this function.

           If cpp = False then the output is:

               node_embedding_nibble: numpy array, float 
                                      Approximate personalized PageRank vector

               best_cluster_nibble: list
                                    A list of nodes that correspond to the cluster with the best 
                                    conductance that was found by the algorithm.

               best_conductance_nibble: float
                                        Conductance value that corresponds to the cluster with the best 
                                        conductance that was found by the algorithm.

               sweep_profile_nibble: list of objects
                                     A two dimensional list of objects. For example,
                                     sweep_profile[0] contains a numpy array with all conductances for all
                                     clusters that were calculated by sweep_cut.
                                     sweep_profile[1] is a multidimensional list that contains the indices
                                     of all clusters that were calculated by sweep_cut. For example,
                                     sweep_profile[1][5] is a list that contains the indices of the 5th cluster
                                     that was calculated by sweep_cut. The set of indices in sweep_profile[1][5] also correspond 
                                     to conductance in sweep_profile[0][5]. The number of clusters is unknwon apriori 
                                     and depends on the data and that parameter setting of the algorithm.  

              volume_profile_nibble: list of objects
                                     A two dimensional list of objects which stores information about clusters
                                     which have volume larger than the input vol and les than 2/3 of the volume
                                     of the whole graph. For example, volume_profile[0] contains a list 
                                     with all conductances for all clusters that were calculated by sweep_cut and 
                                     also satisfy the previous volume constraint.
                                     volume_profile[1] is a multidimensional list that contains the indices
                                     of all clusters that were calculated by sweep_cut and also satisfy the previous 
                                     volume constraint. For example, volume_profile[1][5] is a list that contains the 
                                     indices of the 5th cluster that was calculated by sweep_cut and also satisfies 
                                     the previous volume constraint. The set of indices in volume_profile[1][5] also correspond 
                                     to conductance in volume_profile[0][5]. The number of clusters is unknwon apriori and 
                                     depends on the data and that parameter setting of the algorithm.

           If cpp = True then the output is:

               node_embedding_nibble: numpy array, float 
                                      Approximate personalized PageRank vector

               best_cluster_nibble: list
                                    A list of nodes that correspond to the cluster with the best 
                                    conductance that was found by the algorithm.

               best_conductance_nibble: float
                                        Conductance value that corresponds to the cluster with the best 
                                        conductance that was found by the algorithm.
        """ 
        n = g.A.shape[0]
        nodes = range(n)
        g_copy = g

        m = g_copy.A.count_nonzero()/2

        B = np.log2(m)

        if vol < 0:
            print("The input volume must be non-negative")
            return [], [], [], [], []
        if vol == 0:
            vol_user = 1
        else:
            vol_user = vol

        b = 1 + np.log2(vol_user)

        b = min(b,B)

        alpha = (phi**2)/(225*np.log(100*np.sqrt(m)))

        rho = (1/(2**b))*(1/(48*B))

        if algorithm == 'fista':
            if not cpp:
                p = fista_dinput_dense(ref_node, g_copy, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time)
            else:
                uint_indptr = np.uint32(g.A.indptr) 
                uint_indices = np.uint32(g.A.indices)

                (not_converged,grad,p) = proxl1PRaccel(uint_indptr, uint_indices, g.A.data, ref_node, g.d, g.d_sqrt, g.dn_sqrt, alpha = alpha, rho = rho, epsilon = epsilon, maxiter = max_iter, max_time = max_time)
        elif algorithm == 'ista':
            p = ista_dinput_dense(ref_node, g_copy, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time)
        elif algorithm == 'acl':
            p = acl_list(ref_node, g_copy, alpha = alpha, rho = rho, max_iter = max_iter, max_time = max_time)
        else:
            print("There is no such algorithm provided")
            return [], [], [], []

        sweep = sweepCut()    

        if not cpp:
            sweep.sweep_normalized(p,g_copy,vol)

            for i in range(len(sweep.sweep_profile[0])):
                sweep.sweep_profile[1][i] = [nodes[j] for j in sweep.sweep_profile[1][i]]

            for i in range(len(sweep.volume_profile[0])):
                sweep.volume_profile[1][i] = [nodes[j] for j in sweep.volume_profile[1][i]]

            sweep.best_cluster = [nodes[i] for i in sweep.best_cluster]

            self.node_embedding_nibble = p
            self.best_cluster_nibble = sweep.best_cluster
            self.best_conductance_nibble = sweep.best_conductance
            self.sweep_profile_nibble = sweep.sweep_profile
            self.volume_profile_nibble = sweep.volume_profile
        else:
            n = g.A.shape[0]

            sweep.sweep_cut_cpp(p,g)     

            self.node_embedding_nibble = p
            self.best_cluster_nibble = sweep.best_cluster
            self.best_conductance_nibble = sweep.best_conductance
Exemple #2
0
    def fista(self, ref_node, g, alpha = 0.15, rho = 1.0e-5, epsilon = 1.0e-6, max_iter = 10000, vol_G = -1, max_time = 100, cpp = True):
        """DESCRIPTION
           -----------

           Fast Iterative Soft Thresholding Algorithm (FISTA). This algorithm solves the l1-regularized
           personalized PageRank problem using an accelerated version of ISTA. It rounds the solution 
           using sweep cut.

           The l1-regularized personalized PageRank problem is defined as

           min rho*||p||_1 + <c,p> + <p,Q*p>

           where p is the PageRank vector, ||p||_1 is the l1-norm of p, rho is the regularization parameter 
           of the l1-norm, c is the right hand side of the personalized PageRank linear system and Q is the 
           symmetrized personalized PageRank matrix.    

           For details regarding ISTA please refer to: 
           K. Fountoulakis, F. Roosta-Khorasani, J. Shun, X. Cheng and M. Mahoney. Variational 
           Perspective on Local Graph Clustering. arXiv:1602.01886, 2017.
           arXiv link:https://arxiv.org/abs/1602.01886 

           PARAMETERS (mandatory)
           ----------------------

           ref_node: integer
                     The reference node, i.e., node of interest around which
                     we are looking for a target cluster.

           g: graph object

           PARAMETERS (optional)
           ---------------------

           alpha: float, double
                  default == 0.15
                  Teleportation parameter of the personalized PageRank linear system.
                  The smaller the more global the personalized PageRank vector is.

           rho:   float, double
                  defaul == 1.0e-5
                  Regularization parameter for the l1-norm of the model.

           For details of these parameters please refer to: K. Fountoulakis, F. Roosta-Khorasani, 
           J. Shun, X. Cheng and M. Mahoney. Variational Perspective on Local Graph Clustering. arXiv:1602.01886, 2017
           arXiv link:https://arxiv.org/abs/1602.01886 

           epsilon: float, double
                    default == 1.0e-6
                    Tolerance for FISTA for solving the l1-regularized personalized PageRank problem.

           max_iter: integer
                     default = 10000
                     Maximum number of iterations of FISTA.
                     
           max_time: float, double
                     default = 100
                     Maximum time in seconds

           cpp: boolean
                default = True
                Use the faster C++ version of FISTA or not.

           RETURNS
           -------

           The output can be accessed from the localCluster object that calls this function.

           If cpp = False then the output is:

               node_embedding_fista: numpy array, float
                                     Approximate personalized PageRank vector

               best_cluster_fista: list
                                   A list of nodes that correspond to the cluster with the best 
                                   conductance that was found by FISTA.

               best_conductance:  float, double
                                  Conductance value that corresponds to the cluster with the best 
                                  conductance that was found by FISTA.

               sweep_profile_fista: list of objects
                                    A two dimensional list of objects. For example,
                                    sweep_profile[0] contains a numpy array with all conductances for all
                                    clusters that were calculated by sweep_cut.
                                    sweep_profile[1] is a multidimensional list that contains the indices
                                    of all clusters that were calculated by sweep_cut. For example,
                                    sweep_profile[1,5] is a list that contains the indices of the 5th cluster
                                    that was calculated by sweep_cut. The set of indices in sweep_profile[1][5] also correspond 
                                    to conductance in sweep_profile[0][5]. The number of clusters is unknwon apriori 
                                    and depends on the data and that parameter setting of FISTA.

           If cpp = True then the output is:

               node_embedding_fista: numpy array, float
                                     Approximate personalized PageRank vector

               best_cluster_fista: list
                                   A list of nodes that correspond to the cluster with the best 
                                   conductance that was found by FISTA.

               best_conductance_fista: float, double
                                       Conductance value that corresponds to the cluster with the best 
                                       conductance that was found by FISTA.
        """
        sweep = sweepCut()

        if not cpp:
            self.node_embedding_fista = fista_dinput_dense(ref_node, g, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time)

            sweep.sweep_normalized(self.node_embedding_fista,g)

            self.best_cluster_fista = sweep.best_cluster
            self.best_conductance_fista = sweep.best_conductance
            self.sweep_profile_fista = sweep.sweep_profile
        else:

            uint_indptr = np.uint32(g.A.indptr) 
            uint_indices = np.uint32(g.A.indices)

            (not_converged,grad,self.node_embedding_fista) = proxl1PRaccel(uint_indptr, uint_indices, g.A.data, ref_node, g.d, g.d_sqrt, g.dn_sqrt, alpha = alpha, rho = rho, epsilon = epsilon, maxiter = max_iter, max_time = max_time)

            n = g.A.shape[0]

            sweep.sweep_cut_cpp(self.node_embedding_fista,g)  

            self.best_cluster_fista = sweep.best_cluster
            self.best_conductance_fista = sweep.best_conductance
def page_rank_nibble_algo(g,
                          ref_node,
                          vol,
                          phi=0.5,
                          algorithm='fista',
                          epsilon=1.0e-2,
                          max_iter=10000,
                          max_time=100,
                          cpp=True):
    """
    Page Rank Nibble Algorithm. For details please refer to: 
    R. Andersen, F. Chung and K. Lang. Local Graph Partitioning using PageRank Vectors
    link: http://www.cs.cmu.edu/afs/cs/user/glmiller/public/Scientific-Computing/F-11/RelatedWork/local_partitioning_full.pdf
    The algorithm works on the connected component that the given reference node belongs to.

    This method stores the results in the class attribute page_rank_nibble_transformation.

    Parameters (mandatory)
    ----------------------

    g: graph object       

    ref_node:  integer
        The reference node, i.e., node of interest around which
        we are looking for a target cluster.

    vol: float, double
        Lower bound for the volume of the output cluster.

    Parameters (optional)
    ---------------------

    phi: float64
        Default == 0.5
        Target conductance for the output cluster.

    algorithm: string
        Default == 'fista'
        Algorithm for spectral local graph clustering
        Options: 'fista', 'ista', 'acl'.

    epsilon: float64
        Default = 1.0e-2
        Termination tolerance for l1-regularized PageRank, i.e., applies to FISTA and ISTA algorithms

    max_iter: int
        default = 10000
        Maximum number of iterations of FISTA, ISTA or ACL.

    max_time: float64
        default = 100
        Maximum time in seconds

    cpp: bool
        default = True
        Use the faster C++ version of FISTA or not.
    """
    n = g.adjacency_matrix.shape[0]
    nodes = range(n)

    m = g.adjacency_matrix.count_nonzero() / 2

    B = np.log2(m)

    if vol < 0:
        print("The input volume must be non-negative")
        return [], [], [], [], []
    if vol == 0:
        vol_user = 1
    else:
        vol_user = vol

    b = 1 + np.log2(vol_user)

    b = min(b, B)

    alpha = (phi**2) / (225 * np.log(100 * np.sqrt(m)))

    rho = (1 / (2**b)) * (1 / (48 * B))

    if algorithm == 'fista':
        if not cpp:
            p = fista_dinput_dense(ref_node,
                                   g,
                                   alpha=alpha,
                                   rho=rho,
                                   epsilon=epsilon,
                                   max_iter=max_iter,
                                   max_time=max_time)
        else:
            uint_indptr = np.uint32(g.adjacency_matrix.indptr)
            uint_indices = np.uint32(g.adjacency_matrix.indices)

            (not_converged, grad, p) = proxl1PRaccel(uint_indptr,
                                                     uint_indices,
                                                     g.adjacency_matrix.data,
                                                     ref_node,
                                                     g.d,
                                                     g.d_sqrt,
                                                     g.dn_sqrt,
                                                     g.lib,
                                                     alpha=alpha,
                                                     rho=rho,
                                                     epsilon=epsilon,
                                                     maxiter=max_iter,
                                                     max_time=max_time)
            p = np.abs(p)
    elif algorithm == 'ista':
        p = ista_dinput_dense(ref_node,
                              g,
                              alpha=alpha,
                              rho=rho,
                              epsilon=epsilon,
                              max_iter=max_iter,
                              max_time=max_time)
    elif algorithm == 'acl':
        p = acl_list(ref_node,
                     g,
                     alpha=alpha,
                     rho=rho,
                     max_iter=max_iter,
                     max_time=max_time)
    else:
        raise Exception("There is no such algorithm provided")

    return p
    def produce(self,
                inputs: Sequence[Input],
                ref_nodes: Sequence[int],
                ys: Sequence[Sequence[float]] = None,
                timeout: float = 100,
                iterations: int = 1000,
                alpha: float = 0.15,
                rho: float = 1.0e-6,
                epsilon: float = 1.0e-2,
                cpp: bool = True) -> Sequence[Output]:
        """
        Computes an l1-regularized PageRank vector. 
        
        Uses the Fast Iterative Soft Thresholding Algorithm (FISTA). This algorithm solves the l1-regularized
        personalized PageRank problem.

        The l1-regularized personalized PageRank problem is defined as

        min rho*||p||_1 + <c,p> + <p,Q*p>

        where p is the PageRank vector, ||p||_1 is the l1-norm of p, rho is the regularization parameter 
        of the l1-norm, c is the right hand side of the personalized PageRank linear system and Q is the 
        symmetrized personalized PageRank matrix.

        For details please refer to: 
        K. Fountoulakis, F. Roosta-Khorasani, J. Shun, X. Cheng and M. Mahoney. Variational 
        Perspective on Local Graph Clustering. arXiv:1602.01886, 2017.
        arXiv link:https://arxiv.org/abs/1602.01886

        Parameters
        ----------

        inputs: Sequence[Graph]

        ref_nodes: Sequence[int]
            A sequence of reference nodes, i.e., nodes of interest around which
            we are looking for a target cluster.

        Parameters (optional)
        ---------------------

        ys: Sequence[Sequence[float]]
            Defaul == None
            Initial solutions for l1-regularized PageRank algorithm.
            If not provided then it is initialized to zero.
            This is only used for the C++ version of FISTA.

        alpha: float
            Default == 0.15
            Teleportation parameter of the personalized PageRank linear system.
            The smaller the more global the personalized PageRank vector is.

        rho: float
            Defaul == 1.0e-5
            Regularization parameter for the l1-norm of the model.
            
        epsilon: float64
            Default == 1.0e-2
            Tolerance for FISTA for solving the l1-regularized personalized PageRank problem.
            
        iterations: int
            Default = 100000
            Maximum number of iterations of FISTA algorithm.
                     
        timeout: float
            Default = 100
            Maximum time in seconds.
            
        cpp: boolean
            Default = True
            Use the faster C++ version of FISTA or not.            
            
        Returns
        -------
        
        For each graph in inputs it returns the following:
        
        An np.ndarray (1D embedding) of the nodes for each graph.
        """

        if not cpp:
            return [
                fista_dinput_dense(ref_nodes[i],
                                   inputs[i],
                                   alpha=alpha,
                                   rho=rho,
                                   epsilon=epsilon,
                                   max_iter=iterations,
                                   max_time=timeout)
                for i in range(len(inputs))
            ]

        else:
            if ys == None:
                return [
                    proxl1PRaccel(
                        np.uint32(inputs[i].adjacency_matrix.indptr),
                        np.uint32(inputs[i].adjacency_matrix.indices),
                        inputs[i].adjacency_matrix.data,
                        ref_nodes[i],
                        inputs[i].d,
                        inputs[i].d_sqrt,
                        inputs[i].dn_sqrt,
                        alpha=alpha,
                        rho=rho,
                        epsilon=epsilon,
                        maxiter=iterations,
                        max_time=timeout)[2] for i in range(len(inputs))
                ]
            else:
                return [
                    proxl1PRaccel(
                        np.uint32(inputs[i].adjacency_matrix.indptr),
                        np.uint32(inputs[i].adjacency_matrix.indices),
                        inputs[i].adjacency_matrix.data,
                        ref_nodes[i],
                        inputs[i].d,
                        inputs[i].d_sqrt,
                        inputs[i].dn_sqrt,
                        ys[i],
                        alpha=alpha,
                        rho=rho,
                        epsilon=epsilon,
                        maxiter=iterations,
                        max_time=timeout)[2] for i in range(len(inputs))
                ]
def multiclass_label_prediction_algo(labels, g, alpha = 0.15, rho = 1.0e-10, epsilon = 1.0e-2, max_iter = 10000, max_time = 100, cpp = True):
    """
    This function predicts labels for unlabelled nodes. For details refer to:
    D. Gleich and M. Mahoney. Variational 
    Using Local Spectral Methods to Robustify Graph-Based Learning Algorithms. SIGKDD 2015.
    https://www.stat.berkeley.edu/~mmahoney/pubs/robustifying-kdd15.pdf
       
    Parameters (mandatory)
    ----------------------
       
    labels: list of lists
        Each list of this list corresponds to indices of nodes that are assumed to belong in
        a certain class. For example, list[i] is a list of indices of nodes that are assumed to 
        belong in class i.
                  
    g: graph object
       
    Parameters (optional)
    ---------------------
      
    alpha: float, double
        Default == 0.15
        Teleportation parameter of the personalized PageRank linear system.
        The smaller the more global the personalized PageRank vector is.
          
    rho: float, double
        Defaul == 1.0e-10
        Regularization parameter for the l1-norm of the model.
          
    epsilon: float, double
        Default == 1.0e-2
        Tolerance for FISTA for solving the l1-regularized personalized PageRank problem.
          
    max_iter: integer
        Default = 10000
        Maximum number of iterations of FISTA
                 
    max_time: float, double
        Default = 100
        Maximum time in seconds
                 
    cpp: bool
        default = True
        Use the faster C++ version of FISTA or not.
       
       Returns
       -------
       
       A list of three objects.
       
       output 0: list of indices that holds the class for each node.
           For example classes[i] is the class of node i.
       
       output 1: list of lists. Each componenent of the list is a list that holds the rank
           of the nodes for each class. For details see [1].
       
       output 2: a list of numpy arrays. Each array in this list corresponds to the diffusion vector
           returned by personalized PageRank for each rank. For details see [1].
                   
       [1] D. Gleich and M. Mahoney. Variational 
       Using Local Spectral Methods to Robustify Graph-Based Learning Algorithms. SIGKDD 2015.
       https://www.stat.berkeley.edu/~mmahoney/pubs/robustifying-kdd15.pdf
    """   
   
    n = g.adjacency_matrix.shape[0]
    
    output = [[],[],[]]

    for labels_i in labels: 
        
        if not cpp:
    
            output_fista = fista_dinput_dense(labels_i, g, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time)
        else: 
            uint_indptr = np.uint32(g.adjacency_matrix.indptr) 
            uint_indices = np.uint32(g.adjacency_matrix.indices)
        
            (not_converged,grad,output_fista) = proxl1PRaccel(uint_indptr, uint_indices, g.adjacency_matrix.data, labels_i, g.d, g.d_sqrt, g.dn_sqrt, alpha = alpha, rho = rho, epsilon = epsilon, maxiter = max_iter, max_time = max_time)
        
        p = np.zeros(n)
        for i in range(n):
            p[i] = output_fista[i]
        
        output[0].append(p)
        
        index = (-p).argsort(axis=0)
        rank = np.empty(n, int)
        rank[index] = np.arange(n)
        
        output[1].append(rank)
        
    l_labels = len(labels)
    
    for i in range(n):
        min_rank = n+1
        class_ = l_labels + 1
        for j in range(l_labels):
            rank = output[1][j][i]
            if rank < min_rank:
                min_rank = rank
                class_ = j
        output[2].append(class_)
        
    return output