Ejemplo n.º 1
0
def factorize(data, seed, post, norm, gt, rank, max_iter=200):
    """
    The factorization of NMF, data = W*H. 
    The input gt (groundtruth) is only for monitoring performance of each iteration.
    
    Note: since calculating the cost function is too slow, we can only use the number of iteration as the stopping critera for efficiency issue. 
    Return: W (m*k) and H (k*n) matrix.
    """
    V = data
    W, H = initialize(V, rank, seed=seed, norm=norm)
    iter = 0
    while iter <= max_iter:
        targets = dl.get_targets(W.T, post)
        """
        #Add a function of counting #items in each cluster
        clusters = np.unique(targets)
        count_arr = [0 for i in range(0,len(clusters))]
        for c in targets:
            count_arr[c]+=1
        print sorted(count_arr)
        """
        if gt != None:
            A = metrics.accuracy(gt, targets)
            F1 = metrics.f_measure(gt, targets)
            #print "Iter = %d, Acc = %f, F1 = %f" %(iter,A,F1)

        W, H = euclidean_update(V, W, H, norm)
        W, H = _adjustment(W, H)
        iter += 1

    return W, H
Ejemplo n.º 2
0
def conmf(datas,
          cluster_size,
          weights,
          regu_weights,
          norm="l2",
          seed="k-means",
          post="direct",
          method="pair-wise",
          gt=None):
    """
    CoNMF on multi-view dataset (represented as datas).
    
    :param datas (type: list<csr_matrix>). [data_k] 
        Each element data_k is a sparse matrix: `scipy.sparse` of format csr
    :param cluster_size (type: int). Number of clusters 
    :param weights (type: list<int>). [weight_s]. 
        Each element weight_s is an integer, denoting the weight of the view in CoNMF factorization (i.e. \lambda_s in paper [1])
    :param regular_weights (type: list, 2-dimension). [[weight_st]]
        Each element weight_st is an integer, denoting the weight of view pair <s,t> in CoNMF regularization (i.e. \lambda_st in paper [1])
    :param norm (type: string). Normalization scheme in CoNMF initialization and factorization. Values can be 'l2', 'l1' and 'l0':
        'l2': each item vector is normalized by its Euclidean length (i.e. l2 distance).
        'l1': each item vector is normalized by its sum of all elements (i.e. l1 distance). 
        'l0': the whole matrix is normalized by the sum of all elements (i.e. l1 normalization on the whole matrix).
    :param seed (type: string). Initialization method in CoNMF. Values can be 'k-means' and 'random':
        'k-means': initialize W and H matrix using k-means results. The details are seen in paper [1] Section 4.5
        'random': randomly initialize W and H matrix. 
    :param post (type: string). Post processing on W matrix (m*k) to generate clustering result. Values can be 'direct' and 'k-means':
        'direct': for each item vector (m*1), use the element with largest value as its cluster assignment.
        'k-means': perform k-means on W matrix to get cluster assignment. 
    :param method (type: string). Regularization method of CoNMF. Currently support two ways:
        'pair-wise': pair-wise NMF, details in paper [1] Section 4.3
        'cluster-wise': cluster-wise NMF, details in paper [1] Section 4.4
        Note: experiments in paper [1] indicates 'pair-wise' performs slightly better than 'cluster-wise'. And also, 'pair-wise' is more efficient. 
    :param gt (type: string). Groundtruth of clustering (type: list). Each element gt_i represents the cluster assignment of item_i. 
        Note: this variable is only used for monitoring the performance in each iteration of CoNMF. 
        If the value is none, remember to comment the codes that use the variable, otherwise the program may crash.
        
    Return:
    targets (type: list<int>): Cluster assignment of items. Each element represents the item's cluster id. 
    Ws (type: list<csr_matrix>): W matrix of each view after CoNMF. Each element is a sparse matrix denoting the view's W matrix. 
    Hs (type: list<csr_matrix>): H matrix of each view after CoNMF. Each element is a sparse matrix denoting the view's H matrix.
    """

    if (len(datas) != len(weights)):
        print "Error! Length of datas != length of weights."
        return None

    # Normalize the data of each view.
    datas_norm = []
    for i in range(0, len(datas)):
        data_norm = dl.norm_data(datas[i], norm)
        datas_norm.append(data_norm)

    Ws, Hs = conmf_factorize(method, datas_norm, weights, regu_weights, seed,
                             post, norm, 100, cluster_size, gt)

    # By default, use the clustering result in last view as output for eval.
    targets = dl.get_targets(Ws[-1].T, post)
    return targets, Ws, Hs
Ejemplo n.º 3
0
def svd(data, k, norm, post="k-means", latent_K=-1):
    data_norm = dl.norm_data(data, norm)

    if latent_K == -1:
        U, S, V_T = svds(data_norm, k, which='LM')
    else:
        U, S, V_T = svds(data_norm, latent_K, which='LM')

    U = sp.csr_matrix(U)
    targets = dl.get_targets(U.T, post, k=k)
    return targets
Ejemplo n.º 4
0
def nmf(data, k, norm="l2", seed="random", post="direct", gt=None):
    """
    NMF with Euclidean distance as the cost function.
    For comments on input parameters, please refer to conmf.conmf().
    """
    data_norm = dl.norm_data(data, "l2")

    # print "Running NMF on a matrix with size ",data.shape
    #nmf_model = nimfa.mf(data_norm, method = "nmf", max_iter = 200, min_residuals = 0.001,n_run =1, rank = k, update = 'euclidean', objective = 'div')
    W, H = factorize(data_norm, seed, post, norm, gt, k)  #W is m*k, H is k*n

    targets = dl.get_targets(W.T, post)  # clustering results.

    return targets, W, H
Ejemplo n.º 5
0
def conmf_factorize(method,
                    datas,
                    weights,
                    regu_weights,
                    seed,
                    post,
                    norm,
                    max_iter,
                    rank,
                    gt=None):
    """
    Factorization process of CoNMF.
    
    :param max_iter (type: int). Maximum iterations of executing CoNMF update rules.
    :param rank (type: int). Number of latent factors in NMF factorization. For clustering application, it is typicall set as the number of clusters.
    
    Other parameters are with same meaning of conmf().
    """
    if method not in ["pair-wise", "cluster-wise"]:
        print "Error! Method not in [pair-wise, cluster-wise]!"
        return None

    Ws, Hs = conmf_initialize(datas, rank, seed, weights, norm)

    targets, As, F1s = [], [], []
    iter_num = 0
    while iter_num <= max_iter:
        targets = [dl.get_targets(W.T, post) for W in Ws]
        As = [
            "{0:.4f}".format(metrics.accuracy(gt, target))
            for target in targets
        ]
        F1s = [
            "{0:.4f}".format(metrics.f_measure(gt, target))
            for target in targets
        ]
        if iter_num == 0:
            print "\t\t CoNMF Inits \t Acc = %s;\t F1 = %s " % (str(As),
                                                                str(F1s))
        #print "\t\t Iter = %d: \t Acc = %s;\t F1 = %s " %(iter_num, str(As), str(F1s))
        Ws, Hs = conmf_update(datas, Ws, Hs, weights, regu_weights, norm,
                              method)
        #cost = conmf_cost(Vs,Ws,Hs, weights, mutual_weights, norm, method)
        if iter_num == max_iter:
            print "\t\t CoNMF Ends \t Acc = %s;\t F1 = %s " % (str(As),
                                                               str(F1s))
        iter_num += 1
    return Ws, Hs