def kmeans(data, k, norm="l2", n_init=1): """ data: matrix, #item * #feature """ if norm == None: km_model = KMeans(n_clusters=k, init='random', max_iter=500, n_init=n_init, n_jobs=1, verbose=False) km_model.fit(data) return km_model.labels_, km_model.cluster_centers_ data_norm = dl.norm_data(data, norm) km_model = KMeans(n_clusters=k, init='random', max_iter=500, n_init=n_init, n_jobs=1, verbose=False) km_model.fit(data_norm) # km_model.cluster_centers_ is k*N of <type 'numpy.ndarray'> # H: converted km_model.cluster_centers_ to csr_matrix, shape: k*N H = csr_matrix(km_model.cluster_centers_) H = H.todense() H = H + 0.1 # Add a small number to each element of the centroid matrix H_norm = dl.norm_data(H, norm) return km_model.labels_, H_norm
def conmf(datas, cluster_size, weights, regu_weights, norm="l2", seed="k-means", post="direct", method="pair-wise", gt=None): """ CoNMF on multi-view dataset (represented as datas). :param datas (type: list<csr_matrix>). [data_k] Each element data_k is a sparse matrix: `scipy.sparse` of format csr :param cluster_size (type: int). Number of clusters :param weights (type: list<int>). [weight_s]. Each element weight_s is an integer, denoting the weight of the view in CoNMF factorization (i.e. \lambda_s in paper [1]) :param regular_weights (type: list, 2-dimension). [[weight_st]] Each element weight_st is an integer, denoting the weight of view pair <s,t> in CoNMF regularization (i.e. \lambda_st in paper [1]) :param norm (type: string). Normalization scheme in CoNMF initialization and factorization. Values can be 'l2', 'l1' and 'l0': 'l2': each item vector is normalized by its Euclidean length (i.e. l2 distance). 'l1': each item vector is normalized by its sum of all elements (i.e. l1 distance). 'l0': the whole matrix is normalized by the sum of all elements (i.e. l1 normalization on the whole matrix). :param seed (type: string). Initialization method in CoNMF. Values can be 'k-means' and 'random': 'k-means': initialize W and H matrix using k-means results. The details are seen in paper [1] Section 4.5 'random': randomly initialize W and H matrix. :param post (type: string). Post processing on W matrix (m*k) to generate clustering result. Values can be 'direct' and 'k-means': 'direct': for each item vector (m*1), use the element with largest value as its cluster assignment. 'k-means': perform k-means on W matrix to get cluster assignment. :param method (type: string). Regularization method of CoNMF. Currently support two ways: 'pair-wise': pair-wise NMF, details in paper [1] Section 4.3 'cluster-wise': cluster-wise NMF, details in paper [1] Section 4.4 Note: experiments in paper [1] indicates 'pair-wise' performs slightly better than 'cluster-wise'. And also, 'pair-wise' is more efficient. :param gt (type: string). Groundtruth of clustering (type: list). Each element gt_i represents the cluster assignment of item_i. Note: this variable is only used for monitoring the performance in each iteration of CoNMF. If the value is none, remember to comment the codes that use the variable, otherwise the program may crash. Return: targets (type: list<int>): Cluster assignment of items. Each element represents the item's cluster id. Ws (type: list<csr_matrix>): W matrix of each view after CoNMF. Each element is a sparse matrix denoting the view's W matrix. Hs (type: list<csr_matrix>): H matrix of each view after CoNMF. Each element is a sparse matrix denoting the view's H matrix. """ if (len(datas) != len(weights)): print "Error! Length of datas != length of weights." return None # Normalize the data of each view. datas_norm = [] for i in range(0, len(datas)): data_norm = dl.norm_data(datas[i], norm) datas_norm.append(data_norm) Ws, Hs = conmf_factorize(method, datas_norm, weights, regu_weights, seed, post, norm, 100, cluster_size, gt) # By default, use the clustering result in last view as output for eval. targets = dl.get_targets(Ws[-1].T, post) return targets, Ws, Hs
def svd(data, k, norm, post="k-means", latent_K=-1): data_norm = dl.norm_data(data, norm) if latent_K == -1: U, S, V_T = svds(data_norm, k, which='LM') else: U, S, V_T = svds(data_norm, latent_K, which='LM') U = sp.csr_matrix(U) targets = dl.get_targets(U.T, post, k=k) return targets
def nmf(data, k, norm="l2", seed="random", post="direct", gt=None): """ NMF with Euclidean distance as the cost function. For comments on input parameters, please refer to conmf.conmf(). """ data_norm = dl.norm_data(data, "l2") # print "Running NMF on a matrix with size ",data.shape #nmf_model = nimfa.mf(data_norm, method = "nmf", max_iter = 200, min_residuals = 0.001,n_run =1, rank = k, update = 'euclidean', objective = 'div') W, H = factorize(data_norm, seed, post, norm, gt, k) #W is m*k, H is k*n targets = dl.get_targets(W.T, post) # clustering results. return targets, W, H