# 1) Frist, traditional NMF # D_nmf_pca = X_pca_all[-k_cluster:, :] if np.min(X_pca) < 0: X_for_nmf = X_pca - np.min(X_pca) else: X_for_nmf = X_pca X_for_nmf = normalize(X_for_nmf) * _NF # X_for_nmf = my_normalize(X_for_nmf) * _NF # D_nmf, _, label_nmf = nmf_clustering(X_for_nmf, k_cluster, numIter = 1000) # D_nmf_pca = pca.transform(D_nmf) # ipdb.set_trace() t1 = time.time() D_nmf, _, label_nmf = my_nmf_clustering(X_for_nmf, k_cluster, numIter = numIter) t2 = time.time() t_nmf = t2 - t1 acc_nmf, AMI_nmf = evaluation_clustering(label_nmf, Y) print(' ------ nmf final accuracy = {:.4f}, AMI = {:.4f}'.format(acc_nmf, AMI_nmf)) # D_nmf_pca = my_normalize(D_nmf) * _NF D_nmf_pca = normalize(D_nmf) * _NF df_centroids_nmf = pd.DataFrame(D_nmf_pca, columns = pca_cols) df_centroids_nmf['label'] = ['NMF cell type {}'.format(x) for x in range(1, k_cluster + 1)] # test df_centroids_nmf.to_pickle('results_logging/nmf_centroid_df') X = normalize(X_pca) * _NF # X = my_normalize(X_pca) * _NF # X = X_pca
def cvx_online_dict_learning(X, y_true, n_hat, k_cluster, T, lmda, eps, flag=True, version = 'Rr'): ''' X: R^(n * m) y_true: str^n W_0: R^(n_hat * k) x_i : R^m alpha: R^k cvx_online problem min||x_i - X.T * W * alpha|| + lambda * ||alpha|| in the online setting, there is no X in (n * m), instead, we need to store a candidate set and solve the subproblem: min ||x_i - X_hat * W_hat * alpha|| + lambda * ||alpha|| X_hat : R^(m * n_hat) W_hat : R^(n_hat * k) version: Rr, restricted, heuristic approach Ru, uniform, random assignment ''' n_dim, m_dim = X.shape A_t = np.zeros((k_cluster, k_cluster)) B_t = np.zeros((m_dim, k_cluster)) x_sum = 0 alpha_sum = 0 # step 1: sample n_hat * k_cluster points as initial X_hat. X_0 = np.zeros((m_dim, n_hat)) for idx in range(n_hat): sample_idx = np.random.randint(0, n_dim) x_sample = X[sample_idx, :] X_0[:, idx] = x_sample # step 1: initialization, get X_hat (including clusters info) # and W_hat from X_0, using same init as in CNMF. # here representative_size_count is the n_1_hat, n_2_hat, ..., n_k_hat. t1 = time.time() X_hat, W_hat, representative_size_count = initialize_X_W_hat(X_0, k_cluster) X_0, W_0 = X_hat.copy(), W_hat.copy() t2 = time.time() # print('init cost {:.4f}'.format(t2 - t1)) # step 2: after initialization of X_hat, update alpha, W_hat and X_hat alternatively. t_start = time.time() print(lmda, _NF, eps) for t in range(T): # t_start_online = time.time() if t % 50 == 0 and flag: D_t = np.matmul(X_hat, W_hat) tmp_assignment = get_clustering_assignment_1(X, D_t, k_cluster) tmp_acc, tmp_AMI = evaluation_clustering(tmp_assignment, y_true) print('1)iteration {}, distance acc = {:.4f}, AMI = {:.4f}'.format(t, tmp_acc, tmp_AMI)) tmp_assignment = get_clustering_assignment_2(X, D_t, k_cluster, lmda) tmp_acc, tmp_AMI = evaluation_clustering(tmp_assignment, y_true) print('2)iteration {}, kmeans of weights acc = {:.4f}, AMI = {:.4f}'.format(t, tmp_acc, tmp_AMI)) t_end = time.time() print('time elapse = {:.4f}s'.format(t_end - t_start)) t_start = t_end print('-' * 7) sample_idx = np.random.randint(0, n_dim) x_sample = X[sample_idx, :] # update alpha t1 = time.time() lars_lasso = LassoLars(alpha = lmda, max_iter = 500) D_t = np.matmul(X_hat, W_hat) lars_lasso.fit(D_t, x_sample) alpha_t = lars_lasso.coef_ t2 = time.time() # print('lasso cost {:.4f}s'.format(t2 - t1)) # using different clustering assignment t1 = time.time() if version == 'Rr': cluster_of_x_i = np.argmax(alpha_t) # elif version == 'Ru': else: cluster_of_x_i = int(np.random.uniform(0, k_cluster)) t2 = time.time() # print('argmax alpha cost {:.4f}s'.format(t2 - t1)) t1 = time.time() A_t += np.matmul(alpha_t.reshape(k_cluster, 1), alpha_t.reshape(1, k_cluster)) B_t += np.matmul(x_sample.reshape(m_dim, 1), alpha_t.reshape(1, k_cluster)) x_sum += (np.linalg.norm(x_sample) ** 2) alpha_sum += lmda * np.linalg.norm(alpha_t, 1) t2 = time.time() # print('update At, Bt cost {:.4f}s'.format(t2 - t1)) # update X_hat t1 = time.time() W_hat, X_hat = update_W_X_hat(W_hat, X_hat, representative_size_count, x_sample, cluster_of_x_i, A_t, B_t, x_sum, alpha_sum, t, eps) t2 = time.time() # print('update X_hat, W_hat cost {:.4f}s'.format(t2 - t1)) print('Dcitionary update done! Time elapse {:.04f}s'.format(time.time() - t_start)) return W_hat, X_hat, representative_size_count, X_0, W_0
Y, n_hat, k_cluster, numIter, lmda, eps, flag=False, version='Ru') t2 = time.time() t_ocmf += (t2 - t1) D_final_tmp = np.matmul(X_hat_tmp, W_hat_tmp) # clustered_label = get_clustering_assignment_1(X, D_final) clustered_label_ocmf = get_clustering_assignment_2( X, D_final_tmp, k_cluster, lmda) acc_tmp, AMI_tmp = evaluation_clustering(clustered_label_ocmf, Y) acc_array.append(acc_tmp) if acc_tmp >= acc: W_hat = W_hat_tmp X_hat = X_hat_tmp X_0 = X_0_tmp W_0 = W_0_tmp D_final = D_final_tmp acc = acc_tmp AMI = AMI_tmp repre_size_count = repre_size_count_tmp if acc >= 0.9: break acc_aver = np.mean(acc_array) t_ocmf_Ru = t_ocmf / (round_num + 1) print(' ------ Ru ocmf final accuracy = {:.4f}, AMI = {:.4f}'.format(