Exemple #1
0
    # 1) Frist, traditional NMF
    # D_nmf_pca = X_pca_all[-k_cluster:, :]
    if np.min(X_pca) < 0:
        X_for_nmf = X_pca - np.min(X_pca)
    else:
        X_for_nmf = X_pca
    X_for_nmf = normalize(X_for_nmf) * _NF
    # X_for_nmf = my_normalize(X_for_nmf) * _NF
    # D_nmf, _, label_nmf = nmf_clustering(X_for_nmf, k_cluster, numIter = 1000)
    # D_nmf_pca = pca.transform(D_nmf)
    # ipdb.set_trace()
    t1 = time.time()
    D_nmf, _, label_nmf = my_nmf_clustering(X_for_nmf, k_cluster, numIter = numIter)
    t2 = time.time()
    t_nmf = t2 - t1
    acc_nmf, AMI_nmf = evaluation_clustering(label_nmf, Y)

    print(' ------ nmf final accuracy = {:.4f}, AMI = {:.4f}'.format(acc_nmf, AMI_nmf))

    # D_nmf_pca = my_normalize(D_nmf) * _NF
    D_nmf_pca = normalize(D_nmf) * _NF
    df_centroids_nmf = pd.DataFrame(D_nmf_pca, columns = pca_cols)
    df_centroids_nmf['label'] = ['NMF cell type {}'.format(x) for x in range(1, k_cluster + 1)]

    # test
    df_centroids_nmf.to_pickle('results_logging/nmf_centroid_df')

    X = normalize(X_pca) * _NF
    # X = my_normalize(X_pca) * _NF
    # X = X_pca
Exemple #2
0
def cvx_online_dict_learning(X, y_true, n_hat, k_cluster, T, lmda, eps, 
        flag=True, version = 'Rr'):
    '''
    X: R^(n * m)
    y_true: str^n
    W_0: R^(n_hat * k)
    x_i : R^m
    alpha: R^k
    cvx_online problem 
        min||x_i - X.T * W * alpha|| + lambda * ||alpha||

    in the online setting, there is no X in (n * m), 
    instead, we need to store a candidate set and solve the subproblem:
        min ||x_i - X_hat * W_hat * alpha|| + lambda * ||alpha||

    X_hat : R^(m * n_hat)
    W_hat : R^(n_hat * k)

    version: Rr, restricted, heuristic approach
             Ru, uniform, random assignment
    '''
    n_dim, m_dim = X.shape

    A_t = np.zeros((k_cluster, k_cluster))
    B_t = np.zeros((m_dim, k_cluster))
    x_sum = 0
    alpha_sum = 0

    # step 1: sample n_hat * k_cluster points as initial X_hat.
    X_0 = np.zeros((m_dim, n_hat))
    for idx in range(n_hat):
        sample_idx = np.random.randint(0, n_dim)
        x_sample = X[sample_idx, :]
        X_0[:, idx] = x_sample


    # step 1: initialization, get X_hat (including clusters info)
    # and W_hat from X_0, using same init as in CNMF.
    # here representative_size_count is the n_1_hat, n_2_hat, ..., n_k_hat.
    t1 = time.time()
    X_hat, W_hat, representative_size_count = initialize_X_W_hat(X_0, k_cluster)
    X_0, W_0 = X_hat.copy(), W_hat.copy()
    t2 = time.time()
    # print('init cost {:.4f}'.format(t2 - t1))
    
    # step 2: after initialization of X_hat, update alpha, W_hat and X_hat alternatively.
    t_start = time.time()
    print(lmda, _NF, eps)
    for t in range(T):
        # t_start_online = time.time()
        if t % 50 == 0 and flag:
            D_t = np.matmul(X_hat, W_hat)
            tmp_assignment = get_clustering_assignment_1(X, D_t, k_cluster)
            tmp_acc, tmp_AMI = evaluation_clustering(tmp_assignment, y_true)
            print('1)iteration {}, distance acc = {:.4f}, AMI = {:.4f}'.format(t, tmp_acc, tmp_AMI))

            tmp_assignment = get_clustering_assignment_2(X, D_t, k_cluster, lmda)
            tmp_acc, tmp_AMI = evaluation_clustering(tmp_assignment, y_true)
            print('2)iteration {}, kmeans of weights acc = {:.4f}, AMI = {:.4f}'.format(t, tmp_acc, tmp_AMI))
            t_end = time.time()
            print('time elapse = {:.4f}s'.format(t_end - t_start))
            t_start = t_end

            print('-' * 7)


        sample_idx = np.random.randint(0, n_dim)
        x_sample = X[sample_idx, :]

        # update alpha
        t1 = time.time()
        lars_lasso = LassoLars(alpha = lmda, max_iter = 500)
        D_t = np.matmul(X_hat, W_hat)
        lars_lasso.fit(D_t, x_sample)
        alpha_t = lars_lasso.coef_
        t2 = time.time()
        # print('lasso cost {:.4f}s'.format(t2 - t1))
        
        # using different clustering assignment
        t1 = time.time()
        if version == 'Rr':
            cluster_of_x_i = np.argmax(alpha_t)
        # elif version == 'Ru':
        else:
            cluster_of_x_i = int(np.random.uniform(0, k_cluster))
        t2 = time.time()
        # print('argmax alpha cost {:.4f}s'.format(t2 - t1))

        t1 = time.time()
        A_t += np.matmul(alpha_t.reshape(k_cluster, 1), alpha_t.reshape(1, k_cluster))
        B_t += np.matmul(x_sample.reshape(m_dim, 1), alpha_t.reshape(1, k_cluster))
        x_sum += (np.linalg.norm(x_sample) ** 2)
        alpha_sum += lmda * np.linalg.norm(alpha_t, 1)
        t2 = time.time()
        # print('update At, Bt cost {:.4f}s'.format(t2 - t1))


        # update X_hat
        t1 = time.time()
        W_hat, X_hat = update_W_X_hat(W_hat, X_hat, representative_size_count, x_sample, cluster_of_x_i, 
                A_t, B_t, x_sum, alpha_sum, t, eps)
        t2 = time.time()
        # print('update X_hat, W_hat cost {:.4f}s'.format(t2 - t1))

    print('Dcitionary update done! Time elapse {:.04f}s'.format(time.time() - t_start))

    return W_hat, X_hat, representative_size_count, X_0, W_0
            Y,
            n_hat,
            k_cluster,
            numIter,
            lmda,
            eps,
            flag=False,
            version='Ru')
        t2 = time.time()
        t_ocmf += (t2 - t1)
        D_final_tmp = np.matmul(X_hat_tmp, W_hat_tmp)

        # clustered_label = get_clustering_assignment_1(X, D_final)
        clustered_label_ocmf = get_clustering_assignment_2(
            X, D_final_tmp, k_cluster, lmda)
        acc_tmp, AMI_tmp = evaluation_clustering(clustered_label_ocmf, Y)
        acc_array.append(acc_tmp)
        if acc_tmp >= acc:
            W_hat = W_hat_tmp
            X_hat = X_hat_tmp
            X_0 = X_0_tmp
            W_0 = W_0_tmp
            D_final = D_final_tmp
            acc = acc_tmp
            AMI = AMI_tmp
            repre_size_count = repre_size_count_tmp
        if acc >= 0.9:
            break
    acc_aver = np.mean(acc_array)
    t_ocmf_Ru = t_ocmf / (round_num + 1)
    print(' ------ Ru ocmf final accuracy = {:.4f}, AMI = {:.4f}'.format(