def test_perform_kmeans(self):
        """ assert that the kmeans sets of a known cluster as consensus matrix is the
            same as the known cluster
        """
        n_samples = 11
        n_clusters = 3
        cluster_set = np.int_(np.ones(n_samples))
        for r in range(0, n_samples):
            cluster_set[r] = int(np.random.randint(n_clusters))

        n_repeats = 33
        n_test_perm = 5
        n_test_rows = n_samples
        I = np.zeros((n_test_rows, n_test_rows))
        M = np.zeros((n_test_rows, n_test_rows))

        for r in range(0, n_repeats):
            f_perm = np.random.permutation(n_test_rows)
            f_perm = f_perm[0:n_test_perm]
            cluster_p = cluster_set[f_perm]
            I = kn.update_indicator_matrix(f_perm, I)
            M = kn.update_linkage_matrix(cluster_p, f_perm, M)

        CC = M / np.maximum(I, 1e-15)

        label_set = kn.perform_kmeans(CC, n_clusters)

        self.assertTrue(sets_a_eq_b(cluster_set, label_set),
                        msg='kemans sets differ from cluster')
def run_nmf(run_parameters):
    """ wrapper: call sequence to perform non-negative matrix factorization and write results.

    Args:
        run_parameters: parameter set dictionary.
    """
    number_of_clusters = run_parameters['number_of_clusters']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat = spreadsheet_df.as_matrix()
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)

    h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters)

    linkage_matrix = np.zeros(
        (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1]))
    sample_perm = np.arange(0, spreadsheet_mat.shape[1])
    linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm,
                                              linkage_matrix)
    labels = kn.perform_kmeans(linkage_matrix, number_of_clusters)

    sample_names = spreadsheet_df.columns
    save_consensus_clustering(linkage_matrix, sample_names, labels,
                              run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)
Example #3
0
def run_kmeans(run_parameters):
    #-----------------------------------------------------
    """ wrapper: call sequence to perform kmeans clustering and save the results.

    Args:
        run_parameters: parameter set dictionary.
    """

    number_of_clusters = run_parameters['number_of_clusters']

    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat_T = spreadsheet_df.values.T
    number_of_samples = spreadsheet_mat_T.shape[0]

    distance_matrix = pairwise_distances(spreadsheet_mat_T)
    labels = kn.perform_kmeans(spreadsheet_mat_T, number_of_clusters)
    sample_names = spreadsheet_df.columns

    save_clustering_scores(distance_matrix, sample_names, labels,
                           run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)

    return labels
Example #4
0
def run_cc_kmeans_clusters_worker(spreadsheet_mat, run_parameters, sample):
    #-----------------------------------------------------
    """Worker to execute kmeans in a single process

    Args:
        spreadsheet_mat: genes x samples matrix.
        run_parameters: dictionary of run-time parameters.
        sample: each loops.

    Returns:
        None
    """

    import knpackage.toolbox as kn
    import numpy as np

    np.random.seed(sample)
    rows_sampling_fraction = run_parameters["rows_sampling_fraction"]
    cols_sampling_fraction = run_parameters["cols_sampling_fraction"]
    number_of_clusters = run_parameters["number_of_clusters"]
    spreadsheet_mat, sample_permutation = kn.sample_a_matrix(
        spreadsheet_mat, rows_sampling_fraction, cols_sampling_fraction)

    spreadsheet_mat_T = spreadsheet_mat.T
    labels = kn.perform_kmeans(spreadsheet_mat_T, number_of_clusters)
    h_mat = labels_to_hmat(labels, number_of_clusters)
    kn.save_a_clustering_to_tmp(h_mat, sample_permutation, run_parameters,
                                sample)
def run_cc_nmf(run_parameters):
    """ wrapper: call sequence to perform non-negative matrix factorization with
        consensus clustering and write results.

    Args:
        run_parameters: parameter set dictionary.
    """
    tmp_dir = 'tmp_cc_nmf'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    processing_method = run_parameters['processing_method']
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    number_of_clusters = run_parameters['number_of_clusters']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat = spreadsheet_df.as_matrix()
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)
    number_of_samples = spreadsheet_mat.shape[1]

    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_nmf_clusters_worker(spreadsheet_mat, run_parameters, sample)

    elif processing_method == 'parallel':
        find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters,
                                               number_of_bootstraps)

    elif processing_method == 'distribute':
        func_args = [spreadsheet_mat, run_parameters]
        dependency_list = [
            run_cc_nmf_clusters_worker, save_a_clustering_to_tmp,
            dstutil.determine_parallelism_locally
        ]
        dstutil.execute_distribute_computing_job(
            run_parameters['cluster_ip_address'], number_of_bootstraps,
            func_args, find_and_save_cc_nmf_clusters_parallel, dependency_list)
    else:
        raise ValueError('processing_method contains bad value.')

    consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples)
    labels = kn.perform_kmeans(consensus_matrix, number_of_clusters)

    sample_names = spreadsheet_df.columns
    save_consensus_clustering(consensus_matrix, sample_names, labels,
                              run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])
def run_net_nmf(run_parameters):
    """ wrapper: call sequence to perform network based stratification and write results.

    Args:
        run_parameters: parameter set dictionary.
    """

    np.random.seed(0)

    number_of_clusters = run_parameters['number_of_clusters']
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    network_mat,               \
    unique_gene_names          = kn.get_sparse_network_matrix(gg_network_name_full_path)
    network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat)
    lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat)

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)

    sample_names = spreadsheet_df.columns

    spreadsheet_mat = spreadsheet_df.values
    spreadsheet_mat,           \
    iterations                 = kn.smooth_matrix_with_rwr  (spreadsheet_mat, network_mat, run_parameters)
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)

    h_mat = kn.perform_net_nmf(spreadsheet_mat, lap_pos, lap_diag,
                               run_parameters)

    linkage_matrix = np.zeros(
        (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1]))
    sample_perm = np.arange(0, spreadsheet_mat.shape[1])
    linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm,
                                              linkage_matrix)
    labels = kn.perform_kmeans(linkage_matrix, number_of_clusters)

    distance_matrix = pairwise_distances(
        h_mat.T, n_jobs=-1)  # [n_samples, n_features]. Use all available cores

    save_consensus_clustering(linkage_matrix, sample_names, labels,
                              run_parameters)
    calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels,
                                         run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)
def form_consensus_matrix_graphic(consensus_matrix, k=3):
    """ use K-means to reorder the consensus matrix for graphic display.

    Args:
        consensus_matrix: calculated consensus matrix in samples x samples order.
        k: number of clusters estimate (inner diminsion k of factored h_matrix).

    Returns:
        cc_cm: consensus_matrix with rows and columns in K-means sort order.
    """
    cc_cm = consensus_matrix.copy()
    labels = kn.perform_kmeans(consensus_matrix, k)
    sorted_labels = np.argsort(labels)
    cc_cm = cc_cm[sorted_labels[:, None], sorted_labels]

    return cc_cm
def run_nmf(run_parameters):
    """ wrapper: call sequence to perform non-negative matrix factorization and write results.

    Args:
        run_parameters: parameter set dictionary.
    """

    np.random.seed(0)

    number_of_clusters = run_parameters['number_of_clusters']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)

    spreadsheet_mat = spreadsheet_df.values
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)

    h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters)

    linkage_matrix = np.zeros(
        (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1]))
    sample_perm = np.arange(0, spreadsheet_mat.shape[1])
    linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm,
                                              linkage_matrix)

    labels = kn.perform_kmeans(linkage_matrix, number_of_clusters)

    sample_names = spreadsheet_df.columns

    distance_matrix = pairwise_distances(
        h_mat.T, n_jobs=-1)  # [n_samples, n_features]  use all available cores

    save_consensus_clustering(linkage_matrix, sample_names, labels,
                              run_parameters)
    calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels,
                                         run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)
def run_cc_net_nmf(run_parameters):
    """ wrapper: call sequence to perform network based stratification with consensus clustering
        and write results.

    Args:
        run_parameters: parameter set dictionary.
    """

    tmp_dir = 'tmp_cc_net_nmf'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    processing_method = run_parameters['processing_method']
    number_of_clusters = run_parameters['number_of_clusters']
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    network_mat,               \
             unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path)
    network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat)
    lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat)

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)

    spreadsheet_mat = spreadsheet_df.values
    number_of_samples = spreadsheet_mat.shape[1]
    sample_names = spreadsheet_df.columns

    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_net_nmf_clusters_worker(network_mat, spreadsheet_mat,
                                           lap_diag, lap_pos, run_parameters,
                                           sample)

    elif processing_method == 'parallel':
        find_and_save_cc_net_nmf_clusters_parallel(network_mat,
                                                   spreadsheet_mat, lap_diag,
                                                   lap_pos, run_parameters,
                                                   number_of_bootstraps)

    elif processing_method == 'distribute':
        func_args = [
            network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters
        ]
        dependency_list = [
            run_cc_net_nmf_clusters_worker, save_a_clustering_to_tmp,
            dstutil.determine_parallelism_locally
        ]
        cluster_ip_address = run_parameters['cluster_ip_address']
        dstutil.execute_distribute_computing_job(
            cluster_ip_address, number_of_bootstraps, func_args,
            find_and_save_cc_net_nmf_clusters_parallel, dependency_list)
    else:
        raise ValueError('processing_method contains bad value.')

    consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples)
    distance_matrix = pairwise_distances(
        consensus_matrix,
        n_jobs=-1)  # [n_samples, n_samples] use all available cores
    labels = kn.perform_kmeans(consensus_matrix, number_of_clusters)

    save_consensus_clustering(consensus_matrix, sample_names, labels,
                              run_parameters)
    calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels,
                                         run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters, network_mat)

    kn.remove_dir(run_parameters["tmp_directory"])