Example #1
0
def run_bootstrap_net_correlation(run_parameters):
    """ perform gene prioritization using bootstrap sampling and network smoothing

    Args:
        run_parameters: parameter set dictionary.
    """
    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path)

    network_mat = normalize(network_mat, norm="l1", axis=0)

    phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"])
    spreadsheet_genes_as_input = spreadsheet_df.index.values
    phenotype_df = phenotype_df.T

    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names)
    spreadsheet_df = zscore_dataframe(spreadsheet_df)
    sample_smooth, iterations = kn.smooth_matrix_with_rwr(spreadsheet_df.as_matrix(), network_mat.T, run_parameters)
    spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns)

    baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0]
    baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0]

    number_of_jobs = len(phenotype_df.index)
    jobs_id = range(0, number_of_jobs)
    zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat,
                                              spreadsheet_genes_as_input, baseline_array, jobs_id)
    dstutil.parallelize_processes_locally(run_bootstrap_net_correlation_worker, zipped_arguments, number_of_jobs)

    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Example #2
0
def run_correlation(run_parameters):
    """ perform feature prioritization

    Args:
        run_parameters: parameter set dictionary.
    """
    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')

    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])
    phenotype_df = phenotype_df.T

    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    for i in range(0, len_phenotype, max_cpu):
        jobs_id = array_of_jobs[i:i + max_cpu]
        number_of_jobs = len(jobs_id)

        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  jobs_id)
        dstutil.parallelize_processes_locally(run_correlation_worker,
                                              zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Example #3
0
def run_correlation(run_parameters):
    """ perform gene prioritization

    Args:
        run_parameters: parameter set dictionary.
    """

    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')

    results_tmp_directory      = run_parameters["results_tmp_directory"     ]
    phenotype_name_full_path   = run_parameters["phenotype_name_full_path"  ]
    spreadsheet_name_full_path = run_parameters["spreadsheet_name_full_path"]

    spreadsheet_df             = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    phenotype_df               = kn.get_spreadsheet_df(phenotype_name_full_path  )
    phenotype_df               = phenotype_df.T

    number_of_jobs             = len(phenotype_df.index)
    jobs_id                    = range(0, number_of_jobs)
    zipped_arguments           = dstutil.zip_parameters( run_parameters
                                                       , spreadsheet_df
                                                       , phenotype_df
                                                       , jobs_id
                                                       )

    dstutil.parallelize_processes_locally( run_correlation_worker
                                         , zipped_arguments
                                         , number_of_jobs
                                         )

    write_phenotype_data_all(run_parameters       )
    kn.remove_dir           (results_tmp_directory)
Example #4
0
def run_cc_net_similarity(run_parameters):
    """ wrapper: call sequence to perform signature analysis with
        random walk smoothing and bootstrapped similarity and save results.

    Args:
        run_parameters: parameter set dictionary.
    """
    tmp_dir = 'tmp_cc_similarity_'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    expression_name      = run_parameters["spreadsheet_name_full_path"]
    signature_name       = run_parameters["signature_name_full_path"  ]
    gg_network_name      = run_parameters['gg_network_name_full_path' ]
    similarity_measure   = run_parameters["similarity_measure"        ]
    number_of_bootstraps = run_parameters['number_of_bootstraps'      ]
    processing_method    = run_parameters['processing_method'         ]

    expression_df        = kn.get_spreadsheet_df(expression_name)
    signature_df         = kn.get_spreadsheet_df(signature_name )

    samples_names        = expression_df.columns
    signatures_names     =  signature_df.columns
    signatures_names     = [i.split('.')[0] for i in signatures_names]
    signature_df.columns = signatures_names

    network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name)
    # network_mat                    = kn.normalize_sparse_mat_by_diagonal(network_mat)
    
    expression_df                  = kn.update_spreadsheet_df(expression_df, unique_gene_names)
    signature_df                   = kn.update_spreadsheet_df(signature_df, unique_gene_names)

    expression_mat                 = expression_df.as_matrix()
    signature_mat                  = signature_df.as_matrix()

    expression_mat, iterations = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters)
    signature_mat,  iterations = kn.smooth_matrix_with_rwr(signature_mat,  network_mat, run_parameters)

    expression_df.iloc[:] = expression_mat
    signature_df.iloc[:]  = signature_mat

    if   processing_method == 'serial':
         for sample in range(0, number_of_bootstraps):
            run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample)

    elif processing_method == 'parallel':
         find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, number_of_bootstraps)

    else:
        raise ValueError('processing_method contains bad value.')

    # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df)
    similarity_df = assemble_similarity_df(expression_df, signature_df, run_parameters)
    similarity_df  = pd.DataFrame(similarity_df.values, index=samples_names, columns=signatures_names)
    save_final_samples_signature(similarity_df, run_parameters)
    save_best_match_signature(similarity_df, run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])
def run_bootstrap_correlation(run_parameters):
    """ perform gene prioritization using bootstrap sampling

    Args:
        run_parameters: parameter set dictionary.
    """

    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')

    results_tmp_directory = run_parameters["results_tmp_directory"]
    n_bootstraps = run_parameters["number_of_bootstraps"]
    results_tmp_directory = run_parameters["results_tmp_directory"]
    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])

    phenotype_df = phenotype_df.T

    #-----------------------------------------------------------------------------------------
    #   Partition the phenotype dataframe (partition size = MaxCPU)
    #-----------------------------------------------------------------------------------------
    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    if (len_phenotype <= max_cpu):
        jobs_id = array_of_jobs
        number_of_jobs = len(jobs_id)
        #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  n_bootstraps, jobs_id)

        dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker,
                                              zipped_arguments, number_of_jobs)

        write_phenotype_data_all(run_parameters)
        #-----------------------------------------------------------------------------------------

    else:
        for i in range(0, len_phenotype, max_cpu):
            jobs_id = array_of_jobs[i:i + max_cpu]
            number_of_jobs = len(jobs_id)
            #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  n_bootstraps, jobs_id)

        dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker,
                                              zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    #-----------------------------------------------------------------------------------------

    kn.remove_dir(results_tmp_directory)
def run_cc_nmf(run_parameters):
    """ wrapper: call sequence to perform non-negative matrix factorization with
        consensus clustering and write results.

    Args:
        run_parameters: parameter set dictionary.
    """
    tmp_dir = 'tmp_cc_nmf'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    processing_method = run_parameters['processing_method']
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    number_of_clusters = run_parameters['number_of_clusters']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat = spreadsheet_df.as_matrix()
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)
    number_of_samples = spreadsheet_mat.shape[1]

    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_nmf_clusters_worker(spreadsheet_mat, run_parameters, sample)

    elif processing_method == 'parallel':
        find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters,
                                               number_of_bootstraps)

    elif processing_method == 'distribute':
        func_args = [spreadsheet_mat, run_parameters]
        dependency_list = [
            run_cc_nmf_clusters_worker, save_a_clustering_to_tmp,
            dstutil.determine_parallelism_locally
        ]
        dstutil.execute_distribute_computing_job(
            run_parameters['cluster_ip_address'], number_of_bootstraps,
            func_args, find_and_save_cc_nmf_clusters_parallel, dependency_list)
    else:
        raise ValueError('processing_method contains bad value.')

    consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples)
    labels = kn.perform_kmeans(consensus_matrix, number_of_clusters)

    sample_names = spreadsheet_df.columns
    save_consensus_clustering(consensus_matrix, sample_names, labels,
                              run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])
Example #7
0
def run_cc_similarity(run_parameters):
    """ Performs similarity analysis with bootstraps and saves the similarity matrix.

    Args:
        run_parameters: parameter set dictionary.
    """
    tmp_dir = 'tmp_cc_similarity'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    expression_name = run_parameters["spreadsheet_name_full_path"]
    signature_name = run_parameters["signature_name_full_path"]
    similarity_measure = run_parameters["similarity_measure"]
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    processing_method = run_parameters['processing_method']

    expression_df = kn.get_spreadsheet_df(expression_name)
    signature_df = kn.get_spreadsheet_df(signature_name)

    samples_names = expression_df.columns
    signatures_names = signature_df.columns
    signatures_names = [i.split('.')[0] for i in signatures_names]
    signature_df.columns = signatures_names

    expression_mat = expression_df.as_matrix()
    signature_mat = signature_df.as_matrix()
    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_similarity_signature_worker(expression_df, signature_df,
                                               run_parameters, sample)

    elif processing_method == 'parallel':
        find_and_save_cc_similarity_parallel(expression_df, signature_df,
                                             run_parameters,
                                             number_of_bootstraps)

    else:
        raise ValueError('processing_method contains bad value.')

    # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df)
    similarity_df = assemble_similarity_df(expression_df, signature_df,
                                           run_parameters)

    similarity_df = pd.DataFrame(similarity_df.values,
                                 index=samples_names,
                                 columns=signatures_names)
    save_final_samples_signature(similarity_df, run_parameters)
    save_best_match_signature(similarity_df, run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])
Example #8
0
def run_bootstrap_correlation(run_parameters):
    """ perform feature prioritization using bootstrap sampling

    Args:
        run_parameters: parameter set dictionary.
    """
    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')

    phenotype_df        = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"])
    spreadsheet_df      = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"])
    phenotype_df        = phenotype_df.T
    n_bootstraps        = run_parameters["number_of_bootstraps"]
    number_of_jobs      = len(phenotype_df.index)
    jobs_id             = range(0, number_of_jobs)
    zipped_arguments    = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id)

    dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Example #9
0
 def test_create_dir_AND_remove_dir(self):
     """ assert that the functions work togeather to create and remove a directory
         even when files have been added
     """
     dir_name = 'tmp_test'
     dir_path = self.run_parameters['test_directory']
     new_directory_name = kn.create_dir(dir_path, dir_name)
     self.assertTrue(os.path.exists(new_directory_name), msg='create_dir function exception')
     A = np.random.rand(10, 10)
     time_stamp = '123456789'
     a_name = os.path.join(new_directory_name, 'temp_test' + time_stamp)
     with open(a_name, 'wb') as fh:
         A.dump(fh)
     A_back = np.load(a_name)
     if os.path.isfile(a_name):
         os.remove(a_name)
     A_diff = A - A_back
     A_diff = A_diff.sum()
     self.assertEqual(A_diff, 0, msg='write / read directory exception')
     kn.remove_dir(new_directory_name)
     self.assertFalse(os.path.exists(new_directory_name), msg='remove_dir function exception')
def run_net_correlation(run_parameters):
    """ perform gene prioritization with network smoothing

    Args:
        run_parameters: parameter set dictionary.
    """
    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    network_mat, unique_gene_names = kn.get_sparse_network_matrix(
        gg_network_name_full_path)

    network_mat = normalize(network_mat, norm="l1", axis=0)

    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])
    spreadsheet_genes_as_input = spreadsheet_df.index.values
    phenotype_df = phenotype_df.T

    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)
    spreadsheet_df = zscore_dataframe(spreadsheet_df)

    sample_smooth, iterations = kn.smooth_matrix_with_rwr(
        spreadsheet_df.values, network_mat.T, run_parameters)
    spreadsheet_df = pd.DataFrame(sample_smooth,
                                  index=spreadsheet_df.index,
                                  columns=spreadsheet_df.columns)

    baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0]
    baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat,
                                               run_parameters)[0]

    #-----------------------------------------------------------------------------------------
    #   Partition the phenotype dataframe (partition size = MaxCPU)
    #-----------------------------------------------------------------------------------------

    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    if (len_phenotype <= max_cpu):
        jobs_id = array_of_jobs
        number_of_jobs = len(jobs_id)

        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  network_mat,
                                                  spreadsheet_genes_as_input,
                                                  baseline_array, jobs_id)
        dstutil.parallelize_processes_locally(run_net_correlation_worker,
                                              zipped_arguments, number_of_jobs)

        write_phenotype_data_all(run_parameters)

        #-----------------------------------------------------------------------------------------
    else:
        for i in range(0, len_phenotype, max_cpu):
            jobs_id = array_of_jobs[i:i + max_cpu]
            number_of_jobs = len(jobs_id)
            #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  network_mat,
                                                  spreadsheet_genes_as_input,
                                                  baseline_array, jobs_id)
        dstutil.parallelize_processes_locally(run_net_correlation_worker,
                                              zipped_arguments, number_of_jobs)

    write_phenotype_data_all(run_parameters)
    #-----------------------------------------------------------------------------------------

    kn.remove_dir(run_parameters["results_tmp_directory"])
def run_cc_net_nmf(run_parameters):
    """ wrapper: call sequence to perform network based stratification with consensus clustering
        and write results.

    Args:
        run_parameters: parameter set dictionary.
    """

    tmp_dir = 'tmp_cc_net_nmf'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    processing_method = run_parameters['processing_method']
    number_of_clusters = run_parameters['number_of_clusters']
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    network_mat,               \
             unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path)
    network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat)
    lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat)

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                              unique_gene_names)

    spreadsheet_mat = spreadsheet_df.values
    number_of_samples = spreadsheet_mat.shape[1]
    sample_names = spreadsheet_df.columns

    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_net_nmf_clusters_worker(network_mat, spreadsheet_mat,
                                           lap_diag, lap_pos, run_parameters,
                                           sample)

    elif processing_method == 'parallel':
        find_and_save_cc_net_nmf_clusters_parallel(network_mat,
                                                   spreadsheet_mat, lap_diag,
                                                   lap_pos, run_parameters,
                                                   number_of_bootstraps)

    elif processing_method == 'distribute':
        func_args = [
            network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters
        ]
        dependency_list = [
            run_cc_net_nmf_clusters_worker, save_a_clustering_to_tmp,
            dstutil.determine_parallelism_locally
        ]
        cluster_ip_address = run_parameters['cluster_ip_address']
        dstutil.execute_distribute_computing_job(
            cluster_ip_address, number_of_bootstraps, func_args,
            find_and_save_cc_net_nmf_clusters_parallel, dependency_list)
    else:
        raise ValueError('processing_method contains bad value.')

    consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples)
    distance_matrix = pairwise_distances(
        consensus_matrix,
        n_jobs=-1)  # [n_samples, n_samples] use all available cores
    labels = kn.perform_kmeans(consensus_matrix, number_of_clusters)

    save_consensus_clustering(consensus_matrix, sample_names, labels,
                              run_parameters)
    calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels,
                                         run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters, network_mat)

    kn.remove_dir(run_parameters["tmp_directory"])
Example #12
0
 def tearDown(self):
     kn.remove_dir(self.run_parameters["tmp_directory"])
     del self.run_parameters
Example #13
0
def run_cc_link_hclust(run_parameters):
    #-----------------------------------------------------
    """ wrapper: call sequence to perform hclust with
        consensus clustering and write results.

    Args:
        run_parameters: parameter set dictionary.
    """

    tmp_dir = 'tmp_cc_link_hclust'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    processing_method = run_parameters['processing_method']
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    number_of_clusters = run_parameters['number_of_clusters']
    nearest_neighbors = run_parameters['nearest_neighbors']
    affinity_metric = run_parameters['affinity_metric']
    linkage_criterion = run_parameters['linkage_criterion']

    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat = spreadsheet_df.values
    number_of_samples = spreadsheet_mat.shape[1]

    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_link_hclust_clusters_worker(spreadsheet_mat, run_parameters,
                                               sample)

    elif processing_method == 'parallel':
        find_and_save_cc_link_hclust_clusters_parallel(spreadsheet_mat,
                                                       run_parameters,
                                                       number_of_bootstraps)

    elif processing_method == 'distribute':

        func_args = [spreadsheet_mat, run_parameters]

        dependency_list = [ run_cc_link_hclust_clusters_worker     \
                          , kn.save_a_clustering_to_tmp            \
                          , dstutil.determine_parallelism_locally  ]

        dstutil.execute_distribute_computing_job(
            run_parameters['cluster_ip_address'], number_of_bootstraps,
            func_args, find_and_save_cc_link_hclust_clusters_parallel,
            dependency_list)
    else:
        raise ValueError('processing_method contains bad value.')

    consensus_matrix = kn.form_consensus_matrix(run_parameters,
                                                number_of_samples)

    labels, \
    distance_matrix = perform_link_hclust( consensus_matrix
                                         , number_of_clusters
                                         , nearest_neighbors
                                         , affinity_metric
                                         , linkage_criterion)

    sample_names = spreadsheet_df.columns

    save_consensus_matrix(consensus_matrix, sample_names, labels,
                          run_parameters)
    save_clustering_scores(distance_matrix, sample_names, labels,
                           run_parameters)

    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])