def run_correlation_worker(run_parameters, spreadsheet_df, phenotype_df,
                           job_id):
    """ core function for parallel run_correlation
    Args:
        run_parameters:  dict of parameters
        spreadsheet_df:  spreadsheet data frame
        phenotype_df:    phenotype data frame
        job_id:          parallel iteration number
    """
    # selects the ith row in phenotype_df

    np.random.seed(job_id)

    phenotype_df = phenotype_df.iloc[[job_id], :]

    spreadsheet_df, phenotype_df, msg = datacln.check_input_value_for_gene_prioritazion(
        spreadsheet_df, phenotype_df)

    pc_array = get_correlation(spreadsheet_df.values, phenotype_df.values[0],
                               run_parameters)

    feature_name_list = spreadsheet_df.index
    phenotype_name = phenotype_df.index.values[0]
    generate_correlation_output(pc_array, phenotype_name, feature_name_list,
                                run_parameters)
Example #2
0
def run_bootstrap_net_correlation_worker(run_parameters, spreadsheet_df, phenotype_df, network_mat,
                                             spreadsheet_genes_as_input, baseline_array, job_id):
    """ worker for bootstrap network parallelization.

    Args:
        run_parameters:  dict of parameters
        spreadsheet_df:  spreadsheet data frame
        phenotype_df:    phenotype data frame
        network_mat:     adjacency matrix
        spreadsheet_genes_as_input: list of genes
        baseline_array:  network smooted baseline array
        job_id:          parallel iteration number
    """

    np.random.seed(job_id)

    n_bootstraps           = run_parameters["number_of_bootstraps"  ]
    cols_sampling_fraction = run_parameters["cols_sampling_fraction"]
    top_beta_of_sort       = run_parameters["top_beta_of_sort"      ]

    restart_accumulator    = np.zeros(network_mat.shape[0])
    gm_accumulator         = np.ones(network_mat.shape[0])
    borda_count            = np.zeros(network_mat.shape[0])

    phenotype_df           = phenotype_df.iloc[[job_id], :]
    spreadsheet_df         ,\
    phenotype_df           ,\
    msg                    = datacln.check_input_value_for_gene_prioritazion(spreadsheet_df, phenotype_df)
    sample_smooth          = spreadsheet_df.as_matrix()
    pearson_array          = get_correlation(sample_smooth, phenotype_df.values[0], run_parameters)

    for bootstrap_number in range(0, n_bootstraps):
        sample_random      , sample_permutation = sample_a_matrix_pearson(sample_smooth, 1.0, cols_sampling_fraction)
        phenotype_response = phenotype_df.values[0, None]
        phenotype_response = phenotype_response[0, sample_permutation]
        pc_array           = get_correlation(sample_random, phenotype_response, run_parameters)
        mask               = np.in1d(spreadsheet_df.index, spreadsheet_genes_as_input)
        pc_array[~mask]    = 0.0
        pc_array           = np.abs(trim_to_top_beta(pc_array, top_beta_of_sort))
        restart_accumulator[pc_array != 0] += 1.0
        pc_array           = pc_array / max( sum(pc_array), EPSILON_0 )
        pc_array           = kn.smooth_matrix_with_rwr(pc_array, network_mat, run_parameters)[0]
        pc_array           = pc_array - baseline_array
        borda_count        = sum_array_ranking_to_borda_count(borda_count, pc_array)
        gm_accumulator     = (np.abs(pc_array) + EPSILON_0) * gm_accumulator

    restart_accumulator    = restart_accumulator / n_bootstraps
    borda_count            = borda_count         / n_bootstraps
    viz_score              = (borda_count - min(borda_count)) / (max(borda_count) - min(borda_count))
    phenotype_name         = phenotype_df.index.values[0]
    gene_name_list         = spreadsheet_df.index
    gene_orig_list         = spreadsheet_genes_as_input
    quantitative_score     = borda_count

    generate_net_correlation_output(pearson_array, quantitative_score, viz_score, restart_accumulator,
                                    phenotype_name, gene_name_list, gene_orig_list, run_parameters)
def run_net_correlation_worker(run_parameters, spreadsheet_df, phenotype_df,
                               network_mat, spreadsheet_genes_as_input,
                               baseline_array, job_id):
    """  core function for parallel run_net_correlation

    Args:
        run_parameters:  dict of parameters
        spreadsheet_df:  spreadsheet data frame
        phenotype_df:    phenotype data frame
        network_mat:     adjacency matrix
        spreadsheet_genes_as_input: list of genes
        baseline_array:  network smooted baseline array
        job_id:          parallel iteration number
    """

    np.random.seed(job_id)

    phenotype_df = phenotype_df.iloc[[job_id], :]
    spreadsheet_df      ,\
    phenotype_df        ,\
    msg                 = datacln.check_input_value_for_gene_prioritazion(spreadsheet_df, phenotype_df)
    sample_smooth = spreadsheet_df.values
    pc_array = get_correlation(sample_smooth, phenotype_df.values[0],
                               run_parameters)
    pearson_array = pc_array.copy()
    mask = np.in1d(spreadsheet_df.index, spreadsheet_genes_as_input)
    pc_array[~mask] = 0.0
    pc_array = np.abs(
        trim_to_top_beta(pc_array, run_parameters["top_beta_of_sort"]))

    restart_accumulator = pc_array.copy()
    restart_accumulator[restart_accumulator != 0] = 1

    pc_array = pc_array / max(sum(pc_array), EPSILON_0)
    pc_array = kn.smooth_matrix_with_rwr(pc_array, network_mat,
                                         run_parameters)[0]

    pc_array = pc_array - baseline_array
    quantitative_score = pc_array
    viz_score = (pc_array - min(pc_array)) / (max(pc_array) - min(pc_array))

    phenotype_name = phenotype_df.index.values[0]
    gene_name_list = spreadsheet_df.index
    gene_orig_list = spreadsheet_genes_as_input

    generate_net_correlation_output(pearson_array, quantitative_score,
                                    viz_score, restart_accumulator,
                                    phenotype_name, gene_name_list,
                                    gene_orig_list, run_parameters)
def run_bootstrap_correlation_worker(run_parameters, spreadsheet_df,
                                     phenotype_df, n_bootstraps, job_id):
    """  core function for parallel run_bootstrap_correlation

    Args:
        run_parameters:  dict of parameters
        spreadsheet_df:  spreadsheet data frame
        phenotype_df:    phenotype data frame
        n_bootstraps:    number of bootstrap samples to use
        job_id:          parallel iteration number
    """

    np.random.seed(job_id)

    phenotype_df = phenotype_df.iloc[[job_id], :]
    spreadsheet_df ,\
    phenotype_df   ,\
    msg            = datacln.check_input_value_for_gene_prioritazion(spreadsheet_df, phenotype_df)
    pearson_array = get_correlation(spreadsheet_df.values,
                                    phenotype_df.values[0], run_parameters)
    borda_count = np.zeros(spreadsheet_df.shape[0])
    gm_accumulator = np.ones(spreadsheet_df.shape[0])

    for bootstrap_number in range(0, n_bootstraps):
        sample_random, sample_permutation = sample_a_matrix_pearson(
            spreadsheet_df.values, 1.0,
            run_parameters["cols_sampling_fraction"])
        phenotype_response = phenotype_df.values[0, None]
        phenotype_response = phenotype_response[0, sample_permutation]
        pc_array = get_correlation(sample_random, phenotype_response,
                                   run_parameters)
        borda_count = sum_array_ranking_to_borda_count(borda_count,
                                                       np.abs(pc_array))
        gm_accumulator = (np.abs(pc_array) + EPSILON_0) * gm_accumulator

    pcc_gm_array = gm_accumulator**(1 / n_bootstraps)
    borda_count = borda_count / n_bootstraps
    phenotype_name = phenotype_df.index.values[0]
    gene_name_list = spreadsheet_df.index
    viz_score = (borda_count - min(borda_count)) / (max(borda_count) -
                                                    min(borda_count))

    generate_bootstrap_correlation_output(borda_count, viz_score,
                                          pearson_array, phenotype_name,
                                          gene_name_list, run_parameters)