Ejemplo n.º 1
0
def main(args):
    input_network_file = args.infile  # Input gene interaction set
    gene_set_file = args.diseasefile
    outname = args.outname
    n_cores = args.cores
    is_verbose = args.verbose
    large = True
    if args.size == 's':
        large = False

    # Load network (We choose a smaller network here for the example's sake)
    network = dit.load_network_file(input_network_file, verbose=is_verbose)

    # Load gene sets for analysis
    genesets = dit.load_node_sets(gene_set_file)

    # Calculate geneset sub-sample rate
    genesets_p = nef.calculate_p(network, genesets)

    # Determine optimal alpha for network (can also be done automatically by next step)
    alpha = prop.calculate_alpha(network)
    # print alpha

    # Calculate network kernel for propagation
    kernel = nef.construct_prop_kernel(network,
                                       alpha=alpha,
                                       verbose=is_verbose)

    # Might want to tweak values here to speed up calculation
    # Calculate the AUPRC values for each gene set
    if large:
        AUPRC_values = nef.large_network_AUPRC_wrapper(kernel,
                                                       genesets,
                                                       genesets_p,
                                                       n=30,
                                                       cores=n_cores,
                                                       verbose=is_verbose)
    else:
        AUPRC_values = nef.small_network_AUPRC_wrapper(kernel,
                                                       genesets,
                                                       genesets_p,
                                                       n=30,
                                                       cores=n_cores,
                                                       verbose=is_verbose)

    # Construct null networks and calculate the AUPRC of the gene sets of the null networks
    # We can use the AUPRC wrapper function for this
    null_AUPRCs = []
    for i in range(10):
        shuffNet = nef.shuffle_network(network,
                                       max_tries_n=10,
                                       verbose=is_verbose)
        shuffNet_kernel = nef.construct_prop_kernel(shuffNet,
                                                    alpha=alpha,
                                                    verbose=is_verbose)
        if large:
            shuffNet_AUPRCs = nef.large_network_AUPRC_wrapper(
                shuffNet_kernel,
                genesets,
                genesets_p,
                n=30,
                cores=n_cores,
                verbose=is_verbose)
        else:
            shuffNet_AUPRCs = nef.small_network_AUPRC_wrapper(
                shuffNet_kernel,
                genesets,
                genesets_p,
                n=30,
                cores=n_cores,
                verbose=is_verbose)
        null_AUPRCs.append(shuffNet_AUPRCs)
        print 'shuffNet', repr(i + 1), 'AUPRCs calculated'

    # Construct table of null AUPRCs
    null_AUPRCs_table = pd.concat(null_AUPRCs, axis=1)
    null_AUPRCs_table.columns = [
        'shuffNet' + repr(i + 1) for i in range(len(null_AUPRCs))
    ]

    # Calculate performance metric of gene sets; This is the Z-score
    network_performance = nef.calculate_network_performance_score(
        AUPRC_values, null_AUPRCs_table, verbose=is_verbose)
    network_performance.name = 'Test Network'
    network_performance.to_csv(outname + '_performance_score.csv', sep='\t')

    # Calculate network performance gain over median null AUPRC;
    network_perf_gain = nef.calculate_network_performance_gain(
        AUPRC_values, null_AUPRCs_table, verbose=is_verbose)
    network_perf_gain.name = 'Test Network'
    network_perf_gain.to_csv(outname + '_performance_gain.csv', sep='\t')

    # # Rank network on average performance across gene sets vs performance on same gene sets in previous network set
    # all_network_performance = pd.read_csv(outname+'.csv', index_col=0, sep='\t')
    # all_network_performance_filt = pd.concat([network_performance, all_network_performance.ix[network_performance.index]], axis=1)
    # network_performance_rank_table = all_network_performance_filt.rank(axis=1, ascending=False)
    # network_performance_rankings = network_performance_rank_table['Test Network']
    #
    # # Rank network on average performance gain across gene sets vs performance gain on same gene sets in previous network set
    # all_network_perf_gain = pd.read_csv(outname+'_Gain.csv', index_col=0, sep='\t')
    # all_network_perf_gain_filt = pd.concat([network_perf_gain, all_network_perf_gain.ix[network_perf_gain.index]], axis=1)
    # network_perf_gain_rank_table = all_network_perf_gain_filt.rank(axis=1, ascending=False)
    # network_perf_gain_rankings = network_perf_gain_rank_table['Test Network']
    #
    # # Network Performance
    # network_performance_metric_ranks = pd.concat([network_performance, network_performance_rankings, network_perf_gain, network_perf_gain_rankings], axis=1)
    # network_performance_metric_ranks.columns = ['Network Performance', 'Network Performance Rank', 'Network Performance Gain', 'Network Performance Gain Rank']
    # network_performance_metric_ranks.sort_values(by=['Network Performance Rank', 'Network Performance', 'Network Performance Gain Rank', 'Network Performance Gain'],
    #                                              ascending=[True, False, True, False])

    # Construct network summary table
    network_summary = {}
    network_summary['Nodes'] = int(len(network.nodes()))
    network_summary['Edges'] = int(len(network.edges()))
    network_summary['Avg Node Degree'] = np.mean(
        dict(network.degree()).values())
    network_summary['Edge Density'] = 2 * network_summary['Edges'] / float(
        (network_summary['Nodes'] * (network_summary['Nodes'] - 1)))
    # network_summary['Avg Network Performance Rank'] = network_performance_rankings.mean()
    # network_summary['Avg Network Performance Rank, Rank'] = int(network_performance_rank_table.mean().rank().ix['Test Network'])
    # network_summary['Avg Network Performance Gain Rank'] = network_perf_gain_rankings.mean()
    # network_summary['Avg Network Performance Gain Rank, Rank'] = int(network_perf_gain_rank_table.mean().rank().ix['Test Network'])
    with open(outname + '_summary', 'w') as f:
        for item in ['Nodes', 'Edges', 'Avg Node Degree', 'Edge Density']:
            f.write(item + ':\t' + repr(network_summary[item]) + '\n')
Ejemplo n.º 2
0
        genesets_p = nef.calculate_p(network, genesets)
    else:
        genesets_p = {geneset: args.sample_p for geneset in genesets}
    if args.verbose:
        print('Gene set sub-sample rates set')

    ############################################
    ##### Network Performance Calculations #####
    ############################################

    # Calculate AUPRC for each gene set on actual network (large networks are >=10k nodes)
    if network_size < 10000:
        actual_AUPRC_values = nef.small_network_AUPRC_wrapper(
            kernel,
            genesets,
            genesets_p,
            n=args.sub_sample_iter,
            cores=args.cores,
            bg=background_nodes,
            verbose=False)
    else:
        actual_AUPRC_values = nef.large_network_AUPRC_wrapper(
            kernel,
            genesets,
            genesets_p,
            n=args.sub_sample_iter,
            cores=args.cores,
            bg=background_nodes,
            verbose=False)

    # Save the actual network's AUPRC values
    if args.verbose:
Ejemplo n.º 3
0
#%%
import networkx as nx
print(len(network.nodes))
subs = list(nx.connected_component_subgraphs(network))
print(subs)
#%%

# Calculate network kernel for propagation
kernel = nef.construct_prop_kernel(network, alpha=alpha, verbose=True)
#%%
print(kernel.index)
print(genesets)
#%%

# Calculate the AUPRC values for each gene set
AUPRC_values = nef.small_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=30, cores=4, verbose=True)

#%% md

#**Note about the above cell:** There are a several options for this particular step depending on the computational resources available and network size. If the network is sufficiently small (<250k edges), it is recommended to use the 'small_network_AUPRC_wrapper' function as it can be much faster, especially when run in parallel (at least 8G per core is recommended). If you would like to parallelize the AUPRC calculation with a larger network (between 250K and 2.5M edges), at least 16G per core is recommended, 32G per core if the network contains more than 2.5M edges. For larger networks, it is recommended to use the 'large_network_AUPRC_wrapper', which may be a slightly slower function, but more equipped to handle the larger memory footprint(required. To change the parllelization status of the function, change the 'cores' option to the number of threads you would like to utilize.)

#%%

# Construct null networks and calculate the AUPRC of the gene sets of the null networks
# We can use the AUPRC wrapper function for this
if os.path.exists('null_AUPRCs.pickle'):
    null_AUPRCs = pickle.load(open('null_AUPRCs.pickle','rb'))
else:
    null_AUPRCs = []
    for i in range(10):
        shuffNet = nef.shuffle_network(network, max_tries_n=10, verbose=True)