def write_enrichment_files(in_fname, out_fname, go_dct): ''' Takes in a cluster name, reads it, and writes to the out_fname. ''' # Compute GO enrichment for networks without GO. cluster_dct = file_operations.get_cluster_dictionary(in_fname) compute_go_enrichments(out_fname, cluster_dct, go_dct)
def write_summary(clus_fname, net_fname, eval_fname, enrich_fname, out_fname): # Get cluster dictionary. Values are lists of genes. clst_go_dct = file_operations.get_cluster_dictionary(clus_fname) # Get the number of gene-gene and gene-GO edges from the network. num_genes_net, num_gg_net, num_ggo_net = file_operations.get_network_stats( net_fname) # Get density dictionary. Values are in-density and out-density. density_dct = file_operations.get_cluster_densities(eval_fname) # Find the best p-value GO enrichments for each cluster. enrichment_dct = file_operations.get_enrichment_dct(enrich_fname) # Write out to file. out = open(out_fname, 'w') out.write('num_genes_in_net\tnum_g_g_net\tnum_g_go_net\n') out.write('%s\t%d\t%d\n' % (num_genes_net, num_gg_net, num_ggo_net)) out.write('cluster_number\tin_dens\tout_dens\tratio\t') out.write('num_genes\tnum_go_terms_in\tnum_g_g_edges\tnum_g_go_edges\t') out.write('top_enrichment_p\tgo_terms_in\n') for cid in density_dct: cluster_go_terms = [] # cid = str(i + 1) clus = clst_go_dct[cid] num_genes = len(clus) num_go, num_gg, num_ggo = 0, 0, 0 for node in clus: if ('ENSMUSG' not in node and 'ENSG' not in node): num_go += 1 cluster_go_terms += [node] # for edge in edge_list_go: # node_a, node_b = edge # if node_a in clus and node_b in clus: # if 'ENSMUSG' not in node_a: # # This means we have a GO term. # num_ggo += 1 # elif 'ENSMUSG' not in node_b: # num_ggo += 1 # else: # # This means that we have a gene-gene edge. # num_gg += 1 # assert(num_gg % 2 == 0) # assert(num_ggo % 2 == 0) # # Divide each of these values by two because each edge is written # # twice in the network. # num_gg /= 2 # num_ggo /= 2 in_dens, out_dens, ratio = density_dct[cid] # ratio = in_dens / (in_dens + out_dens) out.write('%s\t%g\t%g\t%g\t' % (cid, in_dens, out_dens, ratio)) out.write('%d\t%d\t%d\t%d\t' % (num_genes, num_go, num_gg, num_ggo)) out.write('%s\t%s\n' % (enrichment_dct[cid], '\t'.join( cluster_go_terms))) out.close()
def write_summary_tables(net_type): ''' Generates the filenames for each of the files we read from to summarize the clustering performances of a particular run. ''' results_folder = './results/%s_results' % species # Generate the format string. format_str = (results_folder, n_clusters, net_type) # Simulated annealing cluster results filename. if net_type == 'none': clus_fname = '%s/%s_clusters_%s.txt' % format_str else: clus_fname = '%s/%s_clusters_%s_clean.txt' % format_str clus_dct = file_operations.get_cluster_dictionary(clus_fname) # Perl script evaluation results filename. eval_fname = '%s/%s_cluster_eval_%s.txt' % format_str density_dct = get_cluster_densities(eval_fname) # DBGAP enrichment results filename. dbgap_fname = '%s/%s_clusters_dbgap_terms_%s.txt' % format_str dbgap_dct = get_enrichment_dct(dbgap_fname) out_fname = '%s/%s_clusters_summary_%s.tsv' % format_str out = open(out_fname, 'w') out.write('Cluster ID\tIn-Density\tOut-Density\tIn/(In+Out)\t' 'Gene size\tTop DBGAP p-value\tTop DBGAP term\n') for cid in sorted(density_dct.keys(), key=lambda x: int(x)): clus = clus_dct[cid] num_genes = len(clus) in_dens, out_dens = density_dct[cid] best_dbgap_p = dbgap_dct[cid][1][0] best_dbgap_term = dbgap_dct[cid][0][0] out.write( '%s\t%g\t%g\t%g\t%d\t%s\t%s\n' % (cid, in_dens, out_dens, in_dens / (in_dens + out_dens), num_genes, best_dbgap_p, best_dbgap_term)) out.close()
def compute_label_enrichments(in_fname, out_fname): ''' Takes in a cluster name, reads it, and writes to the out_fname. ''' global gene_universe cluster_dct = file_operations.get_cluster_dictionary(in_fname) net_genes = [item for sublist in cluster_dct.values() for item in sublist] gene_universe = set(net_genes) out = open(out_fname, 'w') # Loop through the clusters. for clus_id in cluster_dct: clus_genes = set(cluster_dct[clus_id]) sorted_fisher_dct = get_sorted_fisher_dct(clus_genes) # Get the log of the top 5 enrichment p-values. top_label_terms, top_p_values = [], [] for (label_term, p_value) in sorted_fisher_dct[:5]: top_label_terms += [label_term] top_p_values += [str(p_value)] out.write( 'Cluster %s\n%s\n%s\n' % (clus_id, '\t'.join(top_label_terms), '\t'.join(top_p_values))) out.close()