def write_enrichment_files(in_fname, out_fname, go_dct):
    '''
    Takes in a cluster name, reads it, and writes to the out_fname.
    '''
    # Compute GO enrichment for networks without GO.
    cluster_dct = file_operations.get_cluster_dictionary(in_fname)
    compute_go_enrichments(out_fname, cluster_dct, go_dct)
def write_summary(clus_fname, net_fname, eval_fname, enrich_fname, out_fname):
    # Get cluster dictionary. Values are lists of genes.
    clst_go_dct = file_operations.get_cluster_dictionary(clus_fname)

    # Get the number of gene-gene and gene-GO edges from the network.
    num_genes_net, num_gg_net, num_ggo_net = file_operations.get_network_stats(
        net_fname)

    # Get density dictionary. Values are in-density and out-density.
    density_dct = file_operations.get_cluster_densities(eval_fname)

    # Find the best p-value GO enrichments for each cluster.
    enrichment_dct = file_operations.get_enrichment_dct(enrich_fname)

    # Write out to file.
    out = open(out_fname, 'w')
    out.write('num_genes_in_net\tnum_g_g_net\tnum_g_go_net\n')
    out.write('%s\t%d\t%d\n' % (num_genes_net, num_gg_net, num_ggo_net))
    out.write('cluster_number\tin_dens\tout_dens\tratio\t')
    out.write('num_genes\tnum_go_terms_in\tnum_g_g_edges\tnum_g_go_edges\t')
    out.write('top_enrichment_p\tgo_terms_in\n')
    for cid in density_dct:
        cluster_go_terms = []
        # cid = str(i + 1)
        clus = clst_go_dct[cid]
        num_genes = len(clus)
        num_go, num_gg, num_ggo = 0, 0, 0
        for node in clus:
            if ('ENSMUSG' not in node and 'ENSG' not in node):
                num_go += 1
                cluster_go_terms += [node]
        # for edge in edge_list_go:
        #     node_a, node_b = edge
        #     if node_a in clus and node_b in clus:
        #         if 'ENSMUSG' not in node_a:
        #             # This means we have a GO term.
        #             num_ggo += 1
        #         elif 'ENSMUSG' not in node_b:
        #             num_ggo += 1
        #         else:
        #             # This means that we have a gene-gene edge.
        #             num_gg += 1
        # assert(num_gg % 2 == 0)
        # assert(num_ggo % 2 == 0)
        # # Divide each of these values by two because each edge is written
        # # twice in the network.
        # num_gg /= 2
        # num_ggo /= 2
        in_dens, out_dens, ratio = density_dct[cid]

        # ratio = in_dens / (in_dens + out_dens)
        out.write('%s\t%g\t%g\t%g\t' % (cid, in_dens, out_dens, ratio))
        out.write('%d\t%d\t%d\t%d\t' % (num_genes, num_go, num_gg, num_ggo))
        out.write('%s\t%s\n' % (enrichment_dct[cid], '\t'.join(
            cluster_go_terms)))
    out.close()
def write_summary_tables(net_type):
    '''
    Generates the filenames for each of the files we read from to summarize 
    the clustering performances of a particular run.
    '''
    results_folder = './results/%s_results' % species

    # Generate the format string.
    format_str = (results_folder, n_clusters, net_type)
    # Simulated annealing cluster results filename.
    if net_type == 'none':
        clus_fname = '%s/%s_clusters_%s.txt' % format_str
    else:
        clus_fname = '%s/%s_clusters_%s_clean.txt' % format_str
    clus_dct = file_operations.get_cluster_dictionary(clus_fname)
    # Perl script evaluation results filename.
    eval_fname = '%s/%s_cluster_eval_%s.txt' % format_str
    density_dct = get_cluster_densities(eval_fname)

    # DBGAP enrichment results filename.
    dbgap_fname = '%s/%s_clusters_dbgap_terms_%s.txt' % format_str
    dbgap_dct = get_enrichment_dct(dbgap_fname)

    out_fname = '%s/%s_clusters_summary_%s.tsv' % format_str
    out = open(out_fname, 'w')

    out.write('Cluster ID\tIn-Density\tOut-Density\tIn/(In+Out)\t'
              'Gene size\tTop DBGAP p-value\tTop DBGAP term\n')

    for cid in sorted(density_dct.keys(), key=lambda x: int(x)):
        clus = clus_dct[cid]
        num_genes = len(clus)
        in_dens, out_dens = density_dct[cid]

        best_dbgap_p = dbgap_dct[cid][1][0]
        best_dbgap_term = dbgap_dct[cid][0][0]

        out.write(
            '%s\t%g\t%g\t%g\t%d\t%s\t%s\n' %
            (cid, in_dens, out_dens, in_dens /
             (in_dens + out_dens), num_genes, best_dbgap_p, best_dbgap_term))
    out.close()
def compute_label_enrichments(in_fname, out_fname):
    '''
    Takes in a cluster name, reads it, and writes to the out_fname.
    '''
    global gene_universe
    cluster_dct = file_operations.get_cluster_dictionary(in_fname)
    net_genes = [item for sublist in cluster_dct.values() for item in sublist]
    gene_universe = set(net_genes)

    out = open(out_fname, 'w')
    # Loop through the clusters.
    for clus_id in cluster_dct:
        clus_genes = set(cluster_dct[clus_id])
        sorted_fisher_dct = get_sorted_fisher_dct(clus_genes)

        # Get the log of the top 5 enrichment p-values.
        top_label_terms, top_p_values = [], []
        for (label_term, p_value) in sorted_fisher_dct[:5]:
            top_label_terms += [label_term]
            top_p_values += [str(p_value)]
        out.write(
            'Cluster %s\n%s\n%s\n' %
            (clus_id, '\t'.join(top_label_terms), '\t'.join(top_p_values)))
    out.close()