def main():
    if len(sys.argv) != 2:
        print "Usage:python %s mouse/tcga_cancer_index" % sys.argv[0]
        exit()
    data_type = sys.argv[1]
    assert data_type == "mouse" or data_type.isdigit()
    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    if data_type == "mouse":
        pcc_threshold = 0.9
    else:
        # TCGA coefficients are worse.
        pcc_threshold = 0.5

    # Read in the tsv file.
    gene_exp_dct = file_operations.get_gene_expression_dct(data_type)
    high_std_genes = file_operations.get_high_std_genes(data_type)

    gene_exp_matrix = create_gene_exp_matrix(gene_exp_dct, high_std_genes)

    r, p = corrcoef(gene_exp_matrix)

    out = open("./data/%s_data/high_std_network.txt" % data_type, "w")
    for row_idx, row in enumerate(r):
        for col_idx, pcc in enumerate(row):
            if col_idx <= row_idx or pcc < pcc_threshold or pcc == 1:
                continue
            # if p[row_idx][col_idx] > P_VALUE_THRESOLD:
            #     continue
            # Write out gene information.
            gene_a, gene_b = (high_std_genes[row_idx], high_std_genes[col_idx])
            out.write("%s\t%s\t%f\n" % (gene_a, gene_b, abs(pcc)))
    out.close()
def get_go_domains(data_type):
    '''
    Returns three dictionaries, one corresponding to each GO domain. Keys are
    genes, and values are GO terms.
    '''
    high_std_genes = file_operations.get_high_std_genes(data_type)

    # bp = biological process, mf = molecular function.
    bp_go_gene_dct = {}
    mf_go_gene_dct = {}

    if data_type == 'mouse':
        f = open('./data/mouse_data/pantherGeneList_mouse.txt', 'r')
    else:
        f = open('./data/tcga_data/pantherGeneList_tcga.txt', 'r')

    for line in f:
        (gene_id, ensembl_id_list, ortholog, family, protein_class, species,
            bp_term_list, mf_term_list) = line.split('\t')

        # Process the GO terms.
        bp_term_list = process_go_term_list(bp_term_list)
        mf_term_list = process_go_term_list(mf_term_list)

        # Associate each GO term list with the corresponding gene.
        ensembl_id_list = ensembl_id_list.split(',')
        for gene in ensembl_id_list:
            assert ('ENSMUSG' in gene or 'ENSG' in gene)
            if gene not in high_std_genes:
                continue
            add_to_dictionary(gene, bp_term_list, bp_go_gene_dct)
            add_to_dictionary(gene, mf_term_list, mf_go_gene_dct)
    f.close()

    return bp_go_gene_dct, mf_go_gene_dct
def main():
    if len(sys.argv) != 4:
        print 'Usage:python %s data_type objective_function run_num' % sys.argv[0]
        exit()
    global data_type, objective_function, run_num
    data_type = sys.argv[1]
    assert data_type == 'mouse' or data_type.isdigit()
    objective_function = sys.argv[2]
    assert objective_function in ['oclode', 'schaeffer', 'wlogv', 'prosnet']
    run_num = sys.argv[3]
    assert run_num.isdigit()

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]
        
    go_dct_list = read_go_dictionaries()
    global gene_universe
    gene_universe = file_operations.get_high_std_genes(data_type)

    # Get the overlapping MF and BP terms.
    overlap_list = file_operations.read_go_overlap(data_type)

    # Only evaluate on BP for now.
    for domain_index in [0]:
        go_dct = go_dct_list[domain_index]

        # Remove the overlapping BP terms.
        overlapping_go_terms = set([tup[domain_index] for tup in overlap_list])
        for overlapping_go in overlapping_go_terms:
            del go_dct[overlapping_go]

        # No GO network.
        no_go_cluster_fname = './results/%s_results/%s/clusters_no_go/clusters_no_go_%s.txt' % (
            data_type, objective_function, run_num)
        no_go_fname = './results/%s_results/%s/cluster_enrichment_terms_no_go/cluster_' % (
            data_type, objective_function)
        no_go_fname += 'enrichment_terms_no_go_%s_%d.txt' % (run_num,
            domain_index)
        write_enrichment_files(no_go_cluster_fname, no_go_fname, go_dct)

        # GO network.
        cluster_fname = './results/%s_results/%s/clusters_go/clusters_go_clean_%s_%d.txt' % (
            data_type, objective_function, run_num, domain_index)
        go_fname = './results/%s_results/%s/cluster_enrichment_terms_go/cluster_' % (
            data_type, objective_function)
        go_fname += 'enrichment_terms_go_%s_%d.txt' % (run_num, domain_index)
        write_enrichment_files(cluster_fname, go_fname, go_dct)