def main():
    if len(sys.argv) != 2:
        print 'Usage:python %s mouse/tcga_cancers' % sys.argv[0]
        exit()
    data_type = sys.argv[1]    
    assert data_type == 'mouse' or data_type.isdigit()
    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    bp_go_gene_dct, mf_go_gene_dct = get_go_domains(data_type)

    # Delete empty keys.
    if '' in bp_go_gene_dct:
        del bp_go_gene_dct['']
    if '' in mf_go_gene_dct:
        del mf_go_gene_dct['']

    # Dump each dictionary out to file.
    with open('./data/%s_data/bp_dct.json' % data_type, 'w') as fp:
        json.dump(bp_go_gene_dct, fp)
    fp.close()

    with open('./data/%s_data/mf_dct.json' % data_type, 'w') as fp:
        json.dump(mf_go_gene_dct, fp)
    fp.close()
def main():
    if len(sys.argv) != 3:
        print 'Usage: %s data_type genes_only/pca/mean/median' % sys.argv[0]
        exit()
    data_type = sys.argv[1]
    assert data_type == 'mouse' or data_type.isdigit()
    network_type = sys.argv[2]
    assert network_type in ['genes_only', 'pca', 'mean', 'median']

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    for domain_index, go_domain in enumerate(['bp', 'mf']):
        go_dct = get_go_gene_dct(go_domain, data_type)
        # Get the overlapping MF and BP terms.
        overlap_list = read_go_overlap(data_type)
        # Remove the overlapping BP terms.
        overlapping_go_terms = set([tup[domain_index] for tup in overlap_list])
        for overlapping_go in overlapping_go_terms:
            del go_dct[overlapping_go]

        if network_type == 'genes_only':
            cluster_wgcna_dct = get_cluster_dictionary(data_type, network_type,
                network_type)
        else:
            cluster_wgcna_dct = get_cluster_dictionary(data_type, network_type,
                go_domain)
        compute_go_enrichments(data_type, network_type, go_dct,
            cluster_wgcna_dct, go_domain)
Ejemplo n.º 3
0
def main():
    if len(sys.argv) != 2:
        print "Usage:python %s mouse/tcga_cancer_index" % sys.argv[0]
        exit()
    data_type = sys.argv[1]
    assert data_type == "mouse" or data_type.isdigit()
    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    if data_type == "mouse":
        pcc_threshold = 0.9
    else:
        # TCGA coefficients are worse.
        pcc_threshold = 0.5

    # Read in the tsv file.
    gene_exp_dct = file_operations.get_gene_expression_dct(data_type)
    high_std_genes = file_operations.get_high_std_genes(data_type)

    gene_exp_matrix = create_gene_exp_matrix(gene_exp_dct, high_std_genes)

    r, p = corrcoef(gene_exp_matrix)

    out = open("./data/%s_data/high_std_network.txt" % data_type, "w")
    for row_idx, row in enumerate(r):
        for col_idx, pcc in enumerate(row):
            if col_idx <= row_idx or pcc < pcc_threshold or pcc == 1:
                continue
            # if p[row_idx][col_idx] > P_VALUE_THRESOLD:
            #     continue
            # Write out gene information.
            gene_a, gene_b = (high_std_genes[row_idx], high_std_genes[col_idx])
            out.write("%s\t%s\t%f\n" % (gene_a, gene_b, abs(pcc)))
    out.close()
def main():
    if len(sys.argv) != 3:
        print 'Usage: %s data_type genes_only/pca/mean/median' % sys.argv[0]
        exit()
    data_type = sys.argv[1]
    assert data_type == 'mouse' or data_type.isdigit()
    go_method = sys.argv[2]
    assert go_method in ['genes_only', 'pca', 'mean', 'median']

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    if go_method == 'genes_only':
        domain_list = [go_method]
    else:
        domain_list = ['bp', 'cc', 'mf']

    results_folder = './results/%s_results/' % data_type
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    results_folder = './results/%s_results/%s/' % (data_type, go_method)
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)

    for go_domain in domain_list:
        f = open('./data/%s_module_membership_%s.txt' % (data_type, go_domain),
            'r')
        out = open('%sclusters_%s.txt' % (results_folder, go_domain), 'w')
        # Write dummy header line.
        out.write('dummy_header\n')

        for i, line in enumerate(f):
            # Skip header.
            if i == 0:
                continue

            # Ignore color and membershi columns.
            node, module, color, membership = line.split()

            # Skip garbage module.
            if module == '0':
                continue
            
            # Remove quotiation marks around the ENSMUSG ID.
            node = node.strip('"')

            # Don't add in GO terms.
            if 'GO:' in node or ('ENSMUSG' not in node and 'ENSG' not in node):
                continue

            # Write out the line.
            out.write('Species 0\tGene %s\tCluster %s\n' % (node, module))
        out.close()
        f.close()
def main():
    if len(sys.argv) != 4:
        print "Usage: %s data_type genes_only/pca/mean/median network_to_compare" % sys.argv[0]
        exit()
    global data_type
    data_type = sys.argv[1]
    assert data_type == "mouse" or data_type.isdigit()
    go_method = sys.argv[2]
    assert go_method in ["genes_only", "pca", "mean", "median"]
    comparing_network = sys.argv[3]
    assert comparing_network.isdigit()

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    num_genes_net, num_gg_net = get_network_statistics(int(comparing_network))

    domain_list = ["bp"]

    for go_domain in domain_list:
        if go_method == "genes_only":
            cluster_wgcna_dct = get_cluster_dictionary(go_method, go_method)
            density_dct = get_density_dct(go_method, go_method)
        else:
            cluster_wgcna_dct = get_cluster_dictionary(go_method, go_domain)
            density_dct = get_density_dct(go_method, go_domain)

        enrichment_dct = get_enrichment_dct(go_method, go_domain)
        out = open("./results/%s_results/%s/clus_info_%s_%s.txt" % (data_type, go_method, go_method, go_domain), "w")

        # Write out to file.
        out.write("num_genes_in_net\tnum_g_g_net\tnum_g_go_net\n")
        # Automatically write 0 for the last column, since there are no gene-GO
        # edges for WGCNA.
        out.write("%s\t%d\t0\n" % (num_genes_net, num_gg_net))
        out.write("cluster_number\tin_dens\tout_dens\tweighted_in_out_ratio\t")
        out.write("num_genes\tnum_go_terms_in\tnum_g_g_edges\tnum_g_go_edges\t")
        out.write("top_enrichment_p\n")

        # Loop through the clusters of genes.
        for i in range(len(cluster_wgcna_dct)):
            cid = str(i + 1)
            clus = cluster_wgcna_dct[cid]

            num_genes = len(clus)

            in_dens, out_dens, weighted_ratio, num_gg_edges = density_dct[cid]
            # if in_dens == 0:
            #     ratio = float('inf')
            # else:
            out.write("%s\t%g\t%g\t%g\t" % (cid, in_dens, out_dens, weighted_ratio))
            out.write("%d\t0\t%d\t0\t" % (num_genes, num_gg_edges))
            out.write("%s\n" % enrichment_dct[cid])
        out.close()
Ejemplo n.º 6
0
def main():
    if len(sys.argv) != 4:
        print 'Usage:python %s data_type objective_function run_num' % sys.argv[0]
        exit()
    global data_type, objective_function, run_num
    data_type = sys.argv[1]
    assert data_type == 'mouse' or data_type.isdigit()
    objective_function = sys.argv[2]
    assert objective_function in ['oclode', 'schaeffer', 'wlogv', 'prosnet']
    run_num = sys.argv[3]
    assert run_num.isdigit()

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]
        
    go_dct_list = read_go_dictionaries()
    global gene_universe
    gene_universe = file_operations.get_high_std_genes(data_type)

    # Get the overlapping MF and BP terms.
    overlap_list = file_operations.read_go_overlap(data_type)

    # Only evaluate on BP for now.
    for domain_index in [0]:
        go_dct = go_dct_list[domain_index]

        # Remove the overlapping BP terms.
        overlapping_go_terms = set([tup[domain_index] for tup in overlap_list])
        for overlapping_go in overlapping_go_terms:
            del go_dct[overlapping_go]

        # No GO network.
        no_go_cluster_fname = './results/%s_results/%s/clusters_no_go/clusters_no_go_%s.txt' % (
            data_type, objective_function, run_num)
        no_go_fname = './results/%s_results/%s/cluster_enrichment_terms_no_go/cluster_' % (
            data_type, objective_function)
        no_go_fname += 'enrichment_terms_no_go_%s_%d.txt' % (run_num,
            domain_index)
        write_enrichment_files(no_go_cluster_fname, no_go_fname, go_dct)

        # GO network.
        cluster_fname = './results/%s_results/%s/clusters_go/clusters_go_clean_%s_%d.txt' % (
            data_type, objective_function, run_num, domain_index)
        go_fname = './results/%s_results/%s/cluster_enrichment_terms_go/cluster_' % (
            data_type, objective_function)
        go_fname += 'enrichment_terms_go_%s_%d.txt' % (run_num, domain_index)
        write_enrichment_files(cluster_fname, go_fname, go_dct)
Ejemplo n.º 7
0
def main():
    if len(sys.argv) != 4:
        print 'Usage:python %s data_type objective_function run_num' % sys.argv[0]
        exit()
    data_type = sys.argv[1]
    assert data_type == 'mouse' or data_type.isdigit()
    objective_function = sys.argv[2]
    assert objective_function in ['oclode', 'schaeffer', 'wlogv', 'prosnet']
    run_num = sys.argv[3]
    assert run_num.isdigit()

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    for go_domain_num in [0]:
        command = 'perl ./evaluate_clustering.pl '
        # First, get a clean cluster file.
        file_operations.create_clean_go_file(data_type, objective_function,
            run_num, go_domain_num)
        command += '"./results/%s_results/%s/clusters_go/clusters_go_clean_%s_%d.txt" ' % (
            data_type, objective_function, run_num, go_domain_num)
        command += '"./data/%s_data/networks_go/real_network_go_%s_%d.txt" ' % (
            data_type, run_num, go_domain_num)
        command += '> "./results/%s_results/%s/cluster_eval_go/cluster_eval_go_%s_%d.txt"' % (
            data_type, objective_function, run_num, go_domain_num)
        subprocess.call(command, shell=True)

    # No GO term command.
    command = 'perl ./evaluate_clustering.pl '
    command += '"./results/%s_results/%s/clusters_no_go/clusters_no_go_%s.txt" ' % (
        data_type, objective_function, run_num)
    command += '"./data/%s_data/networks_no_go/real_network_no_go_%s.txt" ' % (
        data_type, run_num)
    command += '> "./results/%s_results/%s/cluster_eval_no_go/cluster_eval_no_go_%s.txt"' % (
        data_type, objective_function, run_num)
    subprocess.call(command, shell=True)
Ejemplo n.º 8
0
def main():
    if len(sys.argv) not in [5, 6]:
        print 'Usage:python %s data_type objective_function run_num go/no_go go_num <if go>' % sys.argv[0]
        exit()

    # Sort out command line arguments.
    data_type, objective_function, run_num, network = sys.argv[1:5]
    assert data_type == 'mouse' or data_type.isdigit()
    assert objective_function in ['oclode', 'schaeffer', 'wlogv']
    assert run_num.isdigit()
    assert network in ['go', 'no_go']

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    # Construct the results folders if it's the first time.
    results_folder = './results/%s_results/' % data_type
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    obj_func_folder = results_folder + 'wlogv/'
    if not os.path.exists(obj_func_folder):
        os.makedirs(obj_func_folder)
        os.makedirs(obj_func_folder + 'clus_info_no_go/')
        os.makedirs(obj_func_folder + 'clus_info_go/')
        os.makedirs(obj_func_folder + 'cluster_enrichment_terms_go/')
        os.makedirs(obj_func_folder + 'cluster_enrichment_terms_no_go/')
        os.makedirs(obj_func_folder + 'cluster_eval_no_go/')
        os.makedirs(obj_func_folder + 'clusters_go/')
        os.makedirs(obj_func_folder + 'cluster_eval_go/')
        os.makedirs(obj_func_folder + 'clusters_no_go/')
    plot_folder = results_folder + 'comparison_plots/'
    if not os.path.exists(plot_folder):
        os.makedirs(plot_folder)

    config_dct = file_operations.read_config_file(data_type)[run_num]
    temp = config_dct['temp']
    num_clusters = config_dct['num_clusters']

    # Get the binary associated with the desired objective function.
    if objective_function == 'oclode':
        binary = '''./OCLODE/makedir/OCLODE_efficient'''
    elif objective_function == 'schaeffer':
        binary = './SchaefferScore/makedir/SchaefferImplementNotWeighted'
    else:
        binary = './WlogV/makedir/WlogVImplement'

    # Build the command.
    command = '%s %s 1 0 "./data/%s_data/orth.txt" 1 ' % (binary, num_clusters,
        data_type)
    command += '"./data/%s_data/networks_%s/network_%s_%s' % (data_type, network,
        network, run_num)

    if network == 'go':
        go_num = sys.argv[5]
        command += '_%s' % go_num

    command += '.txt" -t %s 2>log > "./results/%s_results/%s/clusters_%s/clusters_%s_%s' % (
        temp, data_type, objective_function, network, network, run_num)

    if network == 'go':
        command += '_%s' % go_num

    command += '.txt"'

    # Execute the command in the shell.
    print command
    subprocess.call(command, shell=True)
Ejemplo n.º 9
0
matplotlib.use('Agg')
THRESHOLD_MULT = 0.8
import pylab

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print 'Usage: %s data_type run_num' % sys.argv[0]
        exit()
    data_type = sys.argv[1]
    assert data_type == 'mouse' or data_type.isdigit()
    run_num = sys.argv[2]
    assert run_num.isdigit()
    go_domain_list = ['bp', 'mf']

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    for go_domain_index in [0]:
        go_domain = go_domain_list[go_domain_index]
        colors = ['blue', 'purple', 'red', 'black', 'orange', 'green', 'yellow']
        # for mode_index, mode in enumerate(['wgcna', 'wlogv_go', 'wlogv_no_go',
        #     'oclode_go', 'oclode_no_go', 'schaeffer_go', 'schaeffer_no_go']):
        for mode_index, mode in enumerate(['wgcna', 'wlogv_go', 'wlogv_no_go',
            'prosnet_go', 'prosnet_no_go']):
            pts = []

            no_go_fname = 'clus_info_no_go/clus_info_no_go_%s_%d.txt' % (
                run_num, go_domain_index)
            go_fname = 'clus_info_go/clus_info_go_%s_%d.txt' % (
                run_num, go_domain_index)
Ejemplo n.º 10
0
def main():
    if len(sys.argv) != 3:
        print 'Usage: %s data_type genes_only/pca/mean/median' % sys.argv[0]
        exit()
    data_type = sys.argv[1]
    assert data_type == 'mouse' or data_type.isdigit()
    go_method = sys.argv[2]
    assert go_method in ['genes_only', 'pca', 'mean', 'median']

    if data_type.isdigit():
        data_type = file_operations.get_tcga_disease_list()[int(data_type)]

    high_std_genes = get_high_std_genes(data_type)

    if go_method == 'genes_only':
        go_domain_list = [go_method]
    else:
        go_domain_list = ['bp', 'cc', 'mf']

    # go_domain is the domain we hold out.
    for go_domain in go_domain_list:
        ## Leave one out code. This block adds two GO domains into the network,
        ## and evaluates on the domain that is not added.
        # go_domain_list_train = go_domain_list[:]
        # go_domain_list_train.remove(go_domain)

        # go_gene_dct = {}
        # for go_domain_train in go_domain_list_train:
        #     go_gene_dct.update(get_go_gene_dct(go_domain_train))

        # # This line is to both train and evaluate on the same domain.
        # if go_domain != 'genes_only':
        #     go_gene_dct = get_go_gene_dct(go_domain)

        if data_type == 'mouse':
            f = open('../data/mouse_data/mm_mrsb_log2_expression.tsv', 'r')
            out = open('./data/mm_mrsb_log2_expression_%s.tsv' % go_domain, 'w')
        else:
            f = open('../data/%s_data/expr.txt' % data_type, 'r')
            out = open('./data/%s_expr_%s.txt' % (data_type, go_domain), 'w')

        # Write out the gene expression vectors for genes.
        # gene_expression_dct = {}
        for i, line in enumerate(f):
            # Directly write out the header file.
            if i == 0:
                out.write(line)
                continue
            split_line = line.split()
            gene, exp_vals = split_line[0], split_line[1:]
            assert 'ENSMUSG' in gene or 'ENSG' in gene
            
            # Skip genes with low standard deviation.
            if gene not in high_std_genes:
                continue
            out.write(line)
            # exp_vals = [float(val) for val in exp_vals]
            # assert gene not in gene_expression_dct
            # gene_expression_dct[gene] = exp_vals
        f.close()

        # if go_domain == 'genes_only':
        #     break

        # # Run PCA and generate gene expression vectors for GO terms.
        # for go_term in go_gene_dct:
        #     annotated_gene_list = go_gene_dct[go_term]

        #     # Skip bad GO terms.
        #     num_annotated_gene_list = len(annotated_gene_list)
        #     if num_annotated_gene_list < 10 or num_annotated_gene_list > 1000:
        #         continue

        #     # Construct expression matrix for a GO term on the genes it
        #     # annotates.
        #     super_gene_matrix = []
        #     for annotated_gene in annotated_gene_list:
        #         ann_gene_expression = gene_expression_dct[annotated_gene]
        #         super_gene_matrix += [ann_gene_expression]
            
        #     if go_method == 'mean':
        #         # Get the average across each sample.
        #         super_gene_matrix = np.array(super_gene_matrix)
        #         super_gene = np.mean(super_gene_matrix, axis=0)
        #     elif go_method == 'pca':
        #         # Get most principal component.
        #         pca = PCA()
        #         pca.fit(super_gene_matrix)
        #         super_gene = pca.components_[0] + pca.mean_
        #     elif go_method == 'median':
        #         # Get median across each sample.
        #         super_gene_matrix = np.array(super_gene_matrix)
        #         super_gene = np.median(super_gene_matrix, axis=0)

        #     # Write out the vector to file.
        #     out.write(go_term + '\t')
        #     out.write('\t'.join(map(str, super_gene)) + '\n')

        out.close()
        f.close()