def main(): if len(sys.argv) != 2: print "Usage:python %s mouse/tcga_cancer_index" % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == "mouse" or data_type.isdigit() if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] if data_type == "mouse": pcc_threshold = 0.9 else: # TCGA coefficients are worse. pcc_threshold = 0.5 # Read in the tsv file. gene_exp_dct = file_operations.get_gene_expression_dct(data_type) high_std_genes = file_operations.get_high_std_genes(data_type) gene_exp_matrix = create_gene_exp_matrix(gene_exp_dct, high_std_genes) r, p = corrcoef(gene_exp_matrix) out = open("./data/%s_data/high_std_network.txt" % data_type, "w") for row_idx, row in enumerate(r): for col_idx, pcc in enumerate(row): if col_idx <= row_idx or pcc < pcc_threshold or pcc == 1: continue # if p[row_idx][col_idx] > P_VALUE_THRESOLD: # continue # Write out gene information. gene_a, gene_b = (high_std_genes[row_idx], high_std_genes[col_idx]) out.write("%s\t%s\t%f\n" % (gene_a, gene_b, abs(pcc))) out.close()
def get_go_domains(data_type): ''' Returns three dictionaries, one corresponding to each GO domain. Keys are genes, and values are GO terms. ''' high_std_genes = file_operations.get_high_std_genes(data_type) # bp = biological process, mf = molecular function. bp_go_gene_dct = {} mf_go_gene_dct = {} if data_type == 'mouse': f = open('./data/mouse_data/pantherGeneList_mouse.txt', 'r') else: f = open('./data/tcga_data/pantherGeneList_tcga.txt', 'r') for line in f: (gene_id, ensembl_id_list, ortholog, family, protein_class, species, bp_term_list, mf_term_list) = line.split('\t') # Process the GO terms. bp_term_list = process_go_term_list(bp_term_list) mf_term_list = process_go_term_list(mf_term_list) # Associate each GO term list with the corresponding gene. ensembl_id_list = ensembl_id_list.split(',') for gene in ensembl_id_list: assert ('ENSMUSG' in gene or 'ENSG' in gene) if gene not in high_std_genes: continue add_to_dictionary(gene, bp_term_list, bp_go_gene_dct) add_to_dictionary(gene, mf_term_list, mf_go_gene_dct) f.close() return bp_go_gene_dct, mf_go_gene_dct
def main(): if len(sys.argv) != 4: print 'Usage:python %s data_type objective_function run_num' % sys.argv[0] exit() global data_type, objective_function, run_num data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() objective_function = sys.argv[2] assert objective_function in ['oclode', 'schaeffer', 'wlogv', 'prosnet'] run_num = sys.argv[3] assert run_num.isdigit() if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] go_dct_list = read_go_dictionaries() global gene_universe gene_universe = file_operations.get_high_std_genes(data_type) # Get the overlapping MF and BP terms. overlap_list = file_operations.read_go_overlap(data_type) # Only evaluate on BP for now. for domain_index in [0]: go_dct = go_dct_list[domain_index] # Remove the overlapping BP terms. overlapping_go_terms = set([tup[domain_index] for tup in overlap_list]) for overlapping_go in overlapping_go_terms: del go_dct[overlapping_go] # No GO network. no_go_cluster_fname = './results/%s_results/%s/clusters_no_go/clusters_no_go_%s.txt' % ( data_type, objective_function, run_num) no_go_fname = './results/%s_results/%s/cluster_enrichment_terms_no_go/cluster_' % ( data_type, objective_function) no_go_fname += 'enrichment_terms_no_go_%s_%d.txt' % (run_num, domain_index) write_enrichment_files(no_go_cluster_fname, no_go_fname, go_dct) # GO network. cluster_fname = './results/%s_results/%s/clusters_go/clusters_go_clean_%s_%d.txt' % ( data_type, objective_function, run_num, domain_index) go_fname = './results/%s_results/%s/cluster_enrichment_terms_go/cluster_' % ( data_type, objective_function) go_fname += 'enrichment_terms_go_%s_%d.txt' % (run_num, domain_index) write_enrichment_files(cluster_fname, go_fname, go_dct)