def main(): if len(sys.argv) != 2: print 'Usage:python %s mouse/tcga_cancers' % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] bp_go_gene_dct, mf_go_gene_dct = get_go_domains(data_type) # Delete empty keys. if '' in bp_go_gene_dct: del bp_go_gene_dct[''] if '' in mf_go_gene_dct: del mf_go_gene_dct[''] # Dump each dictionary out to file. with open('./data/%s_data/bp_dct.json' % data_type, 'w') as fp: json.dump(bp_go_gene_dct, fp) fp.close() with open('./data/%s_data/mf_dct.json' % data_type, 'w') as fp: json.dump(mf_go_gene_dct, fp) fp.close()
def main(): if len(sys.argv) != 3: print 'Usage: %s data_type genes_only/pca/mean/median' % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() network_type = sys.argv[2] assert network_type in ['genes_only', 'pca', 'mean', 'median'] if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] for domain_index, go_domain in enumerate(['bp', 'mf']): go_dct = get_go_gene_dct(go_domain, data_type) # Get the overlapping MF and BP terms. overlap_list = read_go_overlap(data_type) # Remove the overlapping BP terms. overlapping_go_terms = set([tup[domain_index] for tup in overlap_list]) for overlapping_go in overlapping_go_terms: del go_dct[overlapping_go] if network_type == 'genes_only': cluster_wgcna_dct = get_cluster_dictionary(data_type, network_type, network_type) else: cluster_wgcna_dct = get_cluster_dictionary(data_type, network_type, go_domain) compute_go_enrichments(data_type, network_type, go_dct, cluster_wgcna_dct, go_domain)
def main(): if len(sys.argv) != 2: print "Usage:python %s mouse/tcga_cancer_index" % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == "mouse" or data_type.isdigit() if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] if data_type == "mouse": pcc_threshold = 0.9 else: # TCGA coefficients are worse. pcc_threshold = 0.5 # Read in the tsv file. gene_exp_dct = file_operations.get_gene_expression_dct(data_type) high_std_genes = file_operations.get_high_std_genes(data_type) gene_exp_matrix = create_gene_exp_matrix(gene_exp_dct, high_std_genes) r, p = corrcoef(gene_exp_matrix) out = open("./data/%s_data/high_std_network.txt" % data_type, "w") for row_idx, row in enumerate(r): for col_idx, pcc in enumerate(row): if col_idx <= row_idx or pcc < pcc_threshold or pcc == 1: continue # if p[row_idx][col_idx] > P_VALUE_THRESOLD: # continue # Write out gene information. gene_a, gene_b = (high_std_genes[row_idx], high_std_genes[col_idx]) out.write("%s\t%s\t%f\n" % (gene_a, gene_b, abs(pcc))) out.close()
def main(): if len(sys.argv) != 3: print 'Usage: %s data_type genes_only/pca/mean/median' % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() go_method = sys.argv[2] assert go_method in ['genes_only', 'pca', 'mean', 'median'] if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] if go_method == 'genes_only': domain_list = [go_method] else: domain_list = ['bp', 'cc', 'mf'] results_folder = './results/%s_results/' % data_type if not os.path.exists(results_folder): os.makedirs(results_folder) results_folder = './results/%s_results/%s/' % (data_type, go_method) if not os.path.exists(results_folder): os.makedirs(results_folder) for go_domain in domain_list: f = open('./data/%s_module_membership_%s.txt' % (data_type, go_domain), 'r') out = open('%sclusters_%s.txt' % (results_folder, go_domain), 'w') # Write dummy header line. out.write('dummy_header\n') for i, line in enumerate(f): # Skip header. if i == 0: continue # Ignore color and membershi columns. node, module, color, membership = line.split() # Skip garbage module. if module == '0': continue # Remove quotiation marks around the ENSMUSG ID. node = node.strip('"') # Don't add in GO terms. if 'GO:' in node or ('ENSMUSG' not in node and 'ENSG' not in node): continue # Write out the line. out.write('Species 0\tGene %s\tCluster %s\n' % (node, module)) out.close() f.close()
def main(): if len(sys.argv) != 4: print "Usage: %s data_type genes_only/pca/mean/median network_to_compare" % sys.argv[0] exit() global data_type data_type = sys.argv[1] assert data_type == "mouse" or data_type.isdigit() go_method = sys.argv[2] assert go_method in ["genes_only", "pca", "mean", "median"] comparing_network = sys.argv[3] assert comparing_network.isdigit() if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] num_genes_net, num_gg_net = get_network_statistics(int(comparing_network)) domain_list = ["bp"] for go_domain in domain_list: if go_method == "genes_only": cluster_wgcna_dct = get_cluster_dictionary(go_method, go_method) density_dct = get_density_dct(go_method, go_method) else: cluster_wgcna_dct = get_cluster_dictionary(go_method, go_domain) density_dct = get_density_dct(go_method, go_domain) enrichment_dct = get_enrichment_dct(go_method, go_domain) out = open("./results/%s_results/%s/clus_info_%s_%s.txt" % (data_type, go_method, go_method, go_domain), "w") # Write out to file. out.write("num_genes_in_net\tnum_g_g_net\tnum_g_go_net\n") # Automatically write 0 for the last column, since there are no gene-GO # edges for WGCNA. out.write("%s\t%d\t0\n" % (num_genes_net, num_gg_net)) out.write("cluster_number\tin_dens\tout_dens\tweighted_in_out_ratio\t") out.write("num_genes\tnum_go_terms_in\tnum_g_g_edges\tnum_g_go_edges\t") out.write("top_enrichment_p\n") # Loop through the clusters of genes. for i in range(len(cluster_wgcna_dct)): cid = str(i + 1) clus = cluster_wgcna_dct[cid] num_genes = len(clus) in_dens, out_dens, weighted_ratio, num_gg_edges = density_dct[cid] # if in_dens == 0: # ratio = float('inf') # else: out.write("%s\t%g\t%g\t%g\t" % (cid, in_dens, out_dens, weighted_ratio)) out.write("%d\t0\t%d\t0\t" % (num_genes, num_gg_edges)) out.write("%s\n" % enrichment_dct[cid]) out.close()
def main(): if len(sys.argv) != 4: print 'Usage:python %s data_type objective_function run_num' % sys.argv[0] exit() global data_type, objective_function, run_num data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() objective_function = sys.argv[2] assert objective_function in ['oclode', 'schaeffer', 'wlogv', 'prosnet'] run_num = sys.argv[3] assert run_num.isdigit() if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] go_dct_list = read_go_dictionaries() global gene_universe gene_universe = file_operations.get_high_std_genes(data_type) # Get the overlapping MF and BP terms. overlap_list = file_operations.read_go_overlap(data_type) # Only evaluate on BP for now. for domain_index in [0]: go_dct = go_dct_list[domain_index] # Remove the overlapping BP terms. overlapping_go_terms = set([tup[domain_index] for tup in overlap_list]) for overlapping_go in overlapping_go_terms: del go_dct[overlapping_go] # No GO network. no_go_cluster_fname = './results/%s_results/%s/clusters_no_go/clusters_no_go_%s.txt' % ( data_type, objective_function, run_num) no_go_fname = './results/%s_results/%s/cluster_enrichment_terms_no_go/cluster_' % ( data_type, objective_function) no_go_fname += 'enrichment_terms_no_go_%s_%d.txt' % (run_num, domain_index) write_enrichment_files(no_go_cluster_fname, no_go_fname, go_dct) # GO network. cluster_fname = './results/%s_results/%s/clusters_go/clusters_go_clean_%s_%d.txt' % ( data_type, objective_function, run_num, domain_index) go_fname = './results/%s_results/%s/cluster_enrichment_terms_go/cluster_' % ( data_type, objective_function) go_fname += 'enrichment_terms_go_%s_%d.txt' % (run_num, domain_index) write_enrichment_files(cluster_fname, go_fname, go_dct)
def main(): if len(sys.argv) != 4: print 'Usage:python %s data_type objective_function run_num' % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() objective_function = sys.argv[2] assert objective_function in ['oclode', 'schaeffer', 'wlogv', 'prosnet'] run_num = sys.argv[3] assert run_num.isdigit() if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] for go_domain_num in [0]: command = 'perl ./evaluate_clustering.pl ' # First, get a clean cluster file. file_operations.create_clean_go_file(data_type, objective_function, run_num, go_domain_num) command += '"./results/%s_results/%s/clusters_go/clusters_go_clean_%s_%d.txt" ' % ( data_type, objective_function, run_num, go_domain_num) command += '"./data/%s_data/networks_go/real_network_go_%s_%d.txt" ' % ( data_type, run_num, go_domain_num) command += '> "./results/%s_results/%s/cluster_eval_go/cluster_eval_go_%s_%d.txt"' % ( data_type, objective_function, run_num, go_domain_num) subprocess.call(command, shell=True) # No GO term command. command = 'perl ./evaluate_clustering.pl ' command += '"./results/%s_results/%s/clusters_no_go/clusters_no_go_%s.txt" ' % ( data_type, objective_function, run_num) command += '"./data/%s_data/networks_no_go/real_network_no_go_%s.txt" ' % ( data_type, run_num) command += '> "./results/%s_results/%s/cluster_eval_no_go/cluster_eval_no_go_%s.txt"' % ( data_type, objective_function, run_num) subprocess.call(command, shell=True)
def main(): if len(sys.argv) not in [5, 6]: print 'Usage:python %s data_type objective_function run_num go/no_go go_num <if go>' % sys.argv[0] exit() # Sort out command line arguments. data_type, objective_function, run_num, network = sys.argv[1:5] assert data_type == 'mouse' or data_type.isdigit() assert objective_function in ['oclode', 'schaeffer', 'wlogv'] assert run_num.isdigit() assert network in ['go', 'no_go'] if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] # Construct the results folders if it's the first time. results_folder = './results/%s_results/' % data_type if not os.path.exists(results_folder): os.makedirs(results_folder) obj_func_folder = results_folder + 'wlogv/' if not os.path.exists(obj_func_folder): os.makedirs(obj_func_folder) os.makedirs(obj_func_folder + 'clus_info_no_go/') os.makedirs(obj_func_folder + 'clus_info_go/') os.makedirs(obj_func_folder + 'cluster_enrichment_terms_go/') os.makedirs(obj_func_folder + 'cluster_enrichment_terms_no_go/') os.makedirs(obj_func_folder + 'cluster_eval_no_go/') os.makedirs(obj_func_folder + 'clusters_go/') os.makedirs(obj_func_folder + 'cluster_eval_go/') os.makedirs(obj_func_folder + 'clusters_no_go/') plot_folder = results_folder + 'comparison_plots/' if not os.path.exists(plot_folder): os.makedirs(plot_folder) config_dct = file_operations.read_config_file(data_type)[run_num] temp = config_dct['temp'] num_clusters = config_dct['num_clusters'] # Get the binary associated with the desired objective function. if objective_function == 'oclode': binary = '''./OCLODE/makedir/OCLODE_efficient''' elif objective_function == 'schaeffer': binary = './SchaefferScore/makedir/SchaefferImplementNotWeighted' else: binary = './WlogV/makedir/WlogVImplement' # Build the command. command = '%s %s 1 0 "./data/%s_data/orth.txt" 1 ' % (binary, num_clusters, data_type) command += '"./data/%s_data/networks_%s/network_%s_%s' % (data_type, network, network, run_num) if network == 'go': go_num = sys.argv[5] command += '_%s' % go_num command += '.txt" -t %s 2>log > "./results/%s_results/%s/clusters_%s/clusters_%s_%s' % ( temp, data_type, objective_function, network, network, run_num) if network == 'go': command += '_%s' % go_num command += '.txt"' # Execute the command in the shell. print command subprocess.call(command, shell=True)
matplotlib.use('Agg') THRESHOLD_MULT = 0.8 import pylab if __name__ == '__main__': if len(sys.argv) != 3: print 'Usage: %s data_type run_num' % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() run_num = sys.argv[2] assert run_num.isdigit() go_domain_list = ['bp', 'mf'] if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] for go_domain_index in [0]: go_domain = go_domain_list[go_domain_index] colors = ['blue', 'purple', 'red', 'black', 'orange', 'green', 'yellow'] # for mode_index, mode in enumerate(['wgcna', 'wlogv_go', 'wlogv_no_go', # 'oclode_go', 'oclode_no_go', 'schaeffer_go', 'schaeffer_no_go']): for mode_index, mode in enumerate(['wgcna', 'wlogv_go', 'wlogv_no_go', 'prosnet_go', 'prosnet_no_go']): pts = [] no_go_fname = 'clus_info_no_go/clus_info_no_go_%s_%d.txt' % ( run_num, go_domain_index) go_fname = 'clus_info_go/clus_info_go_%s_%d.txt' % ( run_num, go_domain_index)
def main(): if len(sys.argv) != 3: print 'Usage: %s data_type genes_only/pca/mean/median' % sys.argv[0] exit() data_type = sys.argv[1] assert data_type == 'mouse' or data_type.isdigit() go_method = sys.argv[2] assert go_method in ['genes_only', 'pca', 'mean', 'median'] if data_type.isdigit(): data_type = file_operations.get_tcga_disease_list()[int(data_type)] high_std_genes = get_high_std_genes(data_type) if go_method == 'genes_only': go_domain_list = [go_method] else: go_domain_list = ['bp', 'cc', 'mf'] # go_domain is the domain we hold out. for go_domain in go_domain_list: ## Leave one out code. This block adds two GO domains into the network, ## and evaluates on the domain that is not added. # go_domain_list_train = go_domain_list[:] # go_domain_list_train.remove(go_domain) # go_gene_dct = {} # for go_domain_train in go_domain_list_train: # go_gene_dct.update(get_go_gene_dct(go_domain_train)) # # This line is to both train and evaluate on the same domain. # if go_domain != 'genes_only': # go_gene_dct = get_go_gene_dct(go_domain) if data_type == 'mouse': f = open('../data/mouse_data/mm_mrsb_log2_expression.tsv', 'r') out = open('./data/mm_mrsb_log2_expression_%s.tsv' % go_domain, 'w') else: f = open('../data/%s_data/expr.txt' % data_type, 'r') out = open('./data/%s_expr_%s.txt' % (data_type, go_domain), 'w') # Write out the gene expression vectors for genes. # gene_expression_dct = {} for i, line in enumerate(f): # Directly write out the header file. if i == 0: out.write(line) continue split_line = line.split() gene, exp_vals = split_line[0], split_line[1:] assert 'ENSMUSG' in gene or 'ENSG' in gene # Skip genes with low standard deviation. if gene not in high_std_genes: continue out.write(line) # exp_vals = [float(val) for val in exp_vals] # assert gene not in gene_expression_dct # gene_expression_dct[gene] = exp_vals f.close() # if go_domain == 'genes_only': # break # # Run PCA and generate gene expression vectors for GO terms. # for go_term in go_gene_dct: # annotated_gene_list = go_gene_dct[go_term] # # Skip bad GO terms. # num_annotated_gene_list = len(annotated_gene_list) # if num_annotated_gene_list < 10 or num_annotated_gene_list > 1000: # continue # # Construct expression matrix for a GO term on the genes it # # annotates. # super_gene_matrix = [] # for annotated_gene in annotated_gene_list: # ann_gene_expression = gene_expression_dct[annotated_gene] # super_gene_matrix += [ann_gene_expression] # if go_method == 'mean': # # Get the average across each sample. # super_gene_matrix = np.array(super_gene_matrix) # super_gene = np.mean(super_gene_matrix, axis=0) # elif go_method == 'pca': # # Get most principal component. # pca = PCA() # pca.fit(super_gene_matrix) # super_gene = pca.components_[0] + pca.mean_ # elif go_method == 'median': # # Get median across each sample. # super_gene_matrix = np.array(super_gene_matrix) # super_gene = np.median(super_gene_matrix, axis=0) # # Write out the vector to file. # out.write(go_term + '\t') # out.write('\t'.join(map(str, super_gene)) + '\n') out.close() f.close()