def data_fetch(self, curs, splat_table, mcl_table, crs_no=0, output_fname=None): """ 04-17-05 fetch cluster_dstructures for all clusters(Jasmine's request) 04-19-05 1. return a mcl_id2cluster_dstructure 2. crs_no 08-31-05 output clusters directly to output_fname 09-01-05 add the last [] """ gene_no2gene_id = get_gene_no2gene_id(curs) #08-31-05 outf = open(output_fname, 'w') #08-31-05 outf.write("r:=[") #08-31-05 mcl_id2cluster_dstructure = {} no_of_total_genes = get_no_of_total_genes(curs) sys.stderr.write("Getting the basic information for all clusters...\n") curs.execute("DECLARE crs%s CURSOR FOR select m.mcl_id, m.vertex_set, m.connectivity, 0,\ m.recurrence_array, s.edge_set, s.connectivity, m.cooccurrent_cluster_id from %s m, %s s where \ m.splat_id=s.splat_id"\ %(crs_no, mcl_table, splat_table)) #06-20-05 connectivity_original faked to be 0 curs.execute("fetch 5000 from crs%s"%crs_no) rows = curs.fetchall() while rows: for row in rows: unit = cluster_dstructure() unit.cluster_id = row[0] vertex_set = row[1][1:-1].split(',') unit.vertex_set = map(int, vertex_set) unit.connectivity = row[2] unit.connectivity_original = row[3] recurrence_array = row[4][1:-1].split(',') unit.recurrence_array = map(float, recurrence_array) unit.edge_set = parse_splat_table_edge_set(row[5]) unit.splat_connectivity = row[6] unit.cooccurrent_cluster_id = row[7] unit.go_no2association_genes = self.get_go_functions_of_this_gene_set(curs, unit.vertex_set) unit.go_no2information = self.get_information_of_go_functions(curs, \ unit.go_no2association_genes, len(unit.vertex_set), no_of_total_genes, p_value_cut_off=0.05) #jasmine wants to cut some go-nos. unit.edge_cor_2d_list, unit.edge_sig_2d_list = self.get_cor_sig_2d_list(curs, unit.edge_set) str_tmp = self.return_string_form_of_cluster_dstructure(unit, gene_no2gene_id) #08-31-05 outf.write("%s,"%str_tmp) #mcl_id2cluster_dstructure[unit.cluster_id] = unit """ order_1st_id, order_2nd_id = map(int, unit.cooccurrent_cluster_id.split('.')) if order_1st_id not in self.order_1st_id2all_clusters: self.order_1st_id2all_clusters[order_1st_id] = {} if order_2nd_id not in self.order_1st_id2all_clusters[order_1st_id]: self.order_1st_id2all_clusters[order_1st_id][order_2nd_id] = [] self.order_1st_id2all_clusters[order_1st_id][order_2nd_id].append(unit) """ curs.execute("fetch 5000 from crs%s"%crs_no) rows = curs.fetchall() outf.write("[]]:") #08-31-05, 09-01-05 add the last blank [] del outf sys.stderr.write("Done.\n") return mcl_id2cluster_dstructure
def cluster_dstructure_output_with_both_hierarchy(self, curs, output_fname, \ pre_2nd_cc_hierarchy, mcl_id2cluster_dstructure, mcl_id_2nd_order2cluster_dstructure): """ 04-19-05 jasmine wants to put 2nd-order clusters and its connected components into one file. """ from codense.common import get_gene_no2gene_id gene_no2gene_id = get_gene_no2gene_id(curs) sys.stderr.write("Outputting cluster information...") outf = open(output_fname, 'w') str_tmp_list0 = [] #hold the 1st-order clusters for pregraph_id,mcl_id_2nd_order_dict in pre_2nd_cc_hierarchy.iteritems(): str_tmp_list1 = [] #hold the 2nd-order clusters for mcl_id_2nd_order,mcl_id_set in mcl_id_2nd_order_dict.iteritems(): str_tmp_list2 = [] #hold the connected components #first one is the 2nd-order cluster str_tmp = self.return_string_form_of_cluster_dstructure(mcl_id_2nd_order2cluster_dstructure[mcl_id_2nd_order],\ gene_no2gene_id) str_tmp_list2.append(str_tmp) for mcl_id in mcl_id_set: str_tmp = self.return_string_form_of_cluster_dstructure(mcl_id2cluster_dstructure[mcl_id],\ gene_no2gene_id) str_tmp_list2.append(str_tmp) str_tmp_list1.append("[%s]"%','.join(str_tmp_list2)) str_tmp_list0.append("[%s]"%",".join(str_tmp_list1)) #'r:=' is for directly read in as an array outf.write("r:=[%s]:"%",".join(str_tmp_list0)) outf.close() sys.stderr.write("Done.\n")
def dstruc_loadin(self, curs): ''' ''' sys.stderr.write("Loading Data STructure...\n") from codense.common import get_go_no2go_id, get_gene_no2gene_id, get_go_no2name, get_gene_id2gene_no, get_gene_no2go_no self.go_no2go_id = get_go_no2go_id(curs) self.go_no2go_name = get_go_no2name(curs) self.gene_no2gene_id = get_gene_no2gene_id(curs) self.gene_id2gene_no = get_gene_id2gene_no(curs) self.global_gene_to_go_dict = get_gene_no2go_no(curs) #04-01-05 the second kind in label_dict gene_no2no = {} for gene_no in self.gene_no2gene_id: gene_no2no[gene_no] = gene_no self.label_dict = {1:self.gene_no2gene_id, 2: gene_no2no} curs.execute("select gene_no,go_functions from gene") if self.type == 3: curs.execute("select array_upper(recurrence_array,1) from %s limit 1"%self.table) rows = curs.fetchall() self.no_of_datasets = int(rows[0][0]) sys.stderr.write("Done\n")
def on_button_cluster_info_clicked(self, button_cluster_info, *args): if self.curs==None: print "db_connect first" return self.no_of_datasets = int(self.entry_no_of_datasets.get_text()) self.treeview_init(self.no_of_datasets) self.gene_no2gene_id = get_gene_no2gene_id(self.curs) self.gene_no2go_no = get_gene_no2go_no(self.curs) self.window_cluster_info1.show() self.window_cluster_info2.show()
def dstruc_loadin(self, curs): """ 03-09-05 get the context from mcl_table via linking through mcl_id of p_gene_table context_dict is set """ from codense.common import get_known_genes_dict, get_go_no2go_id, get_go_no2name, get_gene_no2gene_id self.known_genes_dict = get_known_genes_dict(curs) self.go_no2go_id = get_go_no2go_id(curs) self.go_no2go_name = get_go_no2name(curs) self.gene_no2gene_id = get_gene_no2gene_id(curs) sys.stderr.write("Setting up gene_prediction_dict...") # setup self.gene_prediction_dict curs.execute( "select p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, p.is_correct_lca, m.vertex_set\ from %s p, %s g, %s m where g.p_gene_id=p.p_gene_id and m.mcl_id=p.mcl_id" % (self.gene_table, self.table, self.mcl_table) ) rows = curs.fetchall() for row in rows: gene_no = row[0] if self.type == 2 and gene_no not in self.known_genes_dict: # I only want the known genes, but this gene is unknown continue elif self.type == 3 and gene_no in self.known_genes_dict: # i only want the unknown genes, but this gene is known continue go_no = row[1] is_correct = row[2] is_correct_l1 = row[3] is_correct_lca = row[4] vertex_set = row[5][1:-1].split(",") vertex_set = map(int, vertex_set) item = function_struc() item.is_correct = is_correct item.is_correct_l1 = is_correct_l1 item.is_correct_lca = is_correct_lca # context_dict is a set item.context_dict = Set(vertex_set) if gene_no not in self.gene_prediction_dict: self.gene_prediction_dict[gene_no] = gene_prediction() self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item else: self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item sys.stderr.write("Done\n") """
def run(self): communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) gene_id2no = get_gene_id2gene_no(curs) gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no) gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1) gene_no2id = get_gene_no2gene_id(curs) gene_no2go_no = get_gene_no2go_no(curs) gene_no2id_pickle = cPickle.dumps(gene_no2id, -1) gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(gene2enc_array_pickle, node, 0) communicator.send(gene_no2id_pickle, communicator.size-1, 0) communicator.send(gene_no2go_no_pickle, communicator.size-1, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) gene2enc_array = cPickle.loads(data) #take the data elif node_rank==communicator.size-1: schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) data, source, tag = communicator.receiveString(0, 0) gene_no2id = cPickle.loads(data) data, source, tag = communicator.receiveString(0, 0) gene_no2go_no = cPickle.loads(data) mpi_synchronize(communicator) if node_rank == 0: curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\ g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table)) input_node(communicator, curs, free_computing_nodes, self.message_size, self.report) elif node_rank in free_computing_nodes: parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off] computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report) elif node_rank==communicator.size-1: if not os.path.isdir(self.pic_output_dir): os.makedirs(self.pic_output_dir) cluster_info_instance = cluster_info() ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off)) writer = csv.writer(open(ofname, 'w'), delimiter='\t') parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def output(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map): """ 03-03-05 loop over gene_no2p_gene_id_src and p_gene_id_src_map 03-13-05 add a column, #clusters in the output file --output_one_gene() --output_function_group() """ #three dictionaries gene_no2gene_id = get_gene_no2gene_id(curs) gene_no2direct_go = get_gene_no2direct_go(curs) go_no2go_id = get_go_no2go_id(curs) go_no2name = get_go_no2name(curs) go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table) sys.stderr.write("Outputing prediction table...") writer = csv.writer(outf, delimiter='\t') #first output the known genes for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems(): self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \ 'e_acc', 'e_acc_pair', 'cluster_context'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) #second output the unknown genes for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems(): self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \ 'e_acc', 'e_acc_pair', 'cluster_context'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) del writer sys.stderr.write("Done\n")
def output1(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map): """ 03-15-05 copied from output() """ #three dictionaries gene_no2gene_id = get_gene_no2gene_id(curs) gene_no2direct_go = get_gene_no2direct_go(curs) go_no2go_id = get_go_no2go_id(curs) go_no2name = get_go_no2name(curs) go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table) from codense.common import get_prediction_pair2lca_list prediction_pair2lca_list = get_prediction_pair2lca_list(curs,p_gene_table=self.p_gene_table) sys.stderr.write("Outputing prediction table...") writer = csv.writer(outf, delimiter='\t') #first output the known genes for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems(): self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_id', 'go_name', 'is_correct_lca', 'lca_list', 'p_value_list', '#clusters',\ 'e_acc'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: #NOTE: the arguments passed to this function is different between known and unknown genes. self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair, prediction_pair2lca_list, gene_no) writer.writerow([]) #second output the unknown genes for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems(): self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go) row = ['go_id', 'go_name', 'p_value_list', '#clusters', 'e_acc'] writer.writerow(row) for p_gene_id_src in p_gene_id_src_list: self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\ go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair) writer.writerow([]) del writer sys.stderr.write("Done\n")
def cluster_dstructure_output(self, curs, output_fname, order_1st_id2all_clusters): """ 04-17-05 output it in the format Jasmine's Darwin can read """ from codense.common import get_gene_no2gene_id gene_no2gene_id = get_gene_no2gene_id(curs) sys.stderr.write("Outputting cluster information...") outf = open(output_fname, 'w') str_tmp_list0 = [] #hold the 1st-order clusters for order_1st_id,all_2nd_order_clusters in order_1st_id2all_clusters.iteritems(): str_tmp_list1 = [] #hold the 2nd-order clusters for order_2nd_id,cluster_list in all_2nd_order_clusters.iteritems(): str_tmp_list2 = [] #hold the connected components for cluster in cluster_list: str_tmp = self.return_string_form_of_cluster_dstructure(cluster, gene_no2gene_id) str_tmp_list2.append(str_tmp) str_tmp_list1.append("[%s]"%','.join(str_tmp_list2)) str_tmp_list0.append("[%s]"%",".join(str_tmp_list1)) #'r:=' is for directly read in as an array outf.write("r:=[%s]:"%",".join(str_tmp_list0)) outf.close() sys.stderr.write("Done.\n")
def run(self): """ 10-17-05 bit control whether that setting has linear model """ schema_instance1 = form_schema_tables(self.fname1, self.acc_cutoff1, self.lm_bit1) schema_instance2 = form_schema_tables(self.fname2, self.acc_cutoff2, self.lm_bit2) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) p_gene_id_set1 = p_gene_id_set_from_gene_p_table(curs, schema_instance1.gene_p_table) p_gene_id_set2 = p_gene_id_set_from_gene_p_table(curs, schema_instance2.gene_p_table) p_gene_id_set_total = p_gene_id_set_from_gene_p_table(curs, schema_instance2.p_gene_table) catI_set = p_gene_id_set1 - p_gene_id_set2 catII_set = p_gene_id_set1 & p_gene_id_set2 catIII_set = p_gene_id_set2 - p_gene_id_set1 catIV_set = p_gene_id_set_total-(p_gene_id_set1|p_gene_id_set2) sample_ls_ls = [] for p_gene_id_set in [catI_set, catII_set, catIII_set, catIV_set]: sample_ls_ls.append(self.sample_p_gene_id_set(p_gene_id_set, self.no_of_samples)) writer = csv.writer(open(self.ofname, 'w'), delimiter = '\t') writer.writerow(['linear model coeffs of two settings']) writer.writerow([]) writer.writerow(['No.','intercept', 'coeff1', 'coeff2', 'coeff3', 'coeff4', 'coeff5', 'intercept_p_value',\ 'coeff1_p_value', 'coeff2_p_value', 'coeff3_p_value', 'coeff4_p_value', 'coeff5_p_value',\ 'score_cut_off']) #fetch linear model coefficients pga_instance_list = [None, None] #10-17-05 default is nothing, none of them have linear model if self.bit[0] == '1': pga_instance1 = p_gene_analysis() pga_instance1.go_no2lm_results, lm_results_2d_list = pga_instance1.get_go_no2lm_results(curs, schema_instance1.lm_table) pga_instance1.general_lm_results = pga_instance1.get_general_lm_results(lm_results_2d_list) pga_instance_list[0] = pga_instance1 self.output_lm_model(curs, schema_instance1, writer) if self.bit[1] == '1': pga_instance2 = p_gene_analysis() pga_instance2.go_no2lm_results, lm_results_2d_list = pga_instance2.get_go_no2lm_results(curs, schema_instance2.lm_table) pga_instance2.general_lm_results = pga_instance2.get_general_lm_results(lm_results_2d_list) pga_instance_list[1] = pga_instance2 self.output_lm_model(curs, schema_instance2, writer) #following is for drawing graph in output_p_gene_id_list() self.gene_no2gene_id = get_gene_no2gene_id(curs) self.gene_no2go_no = get_gene_no2go_no(curs) cluster_info_instance = cluster_info() for i in range(len(sample_ls_ls)): cat_no = i+1 sys.stderr.write("Category %s...\n"%cat_no) writer.writerow(['Category %s'%cat_no]) writer.writerow([self.category_no2information[cat_no]]) cat_dir = 'cat%s'%cat_no if not os.path.isdir(cat_dir): os.makedirs(cat_dir) if i==0: #this is different, prediction only in schema_instance1, so swap it self.output_p_gene_id_list(curs, schema_instance2, schema_instance1, sample_ls_ls[i], writer, cat_dir, \ pga_instance_list[1], pga_instance_list[0], cluster_info_instance, self.simple) else: self.output_p_gene_id_list(curs, schema_instance1, schema_instance2, sample_ls_ls[i], writer, cat_dir, \ pga_instance_list[0], pga_instance_list[1], cluster_info_instance, self.simple) sys.stderr.write("End Category %s.\n"%cat_no)