Esempio n. 1
0
	def data_fetch(self, curs, splat_table, mcl_table, crs_no=0, output_fname=None):
		"""
		04-17-05
			fetch cluster_dstructures for all clusters(Jasmine's request)	
		04-19-05
			1. return a mcl_id2cluster_dstructure
			2. crs_no
		08-31-05
			output clusters directly to output_fname
		09-01-05
			add the last []
		"""
		gene_no2gene_id = get_gene_no2gene_id(curs)	#08-31-05
		outf = open(output_fname, 'w')	#08-31-05
		outf.write("r:=[")	#08-31-05
		
		mcl_id2cluster_dstructure = {}
		no_of_total_genes = get_no_of_total_genes(curs)
		sys.stderr.write("Getting the basic information for all clusters...\n")
		curs.execute("DECLARE crs%s CURSOR FOR select m.mcl_id, m.vertex_set, m.connectivity, 0,\
			m.recurrence_array, s.edge_set, s.connectivity, m.cooccurrent_cluster_id from %s m, %s s where \
			m.splat_id=s.splat_id"\
			%(crs_no, mcl_table, splat_table))	#06-20-05	connectivity_original faked to be 0
		curs.execute("fetch 5000 from crs%s"%crs_no)
		rows = curs.fetchall()
		while rows:
			for row in rows:
				unit = cluster_dstructure()
				unit.cluster_id = row[0]
				vertex_set = row[1][1:-1].split(',')
				unit.vertex_set = map(int, vertex_set)
				unit.connectivity = row[2]
				unit.connectivity_original = row[3]
				recurrence_array = row[4][1:-1].split(',')
				unit.recurrence_array = map(float, recurrence_array)
				unit.edge_set = parse_splat_table_edge_set(row[5])
				unit.splat_connectivity = row[6]
				unit.cooccurrent_cluster_id = row[7]
				unit.go_no2association_genes = self.get_go_functions_of_this_gene_set(curs, unit.vertex_set)
				unit.go_no2information = self.get_information_of_go_functions(curs, \
					unit.go_no2association_genes, len(unit.vertex_set), no_of_total_genes, p_value_cut_off=0.05)	#jasmine wants to cut some go-nos.
				unit.edge_cor_2d_list, unit.edge_sig_2d_list = self.get_cor_sig_2d_list(curs, unit.edge_set)
				
				str_tmp = self.return_string_form_of_cluster_dstructure(unit, gene_no2gene_id)	#08-31-05
				outf.write("%s,"%str_tmp)
				#mcl_id2cluster_dstructure[unit.cluster_id] = unit
				"""
				order_1st_id, order_2nd_id = map(int, unit.cooccurrent_cluster_id.split('.'))
				if order_1st_id not in self.order_1st_id2all_clusters:
					self.order_1st_id2all_clusters[order_1st_id] = {}
				if order_2nd_id not in self.order_1st_id2all_clusters[order_1st_id]:
					self.order_1st_id2all_clusters[order_1st_id][order_2nd_id] = []
				self.order_1st_id2all_clusters[order_1st_id][order_2nd_id].append(unit)
				"""
			curs.execute("fetch 5000 from crs%s"%crs_no)
			rows = curs.fetchall()
		outf.write("[]]:")	#08-31-05, 09-01-05 add the last blank []
		del outf
		sys.stderr.write("Done.\n")
		return mcl_id2cluster_dstructure
Esempio n. 2
0
	def cluster_dstructure_output_with_both_hierarchy(self, curs, output_fname, \
		pre_2nd_cc_hierarchy, mcl_id2cluster_dstructure, mcl_id_2nd_order2cluster_dstructure):
		"""
		04-19-05
			jasmine wants to put 2nd-order clusters and its connected components into one file.
		"""
		from codense.common import get_gene_no2gene_id
		gene_no2gene_id = get_gene_no2gene_id(curs)
		sys.stderr.write("Outputting cluster information...")
		outf = open(output_fname, 'w')
		str_tmp_list0 = []	#hold the 1st-order clusters
		for pregraph_id,mcl_id_2nd_order_dict in pre_2nd_cc_hierarchy.iteritems():
			str_tmp_list1 = []	#hold the 2nd-order clusters
			for mcl_id_2nd_order,mcl_id_set in mcl_id_2nd_order_dict.iteritems():
				str_tmp_list2 = []	#hold the connected components
				#first one is the 2nd-order cluster
				str_tmp = self.return_string_form_of_cluster_dstructure(mcl_id_2nd_order2cluster_dstructure[mcl_id_2nd_order],\
					gene_no2gene_id)
				str_tmp_list2.append(str_tmp)
				for mcl_id in mcl_id_set:
					str_tmp = self.return_string_form_of_cluster_dstructure(mcl_id2cluster_dstructure[mcl_id],\
						gene_no2gene_id)
					str_tmp_list2.append(str_tmp)
				str_tmp_list1.append("[%s]"%','.join(str_tmp_list2))
			str_tmp_list0.append("[%s]"%",".join(str_tmp_list1))
		#'r:=' is for directly read in as an array
		outf.write("r:=[%s]:"%",".join(str_tmp_list0))
		outf.close()
		sys.stderr.write("Done.\n")
Esempio n. 3
0
	def dstruc_loadin(self, curs):
		'''
		'''
		sys.stderr.write("Loading Data STructure...\n")
		
		from codense.common import get_go_no2go_id, get_gene_no2gene_id, get_go_no2name, get_gene_id2gene_no, get_gene_no2go_no
		self.go_no2go_id = get_go_no2go_id(curs)
		self.go_no2go_name = get_go_no2name(curs)
		self.gene_no2gene_id = get_gene_no2gene_id(curs)
		self.gene_id2gene_no = get_gene_id2gene_no(curs)
		self.global_gene_to_go_dict = get_gene_no2go_no(curs)
		
		#04-01-05 the second kind in label_dict
		gene_no2no = {}
		for gene_no in self.gene_no2gene_id:
			gene_no2no[gene_no] = gene_no
		self.label_dict = {1:self.gene_no2gene_id,
			2: gene_no2no}
		
		
		curs.execute("select gene_no,go_functions from gene")
		
		if self.type == 3:
			curs.execute("select array_upper(recurrence_array,1) from %s limit 1"%self.table)
			rows = curs.fetchall()
			self.no_of_datasets = int(rows[0][0])
			
		sys.stderr.write("Done\n")
Esempio n. 4
0
	def on_button_cluster_info_clicked(self, button_cluster_info, *args):
		if self.curs==None:
			print "db_connect first"
			return
		self.no_of_datasets = int(self.entry_no_of_datasets.get_text())
		self.treeview_init(self.no_of_datasets)
		self.gene_no2gene_id = get_gene_no2gene_id(self.curs)
		self.gene_no2go_no = get_gene_no2go_no(self.curs)

		self.window_cluster_info1.show()
		self.window_cluster_info2.show()
Esempio n. 5
0
    def dstruc_loadin(self, curs):
        """
		
		03-09-05
			get the context from mcl_table via linking through mcl_id of p_gene_table
			context_dict is set
		"""
        from codense.common import get_known_genes_dict, get_go_no2go_id, get_go_no2name, get_gene_no2gene_id

        self.known_genes_dict = get_known_genes_dict(curs)
        self.go_no2go_id = get_go_no2go_id(curs)
        self.go_no2go_name = get_go_no2name(curs)
        self.gene_no2gene_id = get_gene_no2gene_id(curs)

        sys.stderr.write("Setting up gene_prediction_dict...")
        # setup self.gene_prediction_dict
        curs.execute(
            "select p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, p.is_correct_lca, m.vertex_set\
			from %s p, %s g, %s m where g.p_gene_id=p.p_gene_id and m.mcl_id=p.mcl_id"
            % (self.gene_table, self.table, self.mcl_table)
        )
        rows = curs.fetchall()
        for row in rows:
            gene_no = row[0]
            if self.type == 2 and gene_no not in self.known_genes_dict:
                # I only want the known genes, but this gene is unknown
                continue
            elif self.type == 3 and gene_no in self.known_genes_dict:
                # i only want the unknown genes, but this gene is known
                continue
            go_no = row[1]
            is_correct = row[2]
            is_correct_l1 = row[3]
            is_correct_lca = row[4]
            vertex_set = row[5][1:-1].split(",")
            vertex_set = map(int, vertex_set)

            item = function_struc()
            item.is_correct = is_correct
            item.is_correct_l1 = is_correct_l1
            item.is_correct_lca = is_correct_lca
            # context_dict is a set
            item.context_dict = Set(vertex_set)
            if gene_no not in self.gene_prediction_dict:
                self.gene_prediction_dict[gene_no] = gene_prediction()
                self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item
            else:
                self.gene_prediction_dict[gene_no].p_functions_struc_dict[go_no] = item

        sys.stderr.write("Done\n")

        """
Esempio n. 6
0
	def run(self):
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank	
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			gene_id2no = get_gene_id2gene_no(curs)
			gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no)
			gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1)
			
			gene_no2id = get_gene_no2gene_id(curs)
			gene_no2go_no = get_gene_no2go_no(curs)
			gene_no2id_pickle = cPickle.dumps(gene_no2id, -1)
			gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(gene2enc_array_pickle, node, 0)
			
			communicator.send(gene_no2id_pickle, communicator.size-1, 0)
			communicator.send(gene_no2go_no_pickle, communicator.size-1, 0)
		elif node_rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			gene2enc_array = cPickle.loads(data)	#take the data
		elif node_rank==communicator.size-1:
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2id = cPickle.loads(data)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go_no = cPickle.loads(data)
			
		mpi_synchronize(communicator)
		if node_rank == 0:
			curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\
			g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table))
			input_node(communicator, curs, free_computing_nodes, self.message_size, self.report)
		elif node_rank in free_computing_nodes:
			parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off]
			computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report)
		elif node_rank==communicator.size-1:
			if not os.path.isdir(self.pic_output_dir):
				os.makedirs(self.pic_output_dir)
			cluster_info_instance = cluster_info()
			ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off))
			writer = csv.writer(open(ofname, 'w'), delimiter='\t')
			parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			del writer
Esempio n. 7
0
	def output(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map):
		"""
		03-03-05
			loop over gene_no2p_gene_id_src and p_gene_id_src_map
		03-13-05
			add a column, #clusters in the output file
			
			--output_one_gene()
			--output_function_group()
		"""
		#three dictionaries
		gene_no2gene_id = get_gene_no2gene_id(curs)
		gene_no2direct_go = get_gene_no2direct_go(curs)
		go_no2go_id = get_go_no2go_id(curs)
		go_no2name = get_go_no2name(curs)
		go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table)
		
		sys.stderr.write("Outputing prediction table...")
		writer = csv.writer(outf, delimiter='\t')
		#first output the known genes
		for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \
				'e_acc', 'e_acc_pair', 'cluster_context']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair)
			writer.writerow([])
		#second output the unknown genes
		for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_no', 'go_id', 'go_name', 'is_correct', 'is_correct_L1', 'is_correct_lca', 'p_value_list', '#clusters', 'mcl_id_list', \
				'e_acc', 'e_acc_pair', 'cluster_context']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				self.output_function_group(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair)
			writer.writerow([])
		del writer
		sys.stderr.write("Done\n")
Esempio n. 8
0
	def output1(self, curs, outf, known_gene_no2p_gene_id_src, unknown_gene_no2p_gene_id_src, p_gene_id_src_map):
		"""
		03-15-05
			copied from output()
		"""
		#three dictionaries
		gene_no2gene_id = get_gene_no2gene_id(curs)
		gene_no2direct_go = get_gene_no2direct_go(curs)
		go_no2go_id = get_go_no2go_id(curs)
		go_no2name = get_go_no2name(curs)
		go_no2accuracy, go_no2accuracy_pair = self.get_go_no2accuracy(curs, self.p_gene_table, self.gene_p_table)
		from codense.common import get_prediction_pair2lca_list
		prediction_pair2lca_list = get_prediction_pair2lca_list(curs,p_gene_table=self.p_gene_table)
		
		sys.stderr.write("Outputing prediction table...")
		writer = csv.writer(outf, delimiter='\t')
		#first output the known genes
		for (gene_no, p_gene_id_src_list) in known_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_id', 'go_name', 'is_correct_lca', 'lca_list', 'p_value_list', '#clusters',\
				'e_acc']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				#NOTE: the arguments passed to this function is different between known and unknown genes.
				self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair, prediction_pair2lca_list, gene_no)
			writer.writerow([])
		#second output the unknown genes
		for (gene_no, p_gene_id_src_list) in unknown_gene_no2p_gene_id_src.iteritems():
			self.output_one_gene1(curs, writer, gene_no, gene_no2gene_id, gene_no2direct_go)
			row = ['go_id', 'go_name', 'p_value_list', '#clusters', 'e_acc']
			writer.writerow(row)
			for p_gene_id_src in p_gene_id_src_list:
				self.output_function_group1(curs, writer, p_gene_id_src_map[p_gene_id_src], gene_no2gene_id,\
					go_no2go_id, go_no2name, go_no2accuracy, go_no2accuracy_pair)
			writer.writerow([])
		del writer
		sys.stderr.write("Done\n")
Esempio n. 9
0
	def cluster_dstructure_output(self, curs, output_fname, order_1st_id2all_clusters):
		"""
		04-17-05
			output it in the format Jasmine's Darwin can read
		"""
		from codense.common import get_gene_no2gene_id
		gene_no2gene_id = get_gene_no2gene_id(curs)
		sys.stderr.write("Outputting cluster information...")
		outf = open(output_fname, 'w')
		str_tmp_list0 = []	#hold the 1st-order clusters
		for order_1st_id,all_2nd_order_clusters in order_1st_id2all_clusters.iteritems():
			str_tmp_list1 = []	#hold the 2nd-order clusters
			for order_2nd_id,cluster_list in all_2nd_order_clusters.iteritems():
				str_tmp_list2 = []	#hold the connected components
				for cluster in cluster_list:
					str_tmp = self.return_string_form_of_cluster_dstructure(cluster, gene_no2gene_id)
					str_tmp_list2.append(str_tmp)
				str_tmp_list1.append("[%s]"%','.join(str_tmp_list2))
			str_tmp_list0.append("[%s]"%",".join(str_tmp_list1))
		#'r:=' is for directly read in as an array
		outf.write("r:=[%s]:"%",".join(str_tmp_list0))
		outf.close()
		sys.stderr.write("Done.\n")
Esempio n. 10
0
	def run(self):
		"""
		10-17-05
			bit control whether that setting has linear model
		"""
		schema_instance1 = form_schema_tables(self.fname1, self.acc_cutoff1, self.lm_bit1)
		schema_instance2 = form_schema_tables(self.fname2, self.acc_cutoff2, self.lm_bit2)
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		p_gene_id_set1 = p_gene_id_set_from_gene_p_table(curs, schema_instance1.gene_p_table)
		p_gene_id_set2 = p_gene_id_set_from_gene_p_table(curs, schema_instance2.gene_p_table)
		p_gene_id_set_total = p_gene_id_set_from_gene_p_table(curs, schema_instance2.p_gene_table)
		
		catI_set = p_gene_id_set1 - p_gene_id_set2
		catII_set = p_gene_id_set1 & p_gene_id_set2
		catIII_set = p_gene_id_set2 - p_gene_id_set1
		catIV_set = p_gene_id_set_total-(p_gene_id_set1|p_gene_id_set2)
		
		sample_ls_ls = []
		for p_gene_id_set in [catI_set, catII_set, catIII_set, catIV_set]:
			sample_ls_ls.append(self.sample_p_gene_id_set(p_gene_id_set, self.no_of_samples))
		
		writer = csv.writer(open(self.ofname, 'w'), delimiter = '\t')
		writer.writerow(['linear model coeffs of two settings'])
		writer.writerow([])
		writer.writerow(['No.','intercept', 'coeff1', 'coeff2', 'coeff3', 'coeff4', 'coeff5', 'intercept_p_value',\
			'coeff1_p_value', 'coeff2_p_value', 'coeff3_p_value', 'coeff4_p_value', 'coeff5_p_value',\
			'score_cut_off'])
		
		#fetch linear model coefficients
		pga_instance_list = [None, None]	#10-17-05 default is nothing, none of them have linear model
		if self.bit[0] == '1':
			pga_instance1 = p_gene_analysis()
			pga_instance1.go_no2lm_results, lm_results_2d_list = pga_instance1.get_go_no2lm_results(curs, schema_instance1.lm_table)
			pga_instance1.general_lm_results = pga_instance1.get_general_lm_results(lm_results_2d_list)
			pga_instance_list[0] = pga_instance1
			self.output_lm_model(curs, schema_instance1, writer)
		if self.bit[1] == '1':
			pga_instance2 = p_gene_analysis()
			pga_instance2.go_no2lm_results, lm_results_2d_list = pga_instance2.get_go_no2lm_results(curs, schema_instance2.lm_table)
			pga_instance2.general_lm_results = pga_instance2.get_general_lm_results(lm_results_2d_list)
			pga_instance_list[1] = pga_instance2
			self.output_lm_model(curs, schema_instance2, writer)
		
		#following is for drawing graph in output_p_gene_id_list()
		self.gene_no2gene_id = get_gene_no2gene_id(curs)
		self.gene_no2go_no = get_gene_no2go_no(curs)

		cluster_info_instance = cluster_info()
		
		for i in range(len(sample_ls_ls)):
			cat_no = i+1
			sys.stderr.write("Category %s...\n"%cat_no)
			writer.writerow(['Category %s'%cat_no])
			writer.writerow([self.category_no2information[cat_no]])
			cat_dir = 'cat%s'%cat_no
			if not os.path.isdir(cat_dir):
				os.makedirs(cat_dir)
			if i==0:	#this is different, prediction only in schema_instance1, so swap it
				self.output_p_gene_id_list(curs, schema_instance2, schema_instance1, sample_ls_ls[i], writer, cat_dir, \
					pga_instance_list[1], pga_instance_list[0], cluster_info_instance, self.simple)
			else:
				self.output_p_gene_id_list(curs, schema_instance1, schema_instance2, sample_ls_ls[i], writer, cat_dir, \
					pga_instance_list[0], pga_instance_list[1], cluster_info_instance, self.simple)
			sys.stderr.write("End Category %s.\n"%cat_no)