Ejemplo n.º 1
0
	def dstruc_loadin(self, curs):
		'''
		'''
		sys.stderr.write("Loading Data STructure...\n")
		
		from codense.common import get_go_no2go_id, get_gene_no2gene_id, get_go_no2name, get_gene_id2gene_no, get_gene_no2go_no
		self.go_no2go_id = get_go_no2go_id(curs)
		self.go_no2go_name = get_go_no2name(curs)
		self.gene_no2gene_id = get_gene_no2gene_id(curs)
		self.gene_id2gene_no = get_gene_id2gene_no(curs)
		self.global_gene_to_go_dict = get_gene_no2go_no(curs)
		
		#04-01-05 the second kind in label_dict
		gene_no2no = {}
		for gene_no in self.gene_no2gene_id:
			gene_no2no[gene_no] = gene_no
		self.label_dict = {1:self.gene_no2gene_id,
			2: gene_no2no}
		
		
		curs.execute("select gene_no,go_functions from gene")
		
		if self.type == 3:
			curs.execute("select array_upper(recurrence_array,1) from %s limit 1"%self.table)
			rows = curs.fetchall()
			self.no_of_datasets = int(rows[0][0])
			
		sys.stderr.write("Done\n")
Ejemplo n.º 2
0
	def run(self):
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank	
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			gene_id2no = get_gene_id2gene_no(curs)
			gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no)
			gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1)
			
			gene_no2id = get_gene_no2gene_id(curs)
			gene_no2go_no = get_gene_no2go_no(curs)
			gene_no2id_pickle = cPickle.dumps(gene_no2id, -1)
			gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(gene2enc_array_pickle, node, 0)
			
			communicator.send(gene_no2id_pickle, communicator.size-1, 0)
			communicator.send(gene_no2go_no_pickle, communicator.size-1, 0)
		elif node_rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			gene2enc_array = cPickle.loads(data)	#take the data
		elif node_rank==communicator.size-1:
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2id = cPickle.loads(data)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go_no = cPickle.loads(data)
			
		mpi_synchronize(communicator)
		if node_rank == 0:
			curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\
			g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table))
			input_node(communicator, curs, free_computing_nodes, self.message_size, self.report)
		elif node_rank in free_computing_nodes:
			parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off]
			computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report)
		elif node_rank==communicator.size-1:
			if not os.path.isdir(self.pic_output_dir):
				os.makedirs(self.pic_output_dir)
			cluster_info_instance = cluster_info()
			ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off))
			writer = csv.writer(open(ofname, 'w'), delimiter='\t')
			parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			del writer
Ejemplo n.º 3
0
	def parse_cluster_fname(self, curs, cluster_fname, gim_inputfname, cluster_id_set, schema_instance):
		"""
		01-24-06
			a lot of analogy to codense2db.py's run()
		"""
		sys.stderr.write("Parsing cluster_fname: %s ...\n"%os.path.basename(cluster_fname))
		codense2db_instance  = codense2db()
		codense2db_instance.create_tables(curs, schema_instance.splat_table, \
			schema_instance.mcl_table, schema_instance.pattern_table)
		gene_id2gene_no = get_gene_id2gene_no(curs)
		gene_no2incidence_array = get_gene_no2incidence_array(gim_inputfname, gene_id2gene_no)
		known_gene_no2go_no_set = get_known_genes_dict(curs)
		counter = 0
		real_counter = 0
		cluster_id2properties = {}	#additional properties for prediction_pair2instance
		reader = csv.reader(open(cluster_fname, 'r'), delimiter='\t')
		for row in reader:
			counter += 1
			#only those who are in cluster_id_set
			if counter in cluster_id_set:	#cluster_id starts from 1
				cluster_list = codense2db_instance.fimbfs_parser(row, gene_no2incidence_array, curs)
				for cluster in cluster_list:
					real_counter += 1
					cluster.unknown_gene_ratio = codense2db_instance.calculate_unknown_gene_ratio(cluster.vertex_set, \
						known_gene_no2go_no_set)
					cluster.cluster_id = counter	#line number is the cluster_id
					codense2db_instance.db_submit(curs, cluster, schema_instance.pattern_table)
					
					cluster_id2properties[cluster.cluster_id] = [cluster.connectivity, cluster.unknown_gene_ratio, cluster.vertex_set]
			if real_counter==len(cluster_id_set):
				#all relevant clusters have been got, ignore remaining clusters
				break
			if self.report and counter%2000==0:
				sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter))
		if self.report:
			sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter))
		del reader
		sys.stderr.write("Done.\n")
		return cluster_id2properties