Beispiel #1
0
	def core_from_files(self, curs):
		"""
		05-19-05
			It's outdated compared with core(). So update it.
		08-13-05
			row[4] is unknown_gene_ratio
		"""	
		sys.stderr.write("Starting gene-stat...\n")
		from gene_p_map_redundancy import gene_p_map_redundancy
		node_distance_class = gene_p_map_redundancy()
		"""
		05-19-05
			#read from a single file
		#following codes are attaching directory path to each file in the list
		file_list = os.listdir(self.dir_files)
		file_path_list = []
		for filename in file_list:
			file_path_list.append(os.path.join(self.dir_files, filename))
		#multiple files constitute the source of data
		self.files = fileinput.input(file_path_list)
		"""
		self.files = open(self.dir_files, 'r')
		#wrap it with a reader
		self.reader = csv.reader(self.files, delimiter='\t')
		for row in self.reader:
			row[0] = int(row[0])
			row[1] = int(row[1])
			row[3] = float(row[3])
			row[4] = float(row[4])
			curs.execute("select recurrence_array, vertex_set from %s where mcl_id=%d"%(self.mcl_table, int(row[0])) )
			rows = curs.fetchall()
			#first append the recurrence_array
			row.append(rows[0][0])
			#second append the vertex_set
			row.append(rows[0][1])
			#only leave_one_out
			self._gene_stat_leave_one_out(row, node_distance_class, curs)

			if self.report and self.no_of_records%2000==0:
				sys.stderr.write('%s%s'%('\x08'*20, self.no_of_records))
		if self.report:
			sys.stderr.write('%s%s'%('\x08'*20, self.no_of_records))
		sys.stderr.write("Done.\n")
Beispiel #2
0
	def core(self, curs):
		"""
		03-14-05
			load go_no2distance on demand
		"""
		sys.stderr.write("Starting gene-stat...\n")
		from gene_p_map_redundancy import gene_p_map_redundancy
		node_distance_class = gene_p_map_redundancy()
		#the central function of the class
		if self.leave_one_out:
			#leave_one_out method gets data from both cluster_stat-like and mcl_result-like table
			curs.execute("DECLARE crs CURSOR FOR select c.mcl_id, c.leave_one_out, c.p_value_vector, \
				 c.connectivity, m.recurrence_array, m.vertex_set from %s c, %s m where c.mcl_id=m.mcl_id"\
				%(self.table, self.mcl_table))
		else:
			#no leave_one_out method gets data only from mcl_result-like table
			curs.execute("DECLARE crs CURSOR FOR select mcl_id, vertex_set, p_value_min, go_no_vector, unknown_gene_ratio, \
				recurrence_array from %s where connectivity>=%f and p_value_min notnull and array_upper(recurrence_array, 1)>=%d\
				and array_upper(vertex_set, 1)<=%d"%(self.mcl_table, self.connectivity_cut_off, self.recurrence_cut_off, self.cluster_size_cut_off))
		
		curs.execute("fetch 5000 from crs")
		rows = curs.fetchall()
		while rows:
			for row in rows:
				if self.leave_one_out:
					#in leave_one_out, only one gene's function is predicted based on one row
					self._gene_stat_leave_one_out(row, node_distance_class, curs)
				else:
					#in no leave_one_out, function of all vertices in that cluster is predicted based on one row
					self._gene_stat_no_leave_one_out(row)
			if self.report:
				sys.stderr.write('%s%s'%('\x08'*20, self.no_of_records))
			
			curs.execute("fetch 5000 from crs")
			rows = curs.fetchall()
		sys.stderr.write("Done.\n")
Beispiel #3
0
	def return_go_no_map(self, go_no_list, curs, distance_table):
		"""
		03-06-05
			input: a list of go_nos, curs
			output: a map showing which go_no corresponds to which
			
			curs is used to get the go_no2term_id and nodes pairwise distance
		"""
		sys.stderr.write("Mapping go_nos...")
		from gene_p_map_redundancy import gene_p_map_redundancy
		from codense.common import get_go_no2term_id
		borrowed_instance = gene_p_map_redundancy()
		go_no_map = {}
		go_no2term_id = get_go_no2term_id(curs)
		go_no2distance = {}
		
		for i in range(len(go_no_list)):
			go_no = go_no_list[i]
			if go_no not in go_no_map:
				#not flagged, map
				go_no_map[go_no] = go_no
				for j in range(i+1, len(go_no_list)):
					go_no2 = go_no_list[j]
					if go_no < go_no2:
						key= (go_no, go_no2)
					else:
						key = (go_no2, go_no)
					if key in go_no2distance:
						jasmine_distance = go_no2distance[key][2]
					else:
						jasmine_distance = borrowed_instance.get_distance(curs, go_no, go_no2, distance_table, go_no2distance, go_no2term_id)
					if jasmine_distance == 0:
						#jasmine_distance=0 means they are parent-child
						go_no_map[go_no2] = go_no
		sys.stderr.write("done.\n")
		return go_no_map	
Beispiel #4
0
	def run(self):
		"""
		09-05-05
		10-23-05
			create views from old schema
			result goes to the new schema's p_gene_table
		
			(input_node)
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--get_gene_no2go_no_set()
				--get_go_no2depth()
				(pass data to computing_node)
			(computing_node)
				(take data from other nodes, 0 and size-1)
			(judge_node)
				--gene_stat()
				--db_connect()
				--gene_p_map_redundancy()
			(output_node)
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--MpiPredictionFilter()
				--MpiPredictionFilter_instance.createGeneTable()
				--get_go_no2edge_counter_list()(if necessary)
				(pass go_no2edge_counter_list to computing_node)
			
			(input_node)
				--fetch_cluster_block()
			(computing_node)
				--get_no_of_unknown_genes()
				--node_fire_handler()
				--cleanup_handler()
			--judge_node()
				--gene_stat_instance.(match functions)
			--output_node()
				--output_node_handler()
					--MpiPredictionFilter_instance.submit_to_p_gene_table()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			"""
			#01-02-06
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			"""
			gene_no2go_no = get_gene_no2go_no_set(curs)
			gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1)	#-1 means use the highest protocol
			go_no2depth = get_go_no2depth(curs)
			go_no2depth_pickle = cPickle.dumps(go_no2depth, -1)
			go_no2gene_no_set = get_go_no2gene_no_set(curs)
			go_no2gene_no_set_pickle = cPickle.dumps(go_no2gene_no_set, -1)
			for node in range(1, communicator.size-2):	#send it to the computing_node
				communicator.send(gene_no2go_no_pickle, node, 0)
				communicator.send(go_no2depth_pickle, node, 0)
				communicator.send(go_no2gene_no_set_pickle, node, 0)
		elif node_rank<=communicator.size-3:	#WATCH: last 2 nodes are not here.
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go_no = cPickle.loads(data)	#take the data
			data, source, tag = communicator.receiveString(0, 0)
			go_no2depth = cPickle.loads(data)
			data, source, tag = communicator.receiveString(0, 0)
			go_no2gene_no_set = cPickle.loads(data)
			data, source, tag = communicator.receiveString(communicator.size-1, 0)	#from the last node
			go_no2edge_counter_list = cPickle.loads(data)
			#choose a functor for recurrence_array
			functor_dict = {0: None,
				1: lambda x: int(x>=self.recurrence_x),
				2: lambda x: math.pow(x, self.recurrence_x)}
			functor = functor_dict[self.recurrence_x_type]
		elif node_rank == communicator.size-2:	#judge node
			gene_stat_instance = gene_stat(depth_cut_off=self.depth)
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			gene_stat_instance.dstruc_loadin(curs)
			from gene_p_map_redundancy import gene_p_map_redundancy
			node_distance_class = gene_p_map_redundancy()			
		elif node_rank==communicator.size-1:	#establish connection before pursuing
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			"""
			#01-02-06, input and output are all directed to files
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			MpiPredictionFilter_instance = MpiPredictionFilter()
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
			if self.new_table:
				MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
			"""
			if self.go_no2edge_counter_list_fname:
				go_no2edge_counter_list = cPickle.load(open(self.go_no2edge_counter_list_fname,'r'))
			else:
				if self.eg_d_type==2:
					go_no2edge_counter_list = None
				else:
					gene_no2go_no = get_gene_no2go_no_set(curs)
					go_no2edge_counter_list = get_go_no2edge_counter_list(curs, gene_no2go_no, self.edge_type2index)
			go_no2edge_counter_list_pickle = cPickle.dumps(go_no2edge_counter_list, -1)
			for node in range(1, communicator.size-2):	#send it to the computing_node
				communicator.send(go_no2edge_counter_list_pickle, node, 0)
		
		mpi_synchronize(communicator)
		
		free_computing_nodes = range(1,communicator.size-2)	#exclude the last node
		if node_rank == 0:
			"""
			curs.execute("DECLARE crs CURSOR FOR SELECT id, vertex_set, edge_set, no_of_edges,\
			connectivity, unknown_gene_ratio, recurrence_array, d_matrix from %s"%(old_schema_instance.pattern_table))
			"""
			self.counter = 0	#01-02-06 counter is used as id
			reader = csv.reader(open(self.input_fname, 'r'), delimiter='\t')
			parameter_list = [reader]
			input_node(communicator, parameter_list, free_computing_nodes, self.message_size, \
				self.report, input_handler=self.input_handler)
			del reader
		elif node_rank in free_computing_nodes:
			no_of_unknown_genes = get_no_of_unknown_genes(gene_no2go_no)
			GradientScorePrediction_instance = GradientScorePrediction(gene_no2go_no, go_no2gene_no_set, go_no2depth, \
				go_no2edge_counter_list, no_of_unknown_genes, self.depth, self.min_layer1_associated_genes, \
				self.min_layer1_ratio, self.min_layer2_associated_genes, self.min_layer2_ratio, self.exponent, \
				self.score_list, self.max_layer, self.norm_exp, self.eg_d_type, self.debug)
			parameter_list = [GradientScorePrediction_instance, functor]
			computing_node(communicator, parameter_list, self.node_fire_handler, self.cleanup_handler, self.report)
		elif node_rank == communicator.size-2:
			self.judge_node(communicator, curs, gene_stat_instance, node_distance_class)
		elif node_rank==communicator.size-1:
			#01-02-06 output goes to plain file, not database
			writer = csv.writer(open(self.jnput_fname, 'w'), delimiter='\t')
			parameter_list = [writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			del writer
Beispiel #5
0
	def output(self, curs, gene_no2go_id_set_list, go_id_set_list, support, prefix, gene_no2id, go_id2name, schema_list):
		"""
		07-06-05
		"""
		sys.stderr.write("Outputing...")
		
		#get the total set
		total_gene_no_set = Set()
		total_go_id_set = Set()
		for i in range(len(gene_no2go_id_set_list)):
			total_gene_no_set |= Set(gene_no2go_id_set_list[i].keys())
			total_go_id_set |= go_id_set_list[i]
		print "the total number of genes is ",len(total_gene_no_set)
		gene_ofname = '%s.gene'%prefix
		function_ofname = '%s.function'%prefix
		gene_writer = csv.writer(open(gene_ofname,'w'), delimiter='\t')
		function_writer = csv.writer(open(function_ofname, 'w'), delimiter='\t')
		gene_writer.writerow(['']+schema_list)
		function_writer.writerow([''] + schema_list)
		
		from gene_p_map_redundancy import gene_p_map_redundancy
		node_distance_class = gene_p_map_redundancy()
		
		go_id2term_id = get_go_id2term_id(curs)
		go_term_id2depth = get_go_term_id2depth(curs)
		
		#output the gene-oriented information
		for gene_no in total_gene_no_set:
			freq = 0
			p_go_id_set_list = []
			for i in range(len(gene_no2go_id_set_list)):
				if gene_no in gene_no2go_id_set_list[i]:
					p_go_id_set_list.append(gene_no2go_id_set_list[i][gene_no])
					freq += 1
			if freq == support:
				if self.p_go_id_set_list_distinct(curs, p_go_id_set_list, node_distance_class, go_term_id2depth, go_id2term_id):
					row = [gene_no2id[gene_no]]
					for i in range(len(gene_no2go_id_set_list)):
						if gene_no in gene_no2go_id_set_list[i]:
							go_id_set = gene_no2go_id_set_list[i][gene_no]
							go_name_list = dict_map(go_id2name, go_id_set)
							row.append(';'.join(go_name_list))
						else:
							row.append('')
					gene_writer.writerow(row)
		
		#output the function_oriented information
		for go_id in total_go_id_set:
			freq = 0
			for i in range(len(go_id_set_list)):
				if go_id in go_id_set_list[i]:
					freq += 1
			if freq == support:
				row = ['%s(%s)'%(go_id2name[go_id],go_id)]
				for i in range(len(go_id_set_list)):
					if go_id in go_id_set_list[i]:
						row.append('1')
					else:
						row.append('0')
				function_writer.writerow(row)
		
		
		sys.stderr.write("Done.\n")