Esempio n. 1
0
	def run(self):
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank	
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			gene_id2no = get_gene_id2gene_no(curs)
			gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no)
			gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1)
			
			gene_no2id = get_gene_no2gene_id(curs)
			gene_no2go_no = get_gene_no2go_no(curs)
			gene_no2id_pickle = cPickle.dumps(gene_no2id, -1)
			gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(gene2enc_array_pickle, node, 0)
			
			communicator.send(gene_no2id_pickle, communicator.size-1, 0)
			communicator.send(gene_no2go_no_pickle, communicator.size-1, 0)
		elif node_rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			gene2enc_array = cPickle.loads(data)	#take the data
		elif node_rank==communicator.size-1:
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2id = cPickle.loads(data)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go_no = cPickle.loads(data)
			
		mpi_synchronize(communicator)
		if node_rank == 0:
			curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\
			g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table))
			input_node(communicator, curs, free_computing_nodes, self.message_size, self.report)
		elif node_rank in free_computing_nodes:
			parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off]
			computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report)
		elif node_rank==communicator.size-1:
			if not os.path.isdir(self.pic_output_dir):
				os.makedirs(self.pic_output_dir)
			cluster_info_instance = cluster_info()
			ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off))
			writer = csv.writer(open(ofname, 'w'), delimiter='\t')
			parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			del writer
Esempio n. 2
0
	def run(self):
		"""
		11-09-05
		11-09-05 add rpart_cp
		11-10-05 add need_cal_hg_p_value
		
			--db_connect()
			--form_schema_tables()
			--form_schema_tables()
			--get_no_of_total_genes()
			--get_go_no2gene_no_set()
			--data_fetch()
				--get_vertex_list()
				--cal_hg_p_value()
			--rpart_fit_and_predict()
			--MpiPredictionFilter_instance....()
			--record_data()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		old_schema_instance = form_schema_tables(self.fname1)
		new_schema_instance = form_schema_tables(self.fname2)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		
		prediction_ls, all_data, known_data = self.data_fetch(curs, old_schema_instance, self.filter_type, self.is_correct_type, \
			no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		
		"""
		testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.no_of_buckets, self.rpart_cp, \
			self.loss_matrix, self.prior_prob)
		print testing_acc_ls
		print training_acc_ls
		"""
		pred, pred_training = self.rpart_fit_and_predict(all_data, known_data, self.rpart_cp, self.loss_matrix, self.prior_prob)
		
		MpiPredictionFilter_instance = MpiPredictionFilter()
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
		MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
		MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
		self.record_data(curs, MpiPredictionFilter_instance, prediction_ls, pred, new_schema_instance)
		if self.commit:
			curs.execute("end")
Esempio n. 3
0
	def get_known_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value):
		schema_instance = form_schema_tables(fname)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		
		prediction_ls, all_data, known_data = self.data_fetch(curs, schema_instance, filter_type, is_correct_type, \
			no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		del prediction_ls, all_data
		return known_data
Esempio n. 4
0
	def run(self):
		if self.ofname and self.acc_cut_off and self.lm_bit:
			schema_instance = form_schema_tables(self.ofname, self.acc_cut_off, self.lm_bit)
			
		else:
			sys.stderr.write("ofname: %s and acc_cut_off: %s and lm_bit %s, NOT VALID\n"%(self.ofname, self.acc_cut_off, self.lm_bit))
			sys.exit(2)
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		self._cluster_darwin_format(curs, schema_instance.good_cluster_table, self.gene_no2id, self.go_no2id, self.output_fname)
		del conn, curs
Esempio n. 5
0
	def run(self):
		"""
		01-24-06
			
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		schema_instance = form_schema_tables(self.jnput_fname)
		prediction_pair2instance = self.parse_stat_fname(self.stat_fname, self.filter_type)
		cluster_id_set = self.get_cluster_id_set(prediction_pair2instance)
		cluster_id2properties = self.parse_cluster_fname(curs, self.inputfile, self.gim_inputfname, cluster_id_set, schema_instance)
		self.submit_predictions(curs, schema_instance, prediction_pair2instance, cluster_id2properties)
		if self.commit:
			curs.execute("end")
Esempio n. 6
0
	def get_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value):
		"""
		11-19-05
			data_fetch() of rpart_prediction.py changed
			return unknown_data
		"""
		schema_instance = form_schema_tables(fname)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(curs, schema_instance, \
			filter_type, is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value)
		del unknown_prediction_ls, known_prediction_ls
		return unknown_data, known_data
Esempio n. 7
0
	def run(self):
		"""
		10-17-05
			bit control whether that setting has linear model
		"""
		schema_instance1 = form_schema_tables(self.fname1, self.acc_cutoff1, self.lm_bit1)
		schema_instance2 = form_schema_tables(self.fname2, self.acc_cutoff2, self.lm_bit2)
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		p_gene_id_set1 = p_gene_id_set_from_gene_p_table(curs, schema_instance1.gene_p_table)
		p_gene_id_set2 = p_gene_id_set_from_gene_p_table(curs, schema_instance2.gene_p_table)
		p_gene_id_set_total = p_gene_id_set_from_gene_p_table(curs, schema_instance2.p_gene_table)
		
		catI_set = p_gene_id_set1 - p_gene_id_set2
		catII_set = p_gene_id_set1 & p_gene_id_set2
		catIII_set = p_gene_id_set2 - p_gene_id_set1
		catIV_set = p_gene_id_set_total-(p_gene_id_set1|p_gene_id_set2)
		
		sample_ls_ls = []
		for p_gene_id_set in [catI_set, catII_set, catIII_set, catIV_set]:
			sample_ls_ls.append(self.sample_p_gene_id_set(p_gene_id_set, self.no_of_samples))
		
		writer = csv.writer(open(self.ofname, 'w'), delimiter = '\t')
		writer.writerow(['linear model coeffs of two settings'])
		writer.writerow([])
		writer.writerow(['No.','intercept', 'coeff1', 'coeff2', 'coeff3', 'coeff4', 'coeff5', 'intercept_p_value',\
			'coeff1_p_value', 'coeff2_p_value', 'coeff3_p_value', 'coeff4_p_value', 'coeff5_p_value',\
			'score_cut_off'])
		
		#fetch linear model coefficients
		pga_instance_list = [None, None]	#10-17-05 default is nothing, none of them have linear model
		if self.bit[0] == '1':
			pga_instance1 = p_gene_analysis()
			pga_instance1.go_no2lm_results, lm_results_2d_list = pga_instance1.get_go_no2lm_results(curs, schema_instance1.lm_table)
			pga_instance1.general_lm_results = pga_instance1.get_general_lm_results(lm_results_2d_list)
			pga_instance_list[0] = pga_instance1
			self.output_lm_model(curs, schema_instance1, writer)
		if self.bit[1] == '1':
			pga_instance2 = p_gene_analysis()
			pga_instance2.go_no2lm_results, lm_results_2d_list = pga_instance2.get_go_no2lm_results(curs, schema_instance2.lm_table)
			pga_instance2.general_lm_results = pga_instance2.get_general_lm_results(lm_results_2d_list)
			pga_instance_list[1] = pga_instance2
			self.output_lm_model(curs, schema_instance2, writer)
		
		#following is for drawing graph in output_p_gene_id_list()
		self.gene_no2gene_id = get_gene_no2gene_id(curs)
		self.gene_no2go_no = get_gene_no2go_no(curs)

		cluster_info_instance = cluster_info()
		
		for i in range(len(sample_ls_ls)):
			cat_no = i+1
			sys.stderr.write("Category %s...\n"%cat_no)
			writer.writerow(['Category %s'%cat_no])
			writer.writerow([self.category_no2information[cat_no]])
			cat_dir = 'cat%s'%cat_no
			if not os.path.isdir(cat_dir):
				os.makedirs(cat_dir)
			if i==0:	#this is different, prediction only in schema_instance1, so swap it
				self.output_p_gene_id_list(curs, schema_instance2, schema_instance1, sample_ls_ls[i], writer, cat_dir, \
					pga_instance_list[1], pga_instance_list[0], cluster_info_instance, self.simple)
			else:
				self.output_p_gene_id_list(curs, schema_instance1, schema_instance2, sample_ls_ls[i], writer, cat_dir, \
					pga_instance_list[0], pga_instance_list[1], cluster_info_instance, self.simple)
			sys.stderr.write("End Category %s.\n"%cat_no)
Esempio n. 8
0
    def run(self):
        """
		11-09-05
		11-09-05 add rpart_cp
		11-10-05 add need_cal_hg_p_value
		11-23-05
			rpart_fit_and_predict() is split
		2006-12-05
			add need_output_data_for_R flag
		
			--db_connect()
			--form_schema_tables()
			--form_schema_tables()
			--get_no_of_total_genes()
			--get_go_no2gene_no_set()
			--data_fetch()
				--get_vertex_list()
				--cal_hg_p_value()
			--output_data_for_R()
			
			--rpart_fit()
			--rpart_predict()
			--rpart_predict()
			--MpiPredictionFilter_instance....()
			--record_data()
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        old_schema_instance = form_schema_tables(self.fname1)
        new_schema_instance = form_schema_tables(self.fname2)

        no_of_total_genes = get_no_of_total_genes(curs)
        go_no2gene_no_set = get_go_no2gene_no_set(curs)

        unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(
            curs,
            old_schema_instance,
            self.filter_type,
            self.is_correct_type,
            no_of_total_genes,
            go_no2gene_no_set,
            need_cal_hg_p_value,
        )

        if self.need_output_data_for_R:  # 2006-12-05
            self.output_data_for_R(known_data, "%s.known" % self.fname1)
            self.output_data_for_R(unknown_data, "%s.unknown" % self.fname1)
        """
		testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.training_perc, self.rpart_cp, \
			self.loss_matrix, self.prior_prob)
		print testing_acc_ls
		print training_acc_ls
		"""
        fit_model = self.fit_function_dict[self.type](known_data, self.parameter_list_dict[self.type], self.bit_string)
        known_pred = self.predict_function_dict[self.type](fit_model, known_data)
        unknown_pred = self.predict_function_dict[self.type](fit_model, unknown_data)

        if self.debug:
            if self.type == 2:
                # randomForest's model has its own oob prediction
                fit_model_py = fit_model.as_py(BASIC_CONVERSION)
                print self.cal_accuracy(known_data, fit_model_py["predicted"], pred_type=1)
            print self.cal_accuracy(known_data, known_pred, pred_type=self.type)
            print self.cal_accuracy(unknown_data, unknown_pred, pred_type=self.type)

        if self.commit:
            MpiPredictionFilter_instance = MpiPredictionFilter()
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.splat_table, new_schema_instance.splat_table
            )
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table
            )
            MpiPredictionFilter_instance.view_from_table(
                curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table
            )
            MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
            self.record_data(
                curs,
                MpiPredictionFilter_instance,
                unknown_prediction_ls,
                unknown_pred,
                new_schema_instance,
                pred_type=self.type,
            )
            if (
                self.type == 2
            ):  # 2006-10-31 randomForest's model has its own oob prediction, but use rpart's way of storing prediction
                fit_model_py = fit_model.as_py(BASIC_CONVERSION)
                known_pred = fit_model_py["predicted"]
                self.record_data(
                    curs,
                    MpiPredictionFilter_instance,
                    known_prediction_ls,
                    known_pred,
                    new_schema_instance,
                    pred_type=1,
                )
            else:
                self.record_data(
                    curs,
                    MpiPredictionFilter_instance,
                    known_prediction_ls,
                    known_pred,
                    new_schema_instance,
                    pred_type=self.type,
                )
            curs.execute("end")
Esempio n. 9
0
	def run(self):
		"""
		10-31-05
		2006-09-26
			modify it to be compatible with the modified pipeline from haifeng
		2006-11-06
			add type
		2006-12-13
			use font_path and font_size
			
			--form_schema_tables()
			--db_connect()
			--get_char_dimension()
			
			--get_no_of_p_funcs_gene_no_go_no_list()
			--get_recurrence_go_no_rec_array_cluster_id_ls()
			--get_go_no2name()
			--draw_function_map()
			
			--draw_gene_function_map()

			--get_recurrence_rec_array_bs_no_list()
			--get_mt_no2tf_name()
			--draw_tf_map()
		"""
		schema_instance = form_schema_tables(self.inputfname, self.acc_cutoff, self.lm_bit)
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		font = ImageFont.truetype(self.font_path, self.font_size)
		char_dimension = font.getsize('a')
		#char_dimension = get_char_dimension()
		
		#go_no2name = get_go_no2name(curs)
		go_no2name = get_go_id2name(curs)
		if self.type==1:
			go_no2go_id = get_go_no2go_id(curs)
			given_p_gene_set = p_gene_id_set_from_gene_p_table(curs, schema_instance.gene_p_table)
			no_of_p_funcs_gene_no_go_no_list, mcl_id2go_no_set = self.get_no_of_p_funcs_gene_no_go_no_list_from_db(curs, \
				schema_instance.p_gene_table, given_p_gene_set, go_no2go_id)
		elif self.type==2:
			no_of_p_funcs_gene_no_go_no_list, mcl_id2go_no_set = self.get_no_of_p_funcs_gene_no_go_no_list_from_file(self.inputfname)
		
		
		recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets, mcl_id2enc_recurrence = \
			self.get_recurrence_go_no_rec_array_cluster_id_ls(curs, self.pattern_table, mcl_id2go_no_set)
		
		no_of_functions = len(recurrence_go_no_rec_array_cluster_id_ls)
		function_map_output_fname = '%s.function_map.png'%self.output_prefix
		go_no2index, function_name_region = self.draw_function_map(recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets,\
			go_no2name, function_map_output_fname, self.function_name_length, char_dimension, no_of_functions, font)				
		
		gene_function_map_output_fname = '%s.gene_function_map.png'%self.output_prefix
		self.draw_gene_function_map(no_of_p_funcs_gene_no_go_no_list, go_no2index, function_name_region,\
			gene_function_map_output_fname, self.function_name_length, char_dimension, no_of_functions, font)
		
		
		#tf_map requires mcl_id2enc_recurrence and no_of_datasets from above
		recurrence_rec_array_bs_no_list = self.get_recurrence_rec_array_bs_no_list(curs, self.cluster_bs_table, mcl_id2enc_recurrence)
		mt_no2tf_name = get_gene_id2gene_symbol(curs, tax_id=9606)
		#mt_no2tf_name = get_mt_no2tf_name()
		tf_map_output_fname = '%s.tf_map.png'%self.output_prefix
		self.draw_tf_map(recurrence_rec_array_bs_no_list, no_of_datasets, mt_no2tf_name, \
			tf_map_output_fname, self.function_name_length, char_dimension, font)
Esempio n. 10
0
	def run(self):
		"""
		10-05-05
		10-12-05
			use max_layer to control whether to turn on the gradient or not
		10-16-05
			transformed to MPI version
		
			if node_rank==0
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--get_gene_no2go_no_set()
				--get_mcl_id2accuracy()
			elif computing_node:
				(prepare data)
			elif output_node:
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--view_from_table()
				--view_from_table()
				--view_from_table()
				--createGeneTable()
			
			--mpi_synchronize()
			
			if input_node:
				--input_node()
					--fetch_predictions()
			elif computing_node:
				--computing_node()
					--node_fire()
						--gradient_class()
			elif output_node:
				--output_node()
					--output_node_handler()
						--submit_to_p_gene_table()
		"""		
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			gene_no2go = get_gene_no2go_no_set(curs)
			gene_no2go_pickle = cPickle.dumps(gene_no2go, -1)	#-1 means use the highest protocol
			
			if self.max_layer:
				crs_sentence = 'DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
				p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
				p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \
				p.vertex_gradient, p.edge_gradient, p2.vertex_set, p2.edge_set, p2.d_matrix, p2.recurrence_array from %s p, %s p2 where \
				p.mcl_id=p2.id'%(old_schema_instance.p_gene_table, old_schema_instance.pattern_table)
			else:
				crs_sentence = "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
				p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
				p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\
				p.edge_gradient, 'vertex_set', 'edge_set', 'd_matrix', 'recurrence_array' \
				from %s p"%(old_schema_instance.p_gene_table)
				
				#some placeholders 'vertex_set', 'edge_set', 'd_matrix' for prediction_attributes()
			
			if self.acc_cut_off:
				mcl_id2accuracy = self.get_mcl_id2accuracy(curs, old_schema_instance.p_gene_table, crs_sentence, self.is_correct_type)
			else:
				mcl_id2accuracy = None
			mcl_id2accuracy_pickle = cPickle.dumps(mcl_id2accuracy, -1)	#-1 means use the highest protocol
			for node in range(1, communicator.size-1):	#send it to the computing_node
				communicator.send(gene_no2go_pickle, node, 0)
			for node in range(1, communicator.size-1):	#send it to the computing_node
				communicator.send(mcl_id2accuracy_pickle, node, 0)
		elif node_rank<=communicator.size-2:	#exclude the last node
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go = cPickle.loads(data)	#take the data
			data, source, tag = communicator.receiveString(0, 0)
			mcl_id2accuracy = cPickle.loads(data)	#take the data
			#choose a functor for recurrence_array
			functor_dict = {0: None,
				1: lambda x: int(x>=self.recurrence_x),
				2: lambda x: math.pow(x, self.recurrence_x)}
			functor = functor_dict[self.recurrence_x_type]
		elif node_rank==communicator.size-1:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			self.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
			self.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
			self.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
			self.createGeneTable(curs, new_schema_instance.p_gene_table)
		
		mpi_synchronize(communicator)
		
		if node_rank == 0:
			self.input_node(communicator, curs, old_schema_instance, crs_sentence, self.size)
		elif node_rank<=communicator.size-2:	#exclude the last node
			self.computing_node(communicator, gene_no2go, self.exponent, self.score_list, \
				self.max_layer, self.norm_exp, self.eg_d_type, mcl_id2accuracy, self.acc_cut_off, functor)
		elif node_rank==communicator.size-1:
			parameter_list = [curs, new_schema_instance.p_gene_table]
			free_computing_nodes = range(1,communicator.size-1)
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler)
			if self.commit:
				curs.execute("end")