예제 #1
0
	def run(self):
		"""
		10-22-05
			
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		free_computing_nodes = range(1,communicator.size-1)
		print "this is node",node_rank
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			edge2occurrrence, no_of_datasets = get_edge2occurrence(curs, self.min_sup, self.max_sup)
			edge2occurrrence_pickle = cPickle.dumps((edge2occurrrence, no_of_datasets), -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(edge2occurrrence_pickle, node, 0)
			del conn, curs
		elif node_rank in free_computing_nodes:	#exclude the last node
			data, source, tag = communicator.receiveString(0, 0)
			edge2occurrrence, no_of_datasets = cPickle.loads(data)
		
		mpi_synchronize(communicator)
		if node_rank == 0:
			inf = csv.reader(open(self.inputfile,'r'), delimiter='\t')
			parameter_list = [inf]
			input_node(communicator, parameter_list, free_computing_nodes, self.message_size, self.report, input_handler=self.input_handler)
			del inf
		elif node_rank in free_computing_nodes:
			parameter_list = [self.min_size, self.alpha, edge2occurrrence, no_of_datasets]
			computing_node(communicator, parameter_list, self.node_fire, report=self.report)
		elif node_rank == communicator.size-1:
			writer = csv.writer(open(self.outputfile, 'w'), delimiter='\t')
			parameter_list = [writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report)
			del writer
예제 #2
0
	def run(self):
		"""
		10-07-05
		10-09-05 input_node() add mcl_table
		10-24-05 create new views for splat_table and mcl_table
		10-28-05 no views, no new pattern_table, read from inputfile, write to outputfile
		01-24-06 copy a whole block from MpiFromDatasetSignatureToPattern.py to read in edge sig matrix
			
			(rank==0)
				--get_no_of_datasets()
				--sendEdgeSigMatrix()
			elif free_computing_nodes:
				--PostFim()
				--receiveEdgeSigMatrix()
			
			mpi_synchronize()
			
			--input_node()
				--input_handler()
			--computing_node()
				--node_fire()
				--cleanup_handler()
			--output_node()
				--output_handler()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank		
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		
		#01-24-06 following block is directly copied from MpiFromDatasetSignatureToPattern.py
		block_size = 10000
		MpiFromDatasetSignatureToPattern_instance = MpiFromDatasetSignatureToPattern()
		if communicator.rank == 0:
			no_of_datasets = MpiFromDatasetSignatureToPattern_instance.get_no_of_datasets(self.sig_vector_fname)
				#no_of_datasets is used in fillEdgeSigMatrix() and patternFormation()
			for node in free_computing_nodes:
				communicator.send(str(no_of_datasets), node, 0)
			MpiFromDatasetSignatureToPattern_instance.sendEdgeSigMatrix(communicator, free_computing_nodes, self.sig_vector_fname, \
				no_of_datasets, self.min_sup, self.max_sup, block_size)
		elif communicator.rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			no_of_datasets = int(data)	#take the data
			j_instance = johnson_sp(no_of_datasets)
			MpiFromDatasetSignatureToPattern_instance.receiveEdgeSigMatrix(communicator, j_instance, no_of_datasets, block_size)
		
		mpi_synchronize(communicator)
		
		if node_rank == 0:
			inf = csv.reader(open(self.inputfile,'r'), delimiter='\t')
			parameter_list = [inf]
			input_node(communicator, parameter_list, free_computing_nodes, self.size, self.report, input_handler=self.input_handler)
			del inf
		elif node_rank in free_computing_nodes:	#exclude the last node
			parameter_list = [j_instance, self.parser_type]
			computing_node(communicator, parameter_list, self.node_fire, self.cleanup_handler, self.report)
		elif node_rank==communicator.size-1:
			writer = csv.writer(open(self.outputfile, 'w'), delimiter='\t')
			parameter_list = [writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report)
			del writer
예제 #3
0
    def run(self):
        """
		09-05-05
			Watch: when sending via MPI, tag 0 means from node 0,  tag 1 means goes to the last node.
		10-21-05
			replace output_node() with the one from codense.common for better scheduling
			
			--fill_edge2encodedOccurrence()
			
			--input_node()
				--get_cluster_block()
			--computing_node()
				--node_fire()
			--output_node()
				--output_cluster()
			
			--uniqueSort()
		"""
        communicator = MPI.world.duplicate()
        node_rank = communicator.rank
        intermediateFile = "%s.unsorted" % self.outputfile  # intermediateFile to store concatenated results
        if communicator.rank == (communicator.size - 1):
            edge2encodedOccurrence = {}
            no_of_datasets = self.fill_edge2encodedOccurrence(
                self.hostname, self.dbname, self.schema, edge2encodedOccurrence, self.min_sup, self.max_sup
            )

        mpi_synchronize(communicator)

        if node_rank == 0:
            self.input_node(
                communicator, self.inputfile, self.min_size, self.cluster_block_size, self.cluster_block_edges
            )
        elif node_rank <= communicator.size - 2:  # exclude the last node
            self.computing_node(communicator, self.cluster_block_size, self.min_size, self.min_con)
        elif node_rank == communicator.size - 1:
            codense2db_instance = codense2db()
            free_computing_nodes = range(1, communicator.size - 1)
            writer = csv.writer(open(intermediateFile, "w"), delimiter="\t")
            parameter_list = [writer, codense2db_instance, edge2encodedOccurrence, no_of_datasets]
            output_node(
                communicator,
                free_computing_nodes,
                parameter_list,
                self.output_cluster,
                report=self.report,
                type=Numeric.Int,
            )
            del writer
            # 10-21-05self.output_node(communicator, intermediateFile, codense2db_instance, edge2encodedOccurrence, no_of_datasets)

        mpi_synchronize(communicator)
        # collecting
        if node_rank == 0:
            MpiFromDatasetSignatureToPattern_instance = MpiFromDatasetSignatureToPattern()
            MpiFromDatasetSignatureToPattern_instance.uniqueSort(intermediateFile, self.outputfile)
예제 #4
0
	def run(self):
		"""
		11-16-05
			
			--computing_handler()
				--is_site_confirmed()
					--get_no_of_mismatches_allowed()
					--get_no_of_mismatches_for_consensus()
						--is_good_consensus()
					--get_no_of_mismatches_for_site()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank	
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			if self.profile_filename:
				mt_id_set = get_mt_id_set_from_profile(self.profile_filename)
			else:
				mt_id_set = None
			mt_id2sites_ls = get_mt_id2sites_ls(curs, mt_id_set)
			mt_id2sites_ls_pickle = cPickle.dumps(mt_id2sites_ls, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(mt_id2sites_ls_pickle, node, 0)
			
			input_files = os.listdir(self.inputdir)
			for i in range(len(input_files)):	#attach the directory path to the files
				input_files[i] = os.path.join(self.inputdir, input_files[i])
			#the following infomation is just header info inserted into the top of the output_file
			match_output_header = self.get_match_output_header(input_files[0])
			communicator.send(match_output_header, communicator.size-1, 0)
		elif node_rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			mt_id2sites_ls = cPickle.loads(data)	#take the data
		elif node_rank==communicator.size-1:
			outf = open(self.output_file, 'w')
			match_output_header, source, tag = communicator.receiveString(0, 0)
			outf.write(match_output_header)
			
		mpi_synchronize(communicator)
		if node_rank == 0:
			aggregated_inf = fileinput.input(input_files)
			parameter_list = [0, aggregated_inf]
			input_node(communicator, parameter_list, free_computing_nodes, self.message_size, self.report, \
				input_handler=self.input_handler)
			del aggregated_inf
		elif node_rank in free_computing_nodes:
			parameter_list = [mt_id2sites_ls, max_mis_match_perc, min_no_of_mismatches, max_esc_length]
			computing_node(communicator, parameter_list, self.computing_handler, report=self.report)
		elif node_rank==communicator.size-1:
			parameter_list = [outf]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report)
			del outf
예제 #5
0
	def run(self):
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank	
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			gene_id2no = get_gene_id2gene_no(curs)
			gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no)
			gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1)
			
			gene_no2id = get_gene_no2gene_id(curs)
			gene_no2go_no = get_gene_no2go_no(curs)
			gene_no2id_pickle = cPickle.dumps(gene_no2id, -1)
			gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(gene2enc_array_pickle, node, 0)
			
			communicator.send(gene_no2id_pickle, communicator.size-1, 0)
			communicator.send(gene_no2go_no_pickle, communicator.size-1, 0)
		elif node_rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			gene2enc_array = cPickle.loads(data)	#take the data
		elif node_rank==communicator.size-1:
			schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2id = cPickle.loads(data)
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go_no = cPickle.loads(data)
			
		mpi_synchronize(communicator)
		if node_rank == 0:
			curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\
			g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table))
			input_node(communicator, curs, free_computing_nodes, self.message_size, self.report)
		elif node_rank in free_computing_nodes:
			parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off]
			computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report)
		elif node_rank==communicator.size-1:
			if not os.path.isdir(self.pic_output_dir):
				os.makedirs(self.pic_output_dir)
			cluster_info_instance = cluster_info()
			ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off))
			writer = csv.writer(open(ofname, 'w'), delimiter='\t')
			parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			del writer
예제 #6
0
	def run(self):
		"""
		11-16-05
			
			--computing_handler()
				--is_site_confirmed()
					--get_no_of_mismatches_allowed()
					--get_no_of_mismatches_for_consensus()
						--is_good_consensus()
					--get_no_of_mismatches_for_site()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank	
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			known_data = self.get_known_data(curs, self.fname, self.filter_type, self.is_correct_type, self.need_cal_hg_p_value)
			known_data_pickle = cPickle.dumps(known_data, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(known_data_pickle, node, 0)
		elif node_rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			known_data = cPickle.loads(data)	#take the data
		elif node_rank==communicator.size-1:
			writer = csv.writer(open(self.output_file, 'w'), delimiter='\t')
			#write down the header
			writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\
				'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std'])
			
		mpi_synchronize(communicator)
		if node_rank == 0:
			setting_ls = self.form_setting_ls(self.rpart_cp_ls, self.loss_matrix_ls, self.prior_prob_ls)
			self.input_node(communicator, setting_ls, free_computing_nodes, self.report)
		elif node_rank in free_computing_nodes:
			parameter_list = [known_data, self.no_of_buckets]
			computing_node(communicator, parameter_list, self.computing_handler, report=self.report)
		elif node_rank==communicator.size-1:
			parameter_list = [writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report)
			del writer
예제 #7
0
	def run(self):
		"""
		09-05-05
		2006-09-21 add fuzzyDense_flag
		2006-11-02 add tfbs_association_type
		2006-11-02 differentiate good_cluster_table as pattern_xxx or good_xxx for pattern id
		
			--db_connect()
			--get_gene_no2bs_no_block()
			--construct_two_dicts()
			
			--input_node()
				--fetch_cluster_block()
			--computing_node()
				--node_fire()
					--cluster_bs_analysis()
			--create_cluster_bs_table()
			--output_node()
				--submit_cluster_bs_table()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		free_computing_nodes = range(1,communicator.size-1)
		print self.tfbs_association_type
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			if self.tfbs_association_type==1:	#2006-11-02
				gene_no2bs_no_block = self.get_gene_no2bs_no_block(curs)
			elif self.tfbs_association_type==2:
				gene_no2bs_no_block = get_gene_no2bs_no_block_from_expt_tf_mapping(curs)
			for node in range(1, communicator.size-1):	#send it to the computing_node
				communicator.send(gene_no2bs_no_block, node, 0)
			if self.fuzzyDense_flag:	#2006-09-21 add fuzzyDense_flag
				#12-18-05 get edge2encodedOccurrence
				MpiCrackSplat_instance = MpiCrackSplat()
				edge2encodedOccurrence = {}
				min_sup = 5	#need to expose them
				max_sup = 40
				total_vertex_set = self.return_total_vertex_set(curs, self.good_cluster_table)
				edge2encodedOccurrence, no_of_datasets = self.fill_edge2encodedOccurrence(\
					self.sig_vector_fname, min_sup, max_sup, total_vertex_set)
				edge2encodedOccurrence_pickle = cPickle.dumps(edge2encodedOccurrence, -1)
				for node in free_computing_nodes:	#send it to the computing_node
					communicator.send(edge2encodedOccurrence_pickle, node, 0)
		elif node_rank>0 and node_rank<communicator.size-1:
			data, source, tag, count = communicator.receive(Numeric.Int, 0, 0)
			gene_no2bs_no_set, bs_no2gene_no_set = self.construct_two_dicts(node_rank, data)
			if self.fuzzyDense_flag:	#2006-09-21
				#12-18-05
				data, source, tag = communicator.receiveString(0, 0)
				edge2encodedOccurrence = cPickle.loads(data)
			
		elif node_rank==communicator.size-1:	#establish connection before pursuing
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			
			#12-20-05 for darwin output
			gene_id2symbol = get_gene_id2gene_symbol(curs, self.tax_id)
			dataset_no2desc = get_dataset_no2desc(curs)
			
			
		mpi_synchronize(communicator)
		
		if node_rank == 0:
			if self.good_cluster_table.find('pattern')!=-1:	#2006-11-02 it's pattern_xxx table, use id as pattern_id
				curs.execute("DECLARE crs CURSOR FOR select distinct id, vertex_set, recurrence_array\
					from %s "%(self.good_cluster_table))
			else:	#2006-11-02 it's good_xxx table, use mcl_id as pattern_id
				curs.execute("DECLARE crs CURSOR FOR select distinct mcl_id, vertex_set, recurrence_array\
					from %s "%(self.good_cluster_table))
			input_node(communicator, curs, free_computing_nodes, self.size, self.report)
			curs.execute("close crs")
			
		elif node_rank<=communicator.size-2:	#exclude the last node
			if self.fuzzyDense_flag:	#2006-09-21
				fuzzyDense_instance = fuzzyDense(edge2encodedOccurrence)
			else:
				fuzzyDense_instance = None
			parameter_list = [gene_no2bs_no_set, bs_no2gene_no_set, self.ratio_cutoff, \
				self.top_number, self.p_value_cut_off, fuzzyDense_instance, self.degree_cut_off, self.fuzzyDense_flag]
			computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report)
			
		elif node_rank==communicator.size-1:
			
			#12-20-05 comment out
			if self.new_table:
				self.create_cluster_bs_table(curs, self.cluster_bs_table)
			parameter_list = [curs, self.cluster_bs_table]
			output_node(communicator, free_computing_nodes, parameter_list, self.submit_cluster_bs_table, report=self.report)
			if self.commit:
				curs.execute("end")
			"""
예제 #8
0
	def run(self):
		"""
		09-05-05
		10-23-05
			create views from old schema
			result goes to the new schema's p_gene_table
		
			(input_node)
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--get_gene_no2go_no_set()
				--get_go_no2depth()
				(pass data to computing_node)
			(computing_node)
				(take data from other nodes, 0 and size-1)
			(judge_node)
				--gene_stat()
				--db_connect()
				--gene_p_map_redundancy()
			(output_node)
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--MpiPredictionFilter()
				--MpiPredictionFilter_instance.createGeneTable()
				--get_go_no2edge_counter_list()(if necessary)
				(pass go_no2edge_counter_list to computing_node)
			
			(input_node)
				--fetch_cluster_block()
			(computing_node)
				--get_no_of_unknown_genes()
				--node_fire_handler()
				--cleanup_handler()
			--judge_node()
				--gene_stat_instance.(match functions)
			--output_node()
				--output_node_handler()
					--MpiPredictionFilter_instance.submit_to_p_gene_table()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			"""
			#01-02-06
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			"""
			gene_no2go_no = get_gene_no2go_no_set(curs)
			gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1)	#-1 means use the highest protocol
			go_no2depth = get_go_no2depth(curs)
			go_no2depth_pickle = cPickle.dumps(go_no2depth, -1)
			go_no2gene_no_set = get_go_no2gene_no_set(curs)
			go_no2gene_no_set_pickle = cPickle.dumps(go_no2gene_no_set, -1)
			for node in range(1, communicator.size-2):	#send it to the computing_node
				communicator.send(gene_no2go_no_pickle, node, 0)
				communicator.send(go_no2depth_pickle, node, 0)
				communicator.send(go_no2gene_no_set_pickle, node, 0)
		elif node_rank<=communicator.size-3:	#WATCH: last 2 nodes are not here.
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go_no = cPickle.loads(data)	#take the data
			data, source, tag = communicator.receiveString(0, 0)
			go_no2depth = cPickle.loads(data)
			data, source, tag = communicator.receiveString(0, 0)
			go_no2gene_no_set = cPickle.loads(data)
			data, source, tag = communicator.receiveString(communicator.size-1, 0)	#from the last node
			go_no2edge_counter_list = cPickle.loads(data)
			#choose a functor for recurrence_array
			functor_dict = {0: None,
				1: lambda x: int(x>=self.recurrence_x),
				2: lambda x: math.pow(x, self.recurrence_x)}
			functor = functor_dict[self.recurrence_x_type]
		elif node_rank == communicator.size-2:	#judge node
			gene_stat_instance = gene_stat(depth_cut_off=self.depth)
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			gene_stat_instance.dstruc_loadin(curs)
			from gene_p_map_redundancy import gene_p_map_redundancy
			node_distance_class = gene_p_map_redundancy()			
		elif node_rank==communicator.size-1:	#establish connection before pursuing
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			"""
			#01-02-06, input and output are all directed to files
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			MpiPredictionFilter_instance = MpiPredictionFilter()
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
			MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
			if self.new_table:
				MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table)
			"""
			if self.go_no2edge_counter_list_fname:
				go_no2edge_counter_list = cPickle.load(open(self.go_no2edge_counter_list_fname,'r'))
			else:
				if self.eg_d_type==2:
					go_no2edge_counter_list = None
				else:
					gene_no2go_no = get_gene_no2go_no_set(curs)
					go_no2edge_counter_list = get_go_no2edge_counter_list(curs, gene_no2go_no, self.edge_type2index)
			go_no2edge_counter_list_pickle = cPickle.dumps(go_no2edge_counter_list, -1)
			for node in range(1, communicator.size-2):	#send it to the computing_node
				communicator.send(go_no2edge_counter_list_pickle, node, 0)
		
		mpi_synchronize(communicator)
		
		free_computing_nodes = range(1,communicator.size-2)	#exclude the last node
		if node_rank == 0:
			"""
			curs.execute("DECLARE crs CURSOR FOR SELECT id, vertex_set, edge_set, no_of_edges,\
			connectivity, unknown_gene_ratio, recurrence_array, d_matrix from %s"%(old_schema_instance.pattern_table))
			"""
			self.counter = 0	#01-02-06 counter is used as id
			reader = csv.reader(open(self.input_fname, 'r'), delimiter='\t')
			parameter_list = [reader]
			input_node(communicator, parameter_list, free_computing_nodes, self.message_size, \
				self.report, input_handler=self.input_handler)
			del reader
		elif node_rank in free_computing_nodes:
			no_of_unknown_genes = get_no_of_unknown_genes(gene_no2go_no)
			GradientScorePrediction_instance = GradientScorePrediction(gene_no2go_no, go_no2gene_no_set, go_no2depth, \
				go_no2edge_counter_list, no_of_unknown_genes, self.depth, self.min_layer1_associated_genes, \
				self.min_layer1_ratio, self.min_layer2_associated_genes, self.min_layer2_ratio, self.exponent, \
				self.score_list, self.max_layer, self.norm_exp, self.eg_d_type, self.debug)
			parameter_list = [GradientScorePrediction_instance, functor]
			computing_node(communicator, parameter_list, self.node_fire_handler, self.cleanup_handler, self.report)
		elif node_rank == communicator.size-2:
			self.judge_node(communicator, curs, gene_stat_instance, node_distance_class)
		elif node_rank==communicator.size-1:
			#01-02-06 output goes to plain file, not database
			writer = csv.writer(open(self.jnput_fname, 'w'), delimiter='\t')
			parameter_list = [writer]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			del writer
예제 #9
0
	def run(self):
		"""
		11-16-05
		11-19-05
			use no_of_validations to multiply the setting(separate the one setting's validations
				to different nodes)
			the extra setting copy  is for a non-validation real model fitting
			
			--computing_handler()
				--is_site_confirmed()
					--get_no_of_mismatches_allowed()
					--get_no_of_mismatches_for_consensus()
						--is_good_consensus()
					--get_no_of_mismatches_for_site()
		"""
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		free_computing_nodes = range(1,communicator.size-1)	#exclude the last node
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			unknown_data, known_data = self.get_data(curs, self.fname, self.filter_type, self.is_correct_type, self.need_cal_hg_p_value)
			known_data_pickle = cPickle.dumps(known_data, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(known_data_pickle, node, 0)
			unknown_data_pickle = cPickle.dumps(unknown_data, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				communicator.send(unknown_data_pickle, node, 0)
		elif node_rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			known_data = cPickle.loads(data)	#take the data
			"""
			#11-19-05 shuffle data to check
			index_ls = range(len(known_data))
			random.shuffle(index_ls)
			for i in range(len(index_ls)):
				index_ls[i] = known_data[i]
			known_data = index_ls
			"""
			data, source, tag = communicator.receiveString(0, 0)
			unknown_data = cPickle.loads(data)	#take the data
			"""
			#11-19-05 shuffle data to check
			index_ls = range(len(unknown_data))
			random.shuffle(index_ls)
			for i in range(len(index_ls)):
				index_ls[i] = unknown_data[i]
			unknown_data = index_ls
			"""
		elif node_rank==communicator.size-1:
			writer = csv.writer(open(self.output_file, 'w'), delimiter='\t')
			#write down the header
			writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\
				'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std'])
			
		mpi_synchronize(communicator)
		if node_rank == 0:
			if self.type==1:
				setting_ls = self.form_setting_ls(self.rpart_cp_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations)
			elif self.type==2:
				#randomForest replaces rpart_cp_ls with mty_ls, others are ignored later
				setting_ls = self.form_setting_ls(self.mty_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations)
			else:
				sys.stderr.write("type %s not supported.\n"%self.type)
				sys.exit(3)
			self.input_node(communicator, setting_ls, free_computing_nodes, self.report)
		elif node_rank in free_computing_nodes:
			parameter_list = [unknown_data, known_data, self.training_perc, self.no_of_validations, self.type, self.bit_string]	#03-17-06 add type, bit_string
			computing_node(communicator, parameter_list, self.computing_handler, report=self.report)
		elif node_rank==communicator.size-1:
			setting2validation_stat = {}
			setting2unknown_known_acc_ls = {}
			parameter_list = [writer, setting2validation_stat, setting2unknown_known_acc_ls, self.no_of_validations]
			output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report)
			#cPickle.dump([setting2validation_stat, setting2unknown_known_acc_ls], open('/home/yuhuang/MpiRpartValidation.setting2result.pickle','w'))	#11-23-05
			del writer
예제 #10
0
	def run(self):
		"""
		10-05-05
		10-12-05
			use max_layer to control whether to turn on the gradient or not
		10-16-05
			transformed to MPI version
		
			if node_rank==0
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--get_gene_no2go_no_set()
				--get_mcl_id2accuracy()
			elif computing_node:
				(prepare data)
			elif output_node:
				--db_connect()
				--form_schema_tables()
				--form_schema_tables()
				--view_from_table()
				--view_from_table()
				--view_from_table()
				--createGeneTable()
			
			--mpi_synchronize()
			
			if input_node:
				--input_node()
					--fetch_predictions()
			elif computing_node:
				--computing_node()
					--node_fire()
						--gradient_class()
			elif output_node:
				--output_node()
					--output_node_handler()
						--submit_to_p_gene_table()
		"""		
		communicator = MPI.world.duplicate()
		node_rank = communicator.rank
		if node_rank == 0:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			gene_no2go = get_gene_no2go_no_set(curs)
			gene_no2go_pickle = cPickle.dumps(gene_no2go, -1)	#-1 means use the highest protocol
			
			if self.max_layer:
				crs_sentence = 'DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
				p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
				p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \
				p.vertex_gradient, p.edge_gradient, p2.vertex_set, p2.edge_set, p2.d_matrix, p2.recurrence_array from %s p, %s p2 where \
				p.mcl_id=p2.id'%(old_schema_instance.p_gene_table, old_schema_instance.pattern_table)
			else:
				crs_sentence = "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
				p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
				p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\
				p.edge_gradient, 'vertex_set', 'edge_set', 'd_matrix', 'recurrence_array' \
				from %s p"%(old_schema_instance.p_gene_table)
				
				#some placeholders 'vertex_set', 'edge_set', 'd_matrix' for prediction_attributes()
			
			if self.acc_cut_off:
				mcl_id2accuracy = self.get_mcl_id2accuracy(curs, old_schema_instance.p_gene_table, crs_sentence, self.is_correct_type)
			else:
				mcl_id2accuracy = None
			mcl_id2accuracy_pickle = cPickle.dumps(mcl_id2accuracy, -1)	#-1 means use the highest protocol
			for node in range(1, communicator.size-1):	#send it to the computing_node
				communicator.send(gene_no2go_pickle, node, 0)
			for node in range(1, communicator.size-1):	#send it to the computing_node
				communicator.send(mcl_id2accuracy_pickle, node, 0)
		elif node_rank<=communicator.size-2:	#exclude the last node
			data, source, tag = communicator.receiveString(0, 0)
			gene_no2go = cPickle.loads(data)	#take the data
			data, source, tag = communicator.receiveString(0, 0)
			mcl_id2accuracy = cPickle.loads(data)	#take the data
			#choose a functor for recurrence_array
			functor_dict = {0: None,
				1: lambda x: int(x>=self.recurrence_x),
				2: lambda x: math.pow(x, self.recurrence_x)}
			functor = functor_dict[self.recurrence_x_type]
		elif node_rank==communicator.size-1:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			old_schema_instance = form_schema_tables(self.input_fname)
			new_schema_instance = form_schema_tables(self.jnput_fname)
			self.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table)
			self.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table)
			self.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table)
			self.createGeneTable(curs, new_schema_instance.p_gene_table)
		
		mpi_synchronize(communicator)
		
		if node_rank == 0:
			self.input_node(communicator, curs, old_schema_instance, crs_sentence, self.size)
		elif node_rank<=communicator.size-2:	#exclude the last node
			self.computing_node(communicator, gene_no2go, self.exponent, self.score_list, \
				self.max_layer, self.norm_exp, self.eg_d_type, mcl_id2accuracy, self.acc_cut_off, functor)
		elif node_rank==communicator.size-1:
			parameter_list = [curs, new_schema_instance.p_gene_table]
			free_computing_nodes = range(1,communicator.size-1)
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler)
			if self.commit:
				curs.execute("end")
예제 #11
0
    def run(self):
        """
		2007-04-16
			(rank==0)
				--get_chr_start_ls()
			elif free_computing_nodes:
				-- (receive data)
			
			--mpi_synchronize()
			
			(rank==0)
				--input_node()
					--input_handler()
			elif free_computing_nodes:
				--computing_node()
					--computing_node_handler()
						--identify_ancestry_with_min_jumps()
							--initialize_score_trace_matrix()
								--is_child_heterozygous_SNP_compatible_with_parents()
							(for loop)
								--identify_ancestry_of_one_chr_with_DP()
									--is_child_heterozygous_SNP_compatible_with_parents()
							--trace()
								--recursive_trace()
			else:
				--output_node()
					--output_node_handler()
		"""
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size - 1)  # exclude the 1st and last node
        if node_rank == 0:
            FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname
            )
            snp_acc_list = header[2:]
            data_matrix = Numeric.array(data_matrix)
            no_of_strains = data_matrix.shape[0]
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema, password="******", user="******")

            # 2007-09-17 send strain_acc_list to the output_node
            strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1)
            self.communicator.send(strain_acc_list_pickle, self.communicator.size - 1, 0)

            chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list, self.snp_locus_table)

            chr_start_ls_pickle = cPickle.dumps(chr_start_ls, -1)  # -1 means use the highest protocol
            data_matrix_pickle = cPickle.dumps(data_matrix, -1)
            for node in free_computing_nodes:  # send it to the computing_node
                self.communicator.send(chr_start_ls_pickle, node, 0)
                self.communicator.send(data_matrix_pickle, node, 0)
        elif node_rank in free_computing_nodes:
            data, source, tag = self.communicator.receiveString(0, 0)
            chr_start_ls = cPickle.loads(data)  # take the data
            data, source, tag = self.communicator.receiveString(0, 0)
            data_matrix = cPickle.loads(data)
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            strain_acc_list = cPickle.loads(data)

        mpi_synchronize(self.communicator)

        if node_rank == 0:
            parameter_list = [no_of_strains]
            self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, self.report)
        elif node_rank in free_computing_nodes:
            trio_arrangement_ls = [
                [0, 1, 2],
                [1, 2, 0],
                [2, 0, 1],
            ]  # three different ways to pick the parent-set and the child
            parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls]
            computing_node(self.communicator, parameter_list, self.computing_node_handler, report=self.report)
        else:
            writer = csv.writer(open(self.output_fname, "w"), delimiter="\t")
            parameter_list = [writer, strain_acc_list]
            output_node(self.communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
            del writer
	def run(self):
		"""
		08-06-05
		08-24-05
			read all edge data into matrix
		08-31-05
			the integer returned by encodeOccurrenceBv() could be 138-bit(human no_of_datasets)
			And Numeric.Int is only 32 bit. So Change edge_sig_matrix format.
		12-31-05
			no database connection any more
			2 threads on computing node
		01-08-06
			no threads
			back to edge_sig_matrix
		01-11-06
			use the cc module, PostFim
		01-15-06
			add min_line_number and max_line_number
			
			(rank==0)
				--get_no_of_datasets()
				--sendEdgeSigMatrix()
			elif free_computing_nodes:
				--PostFim()
				--receiveEdgeSigMatrix()
			
			--mpi_synchronize()
			
			(rank==0)
				--input_node()
					--input_handler()
			elif free_computing_nodes:
				--computing_node()
					--computing_node_handler()
			else:
				--output_node()
					--output_node_handler()
			
			--mpi_synchronize()
			
			(rank==0)
				--receive node_outputfile
				--netmine_wrapper()
				--collect_and_merge_output()
				--uniqueSort()
			else:
				--return node_outputfile
			
		"""
		communicator = MPI.world.duplicate()
		free_computing_nodes = range(1,communicator.size-1)	#exclude the 1st and last node
		block_size = 10000
		if communicator.rank == 0:
			no_of_datasets = self.get_no_of_datasets(self.sig_vector_fname)
				#no_of_datasets is used in fillEdgeSigMatrix() and patternFormation()
			for node in free_computing_nodes:
				communicator.send(str(no_of_datasets), node, 0)
			self.sendEdgeSigMatrix(communicator, free_computing_nodes, self.sig_vector_fname, \
				no_of_datasets, self.min_sup, self.max_sup, block_size=10000)
		elif communicator.rank in free_computing_nodes:
			data, source, tag = communicator.receiveString(0, 0)
			no_of_datasets = int(data)	#take the data
			offset = communicator.rank - 1
			node_outputfile = '%s.%s'%(self.outputfile, offset)
			PostFim_instance = PostFim(self.no_cc, no_of_datasets, self.min_cluster_size, node_outputfile)
			self.receiveEdgeSigMatrix(communicator, PostFim_instance, no_of_datasets, block_size)
		
		mpi_synchronize(communicator)
		
		if communicator.rank == 0:
			reader = csv.reader(open(self.inputfile, 'r'), delimiter=' ')
			parameter_list = [reader, self.min_line_number, self.max_line_number]	#01-15-06
			self.line_number = 0	#01-15-06	used in input_handler()
			input_node(communicator, parameter_list, free_computing_nodes, self.queue_size, \
				self.report, input_handler=self.input_handler)
			del reader
		elif communicator.rank in free_computing_nodes:
			parameter_list = [PostFim_instance]
			computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report)
		else:
			parameter_list = []
			output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report)
			
		mpi_synchronize(communicator)
		
		if communicator.rank == 0:
			#12-31-05 wait until of_name_list is full
			of_name_list = []
			while len(of_name_list)<len(free_computing_nodes):
				data, source, tag = communicator.receiveString(None, 1)
				of_name_list.append(data)
			#collecting
			intermediateFile = '%s.unsorted'%self.outputfile	#intermediateFile to store concatenated results
			netmine_wrapper_instance = netmine_wrapper()
			netmine_wrapper_instance.collect_and_merge_output(of_name_list, intermediateFile)
			self.uniqueSort(intermediateFile, self.outputfile, self.tmpdir)
		elif communicator.rank in free_computing_nodes:
			communicator.send(node_outputfile, 0, 1)	#send back the outputfile
예제 #13
0
    def run(self):
        """
		2007-04-16
			(rank==0)
				--get_chr_start_ls()
			elif free_computing_nodes:
				-- (receive data)
			
			--mpi_synchronize()
			
			(rank==0)
				--input_node()
					--input_handler()
			elif free_computing_nodes:
				--computing_node()
					--computing_node_handler()
						--identify_ancestry_with_min_jumps()
							--initialize_score_trace_matrix()
								--is_child_heterozygous_SNP_compatible_with_parents()
							(for loop)
								--identify_ancestry_of_one_chr_with_DP()
									--is_child_heterozygous_SNP_compatible_with_parents()
							--trace()
								--recursive_trace()
			else:
				--output_node()
					--output_node_handler()
		"""
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        if node_rank == 0:
            FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix()
            header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(
                self.input_fname)
            snp_acc_list = header[2:]
            data_matrix = Numeric.array(data_matrix)
            no_of_strains = data_matrix.shape[0]
            (conn, curs) = db_connect(self.hostname,
                                      self.dbname,
                                      self.schema,
                                      password='******',
                                      user='******')

            #2007-09-17 send strain_acc_list to the output_node
            strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1)
            self.communicator.send(strain_acc_list_pickle,
                                   self.communicator.size - 1, 0)

            chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list,
                                                 self.snp_locus_table)

            chr_start_ls_pickle = cPickle.dumps(
                chr_start_ls, -1)  #-1 means use the highest protocol
            data_matrix_pickle = cPickle.dumps(data_matrix, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                self.communicator.send(chr_start_ls_pickle, node, 0)
                self.communicator.send(data_matrix_pickle, node, 0)
        elif node_rank in free_computing_nodes:
            data, source, tag = self.communicator.receiveString(0, 0)
            chr_start_ls = cPickle.loads(data)  #take the data
            data, source, tag = self.communicator.receiveString(0, 0)
            data_matrix = cPickle.loads(data)
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            strain_acc_list = cPickle.loads(data)

        mpi_synchronize(self.communicator)

        if node_rank == 0:
            parameter_list = [no_of_strains]
            self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, \
             self.report)
        elif node_rank in free_computing_nodes:
            trio_arrangement_ls = [[0, 1, 2], [1, 2, 0], [
                2, 0, 1
            ]]  #three different ways to pick the parent-set and the child
            parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls]
            computing_node(self.communicator,
                           parameter_list,
                           self.computing_node_handler,
                           report=self.report)
        else:
            writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t')
            parameter_list = [writer, strain_acc_list]
            output_node(self.communicator, free_computing_nodes,
                        parameter_list, self.output_node_handler, self.report)
            del writer
예제 #14
0
    def run(self):
        """
		11-16-05
		11-19-05
			use no_of_validations to multiply the setting(separate the one setting's validations
				to different nodes)
			the extra setting copy  is for a non-validation real model fitting
			
			--computing_handler()
				--is_site_confirmed()
					--get_no_of_mismatches_allowed()
					--get_no_of_mismatches_for_consensus()
						--is_good_consensus()
					--get_no_of_mismatches_for_site()
		"""
        communicator = MPI.world.duplicate()
        node_rank = communicator.rank
        free_computing_nodes = range(1, communicator.size -
                                     1)  #exclude the last node
        if node_rank == 0:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            unknown_data, known_data = self.get_data(curs, self.fname,
                                                     self.filter_type,
                                                     self.is_correct_type,
                                                     self.need_cal_hg_p_value)
            known_data_pickle = cPickle.dumps(known_data, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                communicator.send(known_data_pickle, node, 0)
            unknown_data_pickle = cPickle.dumps(unknown_data, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                communicator.send(unknown_data_pickle, node, 0)
        elif node_rank in free_computing_nodes:
            data, source, tag = communicator.receiveString(0, 0)
            known_data = cPickle.loads(data)  #take the data
            """
			#11-19-05 shuffle data to check
			index_ls = range(len(known_data))
			random.shuffle(index_ls)
			for i in range(len(index_ls)):
				index_ls[i] = known_data[i]
			known_data = index_ls
			"""
            data, source, tag = communicator.receiveString(0, 0)
            unknown_data = cPickle.loads(data)  #take the data
            """
			#11-19-05 shuffle data to check
			index_ls = range(len(unknown_data))
			random.shuffle(index_ls)
			for i in range(len(index_ls)):
				index_ls[i] = unknown_data[i]
			unknown_data = index_ls
			"""
        elif node_rank == communicator.size - 1:
            writer = csv.writer(open(self.output_file, 'w'), delimiter='\t')
            #write down the header
            writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\
             'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std'])

        mpi_synchronize(communicator)
        if node_rank == 0:
            if self.type == 1:
                setting_ls = self.form_setting_ls(self.rpart_cp_ls,
                                                  self.loss_matrix_ls,
                                                  self.prior_prob_ls,
                                                  self.no_of_validations)
            elif self.type == 2:
                #randomForest replaces rpart_cp_ls with mty_ls, others are ignored later
                setting_ls = self.form_setting_ls(self.mty_ls,
                                                  self.loss_matrix_ls,
                                                  self.prior_prob_ls,
                                                  self.no_of_validations)
            else:
                sys.stderr.write("type %s not supported.\n" % self.type)
                sys.exit(3)
            self.input_node(communicator, setting_ls, free_computing_nodes,
                            self.report)
        elif node_rank in free_computing_nodes:
            parameter_list = [
                unknown_data, known_data, self.training_perc,
                self.no_of_validations, self.type, self.bit_string
            ]  #03-17-06 add type, bit_string
            computing_node(communicator,
                           parameter_list,
                           self.computing_handler,
                           report=self.report)
        elif node_rank == communicator.size - 1:
            setting2validation_stat = {}
            setting2unknown_known_acc_ls = {}
            parameter_list = [
                writer, setting2validation_stat, setting2unknown_known_acc_ls,
                self.no_of_validations
            ]
            output_node(communicator, free_computing_nodes, parameter_list,
                        self.output_handler, self.report)
            #cPickle.dump([setting2validation_stat, setting2unknown_known_acc_ls], open('/home/yuhuang/MpiRpartValidation.setting2result.pickle','w'))	#11-23-05
            del writer