def run(self): """ 10-22-05 """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) print "this is node",node_rank if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) edge2occurrrence, no_of_datasets = get_edge2occurrence(curs, self.min_sup, self.max_sup) edge2occurrrence_pickle = cPickle.dumps((edge2occurrrence, no_of_datasets), -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(edge2occurrrence_pickle, node, 0) del conn, curs elif node_rank in free_computing_nodes: #exclude the last node data, source, tag = communicator.receiveString(0, 0) edge2occurrrence, no_of_datasets = cPickle.loads(data) mpi_synchronize(communicator) if node_rank == 0: inf = csv.reader(open(self.inputfile,'r'), delimiter='\t') parameter_list = [inf] input_node(communicator, parameter_list, free_computing_nodes, self.message_size, self.report, input_handler=self.input_handler) del inf elif node_rank in free_computing_nodes: parameter_list = [self.min_size, self.alpha, edge2occurrrence, no_of_datasets] computing_node(communicator, parameter_list, self.node_fire, report=self.report) elif node_rank == communicator.size-1: writer = csv.writer(open(self.outputfile, 'w'), delimiter='\t') parameter_list = [writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) del writer
def run(self): """ 10-07-05 10-09-05 input_node() add mcl_table 10-24-05 create new views for splat_table and mcl_table 10-28-05 no views, no new pattern_table, read from inputfile, write to outputfile 01-24-06 copy a whole block from MpiFromDatasetSignatureToPattern.py to read in edge sig matrix (rank==0) --get_no_of_datasets() --sendEdgeSigMatrix() elif free_computing_nodes: --PostFim() --receiveEdgeSigMatrix() mpi_synchronize() --input_node() --input_handler() --computing_node() --node_fire() --cleanup_handler() --output_node() --output_handler() """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node #01-24-06 following block is directly copied from MpiFromDatasetSignatureToPattern.py block_size = 10000 MpiFromDatasetSignatureToPattern_instance = MpiFromDatasetSignatureToPattern() if communicator.rank == 0: no_of_datasets = MpiFromDatasetSignatureToPattern_instance.get_no_of_datasets(self.sig_vector_fname) #no_of_datasets is used in fillEdgeSigMatrix() and patternFormation() for node in free_computing_nodes: communicator.send(str(no_of_datasets), node, 0) MpiFromDatasetSignatureToPattern_instance.sendEdgeSigMatrix(communicator, free_computing_nodes, self.sig_vector_fname, \ no_of_datasets, self.min_sup, self.max_sup, block_size) elif communicator.rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) no_of_datasets = int(data) #take the data j_instance = johnson_sp(no_of_datasets) MpiFromDatasetSignatureToPattern_instance.receiveEdgeSigMatrix(communicator, j_instance, no_of_datasets, block_size) mpi_synchronize(communicator) if node_rank == 0: inf = csv.reader(open(self.inputfile,'r'), delimiter='\t') parameter_list = [inf] input_node(communicator, parameter_list, free_computing_nodes, self.size, self.report, input_handler=self.input_handler) del inf elif node_rank in free_computing_nodes: #exclude the last node parameter_list = [j_instance, self.parser_type] computing_node(communicator, parameter_list, self.node_fire, self.cleanup_handler, self.report) elif node_rank==communicator.size-1: writer = csv.writer(open(self.outputfile, 'w'), delimiter='\t') parameter_list = [writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) del writer
def run(self): """ 09-05-05 Watch: when sending via MPI, tag 0 means from node 0, tag 1 means goes to the last node. 10-21-05 replace output_node() with the one from codense.common for better scheduling --fill_edge2encodedOccurrence() --input_node() --get_cluster_block() --computing_node() --node_fire() --output_node() --output_cluster() --uniqueSort() """ communicator = MPI.world.duplicate() node_rank = communicator.rank intermediateFile = "%s.unsorted" % self.outputfile # intermediateFile to store concatenated results if communicator.rank == (communicator.size - 1): edge2encodedOccurrence = {} no_of_datasets = self.fill_edge2encodedOccurrence( self.hostname, self.dbname, self.schema, edge2encodedOccurrence, self.min_sup, self.max_sup ) mpi_synchronize(communicator) if node_rank == 0: self.input_node( communicator, self.inputfile, self.min_size, self.cluster_block_size, self.cluster_block_edges ) elif node_rank <= communicator.size - 2: # exclude the last node self.computing_node(communicator, self.cluster_block_size, self.min_size, self.min_con) elif node_rank == communicator.size - 1: codense2db_instance = codense2db() free_computing_nodes = range(1, communicator.size - 1) writer = csv.writer(open(intermediateFile, "w"), delimiter="\t") parameter_list = [writer, codense2db_instance, edge2encodedOccurrence, no_of_datasets] output_node( communicator, free_computing_nodes, parameter_list, self.output_cluster, report=self.report, type=Numeric.Int, ) del writer # 10-21-05self.output_node(communicator, intermediateFile, codense2db_instance, edge2encodedOccurrence, no_of_datasets) mpi_synchronize(communicator) # collecting if node_rank == 0: MpiFromDatasetSignatureToPattern_instance = MpiFromDatasetSignatureToPattern() MpiFromDatasetSignatureToPattern_instance.uniqueSort(intermediateFile, self.outputfile)
def run(self): """ 11-16-05 --computing_handler() --is_site_confirmed() --get_no_of_mismatches_allowed() --get_no_of_mismatches_for_consensus() --is_good_consensus() --get_no_of_mismatches_for_site() """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) if self.profile_filename: mt_id_set = get_mt_id_set_from_profile(self.profile_filename) else: mt_id_set = None mt_id2sites_ls = get_mt_id2sites_ls(curs, mt_id_set) mt_id2sites_ls_pickle = cPickle.dumps(mt_id2sites_ls, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(mt_id2sites_ls_pickle, node, 0) input_files = os.listdir(self.inputdir) for i in range(len(input_files)): #attach the directory path to the files input_files[i] = os.path.join(self.inputdir, input_files[i]) #the following infomation is just header info inserted into the top of the output_file match_output_header = self.get_match_output_header(input_files[0]) communicator.send(match_output_header, communicator.size-1, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) mt_id2sites_ls = cPickle.loads(data) #take the data elif node_rank==communicator.size-1: outf = open(self.output_file, 'w') match_output_header, source, tag = communicator.receiveString(0, 0) outf.write(match_output_header) mpi_synchronize(communicator) if node_rank == 0: aggregated_inf = fileinput.input(input_files) parameter_list = [0, aggregated_inf] input_node(communicator, parameter_list, free_computing_nodes, self.message_size, self.report, \ input_handler=self.input_handler) del aggregated_inf elif node_rank in free_computing_nodes: parameter_list = [mt_id2sites_ls, max_mis_match_perc, min_no_of_mismatches, max_esc_length] computing_node(communicator, parameter_list, self.computing_handler, report=self.report) elif node_rank==communicator.size-1: parameter_list = [outf] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) del outf
def run(self): communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) gene_id2no = get_gene_id2gene_no(curs) gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no) gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1) gene_no2id = get_gene_no2gene_id(curs) gene_no2go_no = get_gene_no2go_no(curs) gene_no2id_pickle = cPickle.dumps(gene_no2id, -1) gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(gene2enc_array_pickle, node, 0) communicator.send(gene_no2id_pickle, communicator.size-1, 0) communicator.send(gene_no2go_no_pickle, communicator.size-1, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) gene2enc_array = cPickle.loads(data) #take the data elif node_rank==communicator.size-1: schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) data, source, tag = communicator.receiveString(0, 0) gene_no2id = cPickle.loads(data) data, source, tag = communicator.receiveString(0, 0) gene_no2go_no = cPickle.loads(data) mpi_synchronize(communicator) if node_rank == 0: curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\ g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table)) input_node(communicator, curs, free_computing_nodes, self.message_size, self.report) elif node_rank in free_computing_nodes: parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off] computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report) elif node_rank==communicator.size-1: if not os.path.isdir(self.pic_output_dir): os.makedirs(self.pic_output_dir) cluster_info_instance = cluster_info() ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off)) writer = csv.writer(open(ofname, 'w'), delimiter='\t') parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def run(self): """ 11-16-05 --computing_handler() --is_site_confirmed() --get_no_of_mismatches_allowed() --get_no_of_mismatches_for_consensus() --is_good_consensus() --get_no_of_mismatches_for_site() """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) known_data = self.get_known_data(curs, self.fname, self.filter_type, self.is_correct_type, self.need_cal_hg_p_value) known_data_pickle = cPickle.dumps(known_data, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(known_data_pickle, node, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) known_data = cPickle.loads(data) #take the data elif node_rank==communicator.size-1: writer = csv.writer(open(self.output_file, 'w'), delimiter='\t') #write down the header writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\ 'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std']) mpi_synchronize(communicator) if node_rank == 0: setting_ls = self.form_setting_ls(self.rpart_cp_ls, self.loss_matrix_ls, self.prior_prob_ls) self.input_node(communicator, setting_ls, free_computing_nodes, self.report) elif node_rank in free_computing_nodes: parameter_list = [known_data, self.no_of_buckets] computing_node(communicator, parameter_list, self.computing_handler, report=self.report) elif node_rank==communicator.size-1: parameter_list = [writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) del writer
def run(self): """ 09-05-05 2006-09-21 add fuzzyDense_flag 2006-11-02 add tfbs_association_type 2006-11-02 differentiate good_cluster_table as pattern_xxx or good_xxx for pattern id --db_connect() --get_gene_no2bs_no_block() --construct_two_dicts() --input_node() --fetch_cluster_block() --computing_node() --node_fire() --cluster_bs_analysis() --create_cluster_bs_table() --output_node() --submit_cluster_bs_table() """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) print self.tfbs_association_type if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) if self.tfbs_association_type==1: #2006-11-02 gene_no2bs_no_block = self.get_gene_no2bs_no_block(curs) elif self.tfbs_association_type==2: gene_no2bs_no_block = get_gene_no2bs_no_block_from_expt_tf_mapping(curs) for node in range(1, communicator.size-1): #send it to the computing_node communicator.send(gene_no2bs_no_block, node, 0) if self.fuzzyDense_flag: #2006-09-21 add fuzzyDense_flag #12-18-05 get edge2encodedOccurrence MpiCrackSplat_instance = MpiCrackSplat() edge2encodedOccurrence = {} min_sup = 5 #need to expose them max_sup = 40 total_vertex_set = self.return_total_vertex_set(curs, self.good_cluster_table) edge2encodedOccurrence, no_of_datasets = self.fill_edge2encodedOccurrence(\ self.sig_vector_fname, min_sup, max_sup, total_vertex_set) edge2encodedOccurrence_pickle = cPickle.dumps(edge2encodedOccurrence, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(edge2encodedOccurrence_pickle, node, 0) elif node_rank>0 and node_rank<communicator.size-1: data, source, tag, count = communicator.receive(Numeric.Int, 0, 0) gene_no2bs_no_set, bs_no2gene_no_set = self.construct_two_dicts(node_rank, data) if self.fuzzyDense_flag: #2006-09-21 #12-18-05 data, source, tag = communicator.receiveString(0, 0) edge2encodedOccurrence = cPickle.loads(data) elif node_rank==communicator.size-1: #establish connection before pursuing (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) #12-20-05 for darwin output gene_id2symbol = get_gene_id2gene_symbol(curs, self.tax_id) dataset_no2desc = get_dataset_no2desc(curs) mpi_synchronize(communicator) if node_rank == 0: if self.good_cluster_table.find('pattern')!=-1: #2006-11-02 it's pattern_xxx table, use id as pattern_id curs.execute("DECLARE crs CURSOR FOR select distinct id, vertex_set, recurrence_array\ from %s "%(self.good_cluster_table)) else: #2006-11-02 it's good_xxx table, use mcl_id as pattern_id curs.execute("DECLARE crs CURSOR FOR select distinct mcl_id, vertex_set, recurrence_array\ from %s "%(self.good_cluster_table)) input_node(communicator, curs, free_computing_nodes, self.size, self.report) curs.execute("close crs") elif node_rank<=communicator.size-2: #exclude the last node if self.fuzzyDense_flag: #2006-09-21 fuzzyDense_instance = fuzzyDense(edge2encodedOccurrence) else: fuzzyDense_instance = None parameter_list = [gene_no2bs_no_set, bs_no2gene_no_set, self.ratio_cutoff, \ self.top_number, self.p_value_cut_off, fuzzyDense_instance, self.degree_cut_off, self.fuzzyDense_flag] computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report) elif node_rank==communicator.size-1: #12-20-05 comment out if self.new_table: self.create_cluster_bs_table(curs, self.cluster_bs_table) parameter_list = [curs, self.cluster_bs_table] output_node(communicator, free_computing_nodes, parameter_list, self.submit_cluster_bs_table, report=self.report) if self.commit: curs.execute("end") """
def run(self): """ 09-05-05 10-23-05 create views from old schema result goes to the new schema's p_gene_table (input_node) --db_connect() --form_schema_tables() --form_schema_tables() --get_gene_no2go_no_set() --get_go_no2depth() (pass data to computing_node) (computing_node) (take data from other nodes, 0 and size-1) (judge_node) --gene_stat() --db_connect() --gene_p_map_redundancy() (output_node) --db_connect() --form_schema_tables() --form_schema_tables() --MpiPredictionFilter() --MpiPredictionFilter_instance.createGeneTable() --get_go_no2edge_counter_list()(if necessary) (pass go_no2edge_counter_list to computing_node) (input_node) --fetch_cluster_block() (computing_node) --get_no_of_unknown_genes() --node_fire_handler() --cleanup_handler() --judge_node() --gene_stat_instance.(match functions) --output_node() --output_node_handler() --MpiPredictionFilter_instance.submit_to_p_gene_table() """ communicator = MPI.world.duplicate() node_rank = communicator.rank if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) """ #01-02-06 old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) """ gene_no2go_no = get_gene_no2go_no_set(curs) gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1) #-1 means use the highest protocol go_no2depth = get_go_no2depth(curs) go_no2depth_pickle = cPickle.dumps(go_no2depth, -1) go_no2gene_no_set = get_go_no2gene_no_set(curs) go_no2gene_no_set_pickle = cPickle.dumps(go_no2gene_no_set, -1) for node in range(1, communicator.size-2): #send it to the computing_node communicator.send(gene_no2go_no_pickle, node, 0) communicator.send(go_no2depth_pickle, node, 0) communicator.send(go_no2gene_no_set_pickle, node, 0) elif node_rank<=communicator.size-3: #WATCH: last 2 nodes are not here. data, source, tag = communicator.receiveString(0, 0) gene_no2go_no = cPickle.loads(data) #take the data data, source, tag = communicator.receiveString(0, 0) go_no2depth = cPickle.loads(data) data, source, tag = communicator.receiveString(0, 0) go_no2gene_no_set = cPickle.loads(data) data, source, tag = communicator.receiveString(communicator.size-1, 0) #from the last node go_no2edge_counter_list = cPickle.loads(data) #choose a functor for recurrence_array functor_dict = {0: None, 1: lambda x: int(x>=self.recurrence_x), 2: lambda x: math.pow(x, self.recurrence_x)} functor = functor_dict[self.recurrence_x_type] elif node_rank == communicator.size-2: #judge node gene_stat_instance = gene_stat(depth_cut_off=self.depth) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) gene_stat_instance.dstruc_loadin(curs) from gene_p_map_redundancy import gene_p_map_redundancy node_distance_class = gene_p_map_redundancy() elif node_rank==communicator.size-1: #establish connection before pursuing (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) """ #01-02-06, input and output are all directed to files old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table) if self.new_table: MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table) """ if self.go_no2edge_counter_list_fname: go_no2edge_counter_list = cPickle.load(open(self.go_no2edge_counter_list_fname,'r')) else: if self.eg_d_type==2: go_no2edge_counter_list = None else: gene_no2go_no = get_gene_no2go_no_set(curs) go_no2edge_counter_list = get_go_no2edge_counter_list(curs, gene_no2go_no, self.edge_type2index) go_no2edge_counter_list_pickle = cPickle.dumps(go_no2edge_counter_list, -1) for node in range(1, communicator.size-2): #send it to the computing_node communicator.send(go_no2edge_counter_list_pickle, node, 0) mpi_synchronize(communicator) free_computing_nodes = range(1,communicator.size-2) #exclude the last node if node_rank == 0: """ curs.execute("DECLARE crs CURSOR FOR SELECT id, vertex_set, edge_set, no_of_edges,\ connectivity, unknown_gene_ratio, recurrence_array, d_matrix from %s"%(old_schema_instance.pattern_table)) """ self.counter = 0 #01-02-06 counter is used as id reader = csv.reader(open(self.input_fname, 'r'), delimiter='\t') parameter_list = [reader] input_node(communicator, parameter_list, free_computing_nodes, self.message_size, \ self.report, input_handler=self.input_handler) del reader elif node_rank in free_computing_nodes: no_of_unknown_genes = get_no_of_unknown_genes(gene_no2go_no) GradientScorePrediction_instance = GradientScorePrediction(gene_no2go_no, go_no2gene_no_set, go_no2depth, \ go_no2edge_counter_list, no_of_unknown_genes, self.depth, self.min_layer1_associated_genes, \ self.min_layer1_ratio, self.min_layer2_associated_genes, self.min_layer2_ratio, self.exponent, \ self.score_list, self.max_layer, self.norm_exp, self.eg_d_type, self.debug) parameter_list = [GradientScorePrediction_instance, functor] computing_node(communicator, parameter_list, self.node_fire_handler, self.cleanup_handler, self.report) elif node_rank == communicator.size-2: self.judge_node(communicator, curs, gene_stat_instance, node_distance_class) elif node_rank==communicator.size-1: #01-02-06 output goes to plain file, not database writer = csv.writer(open(self.jnput_fname, 'w'), delimiter='\t') parameter_list = [writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def run(self): """ 11-16-05 11-19-05 use no_of_validations to multiply the setting(separate the one setting's validations to different nodes) the extra setting copy is for a non-validation real model fitting --computing_handler() --is_site_confirmed() --get_no_of_mismatches_allowed() --get_no_of_mismatches_for_consensus() --is_good_consensus() --get_no_of_mismatches_for_site() """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) unknown_data, known_data = self.get_data(curs, self.fname, self.filter_type, self.is_correct_type, self.need_cal_hg_p_value) known_data_pickle = cPickle.dumps(known_data, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(known_data_pickle, node, 0) unknown_data_pickle = cPickle.dumps(unknown_data, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(unknown_data_pickle, node, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) known_data = cPickle.loads(data) #take the data """ #11-19-05 shuffle data to check index_ls = range(len(known_data)) random.shuffle(index_ls) for i in range(len(index_ls)): index_ls[i] = known_data[i] known_data = index_ls """ data, source, tag = communicator.receiveString(0, 0) unknown_data = cPickle.loads(data) #take the data """ #11-19-05 shuffle data to check index_ls = range(len(unknown_data)) random.shuffle(index_ls) for i in range(len(index_ls)): index_ls[i] = unknown_data[i] unknown_data = index_ls """ elif node_rank==communicator.size-1: writer = csv.writer(open(self.output_file, 'w'), delimiter='\t') #write down the header writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\ 'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std']) mpi_synchronize(communicator) if node_rank == 0: if self.type==1: setting_ls = self.form_setting_ls(self.rpart_cp_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations) elif self.type==2: #randomForest replaces rpart_cp_ls with mty_ls, others are ignored later setting_ls = self.form_setting_ls(self.mty_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations) else: sys.stderr.write("type %s not supported.\n"%self.type) sys.exit(3) self.input_node(communicator, setting_ls, free_computing_nodes, self.report) elif node_rank in free_computing_nodes: parameter_list = [unknown_data, known_data, self.training_perc, self.no_of_validations, self.type, self.bit_string] #03-17-06 add type, bit_string computing_node(communicator, parameter_list, self.computing_handler, report=self.report) elif node_rank==communicator.size-1: setting2validation_stat = {} setting2unknown_known_acc_ls = {} parameter_list = [writer, setting2validation_stat, setting2unknown_known_acc_ls, self.no_of_validations] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) #cPickle.dump([setting2validation_stat, setting2unknown_known_acc_ls], open('/home/yuhuang/MpiRpartValidation.setting2result.pickle','w')) #11-23-05 del writer
def run(self): """ 10-05-05 10-12-05 use max_layer to control whether to turn on the gradient or not 10-16-05 transformed to MPI version if node_rank==0 --db_connect() --form_schema_tables() --form_schema_tables() --get_gene_no2go_no_set() --get_mcl_id2accuracy() elif computing_node: (prepare data) elif output_node: --db_connect() --form_schema_tables() --form_schema_tables() --view_from_table() --view_from_table() --view_from_table() --createGeneTable() --mpi_synchronize() if input_node: --input_node() --fetch_predictions() elif computing_node: --computing_node() --node_fire() --gradient_class() elif output_node: --output_node() --output_node_handler() --submit_to_p_gene_table() """ communicator = MPI.world.duplicate() node_rank = communicator.rank if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) gene_no2go = get_gene_no2go_no_set(curs) gene_no2go_pickle = cPickle.dumps(gene_no2go, -1) #-1 means use the highest protocol if self.max_layer: crs_sentence = 'DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \ p.vertex_gradient, p.edge_gradient, p2.vertex_set, p2.edge_set, p2.d_matrix, p2.recurrence_array from %s p, %s p2 where \ p.mcl_id=p2.id'%(old_schema_instance.p_gene_table, old_schema_instance.pattern_table) else: crs_sentence = "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\ p.edge_gradient, 'vertex_set', 'edge_set', 'd_matrix', 'recurrence_array' \ from %s p"%(old_schema_instance.p_gene_table) #some placeholders 'vertex_set', 'edge_set', 'd_matrix' for prediction_attributes() if self.acc_cut_off: mcl_id2accuracy = self.get_mcl_id2accuracy(curs, old_schema_instance.p_gene_table, crs_sentence, self.is_correct_type) else: mcl_id2accuracy = None mcl_id2accuracy_pickle = cPickle.dumps(mcl_id2accuracy, -1) #-1 means use the highest protocol for node in range(1, communicator.size-1): #send it to the computing_node communicator.send(gene_no2go_pickle, node, 0) for node in range(1, communicator.size-1): #send it to the computing_node communicator.send(mcl_id2accuracy_pickle, node, 0) elif node_rank<=communicator.size-2: #exclude the last node data, source, tag = communicator.receiveString(0, 0) gene_no2go = cPickle.loads(data) #take the data data, source, tag = communicator.receiveString(0, 0) mcl_id2accuracy = cPickle.loads(data) #take the data #choose a functor for recurrence_array functor_dict = {0: None, 1: lambda x: int(x>=self.recurrence_x), 2: lambda x: math.pow(x, self.recurrence_x)} functor = functor_dict[self.recurrence_x_type] elif node_rank==communicator.size-1: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) self.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table) self.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table) self.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table) self.createGeneTable(curs, new_schema_instance.p_gene_table) mpi_synchronize(communicator) if node_rank == 0: self.input_node(communicator, curs, old_schema_instance, crs_sentence, self.size) elif node_rank<=communicator.size-2: #exclude the last node self.computing_node(communicator, gene_no2go, self.exponent, self.score_list, \ self.max_layer, self.norm_exp, self.eg_d_type, mcl_id2accuracy, self.acc_cut_off, functor) elif node_rank==communicator.size-1: parameter_list = [curs, new_schema_instance.p_gene_table] free_computing_nodes = range(1,communicator.size-1) output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler) if self.commit: curs.execute("end")
def run(self): """ 2007-04-16 (rank==0) --get_chr_start_ls() elif free_computing_nodes: -- (receive data) --mpi_synchronize() (rank==0) --input_node() --input_handler() elif free_computing_nodes: --computing_node() --computing_node_handler() --identify_ancestry_with_min_jumps() --initialize_score_trace_matrix() --is_child_heterozygous_SNP_compatible_with_parents() (for loop) --identify_ancestry_of_one_chr_with_DP() --is_child_heterozygous_SNP_compatible_with_parents() --trace() --recursive_trace() else: --output_node() --output_node_handler() """ node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) # exclude the 1st and last node if node_rank == 0: FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.input_fname ) snp_acc_list = header[2:] data_matrix = Numeric.array(data_matrix) no_of_strains = data_matrix.shape[0] (conn, curs) = db_connect(self.hostname, self.dbname, self.schema, password="******", user="******") # 2007-09-17 send strain_acc_list to the output_node strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1) self.communicator.send(strain_acc_list_pickle, self.communicator.size - 1, 0) chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list, self.snp_locus_table) chr_start_ls_pickle = cPickle.dumps(chr_start_ls, -1) # -1 means use the highest protocol data_matrix_pickle = cPickle.dumps(data_matrix, -1) for node in free_computing_nodes: # send it to the computing_node self.communicator.send(chr_start_ls_pickle, node, 0) self.communicator.send(data_matrix_pickle, node, 0) elif node_rank in free_computing_nodes: data, source, tag = self.communicator.receiveString(0, 0) chr_start_ls = cPickle.loads(data) # take the data data, source, tag = self.communicator.receiveString(0, 0) data_matrix = cPickle.loads(data) else: data, source, tag = self.communicator.receiveString(0, 0) strain_acc_list = cPickle.loads(data) mpi_synchronize(self.communicator) if node_rank == 0: parameter_list = [no_of_strains] self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, self.report) elif node_rank in free_computing_nodes: trio_arrangement_ls = [ [0, 1, 2], [1, 2, 0], [2, 0, 1], ] # three different ways to pick the parent-set and the child parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls] computing_node(self.communicator, parameter_list, self.computing_node_handler, report=self.report) else: writer = csv.writer(open(self.output_fname, "w"), delimiter="\t") parameter_list = [writer, strain_acc_list] output_node(self.communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def run(self): """ 08-06-05 08-24-05 read all edge data into matrix 08-31-05 the integer returned by encodeOccurrenceBv() could be 138-bit(human no_of_datasets) And Numeric.Int is only 32 bit. So Change edge_sig_matrix format. 12-31-05 no database connection any more 2 threads on computing node 01-08-06 no threads back to edge_sig_matrix 01-11-06 use the cc module, PostFim 01-15-06 add min_line_number and max_line_number (rank==0) --get_no_of_datasets() --sendEdgeSigMatrix() elif free_computing_nodes: --PostFim() --receiveEdgeSigMatrix() --mpi_synchronize() (rank==0) --input_node() --input_handler() elif free_computing_nodes: --computing_node() --computing_node_handler() else: --output_node() --output_node_handler() --mpi_synchronize() (rank==0) --receive node_outputfile --netmine_wrapper() --collect_and_merge_output() --uniqueSort() else: --return node_outputfile """ communicator = MPI.world.duplicate() free_computing_nodes = range(1,communicator.size-1) #exclude the 1st and last node block_size = 10000 if communicator.rank == 0: no_of_datasets = self.get_no_of_datasets(self.sig_vector_fname) #no_of_datasets is used in fillEdgeSigMatrix() and patternFormation() for node in free_computing_nodes: communicator.send(str(no_of_datasets), node, 0) self.sendEdgeSigMatrix(communicator, free_computing_nodes, self.sig_vector_fname, \ no_of_datasets, self.min_sup, self.max_sup, block_size=10000) elif communicator.rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) no_of_datasets = int(data) #take the data offset = communicator.rank - 1 node_outputfile = '%s.%s'%(self.outputfile, offset) PostFim_instance = PostFim(self.no_cc, no_of_datasets, self.min_cluster_size, node_outputfile) self.receiveEdgeSigMatrix(communicator, PostFim_instance, no_of_datasets, block_size) mpi_synchronize(communicator) if communicator.rank == 0: reader = csv.reader(open(self.inputfile, 'r'), delimiter=' ') parameter_list = [reader, self.min_line_number, self.max_line_number] #01-15-06 self.line_number = 0 #01-15-06 used in input_handler() input_node(communicator, parameter_list, free_computing_nodes, self.queue_size, \ self.report, input_handler=self.input_handler) del reader elif communicator.rank in free_computing_nodes: parameter_list = [PostFim_instance] computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report) else: parameter_list = [] output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) mpi_synchronize(communicator) if communicator.rank == 0: #12-31-05 wait until of_name_list is full of_name_list = [] while len(of_name_list)<len(free_computing_nodes): data, source, tag = communicator.receiveString(None, 1) of_name_list.append(data) #collecting intermediateFile = '%s.unsorted'%self.outputfile #intermediateFile to store concatenated results netmine_wrapper_instance = netmine_wrapper() netmine_wrapper_instance.collect_and_merge_output(of_name_list, intermediateFile) self.uniqueSort(intermediateFile, self.outputfile, self.tmpdir) elif communicator.rank in free_computing_nodes: communicator.send(node_outputfile, 0, 1) #send back the outputfile
def run(self): """ 2007-04-16 (rank==0) --get_chr_start_ls() elif free_computing_nodes: -- (receive data) --mpi_synchronize() (rank==0) --input_node() --input_handler() elif free_computing_nodes: --computing_node() --computing_node_handler() --identify_ancestry_with_min_jumps() --initialize_score_trace_matrix() --is_child_heterozygous_SNP_compatible_with_parents() (for loop) --identify_ancestry_of_one_chr_with_DP() --is_child_heterozygous_SNP_compatible_with_parents() --trace() --recursive_trace() else: --output_node() --output_node_handler() """ node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node if node_rank == 0: FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.input_fname) snp_acc_list = header[2:] data_matrix = Numeric.array(data_matrix) no_of_strains = data_matrix.shape[0] (conn, curs) = db_connect(self.hostname, self.dbname, self.schema, password='******', user='******') #2007-09-17 send strain_acc_list to the output_node strain_acc_list_pickle = cPickle.dumps(strain_acc_list, -1) self.communicator.send(strain_acc_list_pickle, self.communicator.size - 1, 0) chr_start_ls = self.get_chr_start_ls(curs, snp_acc_list, self.snp_locus_table) chr_start_ls_pickle = cPickle.dumps( chr_start_ls, -1) #-1 means use the highest protocol data_matrix_pickle = cPickle.dumps(data_matrix, -1) for node in free_computing_nodes: #send it to the computing_node self.communicator.send(chr_start_ls_pickle, node, 0) self.communicator.send(data_matrix_pickle, node, 0) elif node_rank in free_computing_nodes: data, source, tag = self.communicator.receiveString(0, 0) chr_start_ls = cPickle.loads(data) #take the data data, source, tag = self.communicator.receiveString(0, 0) data_matrix = cPickle.loads(data) else: data, source, tag = self.communicator.receiveString(0, 0) strain_acc_list = cPickle.loads(data) mpi_synchronize(self.communicator) if node_rank == 0: parameter_list = [no_of_strains] self.input_node(self.communicator, parameter_list, free_computing_nodes, self.message_size, \ self.report) elif node_rank in free_computing_nodes: trio_arrangement_ls = [[0, 1, 2], [1, 2, 0], [ 2, 0, 1 ]] #three different ways to pick the parent-set and the child parameter_list = [data_matrix, chr_start_ls, trio_arrangement_ls] computing_node(self.communicator, parameter_list, self.computing_node_handler, report=self.report) else: writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') parameter_list = [writer, strain_acc_list] output_node(self.communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def run(self): """ 11-16-05 11-19-05 use no_of_validations to multiply the setting(separate the one setting's validations to different nodes) the extra setting copy is for a non-validation real model fitting --computing_handler() --is_site_confirmed() --get_no_of_mismatches_allowed() --get_no_of_mismatches_for_consensus() --is_good_consensus() --get_no_of_mismatches_for_site() """ communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1, communicator.size - 1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) unknown_data, known_data = self.get_data(curs, self.fname, self.filter_type, self.is_correct_type, self.need_cal_hg_p_value) known_data_pickle = cPickle.dumps(known_data, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(known_data_pickle, node, 0) unknown_data_pickle = cPickle.dumps(unknown_data, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(unknown_data_pickle, node, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) known_data = cPickle.loads(data) #take the data """ #11-19-05 shuffle data to check index_ls = range(len(known_data)) random.shuffle(index_ls) for i in range(len(index_ls)): index_ls[i] = known_data[i] known_data = index_ls """ data, source, tag = communicator.receiveString(0, 0) unknown_data = cPickle.loads(data) #take the data """ #11-19-05 shuffle data to check index_ls = range(len(unknown_data)) random.shuffle(index_ls) for i in range(len(index_ls)): index_ls[i] = unknown_data[i] unknown_data = index_ls """ elif node_rank == communicator.size - 1: writer = csv.writer(open(self.output_file, 'w'), delimiter='\t') #write down the header writer.writerow(['rpart_cp', 'loss_matrix', 'prior_prob', 'type', 'accuracy_avg','accuracy_std', 'no_of_predictions_avg',\ 'no_of_predictions_std', 'no_of_genes_avg', 'no_of_genes_std']) mpi_synchronize(communicator) if node_rank == 0: if self.type == 1: setting_ls = self.form_setting_ls(self.rpart_cp_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations) elif self.type == 2: #randomForest replaces rpart_cp_ls with mty_ls, others are ignored later setting_ls = self.form_setting_ls(self.mty_ls, self.loss_matrix_ls, self.prior_prob_ls, self.no_of_validations) else: sys.stderr.write("type %s not supported.\n" % self.type) sys.exit(3) self.input_node(communicator, setting_ls, free_computing_nodes, self.report) elif node_rank in free_computing_nodes: parameter_list = [ unknown_data, known_data, self.training_perc, self.no_of_validations, self.type, self.bit_string ] #03-17-06 add type, bit_string computing_node(communicator, parameter_list, self.computing_handler, report=self.report) elif node_rank == communicator.size - 1: setting2validation_stat = {} setting2unknown_known_acc_ls = {} parameter_list = [ writer, setting2validation_stat, setting2unknown_known_acc_ls, self.no_of_validations ] output_node(communicator, free_computing_nodes, parameter_list, self.output_handler, self.report) #cPickle.dump([setting2validation_stat, setting2unknown_known_acc_ls], open('/home/yuhuang/MpiRpartValidation.setting2result.pickle','w')) #11-23-05 del writer