def run(self): communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) gene_id2no = get_gene_id2gene_no(curs) gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no) gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1) gene_no2id = get_gene_no2gene_id(curs) gene_no2go_no = get_gene_no2go_no(curs) gene_no2id_pickle = cPickle.dumps(gene_no2id, -1) gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(gene2enc_array_pickle, node, 0) communicator.send(gene_no2id_pickle, communicator.size-1, 0) communicator.send(gene_no2go_no_pickle, communicator.size-1, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) gene2enc_array = cPickle.loads(data) #take the data elif node_rank==communicator.size-1: schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) data, source, tag = communicator.receiveString(0, 0) gene_no2id = cPickle.loads(data) data, source, tag = communicator.receiveString(0, 0) gene_no2go_no = cPickle.loads(data) mpi_synchronize(communicator) if node_rank == 0: curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\ g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table)) input_node(communicator, curs, free_computing_nodes, self.message_size, self.report) elif node_rank in free_computing_nodes: parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off] computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report) elif node_rank==communicator.size-1: if not os.path.isdir(self.pic_output_dir): os.makedirs(self.pic_output_dir) cluster_info_instance = cluster_info() ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off)) writer = csv.writer(open(ofname, 'w'), delimiter='\t') parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def run(self): """ 11-09-05 11-09-05 add rpart_cp 11-10-05 add need_cal_hg_p_value --db_connect() --form_schema_tables() --form_schema_tables() --get_no_of_total_genes() --get_go_no2gene_no_set() --data_fetch() --get_vertex_list() --cal_hg_p_value() --rpart_fit_and_predict() --MpiPredictionFilter_instance....() --record_data() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.fname1) new_schema_instance = form_schema_tables(self.fname2) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) prediction_ls, all_data, known_data = self.data_fetch(curs, old_schema_instance, self.filter_type, self.is_correct_type, \ no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value) """ testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.no_of_buckets, self.rpart_cp, \ self.loss_matrix, self.prior_prob) print testing_acc_ls print training_acc_ls """ pred, pred_training = self.rpart_fit_and_predict(all_data, known_data, self.rpart_cp, self.loss_matrix, self.prior_prob) MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table) MpiPredictionFilter_instance.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table) MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table) self.record_data(curs, MpiPredictionFilter_instance, prediction_ls, pred, new_schema_instance) if self.commit: curs.execute("end")
def get_known_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value): schema_instance = form_schema_tables(fname) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) prediction_ls, all_data, known_data = self.data_fetch(curs, schema_instance, filter_type, is_correct_type, \ no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value) del prediction_ls, all_data return known_data
def run(self): if self.ofname and self.acc_cut_off and self.lm_bit: schema_instance = form_schema_tables(self.ofname, self.acc_cut_off, self.lm_bit) else: sys.stderr.write("ofname: %s and acc_cut_off: %s and lm_bit %s, NOT VALID\n"%(self.ofname, self.acc_cut_off, self.lm_bit)) sys.exit(2) conn, curs = db_connect(self.hostname, self.dbname, self.schema) self._cluster_darwin_format(curs, schema_instance.good_cluster_table, self.gene_no2id, self.go_no2id, self.output_fname) del conn, curs
def run(self): """ 01-24-06 """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) schema_instance = form_schema_tables(self.jnput_fname) prediction_pair2instance = self.parse_stat_fname(self.stat_fname, self.filter_type) cluster_id_set = self.get_cluster_id_set(prediction_pair2instance) cluster_id2properties = self.parse_cluster_fname(curs, self.inputfile, self.gim_inputfname, cluster_id_set, schema_instance) self.submit_predictions(curs, schema_instance, prediction_pair2instance, cluster_id2properties) if self.commit: curs.execute("end")
def get_data(self, curs, fname, filter_type, is_correct_type, need_cal_hg_p_value): """ 11-19-05 data_fetch() of rpart_prediction.py changed return unknown_data """ schema_instance = form_schema_tables(fname) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch(curs, schema_instance, \ filter_type, is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value) del unknown_prediction_ls, known_prediction_ls return unknown_data, known_data
def run(self): """ 10-17-05 bit control whether that setting has linear model """ schema_instance1 = form_schema_tables(self.fname1, self.acc_cutoff1, self.lm_bit1) schema_instance2 = form_schema_tables(self.fname2, self.acc_cutoff2, self.lm_bit2) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) p_gene_id_set1 = p_gene_id_set_from_gene_p_table(curs, schema_instance1.gene_p_table) p_gene_id_set2 = p_gene_id_set_from_gene_p_table(curs, schema_instance2.gene_p_table) p_gene_id_set_total = p_gene_id_set_from_gene_p_table(curs, schema_instance2.p_gene_table) catI_set = p_gene_id_set1 - p_gene_id_set2 catII_set = p_gene_id_set1 & p_gene_id_set2 catIII_set = p_gene_id_set2 - p_gene_id_set1 catIV_set = p_gene_id_set_total-(p_gene_id_set1|p_gene_id_set2) sample_ls_ls = [] for p_gene_id_set in [catI_set, catII_set, catIII_set, catIV_set]: sample_ls_ls.append(self.sample_p_gene_id_set(p_gene_id_set, self.no_of_samples)) writer = csv.writer(open(self.ofname, 'w'), delimiter = '\t') writer.writerow(['linear model coeffs of two settings']) writer.writerow([]) writer.writerow(['No.','intercept', 'coeff1', 'coeff2', 'coeff3', 'coeff4', 'coeff5', 'intercept_p_value',\ 'coeff1_p_value', 'coeff2_p_value', 'coeff3_p_value', 'coeff4_p_value', 'coeff5_p_value',\ 'score_cut_off']) #fetch linear model coefficients pga_instance_list = [None, None] #10-17-05 default is nothing, none of them have linear model if self.bit[0] == '1': pga_instance1 = p_gene_analysis() pga_instance1.go_no2lm_results, lm_results_2d_list = pga_instance1.get_go_no2lm_results(curs, schema_instance1.lm_table) pga_instance1.general_lm_results = pga_instance1.get_general_lm_results(lm_results_2d_list) pga_instance_list[0] = pga_instance1 self.output_lm_model(curs, schema_instance1, writer) if self.bit[1] == '1': pga_instance2 = p_gene_analysis() pga_instance2.go_no2lm_results, lm_results_2d_list = pga_instance2.get_go_no2lm_results(curs, schema_instance2.lm_table) pga_instance2.general_lm_results = pga_instance2.get_general_lm_results(lm_results_2d_list) pga_instance_list[1] = pga_instance2 self.output_lm_model(curs, schema_instance2, writer) #following is for drawing graph in output_p_gene_id_list() self.gene_no2gene_id = get_gene_no2gene_id(curs) self.gene_no2go_no = get_gene_no2go_no(curs) cluster_info_instance = cluster_info() for i in range(len(sample_ls_ls)): cat_no = i+1 sys.stderr.write("Category %s...\n"%cat_no) writer.writerow(['Category %s'%cat_no]) writer.writerow([self.category_no2information[cat_no]]) cat_dir = 'cat%s'%cat_no if not os.path.isdir(cat_dir): os.makedirs(cat_dir) if i==0: #this is different, prediction only in schema_instance1, so swap it self.output_p_gene_id_list(curs, schema_instance2, schema_instance1, sample_ls_ls[i], writer, cat_dir, \ pga_instance_list[1], pga_instance_list[0], cluster_info_instance, self.simple) else: self.output_p_gene_id_list(curs, schema_instance1, schema_instance2, sample_ls_ls[i], writer, cat_dir, \ pga_instance_list[0], pga_instance_list[1], cluster_info_instance, self.simple) sys.stderr.write("End Category %s.\n"%cat_no)
def run(self): """ 11-09-05 11-09-05 add rpart_cp 11-10-05 add need_cal_hg_p_value 11-23-05 rpart_fit_and_predict() is split 2006-12-05 add need_output_data_for_R flag --db_connect() --form_schema_tables() --form_schema_tables() --get_no_of_total_genes() --get_go_no2gene_no_set() --data_fetch() --get_vertex_list() --cal_hg_p_value() --output_data_for_R() --rpart_fit() --rpart_predict() --rpart_predict() --MpiPredictionFilter_instance....() --record_data() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.fname1) new_schema_instance = form_schema_tables(self.fname2) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) unknown_prediction_ls, known_prediction_ls, unknown_data, known_data = self.data_fetch( curs, old_schema_instance, self.filter_type, self.is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value, ) if self.need_output_data_for_R: # 2006-12-05 self.output_data_for_R(known_data, "%s.known" % self.fname1) self.output_data_for_R(unknown_data, "%s.unknown" % self.fname1) """ testing_acc_ls, training_acc_ls = self.rpart_validation(known_data, self.training_perc, self.rpart_cp, \ self.loss_matrix, self.prior_prob) print testing_acc_ls print training_acc_ls """ fit_model = self.fit_function_dict[self.type](known_data, self.parameter_list_dict[self.type], self.bit_string) known_pred = self.predict_function_dict[self.type](fit_model, known_data) unknown_pred = self.predict_function_dict[self.type](fit_model, unknown_data) if self.debug: if self.type == 2: # randomForest's model has its own oob prediction fit_model_py = fit_model.as_py(BASIC_CONVERSION) print self.cal_accuracy(known_data, fit_model_py["predicted"], pred_type=1) print self.cal_accuracy(known_data, known_pred, pred_type=self.type) print self.cal_accuracy(unknown_data, unknown_pred, pred_type=self.type) if self.commit: MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.view_from_table( curs, old_schema_instance.splat_table, new_schema_instance.splat_table ) MpiPredictionFilter_instance.view_from_table( curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table ) MpiPredictionFilter_instance.view_from_table( curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table ) MpiPredictionFilter_instance.createGeneTable(curs, new_schema_instance.p_gene_table) self.record_data( curs, MpiPredictionFilter_instance, unknown_prediction_ls, unknown_pred, new_schema_instance, pred_type=self.type, ) if ( self.type == 2 ): # 2006-10-31 randomForest's model has its own oob prediction, but use rpart's way of storing prediction fit_model_py = fit_model.as_py(BASIC_CONVERSION) known_pred = fit_model_py["predicted"] self.record_data( curs, MpiPredictionFilter_instance, known_prediction_ls, known_pred, new_schema_instance, pred_type=1, ) else: self.record_data( curs, MpiPredictionFilter_instance, known_prediction_ls, known_pred, new_schema_instance, pred_type=self.type, ) curs.execute("end")
def run(self): """ 10-31-05 2006-09-26 modify it to be compatible with the modified pipeline from haifeng 2006-11-06 add type 2006-12-13 use font_path and font_size --form_schema_tables() --db_connect() --get_char_dimension() --get_no_of_p_funcs_gene_no_go_no_list() --get_recurrence_go_no_rec_array_cluster_id_ls() --get_go_no2name() --draw_function_map() --draw_gene_function_map() --get_recurrence_rec_array_bs_no_list() --get_mt_no2tf_name() --draw_tf_map() """ schema_instance = form_schema_tables(self.inputfname, self.acc_cutoff, self.lm_bit) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) font = ImageFont.truetype(self.font_path, self.font_size) char_dimension = font.getsize('a') #char_dimension = get_char_dimension() #go_no2name = get_go_no2name(curs) go_no2name = get_go_id2name(curs) if self.type==1: go_no2go_id = get_go_no2go_id(curs) given_p_gene_set = p_gene_id_set_from_gene_p_table(curs, schema_instance.gene_p_table) no_of_p_funcs_gene_no_go_no_list, mcl_id2go_no_set = self.get_no_of_p_funcs_gene_no_go_no_list_from_db(curs, \ schema_instance.p_gene_table, given_p_gene_set, go_no2go_id) elif self.type==2: no_of_p_funcs_gene_no_go_no_list, mcl_id2go_no_set = self.get_no_of_p_funcs_gene_no_go_no_list_from_file(self.inputfname) recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets, mcl_id2enc_recurrence = \ self.get_recurrence_go_no_rec_array_cluster_id_ls(curs, self.pattern_table, mcl_id2go_no_set) no_of_functions = len(recurrence_go_no_rec_array_cluster_id_ls) function_map_output_fname = '%s.function_map.png'%self.output_prefix go_no2index, function_name_region = self.draw_function_map(recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets,\ go_no2name, function_map_output_fname, self.function_name_length, char_dimension, no_of_functions, font) gene_function_map_output_fname = '%s.gene_function_map.png'%self.output_prefix self.draw_gene_function_map(no_of_p_funcs_gene_no_go_no_list, go_no2index, function_name_region,\ gene_function_map_output_fname, self.function_name_length, char_dimension, no_of_functions, font) #tf_map requires mcl_id2enc_recurrence and no_of_datasets from above recurrence_rec_array_bs_no_list = self.get_recurrence_rec_array_bs_no_list(curs, self.cluster_bs_table, mcl_id2enc_recurrence) mt_no2tf_name = get_gene_id2gene_symbol(curs, tax_id=9606) #mt_no2tf_name = get_mt_no2tf_name() tf_map_output_fname = '%s.tf_map.png'%self.output_prefix self.draw_tf_map(recurrence_rec_array_bs_no_list, no_of_datasets, mt_no2tf_name, \ tf_map_output_fname, self.function_name_length, char_dimension, font)
def run(self): """ 10-05-05 10-12-05 use max_layer to control whether to turn on the gradient or not 10-16-05 transformed to MPI version if node_rank==0 --db_connect() --form_schema_tables() --form_schema_tables() --get_gene_no2go_no_set() --get_mcl_id2accuracy() elif computing_node: (prepare data) elif output_node: --db_connect() --form_schema_tables() --form_schema_tables() --view_from_table() --view_from_table() --view_from_table() --createGeneTable() --mpi_synchronize() if input_node: --input_node() --fetch_predictions() elif computing_node: --computing_node() --node_fire() --gradient_class() elif output_node: --output_node() --output_node_handler() --submit_to_p_gene_table() """ communicator = MPI.world.duplicate() node_rank = communicator.rank if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) gene_no2go = get_gene_no2go_no_set(curs) gene_no2go_pickle = cPickle.dumps(gene_no2go, -1) #-1 means use the highest protocol if self.max_layer: crs_sentence = 'DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \ p.vertex_gradient, p.edge_gradient, p2.vertex_set, p2.edge_set, p2.d_matrix, p2.recurrence_array from %s p, %s p2 where \ p.mcl_id=p2.id'%(old_schema_instance.p_gene_table, old_schema_instance.pattern_table) else: crs_sentence = "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\ p.edge_gradient, 'vertex_set', 'edge_set', 'd_matrix', 'recurrence_array' \ from %s p"%(old_schema_instance.p_gene_table) #some placeholders 'vertex_set', 'edge_set', 'd_matrix' for prediction_attributes() if self.acc_cut_off: mcl_id2accuracy = self.get_mcl_id2accuracy(curs, old_schema_instance.p_gene_table, crs_sentence, self.is_correct_type) else: mcl_id2accuracy = None mcl_id2accuracy_pickle = cPickle.dumps(mcl_id2accuracy, -1) #-1 means use the highest protocol for node in range(1, communicator.size-1): #send it to the computing_node communicator.send(gene_no2go_pickle, node, 0) for node in range(1, communicator.size-1): #send it to the computing_node communicator.send(mcl_id2accuracy_pickle, node, 0) elif node_rank<=communicator.size-2: #exclude the last node data, source, tag = communicator.receiveString(0, 0) gene_no2go = cPickle.loads(data) #take the data data, source, tag = communicator.receiveString(0, 0) mcl_id2accuracy = cPickle.loads(data) #take the data #choose a functor for recurrence_array functor_dict = {0: None, 1: lambda x: int(x>=self.recurrence_x), 2: lambda x: math.pow(x, self.recurrence_x)} functor = functor_dict[self.recurrence_x_type] elif node_rank==communicator.size-1: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) old_schema_instance = form_schema_tables(self.input_fname) new_schema_instance = form_schema_tables(self.jnput_fname) self.view_from_table(curs, old_schema_instance.splat_table, new_schema_instance.splat_table) self.view_from_table(curs, old_schema_instance.mcl_table, new_schema_instance.mcl_table) self.view_from_table(curs, old_schema_instance.pattern_table, new_schema_instance.pattern_table) self.createGeneTable(curs, new_schema_instance.p_gene_table) mpi_synchronize(communicator) if node_rank == 0: self.input_node(communicator, curs, old_schema_instance, crs_sentence, self.size) elif node_rank<=communicator.size-2: #exclude the last node self.computing_node(communicator, gene_no2go, self.exponent, self.score_list, \ self.max_layer, self.norm_exp, self.eg_d_type, mcl_id2accuracy, self.acc_cut_off, functor) elif node_rank==communicator.size-1: parameter_list = [curs, new_schema_instance.p_gene_table] free_computing_nodes = range(1,communicator.size-1) output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler) if self.commit: curs.execute("end")