def submit_predictions(self, curs, schema_instance, prediction_pair2instance, cluster_id2properties): sys.stderr.write("Submitting predictions...\n") MpiPredictionFilter_instance = MpiPredictionFilter() MpiPredictionFilter_instance.createGeneTable(curs, schema_instance.p_gene_table) no_of_total_genes = get_no_of_total_genes(curs) go_no2gene_no_set = get_go_no2gene_no_set(curs) counter = 0 for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems(): #1st fill those empty items properties = cluster_id2properties[p_attr_instance.mcl_id] vertex_set = properties[2] p_attr_instance.p_value_cut_off = cal_hg_p_value(p_attr_instance.gene_no, p_attr_instance.go_no,\ vertex_set, no_of_total_genes, go_no2gene_no_set, r) p_attr_instance.avg_p_value = p_attr_instance.p_value_cut_off p_attr_instance.connectivity_cut_off = properties[0] p_attr_instance.cluster_size_cut_off = len(vertex_set) p_attr_instance.unknown_cut_off = properties[1] MpiPredictionFilter_instance.submit_to_p_gene_table(curs, schema_instance.p_gene_table, p_attr_instance) counter += 1 if self.report and counter%2000==0: sys.stderr.write("%s%s"%('\x08'*20, counter)) if self.report: sys.stderr.write("%s%s"%('\x08'*20, counter)) sys.stderr.write("Done.\n")
def data_fetch( self, curs, schema_instance, filter_type, is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value=0, ): """ 11-09-05 1st get the data from p_gene_table and remove redundancy given filter_type 2nd transform the data to three lists 11-10-05 add a chunk of code to get hg p-value(leave one out) for the prediction mcl_id2vertex_list might blow the memory.(?) 11-19-05 separate predictions totally into known and unknown 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off """ sys.stderr.write("Fetching data from old p_gene_table...\n") prediction_pair2instance = {} curs.execute( "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\ p.edge_gradient from %s p" % (schema_instance.p_gene_table) ) curs.execute("fetch 10000 from crs") rows = curs.fetchall() counter = 0 real_counter = 0 while rows: for row in rows: p_attr_instance = prediction_attributes(row, type=2) prediction_pair = (p_attr_instance.gene_no, p_attr_instance.go_no) if prediction_pair not in prediction_pair2instance: prediction_pair2instance[prediction_pair] = p_attr_instance real_counter += 1 else: # remove redundancy if filter_type == 1: new_cmp_value = p_attr_instance.recurrence_cut_off old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off elif filter_type == 2: new_cmp_value = p_attr_instance.edge_gradient old_cmp_value = prediction_pair2instance[prediction_pair].edge_gradient elif filter_type == 3: new_cmp_value = p_attr_instance.recurrence_cut_off + p_attr_instance.edge_gradient old_cmp_value = ( prediction_pair2instance[prediction_pair].recurrence_cut_off + prediction_pair2instance[prediction_pair].edge_gradient ) if new_cmp_value > old_cmp_value: prediction_pair2instance[prediction_pair] = p_attr_instance counter += 1 if self.report: sys.stderr.write("%s%s/%s" % ("\x08" * 20, counter, real_counter)) curs.execute("fetch 10000 from crs") rows = curs.fetchall() unknown_prediction_ls = [] known_prediction_ls = [] unknown_data = [] # 11-19-05 known_data = [] for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems(): # 11-10-05 mcl_id2vertex_list = {} if need_cal_hg_p_value: mcl_id = p_attr_instance.mcl_id if mcl_id not in mcl_id2vertex_list: mcl_id2vertex_list[mcl_id] = self.get_vertex_list(curs, schema_instance, mcl_id) p_attr_instance.p_value_cut_off = cal_hg_p_value( p_attr_instance.gene_no, p_attr_instance.go_no, mcl_id2vertex_list[mcl_id], no_of_total_genes, go_no2gene_no_set, r, ) is_correct = p_attr_instance.is_correct_dict[is_correct_type] # 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off data_row = [ p_attr_instance.p_value_cut_off, p_attr_instance.recurrence_cut_off, p_attr_instance.connectivity_cut_off, p_attr_instance.cluster_size_cut_off, p_attr_instance.edge_gradient, p_attr_instance.vertex_gradient, p_attr_instance.unknown_cut_off, p_attr_instance.gene_no, p_attr_instance.go_no, is_correct, ] if is_correct != -1: known_data.append(data_row) # to do fitting known_prediction_ls.append(p_attr_instance) else: unknown_data.append(data_row) unknown_prediction_ls.append(p_attr_instance) sys.stderr.write("Done fetching data.\n") return unknown_prediction_ls, known_prediction_ls, unknown_data, known_data