def parse_stat_fname(self, stat_fname, filter_type): sys.stderr.write("Parsing stat_fname: %s ...\n"%os.path.basename(stat_fname)) reader = csv.reader(open(stat_fname), delimiter='\t') prediction_pair2instance = {} #following is temp default values p_gene_id = -2 #-2 means no database submission for p_gene_id avg_p_value = -1 no_of_clusters = 1 p_value_cut_off = -1 connectivity_cut_off = -1 cluster_size_cut_off = -1 unknown_cut_off = -1 vertex_gradient = -1 counter = 0 real_counter = 0 for row in reader: cluster_id, gene_no, go_no, go_no_depth, recurrence, gradient_score, edge_gradient, \ is_correct, is_correct_L1, is_correct_lca, lca_list_string = row cluster_id = int(cluster_id) gene_no = int(gene_no) go_no = int(go_no) go_no_depth = int(go_no_depth) recurrence = float(recurrence) edge_gradient = float(edge_gradient) is_correct = int(is_correct) is_correct_L1 = int(is_correct_L1) is_correct_lca = int(is_correct_lca) cluster_array = '{%s}'%cluster_id new_row = [p_gene_id, gene_no, go_no, is_correct, is_correct_L1, \ is_correct_lca, avg_p_value, no_of_clusters, cluster_array, p_value_cut_off, recurrence, \ connectivity_cut_off, cluster_size_cut_off, unknown_cut_off, go_no_depth, cluster_id, \ lca_list_string, vertex_gradient, edge_gradient] p_attr_instance = prediction_attributes(new_row, type=2) prediction_pair = (gene_no, go_no) if prediction_pair not in prediction_pair2instance: prediction_pair2instance[prediction_pair] = p_attr_instance real_counter += 1 else: #remove redundancy if filter_type==1: new_cmp_value = p_attr_instance.recurrence_cut_off old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off elif filter_type==2: new_cmp_value = p_attr_instance.edge_gradient old_cmp_value = prediction_pair2instance[prediction_pair].edge_gradient elif filter_type==3: new_cmp_value = p_attr_instance.recurrence_cut_off+p_attr_instance.edge_gradient old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off + prediction_pair2instance[prediction_pair].edge_gradient if new_cmp_value>old_cmp_value: prediction_pair2instance[prediction_pair] = p_attr_instance counter += 1 if self.report and counter%10000==0: sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter)) if self.report: sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter)) del reader sys.stderr.write("Done.\n") return prediction_pair2instance
def output_p_gene_id_list(self, curs, schema_instance1, schema_instance2, p_gene_id_list, writer, pic_output_dir,\ pga_instance1, pga_instance2, cluster_info_instance, simple): """ 10-15-05 add score1 and is_accepted1 10-17-05 score and is_accepted depend on whether pga_instance is None or not 10-17-05 add simple to allow no graph pictures output also get prediction from schema_instance1 and calculate the score if prediction is available 10-18-05 sort the p_gene_id_list first """ #10-15-05 following sentence slightly different from PredictionFilterByClusterSize.py in the trailing edge_gradient #and d_matrix is a placeholder sql_sentence1 = "SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \ p.vertex_gradient, p.edge_gradient, m.vertex_set, s.edge_set, 'd_matrix', 'r' from %s p, %s s, %s m where \ p.mcl_id=s.splat_id and p.mcl_id=m.mcl_id"%(schema_instance1.p_gene_table, \ schema_instance1.splat_table, schema_instance1.mcl_table) sql_sentence2 = "SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \ p.vertex_gradient, p.edge_gradient, m.vertex_set, s.edge_set, 'd_matrix', 'r' from %s p, %s s, %s m where \ p.mcl_id=s.splat_id and p.mcl_id=m.mcl_id"%(schema_instance2.p_gene_table, \ schema_instance2.splat_table, schema_instance2.mcl_table) writer.writerow(['p_gene_id', 'gene_no', 'go_no', 'is_correct_lca', 'p_value', 'recurrence', 'connectivity',\ 'cluster_size', 'unknown_ratio', 'mcl_id', 'lca_list', 'edge_gradient', 'score1', 'is_accepted1', 'score2', 'is_accepted2']) p_gene_id_list.sort() for p_gene_id in p_gene_id_list: #sql_sentence1's prediction infomation is not gonna be displayed curs.execute("%s and p.p_gene_id=%s"%(sql_sentence1, p_gene_id)) rows = curs.fetchall() if rows: p_attr_instance1 = prediction_attributes(rows[0], type=3) else: p_attr_instance1 = None #sql_sentence2's prediction infomation is going to be displayed curs.execute("%s and p.p_gene_id=%s"%(sql_sentence2, p_gene_id)) rows = curs.fetchall() if rows: p_attr_instance2 = prediction_attributes(rows[0], type=3) if pga_instance1 and p_attr_instance1: (is_accepted1, score1) = pga_instance1.prediction_accepted(p_attr_instance1.go_no, \ [-math.log(p_attr_instance1.p_value_cut_off), p_attr_instance1.recurrence_cut_off, \ p_attr_instance1.connectivity_cut_off, p_attr_instance1.cluster_size_cut_off, \ p_attr_instance1.edge_gradient]) else: is_accepted1, score1 = None, None if pga_instance2: (is_accepted2, score2) = pga_instance2.prediction_accepted(p_attr_instance2.go_no, \ [-math.log(p_attr_instance2.p_value_cut_off), p_attr_instance2.recurrence_cut_off, \ p_attr_instance2.connectivity_cut_off, p_attr_instance2.cluster_size_cut_off, \ p_attr_instance2.edge_gradient]) else: is_accepted2, score2 = None, None writer.writerow([p_attr_instance2.p_gene_id, p_attr_instance2.gene_no, p_attr_instance2.go_no, \ p_attr_instance2.is_correct_lca, p_attr_instance2.avg_p_value, p_attr_instance2.recurrence_cut_off,\ p_attr_instance2.connectivity_cut_off, p_attr_instance2.cluster_size_cut_off, p_attr_instance2.unknown_cut_off,\ p_attr_instance2.mcl_id, p_attr_instance2.lca_list, p_attr_instance2.edge_gradient, score1, is_accepted1, \ score2, is_accepted2]) if not simple: #prepare vertex_set and edge_set to draw graphs vertex_set = p_attr_instance2.vertex_set[1:-1].split(',') vertex_set = map(int, vertex_set) edge_set = p_attr_instance2.edge_set[2:-2].split('},{') for i in range(len(edge_set)): edge_set[i] = edge_set[i].split(',') edge_set[i] = map(int, edge_set[i]) #following copied from GuiAnalyzer.py subgraph = cluster_info_instance.graph_from_node_edge_set(vertex_set, edge_set) graphSrcFname = '/tmp/GuiAnalyzer.dot' graphFname = os.path.join(pic_output_dir, '%s_%s_%s_%s.png'%(p_attr_instance2.p_gene_id, \ p_attr_instance2.gene_no, p_attr_instance2.go_no, p_attr_instance2.mcl_id)) graphSrcF = open(graphSrcFname, 'w') graphDotOutput(graphSrcF, subgraph, \ self.gene_no2gene_id, self.gene_no2go_no, \ centralnode=p_attr_instance2.gene_no, function=p_attr_instance2.go_no, weighted=0, ) graphSrcF.close() plot_type_command='neato -Goverlap=false' commandline = '%s -Tpng %s -o %s'%(plot_type_command, graphSrcFname, graphFname) system_call(commandline)
def data_fetch( self, curs, schema_instance, filter_type, is_correct_type, no_of_total_genes, go_no2gene_no_set, need_cal_hg_p_value=0, ): """ 11-09-05 1st get the data from p_gene_table and remove redundancy given filter_type 2nd transform the data to three lists 11-10-05 add a chunk of code to get hg p-value(leave one out) for the prediction mcl_id2vertex_list might blow the memory.(?) 11-19-05 separate predictions totally into known and unknown 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off """ sys.stderr.write("Fetching data from old p_gene_table...\n") prediction_pair2instance = {} curs.execute( "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \ p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \ p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\ p.edge_gradient from %s p" % (schema_instance.p_gene_table) ) curs.execute("fetch 10000 from crs") rows = curs.fetchall() counter = 0 real_counter = 0 while rows: for row in rows: p_attr_instance = prediction_attributes(row, type=2) prediction_pair = (p_attr_instance.gene_no, p_attr_instance.go_no) if prediction_pair not in prediction_pair2instance: prediction_pair2instance[prediction_pair] = p_attr_instance real_counter += 1 else: # remove redundancy if filter_type == 1: new_cmp_value = p_attr_instance.recurrence_cut_off old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off elif filter_type == 2: new_cmp_value = p_attr_instance.edge_gradient old_cmp_value = prediction_pair2instance[prediction_pair].edge_gradient elif filter_type == 3: new_cmp_value = p_attr_instance.recurrence_cut_off + p_attr_instance.edge_gradient old_cmp_value = ( prediction_pair2instance[prediction_pair].recurrence_cut_off + prediction_pair2instance[prediction_pair].edge_gradient ) if new_cmp_value > old_cmp_value: prediction_pair2instance[prediction_pair] = p_attr_instance counter += 1 if self.report: sys.stderr.write("%s%s/%s" % ("\x08" * 20, counter, real_counter)) curs.execute("fetch 10000 from crs") rows = curs.fetchall() unknown_prediction_ls = [] known_prediction_ls = [] unknown_data = [] # 11-19-05 known_data = [] for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems(): # 11-10-05 mcl_id2vertex_list = {} if need_cal_hg_p_value: mcl_id = p_attr_instance.mcl_id if mcl_id not in mcl_id2vertex_list: mcl_id2vertex_list[mcl_id] = self.get_vertex_list(curs, schema_instance, mcl_id) p_attr_instance.p_value_cut_off = cal_hg_p_value( p_attr_instance.gene_no, p_attr_instance.go_no, mcl_id2vertex_list[mcl_id], no_of_total_genes, go_no2gene_no_set, r, ) is_correct = p_attr_instance.is_correct_dict[is_correct_type] # 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off data_row = [ p_attr_instance.p_value_cut_off, p_attr_instance.recurrence_cut_off, p_attr_instance.connectivity_cut_off, p_attr_instance.cluster_size_cut_off, p_attr_instance.edge_gradient, p_attr_instance.vertex_gradient, p_attr_instance.unknown_cut_off, p_attr_instance.gene_no, p_attr_instance.go_no, is_correct, ] if is_correct != -1: known_data.append(data_row) # to do fitting known_prediction_ls.append(p_attr_instance) else: unknown_data.append(data_row) unknown_prediction_ls.append(p_attr_instance) sys.stderr.write("Done fetching data.\n") return unknown_prediction_ls, known_prediction_ls, unknown_data, known_data