예제 #1
0
	def parse_stat_fname(self, stat_fname, filter_type):
		sys.stderr.write("Parsing stat_fname: %s ...\n"%os.path.basename(stat_fname))
		reader = csv.reader(open(stat_fname), delimiter='\t')
		prediction_pair2instance = {}
		
		#following is temp default values
		p_gene_id = -2	#-2 means no database submission for p_gene_id
		avg_p_value = -1
		no_of_clusters = 1
		p_value_cut_off = -1
		connectivity_cut_off = -1
		cluster_size_cut_off = -1
		unknown_cut_off = -1
		vertex_gradient = -1
		
		counter = 0
		real_counter = 0
		for row in reader:
			cluster_id, gene_no, go_no, go_no_depth, recurrence, gradient_score, edge_gradient, \
				is_correct, is_correct_L1, is_correct_lca, lca_list_string = row
			cluster_id = int(cluster_id)
			gene_no = int(gene_no)
			go_no = int(go_no)
			go_no_depth = int(go_no_depth)
			recurrence = float(recurrence)
			edge_gradient = float(edge_gradient)
			is_correct = int(is_correct)
			is_correct_L1 = int(is_correct_L1)
			is_correct_lca = int(is_correct_lca)
			cluster_array = '{%s}'%cluster_id
			new_row = [p_gene_id, gene_no, go_no, is_correct, is_correct_L1, \
				is_correct_lca, avg_p_value, no_of_clusters, cluster_array, p_value_cut_off, recurrence, \
				connectivity_cut_off, cluster_size_cut_off, unknown_cut_off, go_no_depth, cluster_id, \
				lca_list_string, vertex_gradient, edge_gradient]
			p_attr_instance = prediction_attributes(new_row, type=2)
			prediction_pair = (gene_no, go_no)
			if prediction_pair not in prediction_pair2instance:
				prediction_pair2instance[prediction_pair] = p_attr_instance
				real_counter += 1
			else:	#remove redundancy
				if filter_type==1:
					new_cmp_value = p_attr_instance.recurrence_cut_off
					old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off
				elif filter_type==2:
					new_cmp_value = p_attr_instance.edge_gradient
					old_cmp_value = prediction_pair2instance[prediction_pair].edge_gradient
				elif filter_type==3:
					new_cmp_value = p_attr_instance.recurrence_cut_off+p_attr_instance.edge_gradient
					old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off + prediction_pair2instance[prediction_pair].edge_gradient
				if new_cmp_value>old_cmp_value:
					prediction_pair2instance[prediction_pair] = p_attr_instance
			counter += 1
			if self.report and counter%10000==0:
				sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter))
		if self.report:
			sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter))
		del reader
		sys.stderr.write("Done.\n")
		return prediction_pair2instance
예제 #2
0
	def output_p_gene_id_list(self, curs, schema_instance1, schema_instance2, p_gene_id_list, writer, pic_output_dir,\
		pga_instance1, pga_instance2, cluster_info_instance, simple):
		"""
		10-15-05
			add score1 and is_accepted1
		10-17-05
			score and is_accepted depend on whether pga_instance is None or not
		10-17-05 add simple to allow no graph pictures output
			also get prediction from schema_instance1 and calculate the score if prediction is available
		10-18-05
			sort the p_gene_id_list first
		"""
		#10-15-05 following sentence slightly different from PredictionFilterByClusterSize.py in the trailing edge_gradient
			#and d_matrix is a placeholder
		sql_sentence1 = "SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
			p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
			p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \
			p.vertex_gradient, p.edge_gradient, m.vertex_set, s.edge_set, 'd_matrix', 'r' from %s p, %s s, %s m where \
			p.mcl_id=s.splat_id and p.mcl_id=m.mcl_id"%(schema_instance1.p_gene_table, \
			schema_instance1.splat_table, schema_instance1.mcl_table)
		sql_sentence2 = "SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
			p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
			p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, \
			p.vertex_gradient, p.edge_gradient, m.vertex_set, s.edge_set, 'd_matrix', 'r' from %s p, %s s, %s m where \
			p.mcl_id=s.splat_id and p.mcl_id=m.mcl_id"%(schema_instance2.p_gene_table, \
			schema_instance2.splat_table, schema_instance2.mcl_table)
		writer.writerow(['p_gene_id', 'gene_no', 'go_no', 'is_correct_lca', 'p_value', 'recurrence', 'connectivity',\
			'cluster_size', 'unknown_ratio', 'mcl_id', 'lca_list', 'edge_gradient', 'score1', 'is_accepted1', 'score2', 'is_accepted2'])
		p_gene_id_list.sort()
		for p_gene_id in p_gene_id_list:
			#sql_sentence1's prediction infomation is not gonna be displayed
			curs.execute("%s and p.p_gene_id=%s"%(sql_sentence1, p_gene_id))
			rows = curs.fetchall()
			if rows:
				p_attr_instance1 = prediction_attributes(rows[0], type=3)
			else:
				p_attr_instance1 = None
				
			#sql_sentence2's prediction infomation is going to be displayed
			curs.execute("%s and p.p_gene_id=%s"%(sql_sentence2, p_gene_id))
			rows = curs.fetchall()
			if rows:
				p_attr_instance2 = prediction_attributes(rows[0], type=3)
				if pga_instance1 and p_attr_instance1:
					(is_accepted1, score1) = pga_instance1.prediction_accepted(p_attr_instance1.go_no, \
						[-math.log(p_attr_instance1.p_value_cut_off), p_attr_instance1.recurrence_cut_off, \
						p_attr_instance1.connectivity_cut_off, p_attr_instance1.cluster_size_cut_off, \
						p_attr_instance1.edge_gradient])
				else:
					is_accepted1, score1 = None, None
				if pga_instance2:
					(is_accepted2, score2) = pga_instance2.prediction_accepted(p_attr_instance2.go_no, \
						[-math.log(p_attr_instance2.p_value_cut_off), p_attr_instance2.recurrence_cut_off, \
						p_attr_instance2.connectivity_cut_off, p_attr_instance2.cluster_size_cut_off, \
						p_attr_instance2.edge_gradient])
				else:
					is_accepted2, score2 = None, None
				writer.writerow([p_attr_instance2.p_gene_id, p_attr_instance2.gene_no, p_attr_instance2.go_no, \
					p_attr_instance2.is_correct_lca, p_attr_instance2.avg_p_value, p_attr_instance2.recurrence_cut_off,\
					p_attr_instance2.connectivity_cut_off, p_attr_instance2.cluster_size_cut_off, p_attr_instance2.unknown_cut_off,\
					p_attr_instance2.mcl_id, p_attr_instance2.lca_list, p_attr_instance2.edge_gradient, score1, is_accepted1, \
					score2, is_accepted2])
				if not simple:
					#prepare vertex_set and edge_set to draw graphs
					vertex_set = p_attr_instance2.vertex_set[1:-1].split(',')
					vertex_set = map(int, vertex_set)
					edge_set = p_attr_instance2.edge_set[2:-2].split('},{')
					for i in range(len(edge_set)):
						edge_set[i] = edge_set[i].split(',')
						edge_set[i] = map(int, edge_set[i])
					
					#following copied from GuiAnalyzer.py
					subgraph = cluster_info_instance.graph_from_node_edge_set(vertex_set, edge_set)
					graphSrcFname = '/tmp/GuiAnalyzer.dot'
					graphFname = os.path.join(pic_output_dir, '%s_%s_%s_%s.png'%(p_attr_instance2.p_gene_id, \
						p_attr_instance2.gene_no, p_attr_instance2.go_no, p_attr_instance2.mcl_id))
					graphSrcF = open(graphSrcFname, 'w')
					graphDotOutput(graphSrcF, subgraph, \
						self.gene_no2gene_id, self.gene_no2go_no, \
						centralnode=p_attr_instance2.gene_no, function=p_attr_instance2.go_no, weighted=0, )
					graphSrcF.close()
					plot_type_command='neato -Goverlap=false'
					commandline = '%s -Tpng %s -o %s'%(plot_type_command, graphSrcFname, graphFname)
					system_call(commandline)
예제 #3
0
    def data_fetch(
        self,
        curs,
        schema_instance,
        filter_type,
        is_correct_type,
        no_of_total_genes,
        go_no2gene_no_set,
        need_cal_hg_p_value=0,
    ):
        """
		11-09-05
			1st get the data from p_gene_table and remove redundancy given filter_type
			2nd transform the data to three lists
		11-10-05 add a chunk of code to get hg p-value(leave one out) for the prediction
			mcl_id2vertex_list might blow the memory.(?)
		11-19-05
			separate predictions totally into known and unknown
		2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
		"""
        sys.stderr.write("Fetching data from old p_gene_table...\n")
        prediction_pair2instance = {}
        curs.execute(
            "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
			p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
			p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\
			p.edge_gradient from %s p"
            % (schema_instance.p_gene_table)
        )
        curs.execute("fetch 10000 from crs")
        rows = curs.fetchall()
        counter = 0
        real_counter = 0
        while rows:
            for row in rows:
                p_attr_instance = prediction_attributes(row, type=2)
                prediction_pair = (p_attr_instance.gene_no, p_attr_instance.go_no)
                if prediction_pair not in prediction_pair2instance:
                    prediction_pair2instance[prediction_pair] = p_attr_instance
                    real_counter += 1
                else:  # remove redundancy
                    if filter_type == 1:
                        new_cmp_value = p_attr_instance.recurrence_cut_off
                        old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off
                    elif filter_type == 2:
                        new_cmp_value = p_attr_instance.edge_gradient
                        old_cmp_value = prediction_pair2instance[prediction_pair].edge_gradient
                    elif filter_type == 3:
                        new_cmp_value = p_attr_instance.recurrence_cut_off + p_attr_instance.edge_gradient
                        old_cmp_value = (
                            prediction_pair2instance[prediction_pair].recurrence_cut_off
                            + prediction_pair2instance[prediction_pair].edge_gradient
                        )
                    if new_cmp_value > old_cmp_value:
                        prediction_pair2instance[prediction_pair] = p_attr_instance
                counter += 1
            if self.report:
                sys.stderr.write("%s%s/%s" % ("\x08" * 20, counter, real_counter))
            curs.execute("fetch 10000 from crs")
            rows = curs.fetchall()
        unknown_prediction_ls = []
        known_prediction_ls = []
        unknown_data = []  # 11-19-05
        known_data = []
        for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems():
            # 11-10-05
            mcl_id2vertex_list = {}
            if need_cal_hg_p_value:
                mcl_id = p_attr_instance.mcl_id
                if mcl_id not in mcl_id2vertex_list:
                    mcl_id2vertex_list[mcl_id] = self.get_vertex_list(curs, schema_instance, mcl_id)
                p_attr_instance.p_value_cut_off = cal_hg_p_value(
                    p_attr_instance.gene_no,
                    p_attr_instance.go_no,
                    mcl_id2vertex_list[mcl_id],
                    no_of_total_genes,
                    go_no2gene_no_set,
                    r,
                )

            is_correct = p_attr_instance.is_correct_dict[is_correct_type]
            # 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
            data_row = [
                p_attr_instance.p_value_cut_off,
                p_attr_instance.recurrence_cut_off,
                p_attr_instance.connectivity_cut_off,
                p_attr_instance.cluster_size_cut_off,
                p_attr_instance.edge_gradient,
                p_attr_instance.vertex_gradient,
                p_attr_instance.unknown_cut_off,
                p_attr_instance.gene_no,
                p_attr_instance.go_no,
                is_correct,
            ]
            if is_correct != -1:
                known_data.append(data_row)  # to do fitting
                known_prediction_ls.append(p_attr_instance)
            else:
                unknown_data.append(data_row)
                unknown_prediction_ls.append(p_attr_instance)

        sys.stderr.write("Done fetching data.\n")
        return unknown_prediction_ls, known_prediction_ls, unknown_data, known_data