Ejemplo n.º 1
0
	def submit_predictions(self, curs, schema_instance, prediction_pair2instance, cluster_id2properties):
		sys.stderr.write("Submitting predictions...\n")
		MpiPredictionFilter_instance = MpiPredictionFilter()
		MpiPredictionFilter_instance.createGeneTable(curs, schema_instance.p_gene_table)
		
		no_of_total_genes = get_no_of_total_genes(curs)
		go_no2gene_no_set = get_go_no2gene_no_set(curs)
		counter = 0
		for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems():
			#1st fill those empty items
			properties = cluster_id2properties[p_attr_instance.mcl_id]
			vertex_set = properties[2]
			p_attr_instance.p_value_cut_off = cal_hg_p_value(p_attr_instance.gene_no, p_attr_instance.go_no,\
				vertex_set, no_of_total_genes, go_no2gene_no_set, r)
			p_attr_instance.avg_p_value = p_attr_instance.p_value_cut_off
			p_attr_instance.connectivity_cut_off = properties[0]
			p_attr_instance.cluster_size_cut_off = len(vertex_set)
			p_attr_instance.unknown_cut_off = properties[1]
			MpiPredictionFilter_instance.submit_to_p_gene_table(curs, schema_instance.p_gene_table, p_attr_instance)
			counter += 1
			if self.report and counter%2000==0:
				sys.stderr.write("%s%s"%('\x08'*20, counter))
		if self.report:
			sys.stderr.write("%s%s"%('\x08'*20, counter))
		sys.stderr.write("Done.\n")
Ejemplo n.º 2
0
    def data_fetch(
        self,
        curs,
        schema_instance,
        filter_type,
        is_correct_type,
        no_of_total_genes,
        go_no2gene_no_set,
        need_cal_hg_p_value=0,
    ):
        """
		11-09-05
			1st get the data from p_gene_table and remove redundancy given filter_type
			2nd transform the data to three lists
		11-10-05 add a chunk of code to get hg p-value(leave one out) for the prediction
			mcl_id2vertex_list might blow the memory.(?)
		11-19-05
			separate predictions totally into known and unknown
		2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
		"""
        sys.stderr.write("Fetching data from old p_gene_table...\n")
        prediction_pair2instance = {}
        curs.execute(
            "DECLARE crs CURSOR FOR SELECT p.p_gene_id, p.gene_no, p.go_no, p.is_correct, p.is_correct_l1, \
			p.is_correct_lca, p.avg_p_value, p.no_of_clusters, p.cluster_array, p.p_value_cut_off, p.recurrence_cut_off, \
			p.connectivity_cut_off, p.cluster_size_cut_off, p.unknown_cut_off, p.depth_cut_off, p.mcl_id, p.lca_list, p.vertex_gradient,\
			p.edge_gradient from %s p"
            % (schema_instance.p_gene_table)
        )
        curs.execute("fetch 10000 from crs")
        rows = curs.fetchall()
        counter = 0
        real_counter = 0
        while rows:
            for row in rows:
                p_attr_instance = prediction_attributes(row, type=2)
                prediction_pair = (p_attr_instance.gene_no, p_attr_instance.go_no)
                if prediction_pair not in prediction_pair2instance:
                    prediction_pair2instance[prediction_pair] = p_attr_instance
                    real_counter += 1
                else:  # remove redundancy
                    if filter_type == 1:
                        new_cmp_value = p_attr_instance.recurrence_cut_off
                        old_cmp_value = prediction_pair2instance[prediction_pair].recurrence_cut_off
                    elif filter_type == 2:
                        new_cmp_value = p_attr_instance.edge_gradient
                        old_cmp_value = prediction_pair2instance[prediction_pair].edge_gradient
                    elif filter_type == 3:
                        new_cmp_value = p_attr_instance.recurrence_cut_off + p_attr_instance.edge_gradient
                        old_cmp_value = (
                            prediction_pair2instance[prediction_pair].recurrence_cut_off
                            + prediction_pair2instance[prediction_pair].edge_gradient
                        )
                    if new_cmp_value > old_cmp_value:
                        prediction_pair2instance[prediction_pair] = p_attr_instance
                counter += 1
            if self.report:
                sys.stderr.write("%s%s/%s" % ("\x08" * 20, counter, real_counter))
            curs.execute("fetch 10000 from crs")
            rows = curs.fetchall()
        unknown_prediction_ls = []
        known_prediction_ls = []
        unknown_data = []  # 11-19-05
        known_data = []
        for prediction_pair, p_attr_instance in prediction_pair2instance.iteritems():
            # 11-10-05
            mcl_id2vertex_list = {}
            if need_cal_hg_p_value:
                mcl_id = p_attr_instance.mcl_id
                if mcl_id not in mcl_id2vertex_list:
                    mcl_id2vertex_list[mcl_id] = self.get_vertex_list(curs, schema_instance, mcl_id)
                p_attr_instance.p_value_cut_off = cal_hg_p_value(
                    p_attr_instance.gene_no,
                    p_attr_instance.go_no,
                    mcl_id2vertex_list[mcl_id],
                    no_of_total_genes,
                    go_no2gene_no_set,
                    r,
                )

            is_correct = p_attr_instance.is_correct_dict[is_correct_type]
            # 2006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off
            data_row = [
                p_attr_instance.p_value_cut_off,
                p_attr_instance.recurrence_cut_off,
                p_attr_instance.connectivity_cut_off,
                p_attr_instance.cluster_size_cut_off,
                p_attr_instance.edge_gradient,
                p_attr_instance.vertex_gradient,
                p_attr_instance.unknown_cut_off,
                p_attr_instance.gene_no,
                p_attr_instance.go_no,
                is_correct,
            ]
            if is_correct != -1:
                known_data.append(data_row)  # to do fitting
                known_prediction_ls.append(p_attr_instance)
            else:
                unknown_data.append(data_row)
                unknown_prediction_ls.append(p_attr_instance)

        sys.stderr.write("Done fetching data.\n")
        return unknown_prediction_ls, known_prediction_ls, unknown_data, known_data