def run(self):
		"""
		2010-8-15
			create a RBDict of gene-forms [chr, start, stop] with min_overlap_ratio=1.
				value is a sub-RBDict of the gene structure (UTR, non-UTR-exon, intron)
			
			given any CNV, use RBDict.findNodes() to find all gene-forms.
				WATCH: use an alternative comparison function.
			
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		genome_db = GenomeDB.GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.genome_dbname, )
		genome_db.setup(create_tables=False)
		
		db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
									hostname=self.hostname, database=self.dbname)
		db_250k.setup(create_tables=False)
		
		oneGenomeData = genome_db.getOneGenomeData(tax_id=self.tax_id, chr_gap=0)
		cumuSpan2ChrRBDict = oneGenomeData.cumuSpan2ChrRBDict
		genomeRBDict = genome_db.dealWithGenomeRBDict(self.genomeRBDictPickleFname, tax_id=self.tax_id, \
									max_distance=self.max_distance, debug=self.debug)
		#genomeRBDict = None
		pd = PassingData(min_MAF=self.min_MAF,\
					starting_rank=0, \
					need_chr_pos_ls=0,\
					need_candidate_association=False,\
					min_big_overlap=self.min_big_overlap,\
					no_of_permutations=self.no_of_permutations,\
					no_of_min_breaks=self.no_of_min_breaks)
		
		for result_id in self.result_id_ls:
			candidate_gene_set = db_250k.dealWithCandidateGeneList(self.list_type_id, return_set=True)	#internal cache
			pd.candidate_gene_set = candidate_gene_set
			
			#gwr = db_250k.getResultMethodContent(result_id, pdata=pd)
		
			#top_loci = gwr.getTopLoci(no_of_top_loci=self.no_of_top_loci)
			top_loci = self.getResultPeak(result_id, self.result_peak_type_id, pd)
			
			top_loci_in_cumu_pos = self.translateChrPosDataObjectIntoCumuPos(top_loci, oneGenomeData.chr_id2cumu_start)
			top_loci_in_chr_pos = self.translateCumuPosIntoChrPos(top_loci_in_cumu_pos, cumuSpan2ChrRBDict)
			permData = self.prepareDataForPermutationRankTest(top_loci_in_chr_pos, genomeRBDict, pd, report=True)
			
			#m = self.dealWithNoOfSNPsAssociatedWithCandidateGeneList(pd.list_type_id, rm, pd)	#cache is internally going on
			#n = permData.no_of_total_snps - m
			
			candidate_sample_size = len(permData.captured_candidate_gene_set)
			non_candidate_sample_size = len(permData.non_candidate_gene_snp_rank_ls)
			
			return_data = self.get_enrichment_pvalue_by_gw_looping(candidate_sample_size, top_loci_in_cumu_pos, candidate_gene_set, \
							genomeRBDict, cumuSpan2ChrRBDict=cumuSpan2ChrRBDict, \
							no_of_permutations=pd.no_of_permutations, no_of_min_breaks=pd.no_of_min_breaks, param_data=pd)
			pvalue = return_data.pvalue
			no_of_tests = return_data.no_of_tests
			no_of_tests_passed = return_data.no_of_tests_passed
			sys.stderr.write("%s pvalue: %s.\n"%(result_id, pvalue))
		if self.commit:
			db_250k.session.flush()
Beispiel #2
0
    def run(self):
        """
		2010-8-15
			create a RBDict of gene-forms [chr, start, stop] with min_overlap_ratio=1.
				value is a sub-RBDict of the gene structure (UTR, non-UTR-exon, intron)
			
			given any CNV, use RBDict.findNodes() to find all gene-forms.
				WATCH: use an alternative comparison function.
			
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        genome_db = GenomeDB.GenomeDatabase(
            drivername=self.drivername,
            username=self.db_user,
            password=self.db_passwd,
            hostname=self.hostname,
            database=self.genome_dbname,
        )
        genome_db.setup(create_tables=False)

        db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \
               hostname=self.hostname, database=self.dbname)
        db_250k.setup(create_tables=False)

        oneGenomeData = genome_db.getOneGenomeData(tax_id=self.tax_id,
                                                   chr_gap=0)
        genomeRBDict = genome_db.dealWithGenomeRBDict(self.genomeRBDictPickleFname, tax_id=self.tax_id, \
               max_distance=self.max_distance, debug=self.debug)
        #genomeRBDict = None
        pd = PassingData(min_MAF=self.min_MAF,\
           min_score=self.min_score, \
           results_directory=self.results_directory, \
           no_of_top_loci=self.no_of_top_loci, \
           starting_rank=0, \
           need_chr_pos_ls=0,\
           need_candidate_association=False,\
           min_big_overlap=self.min_big_overlap,\
           no_of_permutations=self.no_of_permutations,\
           no_of_min_breaks=self.no_of_min_breaks)

        compareIns = CNVCompareByOverlapLen(
            min_overlap_len=100)  #any overlap is an overlap
        translationData = None
        for result_id in self.result_id_ls:
            #establish the map from cnv.id from chr_pos
            rm = Stock_250kDB.ResultsMethod.get(result_id)
            if not rm.cnv_method_id:
                sys.stderr.write(
                    "ResultsMethod %s doesn't have cnv_method_id. Skip.\n" %
                    (result_id))
                continue
            if not db_250k._cnv_id2chr_pos:
                db_250k.cnv_id2chr_pos = rm.cnv_method_id
                translationData = self.getTranslationDataStructureForBackgroundLoci(
                    db_250k,
                    cnv_method_id=rm.cnv_method_id,
                    min_MAF=self.min_MAF)
                if not translationData.chrSpan2cumuStartRBDict:
                    sys.stderr.write(
                        "Error: translationData.chrSpan2cumuStartRBDict is empty for cnv method %s. exit.\n"
                        % (rm.cnv_method_id))
                    sys.exit(3)
            pd.db_id2chr_pos = db_250k.cnv_id2chr_pos

            candidate_gene_set = db_250k.dealWithCandidateGeneList(
                self.list_type_id, return_set=True)  #internal cache
            pd.candidate_gene_set = candidate_gene_set

            gwr = db_250k.getResultMethodContent(
                result_id, pdata=pd, min_value_cutoff=self.min_score)

            top_loci = gwr.getTopLoci(no_of_top_loci=self.no_of_top_loci,
                                      min_score=self.min_score)
            top_loci_in_cumu_pos = self.translateChrPosDataObjectIntoCumuPos(
                top_loci, translationData.chrSpan2cumuStartRBDict)
            top_loci_in_chr_pos = self.translateCumuPosIntoChrPos(top_loci_in_cumu_pos, translationData.cumuSpan2ChrSpanRBDict, \
                      compareIns=compareIns)
            permData = self.prepareDataForPermutationRankTest(
                top_loci_in_chr_pos, genomeRBDict, pd, report=True)

            #m = self.dealWithNoOfSNPsAssociatedWithCandidateGeneList(pd.list_type_id, rm, pd)	#cache is internally going on
            #n = permData.no_of_total_snps - m

            candidate_sample_size = len(permData.captured_candidate_gene_set)
            non_candidate_sample_size = len(
                permData.non_candidate_gene_snp_rank_ls)

            return_data = self.get_enrichment_pvalue_by_gw_looping(candidate_sample_size, top_loci_in_cumu_pos, candidate_gene_set, \
                genomeRBDict, cumuSpan2ChrSpanRBDict=translationData.cumuSpan2ChrSpanRBDict, \
                no_of_permutations=pd.no_of_permutations, no_of_min_breaks=pd.no_of_min_breaks, param_data=pd,\
                compareIns=compareIns)
            pvalue = return_data.pvalue
            no_of_tests = return_data.no_of_tests
            no_of_tests_passed = return_data.no_of_tests_passed
            sys.stderr.write("%s pvalue: %s.\n" % (result_id, pvalue))
        if self.commit:
            db_250k.session.flush()