def fetch(self,**kwargs): ''' Fetch intervals in a given region. ''' if kwargs.has_key('chrom'): wigs=wWigIO.getIntervals(self.fname,kwargs['chrom'],kwargs.get('start',0),kwargs.get('stop',0)) if isinstance(wigs,basestring): # bad value raise ValueError("Couldn't get intervals.") return wigs raise ValueError("Chromosome not provided.") return
def fetch(self, **kwargs): ''' Fetch intervals in a given region. ''' if kwargs.has_key('chrom'): wigs = wWigIO.getIntervals(self.fname, kwargs['chrom'], kwargs.get('start', 0), kwargs.get('stop', 0)) if isinstance(wigs, basestring): # bad value raise ValueError("Couldn't get intervals.") return wigs raise ValueError("Chromosome not provided.") return
# ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main # ------------------------------------ if __name__=="__main__": # get information from bigwig file wWigIO.open('test.bw') chroms = wWigIO.getChromSize('test.bw') wigs = wWigIO.getIntervals('test.bw', 'chr1', 10, 200) wWigIO.close('test.bw') print wigs # bigwig -> wig wWigIO.bigWigToWig('test.bw','test.wig') # write the chrom sizes into test.sizes with open('test.sizes','w') as fh: for chrom in chroms: print >>fh, chrom+"\t"+str(chroms[chrom]) # wig -> bigwig wWigIO.wigToBigWig('test.wig','test.sizes','test2.bw')
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates, exon_scores): MEs = set([]) wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score gencode_U2_scores = [] gencode_mean_conservation_vertebrates = [] gencode_mean_conservation_primates = [] for row in csv.reader(open(exon_scores), delimiter=' '): chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates = row gencode_U2_scores.append(float(U2_score)) gencode_mean_conservation_vertebrates.append( float(mean_conservation_vertebrates)) gencode_mean_conservation_primates.append( float(mean_conservation_primates)) for row in csv.reader(open(sim_fastq), delimiter='\t'): if row[0][0] == "@": SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_") len_ME = len(ME_seq) SJ = SJ[1:] SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ) SJ_len = int(SJ_iend) - int(SJ_istart) Kmer = SJ_len - (len_ME + 1) P_ME = 1 - (1 - (float(1) / float(4**len_ME + 4)))**Kmer strand = "+" if "-" in SJ: strand = "-" estart = int(estart) eend = int(eend) MEs.add((SJ_chr, strand, estart, eend, P_ME)) for m in MEs: chr, strand, estart, eend, P_ME = m estart, eend = sorted([estart, eend]) E5 = str(Genome[chr][estart - 14:estart + 3]).upper() E3 = str(Genome[chr][eend - 3:eend + 10]).upper() if strand == "-": E5 = str(Genome[chr][eend - 3:eend + 14].reverse_complement()).upper() E3 = str(Genome[chr][estart - 10:estart + 3].reverse_complement()).upper() U2_score = 0 i = 0 for N in E5: if N != "N": U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: if N != "N": U2_score += U2_GTAG_5[N][i] i += 1 U2_score = percent(U2_score, TOTAL_U2_max_score) conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart - 2, eend + 2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart - 2, eend + 2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates / len( conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates / len( conservation_primates) except ZeroDivisionError: pass ME_percentil_U2_score = stats.percentileofscore( gencode_U2_scores, U2_score) ME_percentil_mean_conservation_vertebrates = stats.percentileofscore( gencode_mean_conservation_vertebrates, mean_conservation_primates) ME_percentil_mean_conservation_primates = stats.percentileofscore( gencode_mean_conservation_primates, mean_conservation_vertebrates) overall_score = P_ME * (1 - ME_percentil_U2_score / 100) * ( 1 - ME_percentil_mean_conservation_vertebrates / 100) if ME_percentil_mean_conservation_primates > ME_percentil_mean_conservation_vertebrates: overall_score = P_ME * (1 - ME_percentil_U2_score / 100) * ( 1 - ME_percentil_mean_conservation_primates / 100) #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates print chr, estart, eend, strand, U2_score, ME_percentil_U2_score, mean_conservation_vertebrates, ME_percentil_mean_conservation_vertebrates, mean_conservation_primates, ME_percentil_mean_conservation_primates, P_ME, overall_score
def main(gencode_bed, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score exons = set([]) for row in csv.reader(open(gencode_bed), delimiter='\t'): csv.field_size_limit(1000000000) qstarts = map(int, row[11].strip(",").split(",")) blocksizes = map(int, row[10].strip(",").split(",")) start = int(row[1]) strand = row[5] bn = int(row[9]) chr = row[0] for q1, b in zip(qstarts[1:-1], blocksizes[1:-1]): estart = start + q1 eend = start + q1 + b E5 = str(Genome[chr][estart - 14:estart + 3]).upper() E3 = str(Genome[chr][eend - 3:eend + 10]).upper() if strand == "-": E5 = str(Genome[chr][eend - 3:eend + 14].reverse_complement()).upper() E3 = str(Genome[chr][estart - 10:estart + 3].reverse_complement()).upper() U2_score = 0 i = 0 for N in E5: U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: U2_score += U2_GTAG_5[N][i] i += 1 U2_score = percent(U2_score, TOTAL_U2_max_score) if E5[-5:-3] == "AG" and E3[3:5] == "GT": exons.add((chr, estart, eend, strand, U2_score)) # if " ".join([chr, estart, eend]) == "chr17 26597935 26598725": # print for e in exons: chr, estart, eend, strand, U2_score = e conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart - 2, eend + 2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart - 2, eend + 2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates / len( conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates / len( conservation_primates) except ZeroDivisionError: pass print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates
def __NDR_bin_detect(self): """检视所有 cover 的 bins.""" for idx in sorted(self.l_umt_met): self.__bin_info(idx) """对这种可能显著并且不会继续落 reads 的 bin""" if idx < self.bin_idx_min: np_obs = self.l_umt_met[idx] np_exp = self.np_RATIO * np.sum(np_obs) chisquare = np.sum((np_obs - np_exp)**2 / np_exp) pval = -1 * np.log10(1 - scipy.stats.chi2.cdf(chisquare, 1)) if np_obs[1] < np_exp[1]: pval = -1 * pval pos_center = int((self.bin_begin + self.bin_endin) / 2) bin_up = pos_center - self.step_len * 10 bin_down = pos_center + self.step_len * 10 # print bin_up, bin_down np_val_ext = np.zeros(2) l_rat_umt_met_ext = [] l_rat_umt_met_sur1 = [] l_rat_umt_met_sur2 = [] try: record = self.tb_file.query(self.chrom, bin_up, bin_down) for rec in record: pos = (int(rec[1]) + int(rec[2])) / 2 total = int(rec[3]) + int(rec[4]) if total >= self.depth: np_val_ext[0] += int(rec[3]) np_val_ext[1] += int(rec[4]) ratio = int(rec[4]) / (int(rec[3]) + int(rec[4])) l_rat_umt_met_ext.append(ratio) if abs(pos - (self.bin_begin - 40)) < 40: l_rat_umt_met_sur1.append(ratio) if abs(pos - (self.bin_endin + 40)) < 40: l_rat_umt_met_sur2.append(ratio) except: pass np_obs_ext = np_val_ext np_exp_ext = self.np_RATIO * np.sum(np_obs_ext) val_reg = np.array(self.l_rat_umt_met[idx]).mean() val_ext = np.array(l_rat_umt_met_ext).mean() val_sur1 = np.array(l_rat_umt_met_sur1).mean() val_sur2 = np.array(l_rat_umt_met_sur2).mean() chisquare_ext = np.sum( (np_obs_ext - np_exp_ext)**2 / np_exp_ext) pval_ext = -1 * np.log10( 1 - scipy.stats.chi2.cdf(chisquare_ext, 1)) if np_obs_ext[1] < np_exp_ext[1]: pval_ext = -1 * pval_ext pval_ttest = -1 * np.log10( scipy.stats.ttest_ind(self.l_rat_umt_met[idx], l_rat_umt_met_ext, equal_var=False)[1]) if val_reg < val_ext: pval_ttest = -1 * pval_ttest pval_ttest2l = -1 * np.log10( scipy.stats.ttest_ind(self.l_rat_umt_met[idx], l_rat_umt_met_sur1, equal_var=False)[1]) if val_reg < val_sur1: pval_ttest2l = -1 * pval_ttest2l pval_ttest2r = -1 * np.log10( scipy.stats.ttest_ind(self.l_rat_umt_met[idx], l_rat_umt_met_sur2, equal_var=False)[1]) if val_reg < val_sur2: pval_ttest2r = -1 * pval_ttest2r np_bw = np.array([ f[2] for f in wWigIO.getIntervals( in_bw, self.chrom, self.bin_begin, self.bin_endin) ]) # print self.bin_begin, self.bin_endin, np_bw mean_bw = np_bw.mean() print "%s\t%d\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%d,%d\t%d,%d\t%1.2f,%1.2f\t%1.2f,%1.2f\t%1.2f,%1.2f,%1.2f,%1.2f" % ( self.chrom, self.bin_begin, self.bin_endin, mean_bw, pval, pval_ext, pval_ttest, pval_ttest2l, pval_ttest2r, self.l_umt_met[idx][0], self.l_umt_met[idx][1], np_val_ext[0], np_val_ext[1], np_exp[0], np_exp[1], np_exp_ext[0], np_exp_ext[1], val_reg, val_ext, val_sur1, val_sur2) # print "%s\t%d\t%d\t%1.2f\t%s\t%s" % (self.chrom, self.bin_begin, self.bin_endin, pval_ttest, val_reg, val_ext) del self.l_umt_met[idx] del self.l_rat_umt_met[idx]
# ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main # ------------------------------------ if __name__ == "__main__": # get information from bigwig file wWigIO.open('test.bw') chroms = wWigIO.getChromSize('test.bw') wigs = wWigIO.getIntervals('test.bw', 'chr1', 10, 200) wWigIO.close('test.bw') print wigs # bigwig -> wig wWigIO.bigWigToWig('test.bw', 'test.wig') # write the chrom sizes into test.sizes with open('test.sizes', 'w') as fh: for chrom in chroms: print >> fh, chrom + "\t" + str(chroms[chrom]) # wig -> bigwig wWigIO.wigToBigWig('test.wig', 'test.sizes', 'test2.bw')
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): MEs = set([]) wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score for row in csv.reader(open(sim_fastq), delimiter='\t'): chr, estart, eend, exon, exon_len, strand = row estart = int(estart) eend = int(eend) # if row[0][0]=="@": # SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_") # len_ME = len(ME_seq) # SJ = SJ[1:] # SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ) # SJ_len = int(SJ_iend) - int(SJ_istart) # Kmer = SJ_len - (len_ME+1) # P_ME = 1 - ( 1 - (float(1)/float(4**len_ME+4)))**Kmer # strand = "+" # if "-" in SJ: # strand = "-" # estart = int(estart) # eend = int(eend) # MEs.add((SJ_chr, strand, estart, eend, P_ME)) # for m in MEs: # chr, strand, estart, eend, P_ME = m # estart, eend = sorted([estart, eend]) E5 = str(Genome[chr][estart - 14:estart + 3]).upper() E3 = str(Genome[chr][eend - 3:eend + 10]).upper() if strand == "-": E5 = str(Genome[chr][eend - 3:eend + 14].reverse_complement()).upper() E3 = str(Genome[chr][estart - 10:estart + 3].reverse_complement()).upper() E5 = E5[:-5] + "AG" + E5[-3:] E3 = E3[:3] + "GT" + E3[5:] U2_score = 0 ME5_U2_score = 0 ME3_U2_score = 0 i = 0 for N in E5: if N != "N": U2_score += U2_GTAG_3[N][i] ME5_U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: if N != "N": U2_score += U2_GTAG_5[N][i] ME3_U2_score += U2_GTAG_5[N][i] i += 1 ME3_U2_score = percent(ME3_U2_score, U2_GTAG_5_max_score) ME5_U2_score = percent(ME5_U2_score, U2_GTAG_3_max_score) U2_score = percent(U2_score, TOTAL_U2_max_score) conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart - 2, eend + 2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart - 2, eend + 2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates / len( conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates / len( conservation_primates) except ZeroDivisionError: pass #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates print chr, estart, eend, strand, U2_score, ME5_U2_score, ME3_U2_score, mean_conservation_vertebrates, mean_conservation_primates
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): MEs = set([]) wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score for row in csv.reader(open(sim_fastq), delimiter = '\t'): chr, estart, eend, exon, exon_len, strand = row estart = int(estart) eend = int(eend) # if row[0][0]=="@": # SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_") # len_ME = len(ME_seq) # SJ = SJ[1:] # SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ) # SJ_len = int(SJ_iend) - int(SJ_istart) # Kmer = SJ_len - (len_ME+1) # P_ME = 1 - ( 1 - (float(1)/float(4**len_ME+4)))**Kmer # strand = "+" # if "-" in SJ: # strand = "-" # estart = int(estart) # eend = int(eend) # MEs.add((SJ_chr, strand, estart, eend, P_ME)) # for m in MEs: # chr, strand, estart, eend, P_ME = m # estart, eend = sorted([estart, eend]) E5 = str(Genome[chr][estart-14:estart+3]).upper() E3 = str(Genome[chr][eend-3:eend+10]).upper() if strand == "-": E5 = str(Genome[chr][eend-3:eend+14].reverse_complement()).upper() E3 = str(Genome[chr][estart-10:estart+3].reverse_complement()).upper() E5 = E5[:-5] + "AG" + E5[-3:] E3 = E3[:3] + "GT" + E3[5:] U2_score = 0 ME5_U2_score = 0 ME3_U2_score = 0 i = 0 for N in E5: if N!="N": U2_score += U2_GTAG_3[N][i] ME5_U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: if N!="N": U2_score += U2_GTAG_5[N][i] ME3_U2_score += U2_GTAG_5[N][i] i += 1 ME3_U2_score = percent(ME3_U2_score, U2_GTAG_5_max_score) ME5_U2_score = percent(ME5_U2_score, U2_GTAG_3_max_score) U2_score = percent(U2_score, TOTAL_U2_max_score) conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart-2, eend+2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart-2, eend+2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates/len(conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates/len(conservation_primates) except ZeroDivisionError: pass #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates print chr, estart, eend, strand, U2_score, ME5_U2_score, ME3_U2_score, mean_conservation_vertebrates, mean_conservation_primates
def main(gencode_bed, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score exons = set([]) for row in csv.reader(open(gencode_bed), delimiter = '\t'): csv.field_size_limit(1000000000) qstarts = map (int, row[11].strip(",").split(",")) blocksizes = map(int, row[10].strip(",").split(",")) start = int(row[1]) strand = row[5] bn = int(row[9]) chr = row[0] for q1, b in zip(qstarts[1:-1], blocksizes[1:-1]): estart = start + q1 eend = start + q1 + b E5 = str(Genome[chr][estart-14:estart+3]).upper() E3 = str(Genome[chr][eend-3:eend+10]).upper() if strand == "-": E5 = str(Genome[chr][eend-3:eend+14].reverse_complement()).upper() E3 = str(Genome[chr][estart-10:estart+3].reverse_complement()).upper() U2_score = 0 i = 0 for N in E5: U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: U2_score += U2_GTAG_5[N][i] i += 1 U2_score = percent(U2_score, TOTAL_U2_max_score) if E5[-5:-3]=="AG" and E3[3:5] == "GT": exons.add((chr, estart, eend, strand, U2_score)) # if " ".join([chr, estart, eend]) == "chr17 26597935 26598725": # print for e in exons: chr, estart, eend, strand, U2_score = e conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart-2, eend+2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart-2, eend+2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates/len(conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates/len(conservation_primates) except ZeroDivisionError: pass print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates