def main(): ''' main scripts ''' bed2anno = defaultdict(list) bedid2bed = {} for line in open(sys.argv[1], "r"): line = line.strip().split("\t") bed = Bed(line[0:6]) if bed.id not in bedid2bed: bedid2bed[bed.id] = line[0:6] gene = genebed(line[6:18]) s = bed_in_bedlist(bed, gene.Exons()) if s == "": s = bed_in_bedlist(bed, gene.Introns()) if s == "": sys.exit("### intersection is wrong") bed2anno[bed.id].append(s) for k, v in bed2anno.items(): x = "\t".join(bedid2bed[k]) ss = [] ss += ["E" for e in v if "Exon" in e] ss += ["I" for e in v if "Intron" in e] if "E" in set(ss) and "I" in set(ss): alternative = "Alternative_ExonIntron" elif "E" in set(ss) and "I" not in set(ss): alternative = "Constitutive_Exon" elif "I" in set(ss) and "E" not in set(ss): alternative = "Constitutive_Intron" else: pass print("{0}\t{1}\t{2}\t{3}".format(x, "|".join(v), "".join(ss), alternative))
def gene_utr_cds_length(genebed_file): UTR5 = [] ORF = [] UTR3 = [] ncRNA = [] for line in open(genebed_file, "r"): if not line.startswith("#"): g = genebed(line.strip().split("\t")) utr5, cds, utr3 = ORF_UTR_size_of(g) if cds < 10: ncRNA.append(utr5 + cds + utr3) elif cds > 10: UTR5.append(utr5) ORF.append(cds) UTR3.append(utr3) else: pass mean_utr5 = float(np.where(len(UTR5) == 0, 0, np.mean(UTR5))) mean_utr3 = float(np.where(len(UTR3) == 0, 0, np.mean(UTR3))) mean_cds = float(np.where(len(ORF) == 0, 0, np.mean(ORF))) mean_ncRNA = float(np.where(len(ncRNA) == 0, 0, np.mean(ncRNA))) print("##########################") print("Average length for features: UTR5, CDS, UTR3, and ncRNA") print(mean_utr5, mean_cds, mean_utr3, mean_ncRNA) return mean_utr5, mean_cds, mean_utr3, mean_ncRNA
def Cluster2pIntronRetention(Cluster): ''''clu_5077': {'chrX\t166458208\t166462315': 0.0195360774825754, 'chrX\t166458343\t166462315': -0.0195360774825754} ''' for k1, v1 in Cluster.items(): if len(v1) > 2: for k2, v2 in v1.items(): Junction = BedTool(k2, from_string=True) Junction_Bed = Bed(k2.split("\t")) X = Junction.intersect(Genes, wb=True) m6A_string = [] splicing_type = [] if len(X) >= 1: for gene in X: g = genebed(gene[3:]) for e in g.Exons(): if Junction_Bed.overlap(Junction_Bed, e): x_l = Junction_Bed.overlapLength(e) if x_l >= 10 and x_l < e.length( ) - 2: ## if ==, which means the entire exon is inside of the intron exon = BedTool(str(e), from_string=True) alternative = Junction.intersect(exon) splicing_type.append("pfIntronRetention") m6A_string.append(m6AORnot(alternative)) if len(set(m6A_string)) == 1 and len(splicing_type) > 0: fo_pfIR.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format( "|".join(set(splicing_type)), "|".join(set(m6A_string)), k1, k2, v2))
def main(): ''' main scripts ''' summit2bed = summitTobed("../m6AIP_Cfg1_Cfg2.narrowPeak.refined.bed") for line in open(sys.argv[1], 'r'): line = line.strip().split("\t") ### from the bed.id to bed interval try: bed = summit2bed[line[3]] except: sys.exit("# Something is wrong with the summit2bed") gene = genebed(line[6:18]) Introns = gene.Introns() random_n = random.randint(0, len(Introns) - 1) f1 = bed_in_Intron(Bed(line[0:6]), [Bed(line[6:12])]) random_start = random.randint(gene.start, gene.end) f2 = bed_in_Intron( Bed([ bed.chr, random_start, random_start + 1, "None", 0, bed.strand ]), [Bed(line[6:12])]) print("{0}\t{1}".format(f1, f2))
def main(): ''' main scripts ''' summit2bed = summitTobed("../m6AIP_Cfg1_Cfg2.narrowPeak.refined.bed") for line in open(sys.argv[1], 'r'): line = line.strip().split("\t") ### from the bed.id to bed interval try: bed = summit2bed[line[3]] except: sys.exit("# Something is wrong with the summit2bed") gene = genebed(line[6:18]) Introns = gene.Introns() random_n = random.randint(0, len(Introns) - 1) for i, intron in enumerate(Introns): if bed.overlap(bed, intron): L = bed.overlapLength(intron) f = bed_in_Intron(Bed(line[0:6]), [intron]) overlap_start, overlap_end = max(bed.start, intron.start), min( bed.end, intron.end) try: random_start = random.randint( 0, intron.length() - 1 - L) + intron.start random_end = random_start + L except: while i > 0: X = Introns[i - 1] if X.length() >= L + 1: random_start = random.randint( 0, X.length() - 1 - L) + X.start random_end = random_start + L break else: i = i - 1 if f + 1 > 0: print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}".format( bed, intron, f, overlap_start, overlap_end, random_start, random_end, intron.length(), Introns[random_n].length()))
def utr5_cds_utr3_rPos(intersect_file): ''' if the overlapped genebed is protein_coding gene and return the relative position to the feature start ''' ''' for example: UTR5 is [0,1220], and the single-point is 125, so it returns 125/1220=0.10245902''' UTR5_rpos = [] CDS_rpos = [] UTR3_rpos = [] ncRNA_rpos = [] Intron_rpos = [] for line in open(intersect_file, "r"): if not line.startswith("#"): line = line.strip().split("\t") b = Bed(line[0:6]) g = genebed(line[6:18]) ## print("\t".join(line)) ## this is test line utr5, cds, utr3 = ORF_UTR_size_of(g) ## print(utr5, cds, utr3) ## this is test line #### Introns if g.Introns(): f = bed_in_Intron(b, g.Introns()) ## print("{0}\t{1}".format("test ", f)) ## this si test line if f >= 0: Intron_rpos.append(f) if cds < 10: fo.write("{0}\t{1}\t{2}\n".format( "\t".join(line), "ncRNA", "intron")) else: fo.write("{0}\t{1}\t{2}\n".format( "\t".join(line), "protein_coding", "intron")) continue ######## if cds < 10: ncRNA_rpos.append(bed_in_bedlist(b, g.Exons())) fo.write("{0}\t{1}\t{2}\n".format("\t".join(line), "ncRNA", "exon")) else: try: if g.CDS().start <= b.start < b.end <= g.CDS().end: CDS_rpos.append( bed_in_bedlist(bed=b, bedlist=g.CDS().Exons())) elif utr5 > 0 and g.UTR5( ).start <= b.start < b.end <= g.UTR5().end: UTR5_rpos.append( bed_in_bedlist(bed=b, bedlist=g.UTR5().Exons())) elif utr3 > 0 and g.UTR3( ).start <= b.start < b.end <= g.UTR3().end: UTR3_rpos.append( bed_in_bedlist(bed=b, bedlist=g.UTR3().Exons())) else: pass fo.write("{0}\t{1}\t{2}\n".format("\t".join(line), "protein_coding", "exon")) except AttributeError: pass print("####################") print("# Intron peaks:") print(len(Intron_rpos)) print("# ncRNA peaks: ") print(len(ncRNA_rpos)) print("# 5UTR peaks:") print(len(UTR5_rpos)) print("# CDS peaks:") print(len(CDS_rpos)) print("# 3UTR peaks:") print(len(UTR3_rpos)) return UTR5_rpos, CDS_rpos, UTR3_rpos, ncRNA_rpos, Intron_rpos
def main(): ''' ''' size_of_ORF = {} size_of_RNA = {} transcript_bed = {} args = parse_argument() mode = args.mode genefile = args.genebed if mode == "m": fo = open( genefile.strip(".GeneBed") + ".MaxORF_LongestNcRNA.GeneBed", "w") print("## Searching the maxORF and Longest Transcript ... ", file=sys.stderr) elif mode == "r": fo = open( genefile.strip(".GeneBed") + ".randomTranscript.GeneBed", "w") print("## Searching the random Transcript ... ", file=sys.stderr) else: print("## Error in the isoform selection mode ", file=sys.stderr) print("## Calculating the size of ORF and gene length ... ", file=sys.stderr) ## loading the GeneBed file for x in Iter_filehandle(filename=genefile): gene = genebed(x) ## gene.id in genebed class is actually transcript_id transcript_bed[gene.id] = x #### for the gene without "NM_" or "NR_" in name size_of_ORF[gene.id] = ORF_UTR_size_of_v2(gene)[1] size_of_RNA[gene.id] = length_of_transcript(gene) ## ### geneid_genename_transcriptid_genetype.txt file for x in Iter_filehandle( filename="geneid_genename_transcriptid_genetype.txt"): #### Geneid, GeneName, Transcripts, GeneType genename = x[1] transcripts = x[2].split('|') longest_one = '' orf = [] #### gene_type try: if x[3]: pass except IndexError: x.append("") ## for t in transcripts: try: orf.append(size_of_ORF[t]) except: print("## Error for calculating the ORF size for, ", t, file=sys.stderr) ## max ORF, if equal orf, long transcript m = max(orf) index = [] for idx, size in enumerate(orf): if size == m: index.append(idx) if len(index) == 1: longest_one = transcripts[orf.index(m)] else: RNALength = [] for i, t in enumerate(transcripts): if i in index: RNALength.append(size_of_RNA[t]) longest_one = transcripts[index[RNALength.index(max(RNALength))]] random_index = random.randint(0, len(transcripts) - 1) random_one = transcripts[random_index] if mode == "m": select_one = longest_one elif mode == "r": select_one = random_one else: print("## Error in the isoform selection mode ", file=sys.stderr) select_genebed = transcript_bed[select_one] if x[0] == x[1] and not x[3]: fo.write("{0}\t{1}\t{2}\n".format("\t".join(select_genebed[0:3]), select_one + "|" + genename, "\t".join(select_genebed[4:]))) elif x[0] == x[1] and x[3]: fo.write("{0}\t{1}\t{2}\n".format( "\t".join(select_genebed[0:3]), select_one + "|" + genename + "|" + x[3], "\t".join(select_genebed[4:]))) elif x[0] != x[1] and not x[3]: fo.write("{0}\t{1}\t{2}\n".format( "\t".join(select_genebed[0:3]), select_one + "|" + genename + "|" + x[0], "\t".join(select_genebed[4:]))) else: fo.write("{0}\t{1}\t{2}\n".format( "\t".join(select_genebed[0:3]), select_one + "|" + genename + "|" + x[0] + "|" + x[3], "\t".join(select_genebed[4:]))) print("## Done ", file=sys.stderr) fo.close()