Beispiel #1
0
def main():
    ''' main scripts '''
    bed2anno = defaultdict(list)
    bedid2bed = {}
    for line in open(sys.argv[1], "r"):
        line = line.strip().split("\t")
        bed = Bed(line[0:6])
        if bed.id not in bedid2bed:
            bedid2bed[bed.id] = line[0:6]
        gene = genebed(line[6:18])
        s = bed_in_bedlist(bed, gene.Exons())
        if s == "":
            s = bed_in_bedlist(bed, gene.Introns())
        if s == "":
            sys.exit("### intersection is wrong")
        bed2anno[bed.id].append(s)
    for k, v in bed2anno.items():
        x = "\t".join(bedid2bed[k])
        ss = []
        ss += ["E" for e in v if "Exon" in e]
        ss += ["I" for e in v if "Intron" in e]
        if "E" in set(ss) and "I" in set(ss):
            alternative = "Alternative_ExonIntron"
        elif "E" in set(ss) and "I" not in set(ss):
            alternative = "Constitutive_Exon"
        elif "I" in set(ss) and "E" not in set(ss):
            alternative = "Constitutive_Intron"
        else:
            pass
        print("{0}\t{1}\t{2}\t{3}".format(x, "|".join(v), "".join(ss),
                                          alternative))
Beispiel #2
0
def gene_utr_cds_length(genebed_file):
    UTR5 = []
    ORF = []
    UTR3 = []
    ncRNA = []
    for line in open(genebed_file, "r"):
        if not line.startswith("#"):
            g = genebed(line.strip().split("\t"))
            utr5, cds, utr3 = ORF_UTR_size_of(g)
            if cds < 10:
                ncRNA.append(utr5 + cds + utr3)
            elif cds > 10:
                UTR5.append(utr5)
                ORF.append(cds)
                UTR3.append(utr3)
            else:
                pass
    mean_utr5 = float(np.where(len(UTR5) == 0, 0, np.mean(UTR5)))
    mean_utr3 = float(np.where(len(UTR3) == 0, 0, np.mean(UTR3)))
    mean_cds = float(np.where(len(ORF) == 0, 0, np.mean(ORF)))
    mean_ncRNA = float(np.where(len(ncRNA) == 0, 0, np.mean(ncRNA)))
    print("##########################")
    print("Average length for features: UTR5, CDS, UTR3, and ncRNA")
    print(mean_utr5, mean_cds, mean_utr3, mean_ncRNA)
    return mean_utr5, mean_cds, mean_utr3, mean_ncRNA
def Cluster2pIntronRetention(Cluster):
    ''''clu_5077': {'chrX\t166458208\t166462315': 0.0195360774825754, 'chrX\t166458343\t166462315': -0.0195360774825754}
    '''
    for k1, v1 in Cluster.items():
        if len(v1) > 2:
            for k2, v2 in v1.items():
                Junction = BedTool(k2, from_string=True)
                Junction_Bed = Bed(k2.split("\t"))
                X = Junction.intersect(Genes, wb=True)
                m6A_string = []
                splicing_type = []
                if len(X) >= 1:
                    for gene in X:
                        g = genebed(gene[3:])
                        for e in g.Exons():
                            if Junction_Bed.overlap(Junction_Bed, e):
                                x_l = Junction_Bed.overlapLength(e)
                                if x_l >= 10 and x_l < e.length(
                                ) - 2:  ## if ==, which means the entire exon is inside of the intron
                                    exon = BedTool(str(e), from_string=True)
                                    alternative = Junction.intersect(exon)
                                    splicing_type.append("pfIntronRetention")
                                    m6A_string.append(m6AORnot(alternative))
                if len(set(m6A_string)) == 1 and len(splicing_type) > 0:
                    fo_pfIR.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
                        "|".join(set(splicing_type)),
                        "|".join(set(m6A_string)), k1, k2, v2))
def main():
    ''' main scripts '''
    summit2bed = summitTobed("../m6AIP_Cfg1_Cfg2.narrowPeak.refined.bed")
    for line in open(sys.argv[1], 'r'):
        line = line.strip().split("\t")
        ### from the bed.id to bed interval
        try:
            bed = summit2bed[line[3]]
        except:
            sys.exit("# Something is wrong with the summit2bed")
        gene = genebed(line[6:18])
        Introns = gene.Introns()
        random_n = random.randint(0, len(Introns) - 1)
        f1 = bed_in_Intron(Bed(line[0:6]), [Bed(line[6:12])])
        random_start = random.randint(gene.start, gene.end)
        f2 = bed_in_Intron(
            Bed([
                bed.chr, random_start, random_start + 1, "None", 0, bed.strand
            ]), [Bed(line[6:12])])
        print("{0}\t{1}".format(f1, f2))
Beispiel #5
0
def main():
    ''' main scripts '''
    summit2bed = summitTobed("../m6AIP_Cfg1_Cfg2.narrowPeak.refined.bed")
    for line in open(sys.argv[1], 'r'):
        line = line.strip().split("\t")
        ### from the bed.id to bed interval
        try:
            bed = summit2bed[line[3]]
        except:
            sys.exit("# Something is wrong with the summit2bed")
        gene = genebed(line[6:18])
        Introns = gene.Introns()
        random_n = random.randint(0, len(Introns) - 1)
        for i, intron in enumerate(Introns):
            if bed.overlap(bed, intron):
                L = bed.overlapLength(intron)
                f = bed_in_Intron(Bed(line[0:6]), [intron])
                overlap_start, overlap_end = max(bed.start, intron.start), min(
                    bed.end, intron.end)
                try:
                    random_start = random.randint(
                        0,
                        intron.length() - 1 - L) + intron.start
                    random_end = random_start + L
                except:
                    while i > 0:
                        X = Introns[i - 1]
                        if X.length() >= L + 1:
                            random_start = random.randint(
                                0,
                                X.length() - 1 - L) + X.start
                            random_end = random_start + L
                            break
                        else:
                            i = i - 1
                if f + 1 > 0:
                    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}".format(
                        bed, intron, f,
                        overlap_start, overlap_end, random_start, random_end,
                        intron.length(), Introns[random_n].length()))
Beispiel #6
0
def utr5_cds_utr3_rPos(intersect_file):
    ''' if the overlapped genebed is protein_coding gene and return the relative position to the feature start '''
    ''' for example: UTR5 is [0,1220], and the single-point is 125, so it returns 125/1220=0.10245902'''
    UTR5_rpos = []
    CDS_rpos = []
    UTR3_rpos = []
    ncRNA_rpos = []
    Intron_rpos = []
    for line in open(intersect_file, "r"):
        if not line.startswith("#"):
            line = line.strip().split("\t")
            b = Bed(line[0:6])
            g = genebed(line[6:18])
            ## print("\t".join(line)) ## this is test line
            utr5, cds, utr3 = ORF_UTR_size_of(g)
            ## print(utr5, cds, utr3) ## this is test line
            #### Introns
            if g.Introns():
                f = bed_in_Intron(b, g.Introns())
                ## print("{0}\t{1}".format("test ", f)) ## this si test line
                if f >= 0:
                    Intron_rpos.append(f)
                    if cds < 10:
                        fo.write("{0}\t{1}\t{2}\n".format(
                            "\t".join(line), "ncRNA", "intron"))
                    else:
                        fo.write("{0}\t{1}\t{2}\n".format(
                            "\t".join(line), "protein_coding", "intron"))
                    continue
            ########
            if cds < 10:
                ncRNA_rpos.append(bed_in_bedlist(b, g.Exons()))
                fo.write("{0}\t{1}\t{2}\n".format("\t".join(line), "ncRNA",
                                                  "exon"))
            else:
                try:
                    if g.CDS().start <= b.start < b.end <= g.CDS().end:
                        CDS_rpos.append(
                            bed_in_bedlist(bed=b, bedlist=g.CDS().Exons()))
                    elif utr5 > 0 and g.UTR5(
                    ).start <= b.start < b.end <= g.UTR5().end:
                        UTR5_rpos.append(
                            bed_in_bedlist(bed=b, bedlist=g.UTR5().Exons()))
                    elif utr3 > 0 and g.UTR3(
                    ).start <= b.start < b.end <= g.UTR3().end:
                        UTR3_rpos.append(
                            bed_in_bedlist(bed=b, bedlist=g.UTR3().Exons()))
                    else:
                        pass
                    fo.write("{0}\t{1}\t{2}\n".format("\t".join(line),
                                                      "protein_coding",
                                                      "exon"))
                except AttributeError:
                    pass
    print("####################")
    print("# Intron peaks:")
    print(len(Intron_rpos))
    print("# ncRNA peaks: ")
    print(len(ncRNA_rpos))
    print("# 5UTR peaks:")
    print(len(UTR5_rpos))
    print("# CDS peaks:")
    print(len(CDS_rpos))
    print("# 3UTR peaks:")
    print(len(UTR3_rpos))
    return UTR5_rpos, CDS_rpos, UTR3_rpos, ncRNA_rpos, Intron_rpos
def main():
    ''' '''
    size_of_ORF = {}
    size_of_RNA = {}
    transcript_bed = {}

    args = parse_argument()
    mode = args.mode
    genefile = args.genebed

    if mode == "m":
        fo = open(
            genefile.strip(".GeneBed") + ".MaxORF_LongestNcRNA.GeneBed", "w")
        print("## Searching the maxORF and Longest Transcript ... ",
              file=sys.stderr)
    elif mode == "r":
        fo = open(
            genefile.strip(".GeneBed") + ".randomTranscript.GeneBed", "w")
        print("## Searching the random Transcript ... ", file=sys.stderr)
    else:
        print("## Error in the isoform selection mode ", file=sys.stderr)

    print("## Calculating the size of ORF and gene length ... ",
          file=sys.stderr)
    ## loading the GeneBed file
    for x in Iter_filehandle(filename=genefile):
        gene = genebed(x)
        ## gene.id in genebed class is actually transcript_id
        transcript_bed[gene.id] = x
        #### for the gene without "NM_" or "NR_" in name
        size_of_ORF[gene.id] = ORF_UTR_size_of_v2(gene)[1]
        size_of_RNA[gene.id] = length_of_transcript(gene)
    ##
    ### geneid_genename_transcriptid_genetype.txt file
    for x in Iter_filehandle(
            filename="geneid_genename_transcriptid_genetype.txt"):
        #### Geneid, GeneName, Transcripts, GeneType
        genename = x[1]
        transcripts = x[2].split('|')
        longest_one = ''
        orf = []
        #### gene_type
        try:
            if x[3]:
                pass
        except IndexError:
            x.append("")
        ##
        for t in transcripts:
            try:
                orf.append(size_of_ORF[t])
            except:
                print("## Error for calculating the ORF size for, ",
                      t,
                      file=sys.stderr)
        ## max ORF, if equal orf, long transcript
        m = max(orf)
        index = []
        for idx, size in enumerate(orf):
            if size == m:
                index.append(idx)
        if len(index) == 1:
            longest_one = transcripts[orf.index(m)]
        else:
            RNALength = []
            for i, t in enumerate(transcripts):
                if i in index:
                    RNALength.append(size_of_RNA[t])
            longest_one = transcripts[index[RNALength.index(max(RNALength))]]
        random_index = random.randint(0, len(transcripts) - 1)
        random_one = transcripts[random_index]
        if mode == "m":
            select_one = longest_one
        elif mode == "r":
            select_one = random_one
        else:
            print("## Error in the isoform selection mode ", file=sys.stderr)
        select_genebed = transcript_bed[select_one]
        if x[0] == x[1] and not x[3]:
            fo.write("{0}\t{1}\t{2}\n".format("\t".join(select_genebed[0:3]),
                                              select_one + "|" + genename,
                                              "\t".join(select_genebed[4:])))
        elif x[0] == x[1] and x[3]:
            fo.write("{0}\t{1}\t{2}\n".format(
                "\t".join(select_genebed[0:3]),
                select_one + "|" + genename + "|" + x[3],
                "\t".join(select_genebed[4:])))
        elif x[0] != x[1] and not x[3]:
            fo.write("{0}\t{1}\t{2}\n".format(
                "\t".join(select_genebed[0:3]),
                select_one + "|" + genename + "|" + x[0],
                "\t".join(select_genebed[4:])))
        else:
            fo.write("{0}\t{1}\t{2}\n".format(
                "\t".join(select_genebed[0:3]),
                select_one + "|" + genename + "|" + x[0] + "|" + x[3],
                "\t".join(select_genebed[4:])))
    print("## Done ", file=sys.stderr)
    fo.close()