Esempio n. 1
0
    def __init__(self, species, sex, tissue, replicate):
        self.species = species
        self.sex = sex
        self.tissue = tissue
        self.replicate = replicate
        self.name = species + "_" + sex + "_" + tissue + "_" + replicate
        self.filenamePacBioBed = self.name + ".bam.bed"
        print("get lines of " + self.filenamePacBioBed)
        self.linesPacBioBed = get_lines("../data/pacbio",
                                        self.filenamePacBioBed)
        print("get isoseqid to exonlen ...")
        self.isoseqid2exonlen, self.isoseqid2line = get_id2exonlen_id2line(
            self.linesPacBioBed)

        self.filenameAnnotationBedA = self.species + ".genePred.bed"
        print("get lines of " + self.filenameAnnotationBedA)
        self.linesAnnotationBedA = get_lines("../data/annotation",
                                             self.filenameAnnotationBedA)
        print("get gtf's transid to exonlen ...")
        self.transidA2exonlen, self.transidA2line = get_id2exonlen_id2line(
            self.linesAnnotationBedA)

        self.filenameAnnotationBedB = self.species + ".v3.genePred.bed"
        print("get lines of " + self.filenameAnnotationBedB)
        self.linesAnnotationBedB = get_lines("../data/annotation",
                                             self.filenameAnnotationBedB)
        print("get v3.gtf's transid to exonlen ...")
        self.transidB2exonlen, self.transidB2line = get_id2exonlen_id2line(
            self.linesAnnotationBedB)

        self.filenameA = self.name + ".bam.gtf.intersect_gtf"
        self.filenameB = self.name + ".bam.gtf.intersect_v3gtf"
        print("get lines of " + self.filenameA)
        self.linesA = get_lines("../data/pacbio", self.filenameA)
        print("get lines of " + self.filenameB)
        self.linesB = get_lines("../data/pacbio", self.filenameB)
        print("get intersection A info ...")
        (self.isoseqid2transidA,
         self.transidA2isoseqid) = self.get_intersectinfo("A")
        print("get intersection B info ...")
        (self.isoseqid2transidB,
         self.transidB2isoseqid) = self.get_intersectinfo("B")

        self.isoseqid2besttransidA = self.get_isoseqid2besttransid("A")
        self.isoseqid2besttransidB = self.get_isoseqid2besttransid("B")
        """ get jaccard info such as intersection union jaccard for each isoseqid """
        print("get jaccard A info ...")
        self.jaccard_infoA = self.get_jaccard_info("A")
        print("get jaccard B info ...")
        self.jaccard_infoB = self.get_jaccard_info("B")
        """ get end info for each isoseqid """
        print("get end A info ...")
        self.end_infoA = self.get_end_info("A")
        print("get end B info ...")
        self.end_infoB = self.get_end_info("B")
Esempio n. 2
0
 def __init__(self, sample):
     self.sample = sample
     self.species, self.sex, self.tissue, self.replicate = self.sample.split(
         "_")
     self.v3exonmap = self.get_exonmap("v3")  # transcript level
     self.fbexonmap = self.get_exonmap("fb")  # transcript level
     self.v3salmonlines = get_lines("../data/salmon/FB2017_03_v3/",
                                    sample + ".salmon.txt")
     self.fbsalmonlines = get_lines("../data/salmon/FB2017_03/",
                                    sample + ".salmon.txt")
     self.fbsalmon = self.get_fbsalmon()
     self.DxxxTID2YOtrID = self.get_DxxxTID2YOtrID()
     self.DxxxTID2FBtrID = self.get_DxxxTID2FBtrID()
     self.DxxxTID2jaccard = self.get_DxxxTID2jaccard()
Esempio n. 3
0
    def __init__(self, species):
        self.species = species

        # v3 updated annotation
        self.filename = species + ".expression.nrc.tab"
        self.lines = get_lines("../data/expression", self.filename)
        self.expressionheader, self.geneid2expression = self.get_geneid2expression(
            1)

        # FB2017_03 old annotation
        self.oldfilename = species + ".expression.onrc.tab"
        self.oldlines = get_lines("../data/expression", self.oldfilename)
        self.oldexpressionheader, self.oldgeneid2expression = self.get_geneid2expression(
            0)
Esempio n. 4
0
 def __init__(self, sample):
     self.sample = sample
     self.species, self.sex, self.tissue, self.replicate = self.sample.split(
         "_")
     self.v3exonmap = self.get_exonmap("v3")
     self.fbexonmap = self.get_exonmap("fb")
     self.v3htseqlines = get_lines("../data/htseq/FB2017_03_v3/",
                                   sample + ".htseq_reverse_HiSAT2.txt")
     self.fbhtseqlines = get_lines("../data/htseq/FB2017_03/",
                                   sample + ".htseq_reverse_HiSAT2.txt")
     self.fbhtseq = self.get_fbhtseq()
     self.DxxxGID2YOgnID = self.get_DxxxGID2YOgnID()
     self.DxxxGID2FBgnID = self.get_DxxxGID2FBgnID()
     self.DxxxGID2jaccard = self.get_DxxxGID2jaccard()
     self.update_v3htseq_lines()
Esempio n. 5
0
 def __init__(self, species, sex, tissue):
     self.species = species
     self.sex = sex
     self.tissue = tissue
     self.filename = self.species + "_" + sex + "_" + tissue + ".merged.events"
     self.lines = get_lines("../data/event", self.filename)
     self.eventinfo = self.get_eventinfo()
Esempio n. 6
0
 def __init__(self, species, sex, tissue):
     self.species = species
     self.sex = sex
     self.tissue = tissue
     # self.filename = self.species + "_" + sex + "_" + tissue + ".merged.juncs" # spanki juncs
     self.filename = self.species + "_" + sex + "_" + tissue + ".sorted.junc.bed"
     self.lines = get_lines("../data/junction", self.filename)
     self.juncinfo = self.get_juncinfo()
Esempio n. 7
0
 def get_geneidtransid2strand(self):
     """ get transid 2 strand """
     dct = dict()
     for line in get_lines("../data/annotation/", self.species + ".v3.gtf"):
         (scaffold, tag, feature, start, end, scoare, strand, dot, others) = line.rstrip().split("\t")
         this_geneid = get_id(others, 'gene_id')
         this_transid = get_id(others, 'transcript_id')
         dct[this_geneid + "|" + this_transid] = strand
     return dct
Esempio n. 8
0
 def __init__(self, species, sex, tissue, replicate):
     self.species = species
     self.sex = sex
     self.tissue = tissue
     self.replicate = replicate
     self.sample = species + "_" + sex + "_" + tissue + "_" + replicate
     self.geneidtransid2strand = self.get_geneidtransid2strand()
     self.trackinglines = get_lines("../data/pacbio/", "jaccard_zero." +  self.sample + ".exon.gtf.tracking")
     self.isoseqid2overlaptype = self.get_isoseqid2overlaptype()    
Esempio n. 9
0
 def __init__(self, filename):
     """ filename example: pacbio_new_gene_model.all_phase_peptide """
     self.filename = filename
     self.peptidefasta = Fasta("../data/pacbio/" + filename + ".fasta")
     print(self.peptidefasta)
     self.signalplines = get_lines("../data/pacbio",
                                   filename + ".fasta.signalp")
     self.position2manualinfo = get_position2manualinfo(
         "../data/pacbio/pacbio_new_gene_model.tab")
Esempio n. 10
0
 def get_DxxxGID2FBgnID(self):
     """ get connection between DxxxGID and FBgn """
     lines = get_lines("../data/ortholog/", self.species + ".ee")
     DxxxGID2FBgnID = dict()
     for line in lines:
         temp = line.rstrip().split()
         DxxxGID = temp[0]
         FBgnID = temp[1]
         DxxxGID2FBgnID[DxxxGID] = FBgnID
     return (DxxxGID2FBgnID)
Esempio n. 11
0
 def get_isoseqid2position(self):
     lines = get_lines("../data/pacbio", self.sample + ".bam.bed")
     dct = dict()
     for line in lines:
         (chrom, chromStart, chromEnd, position_isoseqid, score, strand,
          thickStart, thickEnd, itemRgb, blockCount, blockSizes,
          blockStarts) = line.rstrip().split("\t")
         position, isoseqid = position_isoseqid.split(".")
         dct[isoseqid] = position
     return dct
Esempio n. 12
0
 def get_jaccard_plus_isoseqid(self):
     """ get jaccard > 0 isoseqid """
     lines = get_lines(
         "../data/output", self.species + "_" + self.sex + "_" +
         self.tissue + "_" + self.replicate + ".B.txt")
     st = set()
     for line in lines:
         (position_isoseqid, transid, intersection, union,
          jaccard) = line.rstrip().split("\t")
         position, isoseqid = position_isoseqid.split(".")
         if float(jaccard) > 0:
             st.add(isoseqid)
     return st
Esempio n. 13
0
 def get_isoseqid2intronnum(self):
     """ get intron information from bam.bed """
     lines = get_lines("../data/pacbio", self.sample + ".bam.bed")
     dct = dict()
     for line in lines:
         (chrom, chromStart, chromEnd, position_isoseqid, score, strand,
          thickStart, thickEnd, itemRgb, blockCount, blockSizes,
          blockStarts) = line.rstrip().split("\t")
         position, isoseqid = position_isoseqid.split(".")
         exons = focalintersect.get_exons(int(chromStart), int(chromEnd),
                                          blockSizes, blockStarts)
         num_exons = len(exons)
         dct[isoseqid] = num_exons - 1
     return dct
Esempio n. 14
0
    def __init__(self, species):
        self.species = species
        # self.filename = species + ".SVGpredAdded.v2.gtf"
        self.filename = species + ".v3.gtf"
        self.lines = get_lines("../data/annotation", self.filename)
        self.GdxxxID = self.get_geneid_from_annotation(ver="new")

        # old annotation (to compare)
        self.oldfilename = species + ".gtf"
        self.oldlines = get_lines("../data/annotation", self.oldfilename)
        self.dxxxFBgn = self.get_geneid_from_annotation(ver="old")

        # compare updated annotation with old (postive v3 vs old-gtf; negative old-gtf vs v3)
        self.statspositivelines = get_lines("../data/annotation",
                                            species + ".statspositive")
        self.statsnegativelines = get_lines("../data/annotation",
                                            species + ".statsnegative")
        self.stats = self.get_stats()
        self.tmaplines = get_lines("../data/annotation",
                                   species + ".v3.gtf.tmap")
        self.dxxxFBgn2GdxxxID, self.GdxxxID2dxxxFBgn = self.get_tmap()

        # there are ortholog one2one (GdxxxID 2 dxxxFBgn 2 dmelFBgn)
        self.filenameolo121 = species + ".concise.one2one"
        self.linesolo121 = get_lines("../data/ortholog", self.filenameolo121)
        (self.GdxxxID2dmelFBgn,
         self.dxxxFBgn2dmelFBgn) = self.get_olo121_info()

        # these dmelFBgn lost one2one due to annotation update, and I need to use dxxxFBgn
        self.lost1_dmelFBgn = set(self.dxxxFBgn2dmelFBgn.values()) - set(
            self.GdxxxID2dmelFBgn.values())
        self.lost1_dxxxFBgn = set([
            switch_key_value_in_dict(self.dxxxFBgn2dmelFBgn)[i]
            for i in self.lost1_dmelFBgn
        ])

        ## there are some newly identified orthologs 121
        self.filenameneo121 = species + ".neo121"
        self.linesneo121 = get_lines("../data/ortholog", self.filenameneo121)
        self.add_neo_to_GdxxxID2dmelFBgn()

        if species == "dmel":
            self.dmelFBgn2dmelSymbol = self.get_dmelFBgn2dmelSymbol()