def __init__(self, species, sex, tissue, replicate): self.species = species self.sex = sex self.tissue = tissue self.replicate = replicate self.name = species + "_" + sex + "_" + tissue + "_" + replicate self.filenamePacBioBed = self.name + ".bam.bed" print("get lines of " + self.filenamePacBioBed) self.linesPacBioBed = get_lines("../data/pacbio", self.filenamePacBioBed) print("get isoseqid to exonlen ...") self.isoseqid2exonlen, self.isoseqid2line = get_id2exonlen_id2line( self.linesPacBioBed) self.filenameAnnotationBedA = self.species + ".genePred.bed" print("get lines of " + self.filenameAnnotationBedA) self.linesAnnotationBedA = get_lines("../data/annotation", self.filenameAnnotationBedA) print("get gtf's transid to exonlen ...") self.transidA2exonlen, self.transidA2line = get_id2exonlen_id2line( self.linesAnnotationBedA) self.filenameAnnotationBedB = self.species + ".v3.genePred.bed" print("get lines of " + self.filenameAnnotationBedB) self.linesAnnotationBedB = get_lines("../data/annotation", self.filenameAnnotationBedB) print("get v3.gtf's transid to exonlen ...") self.transidB2exonlen, self.transidB2line = get_id2exonlen_id2line( self.linesAnnotationBedB) self.filenameA = self.name + ".bam.gtf.intersect_gtf" self.filenameB = self.name + ".bam.gtf.intersect_v3gtf" print("get lines of " + self.filenameA) self.linesA = get_lines("../data/pacbio", self.filenameA) print("get lines of " + self.filenameB) self.linesB = get_lines("../data/pacbio", self.filenameB) print("get intersection A info ...") (self.isoseqid2transidA, self.transidA2isoseqid) = self.get_intersectinfo("A") print("get intersection B info ...") (self.isoseqid2transidB, self.transidB2isoseqid) = self.get_intersectinfo("B") self.isoseqid2besttransidA = self.get_isoseqid2besttransid("A") self.isoseqid2besttransidB = self.get_isoseqid2besttransid("B") """ get jaccard info such as intersection union jaccard for each isoseqid """ print("get jaccard A info ...") self.jaccard_infoA = self.get_jaccard_info("A") print("get jaccard B info ...") self.jaccard_infoB = self.get_jaccard_info("B") """ get end info for each isoseqid """ print("get end A info ...") self.end_infoA = self.get_end_info("A") print("get end B info ...") self.end_infoB = self.get_end_info("B")
def __init__(self, sample): self.sample = sample self.species, self.sex, self.tissue, self.replicate = self.sample.split( "_") self.v3exonmap = self.get_exonmap("v3") # transcript level self.fbexonmap = self.get_exonmap("fb") # transcript level self.v3salmonlines = get_lines("../data/salmon/FB2017_03_v3/", sample + ".salmon.txt") self.fbsalmonlines = get_lines("../data/salmon/FB2017_03/", sample + ".salmon.txt") self.fbsalmon = self.get_fbsalmon() self.DxxxTID2YOtrID = self.get_DxxxTID2YOtrID() self.DxxxTID2FBtrID = self.get_DxxxTID2FBtrID() self.DxxxTID2jaccard = self.get_DxxxTID2jaccard()
def __init__(self, species): self.species = species # v3 updated annotation self.filename = species + ".expression.nrc.tab" self.lines = get_lines("../data/expression", self.filename) self.expressionheader, self.geneid2expression = self.get_geneid2expression( 1) # FB2017_03 old annotation self.oldfilename = species + ".expression.onrc.tab" self.oldlines = get_lines("../data/expression", self.oldfilename) self.oldexpressionheader, self.oldgeneid2expression = self.get_geneid2expression( 0)
def __init__(self, sample): self.sample = sample self.species, self.sex, self.tissue, self.replicate = self.sample.split( "_") self.v3exonmap = self.get_exonmap("v3") self.fbexonmap = self.get_exonmap("fb") self.v3htseqlines = get_lines("../data/htseq/FB2017_03_v3/", sample + ".htseq_reverse_HiSAT2.txt") self.fbhtseqlines = get_lines("../data/htseq/FB2017_03/", sample + ".htseq_reverse_HiSAT2.txt") self.fbhtseq = self.get_fbhtseq() self.DxxxGID2YOgnID = self.get_DxxxGID2YOgnID() self.DxxxGID2FBgnID = self.get_DxxxGID2FBgnID() self.DxxxGID2jaccard = self.get_DxxxGID2jaccard() self.update_v3htseq_lines()
def __init__(self, species, sex, tissue): self.species = species self.sex = sex self.tissue = tissue self.filename = self.species + "_" + sex + "_" + tissue + ".merged.events" self.lines = get_lines("../data/event", self.filename) self.eventinfo = self.get_eventinfo()
def __init__(self, species, sex, tissue): self.species = species self.sex = sex self.tissue = tissue # self.filename = self.species + "_" + sex + "_" + tissue + ".merged.juncs" # spanki juncs self.filename = self.species + "_" + sex + "_" + tissue + ".sorted.junc.bed" self.lines = get_lines("../data/junction", self.filename) self.juncinfo = self.get_juncinfo()
def get_geneidtransid2strand(self): """ get transid 2 strand """ dct = dict() for line in get_lines("../data/annotation/", self.species + ".v3.gtf"): (scaffold, tag, feature, start, end, scoare, strand, dot, others) = line.rstrip().split("\t") this_geneid = get_id(others, 'gene_id') this_transid = get_id(others, 'transcript_id') dct[this_geneid + "|" + this_transid] = strand return dct
def __init__(self, species, sex, tissue, replicate): self.species = species self.sex = sex self.tissue = tissue self.replicate = replicate self.sample = species + "_" + sex + "_" + tissue + "_" + replicate self.geneidtransid2strand = self.get_geneidtransid2strand() self.trackinglines = get_lines("../data/pacbio/", "jaccard_zero." + self.sample + ".exon.gtf.tracking") self.isoseqid2overlaptype = self.get_isoseqid2overlaptype()
def __init__(self, filename): """ filename example: pacbio_new_gene_model.all_phase_peptide """ self.filename = filename self.peptidefasta = Fasta("../data/pacbio/" + filename + ".fasta") print(self.peptidefasta) self.signalplines = get_lines("../data/pacbio", filename + ".fasta.signalp") self.position2manualinfo = get_position2manualinfo( "../data/pacbio/pacbio_new_gene_model.tab")
def get_DxxxGID2FBgnID(self): """ get connection between DxxxGID and FBgn """ lines = get_lines("../data/ortholog/", self.species + ".ee") DxxxGID2FBgnID = dict() for line in lines: temp = line.rstrip().split() DxxxGID = temp[0] FBgnID = temp[1] DxxxGID2FBgnID[DxxxGID] = FBgnID return (DxxxGID2FBgnID)
def get_isoseqid2position(self): lines = get_lines("../data/pacbio", self.sample + ".bam.bed") dct = dict() for line in lines: (chrom, chromStart, chromEnd, position_isoseqid, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts) = line.rstrip().split("\t") position, isoseqid = position_isoseqid.split(".") dct[isoseqid] = position return dct
def get_jaccard_plus_isoseqid(self): """ get jaccard > 0 isoseqid """ lines = get_lines( "../data/output", self.species + "_" + self.sex + "_" + self.tissue + "_" + self.replicate + ".B.txt") st = set() for line in lines: (position_isoseqid, transid, intersection, union, jaccard) = line.rstrip().split("\t") position, isoseqid = position_isoseqid.split(".") if float(jaccard) > 0: st.add(isoseqid) return st
def get_isoseqid2intronnum(self): """ get intron information from bam.bed """ lines = get_lines("../data/pacbio", self.sample + ".bam.bed") dct = dict() for line in lines: (chrom, chromStart, chromEnd, position_isoseqid, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts) = line.rstrip().split("\t") position, isoseqid = position_isoseqid.split(".") exons = focalintersect.get_exons(int(chromStart), int(chromEnd), blockSizes, blockStarts) num_exons = len(exons) dct[isoseqid] = num_exons - 1 return dct
def __init__(self, species): self.species = species # self.filename = species + ".SVGpredAdded.v2.gtf" self.filename = species + ".v3.gtf" self.lines = get_lines("../data/annotation", self.filename) self.GdxxxID = self.get_geneid_from_annotation(ver="new") # old annotation (to compare) self.oldfilename = species + ".gtf" self.oldlines = get_lines("../data/annotation", self.oldfilename) self.dxxxFBgn = self.get_geneid_from_annotation(ver="old") # compare updated annotation with old (postive v3 vs old-gtf; negative old-gtf vs v3) self.statspositivelines = get_lines("../data/annotation", species + ".statspositive") self.statsnegativelines = get_lines("../data/annotation", species + ".statsnegative") self.stats = self.get_stats() self.tmaplines = get_lines("../data/annotation", species + ".v3.gtf.tmap") self.dxxxFBgn2GdxxxID, self.GdxxxID2dxxxFBgn = self.get_tmap() # there are ortholog one2one (GdxxxID 2 dxxxFBgn 2 dmelFBgn) self.filenameolo121 = species + ".concise.one2one" self.linesolo121 = get_lines("../data/ortholog", self.filenameolo121) (self.GdxxxID2dmelFBgn, self.dxxxFBgn2dmelFBgn) = self.get_olo121_info() # these dmelFBgn lost one2one due to annotation update, and I need to use dxxxFBgn self.lost1_dmelFBgn = set(self.dxxxFBgn2dmelFBgn.values()) - set( self.GdxxxID2dmelFBgn.values()) self.lost1_dxxxFBgn = set([ switch_key_value_in_dict(self.dxxxFBgn2dmelFBgn)[i] for i in self.lost1_dmelFBgn ]) ## there are some newly identified orthologs 121 self.filenameneo121 = species + ".neo121" self.linesneo121 = get_lines("../data/ortholog", self.filenameneo121) self.add_neo_to_GdxxxID2dmelFBgn() if species == "dmel": self.dmelFBgn2dmelSymbol = self.get_dmelFBgn2dmelSymbol()