def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,pacbporfA, orfSetObject,queryorsbjct,verbose = False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) else: (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) else: (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range( 3-dPhase % 3, len(query), 3) for pos in range(0,len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([ 0, (len(query) - query.count("N"))/2 ]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) #################################################### if verbose: print (pacbporfD.orfQ.id,pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname,'w') fh.write(sfmpat+"\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % ( donorOrf.inputgenomicsequence, EXECUTABLE_SFM,fname, dObj.pos+(kwargs['min_intron_nt_length']-3), aObj.pos-(kwargs['min_intron_nt_length']-3) ) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr,seqmatch in matches.iteritems(): startQ,stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_elegiable_orfs( max_orf_start=exonQstart,min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:],splicetype='donor') aScore = _score_splice_site(seqmatch[0:11],splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos-dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos-intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print (pacbporfD.orfQ.id,tinyexonorf.id,pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = ( tinyexonorf, exonQstart, exonQstop ) sbjct_data = ( prjctOrf, posDsbjct, posAsbjct ) splicesite_data = ( dObj,intron1_aObj, intron2_dObj, aObj ) tinyexons.append( ( query_data, sbjct_data, splicesite_data ) ) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data,sbjct_data,splicesite_data) = tinyexons[0] orfQ,query_dna_start,query_dna_end = query_data orfS,sbjct_dna_start,sbjct_dna_end = sbjct_data (intron1_dObj,intron1_aObj,intron2_dObj,intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) -1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) -1 stopQaa = orfQ.dnapos2aapos(query_dna_end) +1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) +1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 while startSaa <= orfS.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 while stopSaa > orfS.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa,abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa,abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa-startQaa), print (query_dna_start,query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa-startSaa), print (sbjct_dna_start,sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start-2:query_dna_end+2] print orfS.inputgenomicsequence[sbjct_dna_start-2:sbjct_dna_end+2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( qAAseq, sAAseq, startQaa, startSaa ) ) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp,orfQ,orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs( intron1_dObj, intron1_aObj, None, donorOrf,pacbporf.orfQ ) intron2 = IntronConnectingOrfs( intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf ) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1,pacbporfD,pacbporf) succes = set_apps_intron_query(intron2,pacbporf,pacbporfA) else: succes = set_apps_intron_sbjct(intron1,pacbporfD,pacbporf) succes = set_apps_intron_sbjct(intron2,pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [ pacbporf ] intron2._linked_to_pacbporfs = [ pacbporf ] intron1._linked_to_introns = [ intron2 ] intron2._linked_to_introns = [ intron1 ] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1,intron2,pacbporf)]
def _merge_pacbporfs_by_two_tinyexons(pacbporfD,pacbporfA, orfSetObject,queryorsbjct,verbose = False, **kwargs): """ """ # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) tinyexons = [] sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS dStart,dEnd = sposD.query_dna_start, eposD.query_dna_end aStart,aEnd = sposA.query_dna_start, eposA.query_dna_end elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ dStart,dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end aStart,aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # get all potential combinations of two tinyexons tinyexoncombis = merge_orfs_with_two_tinyexons( donorOrf, accepOrf, donorOrf._donor_sites, accepOrf._acceptor_sites, orfSetObject.orfs, ) results = [] for dObj in donorOrf._donor_sites: if queryorsbjct == "query": (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) else: (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break # check if dObj is on pfD; # introns of tinyexons can be projected outside of pfD/pfA area if dObj.pos < dStart: continue for aObj in accepOrf._acceptor_sites: if queryorsbjct == "query": (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) else: (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break # check if aObj is on pfA; # introns of tinyexons can be projected outside of pfD/pfA area if aObj.pos > aEnd: continue if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= (kwargs['max_tinyexon_nt_length']*2): break if distance < (kwargs['min_tinyexon_nt_length']*2): continue filtered_tinyexoncombis = _filter_tinyexoncombis(tinyexoncombis, min_length = distance, max_length = distance, min_first_acceptor_pos = dObj.pos + kwargs['min_tinyexon_intron_nt_length'], max_final_donor_pos = aObj.pos - kwargs['min_tinyexon_intron_nt_length'], phase_final_donor = aObj.phase, phase_first_acceptor= dObj.phase, ) if not filtered_tinyexoncombis: continue #################################################################### if verbose: print distance, dObj, aObj, len(tinyexoncombis), print len(filtered_tinyexoncombis) #################################################################### for exon1,intron,exon2 in filtered_tinyexoncombis: # make preceding intron preceding_intron = IntronConnectingOrfs( dObj,exon1.acceptor, None,donorOrf,exon1.orf ) # make subsequent intron subsequent_intron = IntronConnectingOrfs( exon2.donor, aObj, None,exon2.orf,accepOrf) ################################################################ if verbose: print "\t", exon1, exon1.proteinsequence(), print preceding_intron.phase, exon1.donor.phase, print subsequent_intron.phase, preceding_intron.shared_aa, print intron.shared_aa, subsequent_intron.shared_aa print "\t", exon2, exon2.proteinsequence() ################################################################ # get prjctOrf sequence for comparison correctionA = 0 if aObj.phase != 0: # INCLUDE the final AA which is broken by the splicesite correctionA=1 if queryorsbjct == "query": startPos,_phase = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) stopPos,_phase = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) start = pacbporfD._positions[startPos].sbjct_pos stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA else: startPos,_phase = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) stopPos,_phase = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) start = pacbporfD._positions[startPos].query_pos stop = pacbporfA._positions[stopPos].query_pos + correctionA if stop <= start: # tinyexon is so tiny that is does not have a single # full aligned AA -> discard here continue # actually get the prjctOrf sequence aaseq = prjctOrf.getaas(abs_pos_start=start,abs_pos_end=stop) # initialize a PacbP for the combination of both tinyexons # afterwards, check if the indentityscore is > 0.XX from pacb import PacbP seqparts = [ preceding_intron.shared_aa, exon1.proteinsequence(), intron.shared_aa, exon2.proteinsequence(), subsequent_intron.shared_aa ] ################################################################ if verbose or len("".join(seqparts)) != len(aaseq): print pacbporfD print exon1.orf, exon2.orf, prjctOrf print pacbporfA print seqparts print aaseq, len(aaseq), len("".join(seqparts)), (start,stop) print "'%s'" % queryorsbjct, print "Q", (algDobj.query_pos, algAobj.query_pos), print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos) print "distance:", distance, kwargs['max_tinyexon_nt_length'], print (posDsbjct, posAsbjct), print "Q-dna:", ( algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase ), print "S-dna:", ( algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase ) ################################################################ # ignore by continue when sequences not identical in length if len("".join(seqparts)) != len(aaseq): continue testpacbp = PacbP(input=( "".join(seqparts), aaseq, 0, 0) ) testpacbp.strip_unmatched_ends() if not ( testpacbp.identityscore > 0.60 and\ (float(testpacbp.length) / len(aaseq)) > 0.70 ): # not a very convincing alignment continue ################################################################ if verbose: print testpacbp testpacbp.print_protein() ################################################################ # if here, succesfully mapped 2 tiny exons!! # get all sequences/coordinates in place for # pacbporf formation orfQ1 = exon1.orf orfS1 = prjctOrf orfQ2 = exon2.orf orfS2 = prjctOrf seqQ1 = exon1.proteinsequence() seqQ2 = exon2.proteinsequence() coordQ1 = exon1.acceptor.pos / 3 coordS1 = start coordQ2 = exon2.acceptor.pos / 3 coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len(seqparts[2]) seqS1 = aaseq[0:(len(seqparts[0])+len(seqparts[1]))] seqS2 = aaseq[-(len(seqparts[3])+len(seqparts[4])):] if len(seqparts[0]): seqS1 = seqS1[1:] coordS1 += 1 if len(seqparts[4]): seqS2 = seqS2[:-1] if queryorsbjct == "sbjct": # swap query <-> sbjct orfQ1,orfS1 = orfS1,orfQ1 orfQ2,orfS2 = orfS2,orfQ2 seqQ1,seqS1 = seqS1,seqQ1 seqQ2,seqS2 = seqS2,seqQ2 coordQ1,coordS1 = coordS1,coordQ1 coordQ2,coordS2 = coordS2,coordQ2 ################################################################ if verbose: print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2 print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2 ################################################################ # make pacbporfs pacbp1 = PacbP(input=( seqQ1, seqS1, coordQ1, coordS1) ) pacbp1.strip_unmatched_ends() tinypacbporf1 = pacbp2pacbporf(pacbp1,orfQ1,orfS1) tinypacbporf1.extend_pacbporf_after_stops() pacbp2 = PacbP(input=( seqQ2, seqS2, coordQ2, coordS2) ) pacbp2.strip_unmatched_ends() tinypacbporf2 = pacbp2pacbporf(pacbp2,orfQ2,orfS2) tinypacbporf2.extend_pacbporf_after_stops() ################################################################ if verbose: print tinypacbporf1 tinypacbporf1.print_protein_and_dna() print tinypacbporf2 tinypacbporf2.print_protein_and_dna() ################################################################ ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron preceding_intron._distance = 0 intron._distance = 0 subsequent_intron._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(preceding_intron,pacbporfD,tinypacbporf1) succes = set_apps_intron_query(intron,tinypacbporf1,tinypacbporf2) succes = set_apps_intron_query(subsequent_intron,tinypacbporf2,pacbporfA) else: succes = set_apps_intron_sbjct(preceding_intron,pacbporfD,tinypacbporf1) succes = set_apps_intron_sbjct(intron,tinypacbporf1,tinypacbporf2) succes = set_apps_intron_sbjct(subsequent_intron,tinypacbporf2,pacbporfA) # set GFF fsource attribute for recognition of intron sources preceding_intron._gff['fsource'] = "ABGPprojectingTE" intron._gff['fsource'] = "ABGPprojectingTE" subsequent_intron._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] preceding_intron._linked_to_introns = [ intron,subsequent_intron ] intron._linked_to_introns = [ preceding_intron,subsequent_intron ] subsequent_intron._linked_to_introns = [ intron,preceding_intron ] ################################################################ # append to results ################################################################ results.append( ( preceding_intron, intron, subsequent_intron, tinypacbporf1, tinypacbporf2, ) ) # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row) return results
def _merge_pacbporfs_by_two_tinyexons(pacbporfD, pacbporfA, orfSetObject, queryorsbjct, verbose=False, **kwargs): """ """ # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) tinyexons = [] sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS dStart, dEnd = sposD.query_dna_start, eposD.query_dna_end aStart, aEnd = sposA.query_dna_start, eposA.query_dna_end elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ dStart, dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end aStart, aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # get all potential combinations of two tinyexons tinyexoncombis = merge_orfs_with_two_tinyexons( donorOrf, accepOrf, donorOrf._donor_sites, accepOrf._acceptor_sites, orfSetObject.orfs, ) results = [] for dObj in donorOrf._donor_sites: if queryorsbjct == "query": (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos, forced_return=True) else: (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos, forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break # check if dObj is on pfD; # introns of tinyexons can be projected outside of pfD/pfA area if dObj.pos < dStart: continue for aObj in accepOrf._acceptor_sites: if queryorsbjct == "query": (aPos, aPhase) = pacbporfA.dnaposition_query(aObj.pos, forced_return=True) else: (aPos, aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos, forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break # check if aObj is on pfA; # introns of tinyexons can be projected outside of pfD/pfA area if aObj.pos > aEnd: continue if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= (kwargs['max_tinyexon_nt_length'] * 2): break if distance < (kwargs['min_tinyexon_nt_length'] * 2): continue filtered_tinyexoncombis = _filter_tinyexoncombis( tinyexoncombis, min_length=distance, max_length=distance, min_first_acceptor_pos=dObj.pos + kwargs['min_tinyexon_intron_nt_length'], max_final_donor_pos=aObj.pos - kwargs['min_tinyexon_intron_nt_length'], phase_final_donor=aObj.phase, phase_first_acceptor=dObj.phase, ) if not filtered_tinyexoncombis: continue #################################################################### if verbose: print distance, dObj, aObj, len(tinyexoncombis), print len(filtered_tinyexoncombis) #################################################################### for exon1, intron, exon2 in filtered_tinyexoncombis: # make preceding intron preceding_intron = IntronConnectingOrfs( dObj, exon1.acceptor, None, donorOrf, exon1.orf) # make subsequent intron subsequent_intron = IntronConnectingOrfs( exon2.donor, aObj, None, exon2.orf, accepOrf) ################################################################ if verbose: print "\t", exon1, exon1.proteinsequence(), print preceding_intron.phase, exon1.donor.phase, print subsequent_intron.phase, preceding_intron.shared_aa, print intron.shared_aa, subsequent_intron.shared_aa print "\t", exon2, exon2.proteinsequence() ################################################################ # get prjctOrf sequence for comparison correctionA = 0 if aObj.phase != 0: # INCLUDE the final AA which is broken by the splicesite correctionA = 1 if queryorsbjct == "query": startPos, _phase = pacbporfD.dnaposition_query( dObj.pos, forced_return=True) stopPos, _phase = pacbporfA.dnaposition_query( aObj.pos, forced_return=True) start = pacbporfD._positions[startPos].sbjct_pos stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA else: startPos, _phase = pacbporfD.dnaposition_sbjct( dObj.pos, forced_return=True) stopPos, _phase = pacbporfA.dnaposition_sbjct( aObj.pos, forced_return=True) start = pacbporfD._positions[startPos].query_pos stop = pacbporfA._positions[stopPos].query_pos + correctionA if stop <= start: # tinyexon is so tiny that is does not have a single # full aligned AA -> discard here continue # actually get the prjctOrf sequence aaseq = prjctOrf.getaas(abs_pos_start=start, abs_pos_end=stop) # initialize a PacbP for the combination of both tinyexons # afterwards, check if the indentityscore is > 0.XX from pacb import PacbP seqparts = [ preceding_intron.shared_aa, exon1.proteinsequence(), intron.shared_aa, exon2.proteinsequence(), subsequent_intron.shared_aa ] ################################################################ if verbose or len("".join(seqparts)) != len(aaseq): print pacbporfD print exon1.orf, exon2.orf, prjctOrf print pacbporfA print seqparts print aaseq, len(aaseq), len("".join(seqparts)), (start, stop) print "'%s'" % queryorsbjct, print "Q", (algDobj.query_pos, algAobj.query_pos), print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos) print "distance:", distance, kwargs[ 'max_tinyexon_nt_length'], print(posDsbjct, posAsbjct), print "Q-dna:", (algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase), print "S-dna:", (algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase) ################################################################ # ignore by continue when sequences not identical in length if len("".join(seqparts)) != len(aaseq): continue testpacbp = PacbP(input=("".join(seqparts), aaseq, 0, 0)) testpacbp.strip_unmatched_ends() if not ( testpacbp.identityscore > 0.60 and\ (float(testpacbp.length) / len(aaseq)) > 0.70 ): # not a very convincing alignment continue ################################################################ if verbose: print testpacbp testpacbp.print_protein() ################################################################ # if here, succesfully mapped 2 tiny exons!! # get all sequences/coordinates in place for # pacbporf formation orfQ1 = exon1.orf orfS1 = prjctOrf orfQ2 = exon2.orf orfS2 = prjctOrf seqQ1 = exon1.proteinsequence() seqQ2 = exon2.proteinsequence() coordQ1 = exon1.acceptor.pos / 3 coordS1 = start coordQ2 = exon2.acceptor.pos / 3 coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len( seqparts[2]) seqS1 = aaseq[0:(len(seqparts[0]) + len(seqparts[1]))] seqS2 = aaseq[-(len(seqparts[3]) + len(seqparts[4])):] if len(seqparts[0]): seqS1 = seqS1[1:] coordS1 += 1 if len(seqparts[4]): seqS2 = seqS2[:-1] if queryorsbjct == "sbjct": # swap query <-> sbjct orfQ1, orfS1 = orfS1, orfQ1 orfQ2, orfS2 = orfS2, orfQ2 seqQ1, seqS1 = seqS1, seqQ1 seqQ2, seqS2 = seqS2, seqQ2 coordQ1, coordS1 = coordS1, coordQ1 coordQ2, coordS2 = coordS2, coordQ2 ################################################################ if verbose: print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2 print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2 ################################################################ # make pacbporfs pacbp1 = PacbP(input=(seqQ1, seqS1, coordQ1, coordS1)) pacbp1.strip_unmatched_ends() tinypacbporf1 = pacbp2pacbporf(pacbp1, orfQ1, orfS1) tinypacbporf1.extend_pacbporf_after_stops() pacbp2 = PacbP(input=(seqQ2, seqS2, coordQ2, coordS2)) pacbp2.strip_unmatched_ends() tinypacbporf2 = pacbp2pacbporf(pacbp2, orfQ2, orfS2) tinypacbporf2.extend_pacbporf_after_stops() ################################################################ if verbose: print tinypacbporf1 tinypacbporf1.print_protein_and_dna() print tinypacbporf2 tinypacbporf2.print_protein_and_dna() ################################################################ ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron preceding_intron._distance = 0 intron._distance = 0 subsequent_intron._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(preceding_intron, pacbporfD, tinypacbporf1) succes = set_apps_intron_query(intron, tinypacbporf1, tinypacbporf2) succes = set_apps_intron_query(subsequent_intron, tinypacbporf2, pacbporfA) else: succes = set_apps_intron_sbjct(preceding_intron, pacbporfD, tinypacbporf1) succes = set_apps_intron_sbjct(intron, tinypacbporf1, tinypacbporf2) succes = set_apps_intron_sbjct(subsequent_intron, tinypacbporf2, pacbporfA) # set GFF fsource attribute for recognition of intron sources preceding_intron._gff['fsource'] = "ABGPprojectingTE" intron._gff['fsource'] = "ABGPprojectingTE" subsequent_intron._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] intron._linked_to_pacbporfs = [tinypacbporf1, tinypacbporf2] subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] preceding_intron._linked_to_introns = [ intron, subsequent_intron ] intron._linked_to_introns = [ preceding_intron, subsequent_intron ] subsequent_intron._linked_to_introns = [ intron, preceding_intron ] ################################################################ # append to results ################################################################ results.append(( preceding_intron, intron, subsequent_intron, tinypacbporf1, tinypacbporf2, )) # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row) return results
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD, pacbporfA, orfSetObject, queryorsbjct, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos, forced_return=True) else: (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos, forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos, aPhase) = pacbporfA.dnaposition_query(aObj.pos, forced_return=True) else: (aPos, aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos, forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range(3 - dPhase % 3, len(query), 3) for pos in range(0, len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([0, (len(query) - query.count("N")) / 2]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) #################################################### if verbose: print(pacbporfD.orfQ.id, pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname, 'w') fh.write(sfmpat + "\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % (donorOrf.inputgenomicsequence, EXECUTABLE_SFM, fname, dObj.pos + (kwargs['min_intron_nt_length'] - 3), aObj.pos - (kwargs['min_intron_nt_length'] - 3)) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr, seqmatch in matches.iteritems(): startQ, stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_eligible_orfs( max_orf_start=exonQstart, min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:], splicetype='donor') aScore = _score_splice_site(seqmatch[0:11], splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos - dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos - intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print(pacbporfD.orfQ.id, tinyexonorf.id, pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = (tinyexonorf, exonQstart, exonQstop) sbjct_data = (prjctOrf, posDsbjct, posAsbjct) splicesite_data = (dObj, intron1_aObj, intron2_dObj, aObj) tinyexons.append((query_data, sbjct_data, splicesite_data)) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data, sbjct_data, splicesite_data) = tinyexons[0] orfQ, query_dna_start, query_dna_end = query_data orfS, sbjct_dna_start, sbjct_dna_end = sbjct_data (intron1_dObj, intron1_aObj, intron2_dObj, intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) - 1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) - 1 stopQaa = orfQ.dnapos2aapos(query_dna_end) + 1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) + 1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 while startSaa <= orfS.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 while stopSaa > orfS.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa, abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa, abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa - startQaa), print(query_dna_start, query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa - startSaa), print(sbjct_dna_start, sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start - 2:query_dna_end + 2] print orfS.inputgenomicsequence[sbjct_dna_start - 2:sbjct_dna_end + 2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=(qAAseq, sAAseq, startQaa, startSaa)) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp, orfQ, orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs(intron1_dObj, intron1_aObj, None, donorOrf, pacbporf.orfQ) intron2 = IntronConnectingOrfs(intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1, pacbporfD, pacbporf) succes = set_apps_intron_query(intron2, pacbporf, pacbporfA) else: succes = set_apps_intron_sbjct(intron1, pacbporfD, pacbporf) succes = set_apps_intron_sbjct(intron2, pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [pacbporf] intron2._linked_to_pacbporfs = [pacbporf] intron1._linked_to_introns = [intron2] intron2._linked_to_introns = [intron1] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1, intron2, pacbporf)]
def merge_orfs_with_two_tinyexons(preceding_orf, subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[], **kwargs): """ Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @attention: see get_potential_tiny_exons_on_orf for additional **kwargs @rtype: list @return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron ) """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) # return list with (intron,tinyexon,intron) tuples returntinyexons = [] tinyexoncollection = [] tinyexoncombis = [] min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites]) max_subsequent_acceptor_sites_pos = max( [a.pos for a in subsequent_acceptor_sites]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes min_pos = min_preceding_donor_sites_pos + kwargs[ 'min_tinyexon_intron_nt_length'] max_pos = max_subsequent_acceptor_sites_pos - kwargs[ 'min_tinyexon_intron_nt_length'] # if so, do not check this Orf if orfX.endPY <= min_pos: continue if orfX.startPY >= max_pos: continue # extend the tinyexoncollection tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX, **kwargs)) # make tinyexoncollection ordered on start pos tinyexoncollection = _order_intron_list(tinyexoncollection, order_by='donor_pos') # donor_pos makes REVERSE ordering; restore this by reversing tinyexoncollection.reverse() # make 2-elemented tuples of tinyexons which can co-occur together for tinyexon1 in tinyexoncollection: for pos in range(len(tinyexoncollection) - 1, -1, -1): tinyexon2 = tinyexoncollection[pos] if tinyexon2.donor.pos < tinyexon1.donor.pos: break intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue # if here, elegiable combi! intron = IntronConnectingOrfs( tinyexon1.donor, tinyexon2.acceptor, get_shared_nucleotides_at_splicesite(subsequent_orf, preceding_orf, tinyexon2.acceptor, tinyexon1.donor), preceding_orf, subsequent_orf) totlen = tinyexon1.length + tinyexon2.length combi = (totlen, tinyexon1, intron, tinyexon2) tinyexoncombis.append(combi) # return an ordered list based on length tinyexoncombis.sort() return [(exon1, intron, exon2) for l, exon1, intron, exon2 in tinyexoncombis]
def merge_orfs_with_intron( orfD, orfA, max_intron_nt_length=MAX_INTRON_NT_LENGTH, min_intron_nt_length=MIN_INTRON_NT_LENGTH, min_donor_pssm_score=MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=MIN_ACCEPTOR_PSSM_SCORE, allow_non_canonical_donor=ALLOW_NON_CANONICAL_DONOR, allow_non_canonical_acceptor=ALLOW_NON_CANONICAL_ACCEPTOR, non_canonical_min_donor_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE, non_canonical_min_acceptor_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE, min_donor_pos=None, max_donor_pos=None, min_acceptor_pos=None, max_acceptor_pos=None, order_by='length', **kwargs): """ Merge 2 Orf objects by introns @attention: **kwargs can contain other (here) unnecessarily arguments @type orfD: Orf object @param orfD: Orf object that has to deliver a PSSM donor object @type orfA: Orf object @param orfA: Orf object that has to deliver a PSSM acceptor object @type max_intron_nt_length: integer @param max_intron_nt_length: maximal length (nt) of the intron @type min_intron_nt_length: integer @param min_intron_nt_length: minimal length (nt) of the intron @type min_donor_pssm_score: float @param min_donor_pssm_score: minimal pssm score of donor splice site @type min_acceptor_pssm_score: float @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site @type allow_non_canonical_donor: Boolean @param allow_non_canonical_donor: search for non-canonical donor sites too @type allow_non_canonical_acceptor: Boolean @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too @type non_canonical_min_donor_pssm_score: float @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor @type non_canonical_min_acceptor_pssm_score: float @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor @rtype: list @return: list with introns """ # input validation IsOrf(orfD) IsOrf(orfA) # scan for splice sites (if not already done -> is checked in function) orfD.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=min_donor_pssm_score, allow_non_canonical=allow_non_canonical_donor, non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score) orfA.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=min_acceptor_pssm_score, allow_non_canonical=allow_non_canonical_acceptor, non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score) # return list with introns introns = [] # most quickest scan possible: are there donors & acceptors? if orfD._donor_sites == [] or orfA._acceptor_sites == []: # no introns possible because splice sites are missing return introns # very quick scan: are exons not to far from each other? if max_intron_nt_length and\ (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length: # no introns possible that can bridge this gap return introns for donor in orfD._donor_sites: if not allow_non_canonical_donor and not donor.is_canonical(): continue elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score: continue elif not donor.is_canonical( ) and donor.pssm_score < non_canonical_min_donor_pssm_score: continue elif (min_donor_pos or min_donor_pos == 0) and donor.pos < min_donor_pos: continue elif (max_donor_pos or max_donor_pos == 0) and donor.pos > max_donor_pos: continue else: # donor site accepted pass for acceptor in orfA._acceptor_sites: if not allow_non_canonical_acceptor and not acceptor.is_canonical( ): continue elif acceptor.is_canonical( ) and acceptor.pssm_score < min_acceptor_pssm_score: continue elif not acceptor.is_canonical( ) and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score: continue elif (min_acceptor_pos or min_acceptor_pos == 0) and acceptor.pos < min_acceptor_pos: continue elif (max_acceptor_pos or max_acceptor_pos == 0) and acceptor.pos > max_acceptor_pos: continue else: # acceptor site accepted pass # generate intron length and phase variable intron_length = acceptor.pos - donor.pos intron_phase = intron_length % 3 # check phase compatibilty (1) of splice sites if donor.phase != acceptor.phase: continue # check phase compatibilty (2) of splice sites if (intron_phase + orfD.frame) % 3 != orfA.frame % 3: continue # check if intron length is in between the boundaries if max_intron_nt_length and intron_length > max_intron_nt_length: continue if min_intron_nt_length and intron_length < min_intron_nt_length: continue # okay, if we reach this point, we have a valid intron shared_nts = get_shared_nucleotides_at_splicesite( orfA, orfD, acceptor, donor) # make a IntronConnectingOrfs object intron = IntronConnectingOrfs(donor, acceptor, shared_nts, orfD, orfA) introns.append(intron) # return ordered intron list return _order_intron_list(introns, order_by=order_by)
def merge_orfs_with_tinyexon( preceding_orf, subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[], max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH, min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH, max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH, min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH, min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE, min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE, min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE, **kwargs): """ Bridge two `neighbouring` Orfs by a tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @type max_tinyexon_nt_length: integer @param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt @type min_tinyexon_nt_length: integer @param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt @type max_tinyexon_intron_nt_length: integer @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt @type min_tinyexon_intron_nt_length: integer @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt @type min_total_pssm_score: float or None @param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon @type min_donor_pssm_score: float or None @param min_donor_pssm_score: minimal donor pssm score of tinyexon @type min_acceptor_pssm_score: float or None @param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon @rtype: list @return: list of tuples ( preceding_intron, tinyexon, subsequent_intron ) @attention: Global vars that have to be set upon usage: MIN_DONOR_PSSM_SCORE MIN_ACCEPTOR_PSSM_SCORE # and all TINYEXON variable named TINYEXON_MAX_NT_LENGTH TINYEXON_MIN_NT_LENGTH TINYEXON_MAX_INTRON_NT_LENGTH TINYEXON_MIN_INTRON_NT_LENGTH TINYEXON_MIN_PSSM_SCORE TINYEXON_MIN_DONOR_PSSM_SCORE TINYEXON_MIN_ACCEPTOR_PSSM_SCORE TINYEXON_ALLOW_NON_CANONICAL_DONOR TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # return list with (intron,tinyexon,intron) tuples returnexons = [] min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites]) max_subsequent_acceptor_sites_pos = max( [a.pos for a in subsequent_acceptor_sites]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes if orfX.endPY <= min_preceding_donor_sites_pos: continue if orfX.startPY >= max_subsequent_acceptor_sites_pos: continue # if here, we can try to make a bridge by a tinyexon for donor in preceding_donor_sites: # orf not correctly positions towards the donor site if orfX.endPY <= donor.pos: continue # check pssm_score of donor site # TODO: this is in fact the donor on the normal, large orf # TODO: do we want to check this pssm score? if donor.pssm_score < min_donor_pssm_score: continue for acceptor in subsequent_acceptor_sites: if orfX.startPY >= acceptor.pos: continue # check pssm_score of acceptor site # TODO: this is in fact the acceptor on the normal, large orf # TODO: do we want to check this pssm score? if acceptor.pssm_score < min_acceptor_pssm_score: continue # okay, now try to bridge it! exons = find_tiny_exon_on_orf( orfX, order_by='total_pssm', max_tinyexon_nt_length=max_tinyexon_nt_length, min_tinyexon_nt_length=min_tinyexon_nt_length, max_tinyexon_intron_nt_length=max_tinyexon_intron_nt_length, min_tinyexon_intron_nt_length=min_tinyexon_intron_nt_length, min_donor_pssm_score=min_donor_pssm_score, min_acceptor_pssm_score=min_acceptor_pssm_score, min_total_pssm_score=min_total_pssm_score, preceding_donor=donor, subsequent_acceptor=acceptor) # and append to returnexons for tinyexon in exons: # make preceding intron shared_nts_A = get_shared_nucleotides_at_splicesite( tinyexon.orf, preceding_orf, tinyexon.acceptor, donor) preceding_intron = IntronConnectingOrfs( donor, tinyexon.acceptor, shared_nts_A, preceding_orf, tinyexon.orf) # make subsequent intron shared_nts_B = get_shared_nucleotides_at_splicesite( subsequent_orf, tinyexon.orf, acceptor, tinyexon.donor) subsequent_intron = IntronConnectingOrfs( tinyexon.donor, acceptor, shared_nts_B, tinyexon.orf, subsequent_orf) # and append to exons returnexons.append( (preceding_intron, tinyexon, subsequent_intron)) # and return the list of intron/exon/intron return returnexons