def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge query Orfs in PacbPORF by **best** intron @attention: see orfs.merge_orfs_with_intron for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_query_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) # get list of introns intronlist = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, min_donor_pos=min_donor_query_pos, max_acceptor_pos=max_accep_query_pos, **kwargs) # filter on entropy # settings for minimal alignment entropy score if min([pacbporfD.identityscore, pacbporfA.identityscore]) > 0.55: min_donor_site_entropy = 0.01 min_acceptor_site_entropy = 0.01 intronlist = _filter_introns_on_entropy( intronlist, pacbporfD, pacbporfA, min_donor_site_entropy=min_donor_site_entropy, min_acceptor_site_entropy=min_acceptor_site_entropy) else: # do not filter, but do not forget to store apps data to intron(s) for intron in intronlist: succes = set_apps_intron_query(intron, pacbporfD, pacbporfA) for intron in intronlist: intron._distance = 0 # ?? # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = 'ABGPbridgeing' # get unique list of donors & acceptors donor = olba(list(Set([intron.donor for intron in intronlist])), order_by='pos') accep = olba(list(Set([intron.acceptor for intron in intronlist])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep] ############################################################################ intronlist = _filter_introns_on_pssm_entropy_combination(intronlist) # get unique list of donors & acceptors donor = olba(list(Set([intron.donor for intron in intronlist])), order_by='pos') accep = olba(list(Set([intron.acceptor for intron in intronlist])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep] ############################################################################ filtered_intron_list = [] for intron in intronlist: intron.assign_bp_and_ppts() if intron.branchpoint and (intron.ppt5p or intron.ppt3p): filtered_intron_list.append(intron) else: pass # check if list is emptied due to branchpoint filtering # in that case, filter for either branchpoint OR polyppt if not filtered_intron_list and intronlist: for intron in intronlist: if intron.branchpoint or (intron.ppt5p or intron.ppt3p): filtered_intron_list.append(intron) # return list of filtered introns return filtered_intron_list
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,pacbporfA,verbose=False,**kwargs): """ Merge query Orfs in PacbPORF by **best** intron @attention: see orfs.merge_orfs_with_intron for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) # get list of introns intronlist = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, min_donor_pos =min_donor_query_pos, max_acceptor_pos=max_accep_query_pos,**kwargs) # filter on entropy # settings for minimal alignment entropy score if min([pacbporfD.identityscore,pacbporfA.identityscore]) > 0.55: min_donor_site_entropy = 0.01 min_acceptor_site_entropy = 0.01 intronlist = _filter_introns_on_entropy(intronlist,pacbporfD,pacbporfA, min_donor_site_entropy=min_donor_site_entropy, min_acceptor_site_entropy=min_acceptor_site_entropy) else: # do not filter, but do not forget to store apps data to intron(s) for intron in intronlist: succes = set_apps_intron_query(intron,pacbporfD,pacbporfA) for intron in intronlist: intron._distance = 0 # ?? # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = 'ABGPbridgeing' # get unique list of donors & acceptors donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos') accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos') ############################################################################ if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep] ############################################################################ intronlist = _filter_introns_on_pssm_entropy_combination(intronlist) # get unique list of donors & acceptors donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos') accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos') ############################################################################ if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep] ############################################################################ filtered_intron_list = [] for intron in intronlist: intron.assign_bp_and_ppts() if intron.branchpoint and (intron.ppt5p or intron.ppt3p): filtered_intron_list.append( intron ) else: pass # check if list is emptied due to branchpoint filtering # in that case, filter for either branchpoint OR polyppt if not filtered_intron_list and intronlist: for intron in intronlist: if intron.branchpoint or (intron.ppt5p or intron.ppt3p): filtered_intron_list.append( intron ) # return list of filtered introns return filtered_intron_list
def _merge_pacbporfs_by_intron(pfD,pfA,queryorsbjct,verbose=False,**kwargs): """ Project splicesites from SBJCT intron on continious QUERY PacbPORFs @type pfD: PacbPORF object @param pfD: PacbPORF object that has to deliver (aligned) donor sites @type pfA: PacbPORF object @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @rtype: list @return: list with ProjectedIntrons (from Sbjct on Query) """ # input validation IsPacbPORF(pfD) IsPacbPORF(pfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_INTRON) ### if not kwargs.has_key('projected_intron_max_nt_offset'): ### kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET ### if not kwargs.has_key('projected_intron_max_aa_offset'): ### kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 sposD = pfD._get_original_alignment_pos_start() eposD = pfD._get_original_alignment_pos_end() sposA = pfA._get_original_alignment_pos_start() eposA = pfA._get_original_alignment_pos_end() if queryorsbjct == "query": # Orfs of SBJCT must be identical IsIdenticalOrfs(pfD.orfS,pfA.orfS) donorOrf = pfD.orfQ accepOrf = pfA.orfQ prjctOrf = pfD.orfS # pfD.orfS == pfA.orfS dStart = sposD.query_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.query_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.query_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.query_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "sbjct_dna_start" # calculate elegiable splice site range qdr = pfD.alignment_dna_range_query() qar = pfA.alignment_dna_range_query() min_donor_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) elif queryorsbjct == "sbjct": # Orfs of QUERY must be identical IsIdenticalOrfs(pfD.orfQ,pfA.orfQ) donorOrf = pfD.orfS accepOrf = pfA.orfS prjctOrf = pfD.orfQ # pfD.orfQ == pfA.orfQ dStart = sposD.sbjct_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.sbjct_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.sbjct_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.sbjct_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "query_dna_start" # calculate elegiable splice site range sdr = pfD.alignment_dna_range_sbjct() sar = pfA.alignment_dna_range_sbjct() min_donor_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # predict introns only in `queryorsbjct` Orfs # introns is a list of IntronConnectingOrfs objects introns = merge_orfs_with_intron(donorOrf,accepOrf, min_donor_pos=min_donor_pos, max_acceptor_pos=max_accep_pos, order_by='length',**kwargs) # return list with projected introns projected_introns = [] # gather unique donor and acceptor positions from list # of IntronConnectingOrfs for intron in introns: # break if intron is to large if kwargs['max_intron_nt_length'] and intron.length > kwargs['max_intron_nt_length']: break # continue if intron is to small if kwargs['min_intron_nt_length'] and intron.length < kwargs['min_intron_nt_length']: continue # continue if intron has non-canonical features # check if intron.start is on pfD; # inframe-introns can be projected outside of pfD/pfA area if intron.start <= dStart: continue if intron.start >= dEnd: continue # check if intron.end is on pfA; # inframe-introns can be projected outside of pfD/pfA area if intron.end <= aStart: continue if intron.end >= aEnd: continue if queryorsbjct == "sbjct": # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_sbjct(intron.donor.pos,forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_sbjct(intron.acceptor.pos,forced_return=True) # calculate projected distance on QUERY posDposQuery = pfD._positions[donorPositionPos].query_pos posAposQuery = pfA._positions[accepPositionPos].query_pos aaDistance = posAposQuery - posDposQuery else: # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_query(intron.donor.pos,forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_query(intron.acceptor.pos,forced_return=True) # calculate binary entropy from projected position on SBJCT posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos aaDistance = posAposSbjct - posDposSbjct # calculate binary entropy score entropyDonorSbjct = pfD.alignment_entropy(donorPositionPos,method='donor') entropyAcceptorSbjct= pfA.alignment_entropy(accepPositionPos,method='acceptor') # do distance check upon (projected) intron acceptance if abs(aaDistance) <= kwargs['max_aa_offset']: # check if we've runned out of the aligned part outofalignedpacbporf = False # get the projected donor position; mind the gap on this spot ;-) while pfD._positions[donorPositionPos].isa_gap and donorPositionPos > 0 : donorPositionPos -= 1 else: projected_donor_position = getattr(pfD._positions[donorPositionPos],outOfAlignmentAttribute) + phaseD if donorPositionPos == 0 and pfD._positions[donorPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::donor" outofalignedpacbporf = True # get the projected acceptor position; mind the gap on this spot ;-) while pfA._positions[accepPositionPos].isa_gap and len(pfA._positions) > accepPositionPos+1: accepPositionPos += 1 else: projected_accep_position = getattr(pfA._positions[accepPositionPos],outOfAlignmentAttribute) + phaseA if accepPositionPos == len(pfA._positions)-1 and pfA._positions[accepPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::acceptor" outofalignedpacbporf = True if not outofalignedpacbporf: ################################################################ # set some meta-data properties to the intron object ################################################################ # add distance score to intron intron._distance = abs(aaDistance)*3 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron,pfD,pfA) else: succes = set_apps_intron_sbjct(intron,pfD,pfA) # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = "ABGPprojecting" # make a ProjectedIntronConnectingOrfs object pico = ProjectedIntronConnectingOrfs(prjctOrf, projected_donor_position, projected_accep_position) intron.binary_entropy_donor = entropyDonorSbjct intron.binary_entropy_acceptor = entropyAcceptorSbjct pico.add_projected_intron( intron ) pico.phase = intron.phase projected_introns.append( pico ) ################################################################ if verbose: print "PROJ::", intron._distance, print (pfD.orfQ.id, pfA.orfQ.id), print (pfD.orfS.id, pfA.orfS.id), print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos), print "%2.1f,%2.1f" % (intron.donor.pssm_score, intron.acceptor.pssm_score), print "%2.1f,%2.1f" % (intron.binary_entropy_donor,intron.binary_entropy_acceptor) ################################################################ if aaDistance > kwargs['max_aa_offset']: # break out; ordered by length can never result in # a proper projected intron break # filter out less relevant ones compared to complete set of results projected_introns = _filter_projected_introns(projected_introns) # and return a list of ProjectedIntronConnectingOrfs return projected_introns
def merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=False,**kwargs): """ Merge 2 PacbPORF objects by introns @attention: see orfs.merge_orfs_with_intron for **kwargs @attention: see functions._filter_for_alignable_splice_sites for **kwargs @attention: see functions._filter_for_entropy for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) sdr = pacbporfD.alignment_dna_range_sbjct() sar = pacbporfA.alignment_dna_range_sbjct() min_donor_sbjct_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_sbjct_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) # get list of introns #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, # min_donor_pos =min_donor_query_pos, # max_acceptor_pos=max_accep_query_pos,**kwargs) #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS, # min_donor_pos =min_donor_sbjct_pos, # max_acceptor_pos=max_accep_sbjct_pos,**kwargs) # get list of introns intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,**kwargs) intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,**kwargs) # get unique list of donors & acceptors donorQ = olba( list(Set([inQ.donor for inQ in intronsQ ])), order_by='pos') donorS = olba( list(Set([inS.donor for inS in intronsS ])), order_by='pos') accepQ = olba( list(Set([inQ.acceptor for inQ in intronsQ ])), order_by='pos') accepS = olba( list(Set([inS.acceptor for inS in intronsS ])), order_by='pos') ############################################################################ if verbose: print "dQ1", [ d.pos for d in donorQ ], "aQ1", [ a.pos for a in accepQ ] print "dS1", [ d.pos for d in donorS ], "aS1", [ a.pos for a in accepS ] ############################################################################ # filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor'] algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs) kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor'] algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs) ############################################################################ if verbose: print "dQ2", [ _dq.pos for (_dq,_ds) in algdonors ], print "aQ2", [ _aq.pos for (_aq,_as) in algacceps ] print "dS2", [ _ds.pos for (_dq,_ds) in algdonors ], print "aS2", [ _as.pos for (_aq,_as) in algacceps ] ############################################################################ # remove sites with to low alignment entropy algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) ############################################################################ if verbose: print "dQ3", [ _dq.pos for (_dq,_ds) in algdonors ], print "aQ3", [ _aq.pos for (_aq,_as) in algacceps ] print "dS3", [ _ds.pos for (_dq,_ds) in algdonors ], print "aS3", [ _as.pos for (_aq,_as) in algacceps ] ############################################################################ # make unique position lists for quick lookup in intron lists dQpl = Set([ dQ.pos for dQ,dS in algdonors ]) dSpl = Set([ dS.pos for dQ,dS in algdonors ]) aQpl = Set([ aQ.pos for aQ,aS in algacceps ]) aSpl = Set([ aS.pos for aQ,aS in algacceps ]) # check exterior boundaries of PacbPORFs sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() # now make list of aligable introns algintrons = [] for intQ in intronsQ: # check if intron falls within the PacbPORF aligned area if intQ.donor.pos <= sposD.query_dna_start: continue if intQ.acceptor.pos >= eposA.query_dna_end: continue if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl: # Query intron occurs in list of alignable splice sites! for intS in intronsS: # check if intron falls within the PacbPORF aligned area if intS.donor.pos <= sposD.sbjct_dna_start: continue if intS.acceptor.pos >= eposA.sbjct_dna_end: continue if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl: # Sbjct intron occurs as well in alignable splice sites! if (intQ.donor,intS.donor) in algdonors and\ (intQ.acceptor,intS.acceptor) in algacceps: # Sbjct & Query Donor & Acceptor are alignable! algintrons.append( ( intQ, intS ) ) ############################################################################ # set some meta-data properties to the intron objects ############################################################################ for intQ,intS in algintrons: distDnt = pacbporfD.get_distance_aligned_nucleotide_positions( query = intQ.donor.pos, sbjct = intS.donor.pos ) distAnt = pacbporfA.get_distance_aligned_nucleotide_positions( query = intQ.acceptor.pos, sbjct = intS.acceptor.pos ) # final distance check. kwargs['aligned_site_max_triplet_distance'] # is applied on donor and acceptor site. This distance measured on the # protein sequence can be DOUBLED in case distDnt / distAnt are # opposite (+ and -). Check here if the protein sequence gap is # as well <= kwargs['aligned_site_max_triplet_distance']. if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance']*3: continue # add distance score to introns intQ._distance = abs(distDnt) + abs(distAnt) intS._distance = abs(distDnt) + abs(distAnt) # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,pacbporfD,pacbporfA) succes = set_apps_intron_sbjct(intS,pacbporfD,pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPmapping" intS._gff['fsource'] = "ABGPmapping" ######################################################################## if verbose: # some printing.... print "Aligned introns:", ( intQ.donor.pos, intQ.acceptor.pos ) , print ( intS.donor.pos, intS.acceptor.pos ), print "DIST:", distDnt, distAnt, print "[%s]" % kwargs['aligned_site_max_triplet_distance'], print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep), print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % ( intQ.donor.pssm_score, intS.donor.pssm_score, intQ.acceptor.pssm_score, intS.acceptor.pssm_score, ) ######################################################################## # return lists of aligned introns return algintrons
def _merge_pacbporfs_by_intron(pfD, pfA, queryorsbjct, verbose=False, **kwargs): """ Project splicesites from SBJCT intron on continious QUERY PacbPORFs @type pfD: PacbPORF object @param pfD: PacbPORF object that has to deliver (aligned) donor sites @type pfA: PacbPORF object @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @rtype: list @return: list with ProjectedIntrons (from Sbjct on Query) """ # input validation IsPacbPORF(pfD) IsPacbPORF(pfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_INTRON) ### if not kwargs.has_key('projected_intron_max_nt_offset'): ### kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET ### if not kwargs.has_key('projected_intron_max_aa_offset'): ### kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 sposD = pfD._get_original_alignment_pos_start() eposD = pfD._get_original_alignment_pos_end() sposA = pfA._get_original_alignment_pos_start() eposA = pfA._get_original_alignment_pos_end() if queryorsbjct == "query": # Orfs of SBJCT must be identical IsIdenticalOrfs(pfD.orfS, pfA.orfS) donorOrf = pfD.orfQ accepOrf = pfA.orfQ prjctOrf = pfD.orfS # pfD.orfS == pfA.orfS dStart = sposD.query_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.query_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.query_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.query_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "sbjct_dna_start" # calculate elegiable splice site range qdr = pfD.alignment_dna_range_query() qar = pfA.alignment_dna_range_query() min_donor_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) elif queryorsbjct == "sbjct": # Orfs of QUERY must be identical IsIdenticalOrfs(pfD.orfQ, pfA.orfQ) donorOrf = pfD.orfS accepOrf = pfA.orfS prjctOrf = pfD.orfQ # pfD.orfQ == pfA.orfQ dStart = sposD.sbjct_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.sbjct_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.sbjct_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.sbjct_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "query_dna_start" # calculate elegiable splice site range sdr = pfD.alignment_dna_range_sbjct() sar = pfA.alignment_dna_range_sbjct() min_donor_pos = max( [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_pos = min( [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # predict introns only in `queryorsbjct` Orfs # introns is a list of IntronConnectingOrfs objects introns = merge_orfs_with_intron(donorOrf, accepOrf, min_donor_pos=min_donor_pos, max_acceptor_pos=max_accep_pos, order_by='length', **kwargs) # return list with projected introns projected_introns = [] # gather unique donor and acceptor positions from list # of IntronConnectingOrfs for intron in introns: # break if intron is to large if kwargs['max_intron_nt_length'] and intron.length > kwargs[ 'max_intron_nt_length']: break # continue if intron is to small if kwargs['min_intron_nt_length'] and intron.length < kwargs[ 'min_intron_nt_length']: continue # continue if intron has non-canonical features # check if intron.start is on pfD; # inframe-introns can be projected outside of pfD/pfA area if intron.start <= dStart: continue if intron.start >= dEnd: continue # check if intron.end is on pfA; # inframe-introns can be projected outside of pfD/pfA area if intron.end <= aStart: continue if intron.end >= aEnd: continue if queryorsbjct == "sbjct": # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_sbjct( intron.donor.pos, forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_sbjct( intron.acceptor.pos, forced_return=True) # calculate projected distance on QUERY posDposQuery = pfD._positions[donorPositionPos].query_pos posAposQuery = pfA._positions[accepPositionPos].query_pos aaDistance = posAposQuery - posDposQuery else: # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_query( intron.donor.pos, forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_query( intron.acceptor.pos, forced_return=True) # calculate binary entropy from projected position on SBJCT posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos aaDistance = posAposSbjct - posDposSbjct # calculate binary entropy score entropyDonorSbjct = pfD.alignment_entropy(donorPositionPos, method='donor') entropyAcceptorSbjct = pfA.alignment_entropy(accepPositionPos, method='acceptor') # do distance check upon (projected) intron acceptance if abs(aaDistance) <= kwargs['max_aa_offset']: # check if we've runned out of the aligned part outofalignedpacbporf = False # get the projected donor position; mind the gap on this spot ;-) while pfD._positions[ donorPositionPos].isa_gap and donorPositionPos > 0: donorPositionPos -= 1 else: projected_donor_position = getattr( pfD._positions[donorPositionPos], outOfAlignmentAttribute) + phaseD if donorPositionPos == 0 and pfD._positions[ donorPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::donor" outofalignedpacbporf = True # get the projected acceptor position; mind the gap on this spot ;-) while pfA._positions[accepPositionPos].isa_gap and len( pfA._positions) > accepPositionPos + 1: accepPositionPos += 1 else: projected_accep_position = getattr( pfA._positions[accepPositionPos], outOfAlignmentAttribute) + phaseA if accepPositionPos == len( pfA._positions ) - 1 and pfA._positions[accepPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::acceptor" outofalignedpacbporf = True if not outofalignedpacbporf: ################################################################ # set some meta-data properties to the intron object ################################################################ # add distance score to intron intron._distance = abs(aaDistance) * 3 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron, pfD, pfA) else: succes = set_apps_intron_sbjct(intron, pfD, pfA) # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = "ABGPprojecting" # make a ProjectedIntronConnectingOrfs object pico = ProjectedIntronConnectingOrfs(prjctOrf, projected_donor_position, projected_accep_position) intron.binary_entropy_donor = entropyDonorSbjct intron.binary_entropy_acceptor = entropyAcceptorSbjct pico.add_projected_intron(intron) pico.phase = intron.phase projected_introns.append(pico) ################################################################ if verbose: print "PROJ::", intron._distance, print(pfD.orfQ.id, pfA.orfQ.id), print(pfD.orfS.id, pfA.orfS.id), print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos), print "%2.1f,%2.1f" % (intron.donor.pssm_score, intron.acceptor.pssm_score), print "%2.1f,%2.1f" % (intron.binary_entropy_donor, intron.binary_entropy_acceptor) ################################################################ if aaDistance > kwargs['max_aa_offset']: # break out; ordered by length can never result in # a proper projected intron break # filter out less relevant ones compared to complete set of results projected_introns = _filter_projected_introns(projected_introns) # and return a list of ProjectedIntronConnectingOrfs return projected_introns
def merge_pacbporfs_with_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see orfs.merge_orfs_with_intron for **kwargs @attention: see functions._filter_for_alignable_splice_sites for **kwargs @attention: see functions._filter_for_entropy for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_query_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) sdr = pacbporfD.alignment_dna_range_sbjct() sar = pacbporfA.alignment_dna_range_sbjct() min_donor_sbjct_pos = max( [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_sbjct_pos = min( [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) # get list of introns #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, # min_donor_pos =min_donor_query_pos, # max_acceptor_pos=max_accep_query_pos,**kwargs) #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS, # min_donor_pos =min_donor_sbjct_pos, # max_acceptor_pos=max_accep_sbjct_pos,**kwargs) # get list of introns intronsQ = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, **kwargs) intronsS = merge_orfs_with_intron(pacbporfD.orfS, pacbporfA.orfS, **kwargs) # get unique list of donors & acceptors donorQ = olba(list(Set([inQ.donor for inQ in intronsQ])), order_by='pos') donorS = olba(list(Set([inS.donor for inS in intronsS])), order_by='pos') accepQ = olba(list(Set([inQ.acceptor for inQ in intronsQ])), order_by='pos') accepS = olba(list(Set([inS.acceptor for inS in intronsS])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donorQ], "aQ1", [a.pos for a in accepQ] print "dS1", [d.pos for d in donorS], "aS1", [a.pos for a in accepS] ############################################################################ # filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor'] algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD, **kwargs) kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor'] algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA, **kwargs) ############################################################################ if verbose: print "dQ2", [_dq.pos for (_dq, _ds) in algdonors], print "aQ2", [_aq.pos for (_aq, _as) in algacceps] print "dS2", [_ds.pos for (_dq, _ds) in algdonors], print "aS2", [_as.pos for (_aq, _as) in algacceps] ############################################################################ # remove sites with to low alignment entropy algdonors = _filter_for_entropy( algdonors, pacbporfD, 'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy( algacceps, pacbporfA, 'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) ############################################################################ if verbose: print "dQ3", [_dq.pos for (_dq, _ds) in algdonors], print "aQ3", [_aq.pos for (_aq, _as) in algacceps] print "dS3", [_ds.pos for (_dq, _ds) in algdonors], print "aS3", [_as.pos for (_aq, _as) in algacceps] ############################################################################ # make unique position lists for quick lookup in intron lists dQpl = Set([dQ.pos for dQ, dS in algdonors]) dSpl = Set([dS.pos for dQ, dS in algdonors]) aQpl = Set([aQ.pos for aQ, aS in algacceps]) aSpl = Set([aS.pos for aQ, aS in algacceps]) # check exterior boundaries of PacbPORFs sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() # now make list of aligable introns algintrons = [] for intQ in intronsQ: # check if intron falls within the PacbPORF aligned area if intQ.donor.pos <= sposD.query_dna_start: continue if intQ.acceptor.pos >= eposA.query_dna_end: continue if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl: # Query intron occurs in list of alignable splice sites! for intS in intronsS: # check if intron falls within the PacbPORF aligned area if intS.donor.pos <= sposD.sbjct_dna_start: continue if intS.acceptor.pos >= eposA.sbjct_dna_end: continue if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl: # Sbjct intron occurs as well in alignable splice sites! if (intQ.donor,intS.donor) in algdonors and\ (intQ.acceptor,intS.acceptor) in algacceps: # Sbjct & Query Donor & Acceptor are alignable! algintrons.append((intQ, intS)) ############################################################################ # set some meta-data properties to the intron objects ############################################################################ for intQ, intS in algintrons: distDnt = pacbporfD.get_distance_aligned_nucleotide_positions( query=intQ.donor.pos, sbjct=intS.donor.pos) distAnt = pacbporfA.get_distance_aligned_nucleotide_positions( query=intQ.acceptor.pos, sbjct=intS.acceptor.pos) # final distance check. kwargs['aligned_site_max_triplet_distance'] # is applied on donor and acceptor site. This distance measured on the # protein sequence can be DOUBLED in case distDnt / distAnt are # opposite (+ and -). Check here if the protein sequence gap is # as well <= kwargs['aligned_site_max_triplet_distance']. if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance'] * 3: continue # add distance score to introns intQ._distance = abs(distDnt) + abs(distAnt) intS._distance = abs(distDnt) + abs(distAnt) # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, pacbporfD, pacbporfA) succes = set_apps_intron_sbjct(intS, pacbporfD, pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPmapping" intS._gff['fsource'] = "ABGPmapping" ######################################################################## if verbose: # some printing.... print "Aligned introns:", (intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos), print "DIST:", distDnt, distAnt, print "[%s]" % kwargs['aligned_site_max_triplet_distance'], print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep), print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % ( intQ.donor.pssm_score, intS.donor.pssm_score, intQ.acceptor.pssm_score, intS.acceptor.pssm_score, ) ######################################################################## # return lists of aligned introns return algintrons