def merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns of which one underwent a phase shift @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary kwargs['allow_phase_shift'] = True _update_kwargs(kwargs, KWARGS_PHASE_SHIFT_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_distance'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD, pacbporfA, **kwargs) psh_introns = [] # check if there is length congruence between the cig_introns for intQ, intS in alg_introns: # check phase equilibrium -> if equal, no phase shift if intQ.donor.phase == intS.donor.phase: continue ######################################################################## # set some meta-data properties to the intron objects # attribute _distance is already set in merge_pacbporfs_with_introns # attribute(s) ~APPS are already set in merge_pacbporfs_with_introns ######################################################################## # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPphs" intS._gff['fsource'] = "ABGPphs" # putatively a phase shifted intron pair psh_introns.append((intQ, intS)) # return lists of phase shifted introns return psh_introns
def merge_pacbporfs_with_phase_shift_introns(pacbporfD,pacbporfA, verbose=False,**kwargs): """ Merge 2 PacbPORF objects by introns of which one underwent a phase shift @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary kwargs['allow_phase_shift'] = True _update_kwargs(kwargs,KWARGS_PHASE_SHIFT_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_distance'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,**kwargs) psh_introns = [] # check if there is length congruence between the cig_introns for intQ,intS in alg_introns: # check phase equilibrium -> if equal, no phase shift if intQ.donor.phase == intS.donor.phase: continue ######################################################################## # set some meta-data properties to the intron objects # attribute _distance is already set in merge_pacbporfs_with_introns # attribute(s) ~APPS are already set in merge_pacbporfs_with_introns ######################################################################## # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPphs" intS._gff['fsource'] = "ABGPphs" # putatively a phase shifted intron pair psh_introns.append( ( intQ, intS ) ) # return lists of phase shifted introns return psh_introns
def find_stopless3n_introns_on_orf(orfObj, has_branchpoint=False, has_polypyrimidine=False, order_by='length', **kwargs): """ Find potential stopless3n introns on this orf @attention: **kwargs can contain other (here) unnecessarily arguments @attention: **kwargs are required in the merge_orfs_with_intron() function @type orfObj: Orf object @param orfObj: Orf object which is scanned for stopless3n introns @rtype: list @return: list with introns """ # input validation IsOrf(orfObj) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_STOPLESS_3N_INTRONS) # find stopless3nintrons stopless3nintrons = merge_orfs_with_intron(orfObj, orfObj, **kwargs) # filter for presence of branchpoint / polypyrimidine tracks if has_branchpoint or has_polypyrimidine: filtered = [] for intron in stopless3nintrons: intron.assign_bp_and_ppts() if has_branchpoint and not intron.branchpoint: continue intron_bp_dist = intron.get_branchpoint_nt_distance() if has_branchpoint and intron_bp_dist == None: continue intron_bp_optimality = min([ abs(offset - intron_bp_dist) for offset in OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE ]) if has_branchpoint and intron_bp_optimality > MAXIMAL_OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE: continue if has_polypyrimidine and not (intron.ppt5p or intron.ppt3p): continue # if here, accepted! filtered.append(intron) else: filtered = stopless3nintrons # return ordered intron list return _order_intron_list(filtered, order_by=order_by)
def _get_tinyexon_dict(input, omit_identifier_list=[], **kwargs): """ """ _update_kwargs(kwargs, KWARGS_TINYEXON_PAIRWISE) tinyexondata = {} for orgid in input.keys(): if orgid in omit_identifier_list: continue tinyexondata[orgid] = [] for orfObj in input[orgid]['orfs'].orfs: tinyexondata[orgid].extend( get_potential_tiny_exons_on_orf(orfObj, **kwargs)) tinyexondata[orgid] = order_list_by_attribute(tinyexondata[orgid], order_by='length') # return dict with predicted tinyexons return tinyexondata
def _get_tinyexon_dict(input,omit_identifier_list=[],**kwargs): """ """ _update_kwargs(kwargs,KWARGS_TINYEXON_PAIRWISE) tinyexondata = {} for orgid in input.keys(): if orgid in omit_identifier_list: continue tinyexondata[orgid] = [] for orfObj in input[orgid]['orfs'].orfs: tinyexondata[orgid].extend( get_potential_tiny_exons_on_orf( orfObj,**kwargs ) ) tinyexondata[orgid] = order_list_by_attribute( tinyexondata[orgid],order_by='length') # return dict with predicted tinyexons return tinyexondata
def find_stopless3n_introns_on_orf(orfObj, has_branchpoint = False, has_polypyrimidine = False, order_by = 'length',**kwargs): """ Find potential stopless3n introns on this orf @attention: **kwargs can contain other (here) unnecessarily arguments @attention: **kwargs are required in the merge_orfs_with_intron() function @type orfObj: Orf object @param orfObj: Orf object which is scanned for stopless3n introns @rtype: list @return: list with introns """ # input validation IsOrf(orfObj) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_STOPLESS_3N_INTRONS) # find stopless3nintrons stopless3nintrons = merge_orfs_with_intron(orfObj,orfObj,**kwargs) # filter for presence of branchpoint / polypyrimidine tracks if has_branchpoint or has_polypyrimidine: filtered = [] for intron in stopless3nintrons: intron.assign_bp_and_ppts() if has_branchpoint and not intron.branchpoint: continue intron_bp_dist = intron.get_branchpoint_nt_distance() if has_branchpoint and intron_bp_dist == None: continue intron_bp_optimality = min([ abs(offset-intron_bp_dist) for offset in OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE ]) if has_branchpoint and intron_bp_optimality > MAXIMAL_OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE: continue if has_polypyrimidine and not (intron.ppt5p or intron.ppt3p): continue # if here, accepted! filtered.append( intron ) else: filtered = stopless3nintrons # return ordered intron list return _order_intron_list(filtered,order_by=order_by)
def _merge_pacbporfs_by_intron(pfD,pfA,queryorsbjct,verbose=False,**kwargs): """ Project splicesites from SBJCT intron on continious QUERY PacbPORFs @type pfD: PacbPORF object @param pfD: PacbPORF object that has to deliver (aligned) donor sites @type pfA: PacbPORF object @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @rtype: list @return: list with ProjectedIntrons (from Sbjct on Query) """ # input validation IsPacbPORF(pfD) IsPacbPORF(pfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_INTRON) ### if not kwargs.has_key('projected_intron_max_nt_offset'): ### kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET ### if not kwargs.has_key('projected_intron_max_aa_offset'): ### kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 sposD = pfD._get_original_alignment_pos_start() eposD = pfD._get_original_alignment_pos_end() sposA = pfA._get_original_alignment_pos_start() eposA = pfA._get_original_alignment_pos_end() if queryorsbjct == "query": # Orfs of SBJCT must be identical IsIdenticalOrfs(pfD.orfS,pfA.orfS) donorOrf = pfD.orfQ accepOrf = pfA.orfQ prjctOrf = pfD.orfS # pfD.orfS == pfA.orfS dStart = sposD.query_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.query_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.query_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.query_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "sbjct_dna_start" # calculate elegiable splice site range qdr = pfD.alignment_dna_range_query() qar = pfA.alignment_dna_range_query() min_donor_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) elif queryorsbjct == "sbjct": # Orfs of QUERY must be identical IsIdenticalOrfs(pfD.orfQ,pfA.orfQ) donorOrf = pfD.orfS accepOrf = pfA.orfS prjctOrf = pfD.orfQ # pfD.orfQ == pfA.orfQ dStart = sposD.sbjct_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.sbjct_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.sbjct_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.sbjct_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "query_dna_start" # calculate elegiable splice site range sdr = pfD.alignment_dna_range_sbjct() sar = pfA.alignment_dna_range_sbjct() min_donor_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # predict introns only in `queryorsbjct` Orfs # introns is a list of IntronConnectingOrfs objects introns = merge_orfs_with_intron(donorOrf,accepOrf, min_donor_pos=min_donor_pos, max_acceptor_pos=max_accep_pos, order_by='length',**kwargs) # return list with projected introns projected_introns = [] # gather unique donor and acceptor positions from list # of IntronConnectingOrfs for intron in introns: # break if intron is to large if kwargs['max_intron_nt_length'] and intron.length > kwargs['max_intron_nt_length']: break # continue if intron is to small if kwargs['min_intron_nt_length'] and intron.length < kwargs['min_intron_nt_length']: continue # continue if intron has non-canonical features # check if intron.start is on pfD; # inframe-introns can be projected outside of pfD/pfA area if intron.start <= dStart: continue if intron.start >= dEnd: continue # check if intron.end is on pfA; # inframe-introns can be projected outside of pfD/pfA area if intron.end <= aStart: continue if intron.end >= aEnd: continue if queryorsbjct == "sbjct": # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_sbjct(intron.donor.pos,forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_sbjct(intron.acceptor.pos,forced_return=True) # calculate projected distance on QUERY posDposQuery = pfD._positions[donorPositionPos].query_pos posAposQuery = pfA._positions[accepPositionPos].query_pos aaDistance = posAposQuery - posDposQuery else: # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_query(intron.donor.pos,forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_query(intron.acceptor.pos,forced_return=True) # calculate binary entropy from projected position on SBJCT posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos aaDistance = posAposSbjct - posDposSbjct # calculate binary entropy score entropyDonorSbjct = pfD.alignment_entropy(donorPositionPos,method='donor') entropyAcceptorSbjct= pfA.alignment_entropy(accepPositionPos,method='acceptor') # do distance check upon (projected) intron acceptance if abs(aaDistance) <= kwargs['max_aa_offset']: # check if we've runned out of the aligned part outofalignedpacbporf = False # get the projected donor position; mind the gap on this spot ;-) while pfD._positions[donorPositionPos].isa_gap and donorPositionPos > 0 : donorPositionPos -= 1 else: projected_donor_position = getattr(pfD._positions[donorPositionPos],outOfAlignmentAttribute) + phaseD if donorPositionPos == 0 and pfD._positions[donorPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::donor" outofalignedpacbporf = True # get the projected acceptor position; mind the gap on this spot ;-) while pfA._positions[accepPositionPos].isa_gap and len(pfA._positions) > accepPositionPos+1: accepPositionPos += 1 else: projected_accep_position = getattr(pfA._positions[accepPositionPos],outOfAlignmentAttribute) + phaseA if accepPositionPos == len(pfA._positions)-1 and pfA._positions[accepPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::acceptor" outofalignedpacbporf = True if not outofalignedpacbporf: ################################################################ # set some meta-data properties to the intron object ################################################################ # add distance score to intron intron._distance = abs(aaDistance)*3 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron,pfD,pfA) else: succes = set_apps_intron_sbjct(intron,pfD,pfA) # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = "ABGPprojecting" # make a ProjectedIntronConnectingOrfs object pico = ProjectedIntronConnectingOrfs(prjctOrf, projected_donor_position, projected_accep_position) intron.binary_entropy_donor = entropyDonorSbjct intron.binary_entropy_acceptor = entropyAcceptorSbjct pico.add_projected_intron( intron ) pico.phase = intron.phase projected_introns.append( pico ) ################################################################ if verbose: print "PROJ::", intron._distance, print (pfD.orfQ.id, pfA.orfQ.id), print (pfD.orfS.id, pfA.orfS.id), print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos), print "%2.1f,%2.1f" % (intron.donor.pssm_score, intron.acceptor.pssm_score), print "%2.1f,%2.1f" % (intron.binary_entropy_donor,intron.binary_entropy_acceptor) ################################################################ if aaDistance > kwargs['max_aa_offset']: # break out; ordered by length can never result in # a proper projected intron break # filter out less relevant ones compared to complete set of results projected_introns = _filter_projected_introns(projected_introns) # and return a list of ProjectedIntronConnectingOrfs return projected_introns
def merge_orfs_with_two_tinyexons(preceding_orf,subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[],**kwargs): """ Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @attention: see get_potential_tiny_exons_on_orf for additional **kwargs @rtype: list @return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron ) """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) # return list with (intron,tinyexon,intron) tuples returntinyexons = [] tinyexoncollection = [] tinyexoncombis = [] min_preceding_donor_sites_pos = min([ d.pos for d in preceding_donor_sites ]) max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes min_pos = min_preceding_donor_sites_pos + kwargs['min_tinyexon_intron_nt_length'] max_pos = max_subsequent_acceptor_sites_pos - kwargs['min_tinyexon_intron_nt_length'] # if so, do not check this Orf if orfX.endPY <= min_pos: continue if orfX.startPY >= max_pos: continue # extend the tinyexoncollection tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX,**kwargs) ) # make tinyexoncollection ordered on start pos tinyexoncollection = _order_intron_list(tinyexoncollection,order_by='donor_pos') # donor_pos makes REVERSE ordering; restore this by reversing tinyexoncollection.reverse() # make 2-elemented tuples of tinyexons which can co-occur together for tinyexon1 in tinyexoncollection: for pos in range(len(tinyexoncollection)-1,-1,-1): tinyexon2 = tinyexoncollection[pos] if tinyexon2.donor.pos < tinyexon1.donor.pos: break intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue # if here, elegiable combi! intron = IntronConnectingOrfs( tinyexon1.donor,tinyexon2.acceptor, get_shared_nucleotides_at_splicesite( subsequent_orf,preceding_orf, tinyexon2.acceptor,tinyexon1.donor ), preceding_orf,subsequent_orf) totlen = tinyexon1.length+tinyexon2.length combi = ( totlen, tinyexon1, intron, tinyexon2 ) tinyexoncombis.append( combi ) # return an ordered list based on length tinyexoncombis.sort() return [ (exon1,intron,exon2) for l,exon1,intron,exon2 in tinyexoncombis ]
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,pacbporfA,verbose=False,**kwargs): """ Merge query Orfs in PacbPORF by **best** intron @attention: see orfs.merge_orfs_with_intron for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) # get list of introns intronlist = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, min_donor_pos =min_donor_query_pos, max_acceptor_pos=max_accep_query_pos,**kwargs) # filter on entropy # settings for minimal alignment entropy score if min([pacbporfD.identityscore,pacbporfA.identityscore]) > 0.55: min_donor_site_entropy = 0.01 min_acceptor_site_entropy = 0.01 intronlist = _filter_introns_on_entropy(intronlist,pacbporfD,pacbporfA, min_donor_site_entropy=min_donor_site_entropy, min_acceptor_site_entropy=min_acceptor_site_entropy) else: # do not filter, but do not forget to store apps data to intron(s) for intron in intronlist: succes = set_apps_intron_query(intron,pacbporfD,pacbporfA) for intron in intronlist: intron._distance = 0 # ?? # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = 'ABGPbridgeing' # get unique list of donors & acceptors donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos') accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos') ############################################################################ if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep] ############################################################################ intronlist = _filter_introns_on_pssm_entropy_combination(intronlist) # get unique list of donors & acceptors donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos') accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos') ############################################################################ if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep] ############################################################################ filtered_intron_list = [] for intron in intronlist: intron.assign_bp_and_ppts() if intron.branchpoint and (intron.ppt5p or intron.ppt3p): filtered_intron_list.append( intron ) else: pass # check if list is emptied due to branchpoint filtering # in that case, filter for either branchpoint OR polyppt if not filtered_intron_list and intronlist: for intron in intronlist: if intron.branchpoint or (intron.ppt5p or intron.ppt3p): filtered_intron_list.append( intron ) # return list of filtered introns return filtered_intron_list
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD, pacbporfA, orfSetObject, queryorsbjct, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos, forced_return=True) else: (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos, forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos, aPhase) = pacbporfA.dnaposition_query(aObj.pos, forced_return=True) else: (aPos, aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos, forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range(3 - dPhase % 3, len(query), 3) for pos in range(0, len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([0, (len(query) - query.count("N")) / 2]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) #################################################### if verbose: print(pacbporfD.orfQ.id, pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname, 'w') fh.write(sfmpat + "\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % (donorOrf.inputgenomicsequence, EXECUTABLE_SFM, fname, dObj.pos + (kwargs['min_intron_nt_length'] - 3), aObj.pos - (kwargs['min_intron_nt_length'] - 3)) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr, seqmatch in matches.iteritems(): startQ, stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_eligible_orfs( max_orf_start=exonQstart, min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:], splicetype='donor') aScore = _score_splice_site(seqmatch[0:11], splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos - dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos - intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print(pacbporfD.orfQ.id, tinyexonorf.id, pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = (tinyexonorf, exonQstart, exonQstop) sbjct_data = (prjctOrf, posDsbjct, posAsbjct) splicesite_data = (dObj, intron1_aObj, intron2_dObj, aObj) tinyexons.append((query_data, sbjct_data, splicesite_data)) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data, sbjct_data, splicesite_data) = tinyexons[0] orfQ, query_dna_start, query_dna_end = query_data orfS, sbjct_dna_start, sbjct_dna_end = sbjct_data (intron1_dObj, intron1_aObj, intron2_dObj, intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) - 1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) - 1 stopQaa = orfQ.dnapos2aapos(query_dna_end) + 1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) + 1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 while startSaa <= orfS.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 while stopSaa > orfS.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa, abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa, abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa - startQaa), print(query_dna_start, query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa - startSaa), print(sbjct_dna_start, sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start - 2:query_dna_end + 2] print orfS.inputgenomicsequence[sbjct_dna_start - 2:sbjct_dna_end + 2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=(qAAseq, sAAseq, startQaa, startSaa)) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp, orfQ, orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs(intron1_dObj, intron1_aObj, None, donorOrf, pacbporf.orfQ) intron2 = IntronConnectingOrfs(intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1, pacbporfD, pacbporf) succes = set_apps_intron_query(intron2, pacbporf, pacbporfA) else: succes = set_apps_intron_sbjct(intron1, pacbporfD, pacbporf) succes = set_apps_intron_sbjct(intron2, pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [pacbporf] intron2._linked_to_pacbporfs = [pacbporf] intron1._linked_to_introns = [intron2] intron2._linked_to_introns = [intron1] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1, intron2, pacbporf)]
def _merge_pacbporfs_by_intron(pfD, pfA, queryorsbjct, verbose=False, **kwargs): """ Project splicesites from SBJCT intron on continious QUERY PacbPORFs @type pfD: PacbPORF object @param pfD: PacbPORF object that has to deliver (aligned) donor sites @type pfA: PacbPORF object @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @rtype: list @return: list with ProjectedIntrons (from Sbjct on Query) """ # input validation IsPacbPORF(pfD) IsPacbPORF(pfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_INTRON) ### if not kwargs.has_key('projected_intron_max_nt_offset'): ### kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET ### if not kwargs.has_key('projected_intron_max_aa_offset'): ### kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 sposD = pfD._get_original_alignment_pos_start() eposD = pfD._get_original_alignment_pos_end() sposA = pfA._get_original_alignment_pos_start() eposA = pfA._get_original_alignment_pos_end() if queryorsbjct == "query": # Orfs of SBJCT must be identical IsIdenticalOrfs(pfD.orfS, pfA.orfS) donorOrf = pfD.orfQ accepOrf = pfA.orfQ prjctOrf = pfD.orfS # pfD.orfS == pfA.orfS dStart = sposD.query_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.query_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.query_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.query_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "sbjct_dna_start" # calculate elegiable splice site range qdr = pfD.alignment_dna_range_query() qar = pfA.alignment_dna_range_query() min_donor_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) elif queryorsbjct == "sbjct": # Orfs of QUERY must be identical IsIdenticalOrfs(pfD.orfQ, pfA.orfQ) donorOrf = pfD.orfS accepOrf = pfA.orfS prjctOrf = pfD.orfQ # pfD.orfQ == pfA.orfQ dStart = sposD.sbjct_dna_start # ALIGNED start of donorPacbPORF dEnd = pfD.sbjct_dna_end # ABSOLUTE end of donorPacbPORF aStart = pfA.sbjct_dna_start # ABSOLUTE start of acceptorPacbPORF aEnd = eposA.sbjct_dna_end # ALIGNED end of acceptorPacbPORF outOfAlignmentAttribute = "query_dna_start" # calculate elegiable splice site range sdr = pfD.alignment_dna_range_sbjct() sar = pfA.alignment_dna_range_sbjct() min_donor_pos = max( [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_pos = min( [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # predict introns only in `queryorsbjct` Orfs # introns is a list of IntronConnectingOrfs objects introns = merge_orfs_with_intron(donorOrf, accepOrf, min_donor_pos=min_donor_pos, max_acceptor_pos=max_accep_pos, order_by='length', **kwargs) # return list with projected introns projected_introns = [] # gather unique donor and acceptor positions from list # of IntronConnectingOrfs for intron in introns: # break if intron is to large if kwargs['max_intron_nt_length'] and intron.length > kwargs[ 'max_intron_nt_length']: break # continue if intron is to small if kwargs['min_intron_nt_length'] and intron.length < kwargs[ 'min_intron_nt_length']: continue # continue if intron has non-canonical features # check if intron.start is on pfD; # inframe-introns can be projected outside of pfD/pfA area if intron.start <= dStart: continue if intron.start >= dEnd: continue # check if intron.end is on pfA; # inframe-introns can be projected outside of pfD/pfA area if intron.end <= aStart: continue if intron.end >= aEnd: continue if queryorsbjct == "sbjct": # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_sbjct( intron.donor.pos, forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_sbjct( intron.acceptor.pos, forced_return=True) # calculate projected distance on QUERY posDposQuery = pfD._positions[donorPositionPos].query_pos posAposQuery = pfA._positions[accepPositionPos].query_pos aaDistance = posAposQuery - posDposQuery else: # get positions of donor & acceptor in the PacbPORF alignment donorPositionPos, phaseD = pfD.dnaposition_query( intron.donor.pos, forced_return=True) accepPositionPos, phaseA = pfA.dnaposition_query( intron.acceptor.pos, forced_return=True) # calculate binary entropy from projected position on SBJCT posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos aaDistance = posAposSbjct - posDposSbjct # calculate binary entropy score entropyDonorSbjct = pfD.alignment_entropy(donorPositionPos, method='donor') entropyAcceptorSbjct = pfA.alignment_entropy(accepPositionPos, method='acceptor') # do distance check upon (projected) intron acceptance if abs(aaDistance) <= kwargs['max_aa_offset']: # check if we've runned out of the aligned part outofalignedpacbporf = False # get the projected donor position; mind the gap on this spot ;-) while pfD._positions[ donorPositionPos].isa_gap and donorPositionPos > 0: donorPositionPos -= 1 else: projected_donor_position = getattr( pfD._positions[donorPositionPos], outOfAlignmentAttribute) + phaseD if donorPositionPos == 0 and pfD._positions[ donorPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::donor" outofalignedpacbporf = True # get the projected acceptor position; mind the gap on this spot ;-) while pfA._positions[accepPositionPos].isa_gap and len( pfA._positions) > accepPositionPos + 1: accepPositionPos += 1 else: projected_accep_position = getattr( pfA._positions[accepPositionPos], outOfAlignmentAttribute) + phaseA if accepPositionPos == len( pfA._positions ) - 1 and pfA._positions[accepPositionPos].isa_gap: print "WarningThatIsTackled::outofalignedpacbporf::acceptor" outofalignedpacbporf = True if not outofalignedpacbporf: ################################################################ # set some meta-data properties to the intron object ################################################################ # add distance score to intron intron._distance = abs(aaDistance) * 3 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron, pfD, pfA) else: succes = set_apps_intron_sbjct(intron, pfD, pfA) # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = "ABGPprojecting" # make a ProjectedIntronConnectingOrfs object pico = ProjectedIntronConnectingOrfs(prjctOrf, projected_donor_position, projected_accep_position) intron.binary_entropy_donor = entropyDonorSbjct intron.binary_entropy_acceptor = entropyAcceptorSbjct pico.add_projected_intron(intron) pico.phase = intron.phase projected_introns.append(pico) ################################################################ if verbose: print "PROJ::", intron._distance, print(pfD.orfQ.id, pfA.orfQ.id), print(pfD.orfS.id, pfA.orfS.id), print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos), print "%2.1f,%2.1f" % (intron.donor.pssm_score, intron.acceptor.pssm_score), print "%2.1f,%2.1f" % (intron.binary_entropy_donor, intron.binary_entropy_acceptor) ################################################################ if aaDistance > kwargs['max_aa_offset']: # break out; ordered by length can never result in # a proper projected intron break # filter out less relevant ones compared to complete set of results projected_introns = _filter_projected_introns(projected_introns) # and return a list of ProjectedIntronConnectingOrfs return projected_introns
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs, KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs[ 'cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD, pacbporfA, verbose=verbose, **kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs[ 'cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ, intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) ######################################################################## if verbose: print(intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt - distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append((intQ, intS)) ############################################################################ if verbose: for intQ, intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ, intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart = qEnd - max([distA, distD]) sStart = pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA, distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart = pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA, distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart = sEnd + min([distA, distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart, qEnd, qSeq) headerS = "sbjct_%s_%s_%s" % (sStart, sEnd, sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, ( distD, distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = {headerQ: qSeq, headerS: sSeq} (alignedseqs, alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw(alignment=(alignedseqs[headerQ], alignment, alignedseqs[headerS]), coords=(qStart, qEnd, sStart, sEnd)) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfD.orfQ, pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfA.orfQ, pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt - distAnt) intS._distance = abs(distDnt - distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, cig_pacbporf, pacbporfA) succes = set_apps_intron_sbjct(intS, pacbporfD, cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, pacbporfD, cig_pacbporf) succes = set_apps_intron_sbjct(intS, cig_pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [cig_pacbporf] intS._linked_to_pacbporfs = [cig_pacbporf] # append to found_cig_list found_cig_list.append((intQ, intS, cig_pacbporf)) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list
def merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, orfSetObjQ, orfSetObjS, verbose=False, **kwargs): """ """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 resultlistQ = merge_orfs_with_tinyexon( pacbporfD.orfQ, pacbporfA.orfQ, preceding_donor_sites=pacbporfD.orfQ._donor_sites, subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites, orflist=orfSetObjQ.orfs, **kwargs) resultlistS = merge_orfs_with_tinyexon( pacbporfD.orfS, pacbporfA.orfS, preceding_donor_sites=pacbporfD.orfS._donor_sites, subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites, orflist=orfSetObjS.orfs, **kwargs) # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ] resultdictQ, key2exonQ = _tinyexon_list_2_dict(resultlistQ) resultdictS, key2exonS = _tinyexon_list_2_dict(resultlistS) # get unique list of donors & acceptors donorQ = olba(list(Set([inD.donor for inD, te, inA in resultlistQ])), order_by='pos') donorS = olba(list(Set([inD.donor for inD, te, inA in resultlistS])), order_by='pos') accepQ = olba(list(Set([inA.acceptor for inD, te, inA in resultlistQ])), order_by='pos') accepS = olba(list(Set([inA.acceptor for inD, te, inA in resultlistS])), order_by='pos') ## filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = True # True kwargs['aligned_site_max_triplet_distance'] = 0 # 2 algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD, **kwargs) algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA, **kwargs) # settings for minimal alignment entropy score # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!! min_donor_site_alignment_entropy = 0.1 min_acceptor_site_alignment_entropy = 0.1 # remove sites with to low alignment entropy algdonors = _filter_for_entropy( algdonors, pacbporfD, 'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy( algacceps, pacbporfA, 'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS return_list = [] ############################################################################ if verbose: print "bridges constructed: ORFS:", print(pacbporfD.orfQ.id, pacbporfA.orfQ.id), print(pacbporfD.orfS.id, pacbporfA.orfS.id), print len(resultdictQ), len(resultdictS), print(len(resultlistQ), len(donorQ), len(accepQ)), print(len(resultlistS), len(donorS), len(accepS)), print(len(algdonors), len(algacceps)) ############################################################################ for keyQ, tinyexonQ in key2exonQ.iteritems(): for keyS, tinyexonS in key2exonS.iteritems(): if tinyexonQ.donor.phase != tinyexonS.donor.phase: continue if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase: continue if tinyexonQ.length != tinyexonS.length: continue # if here, then tinyexons of identical structure #################################################################### if verbose: print tinyexonQ.length, tinyexonQ.donor.phase, print(len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1])), print(len(resultdictS[keyS][0]), len(resultdictS[keyS][1])), print tinyexonQ, print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score #################################################################### donor_introns = [] acceptor_introns = [] for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ, dS in algdonors ]: continue # check if they exists as aligned sites alignedkey = (intronDQ.donor.pos, intronDS.donor.pos) if alignedkey not in [(dQ.pos, dS.pos) for dQ, dS in algdonors]: continue # if here, we have a set of introns 5' of the tinyexon # which are perfectly alignable! donor_introns.append((intronDQ, intronDS)) for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ, aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ, aS in algacceps ]: continue # check if they exists as aligned sites alignedkey = (intronAQ.acceptor.pos, intronAS.acceptor.pos) if alignedkey not in [(aQ.pos, aS.pos) for aQ, aS in algacceps]: continue # if here, we have a set of introns 3' of the tinyexon # which are perfectly alignable! acceptor_introns.append((intronAQ, intronAS)) if not len(donor_introns) or not len(acceptor_introns): # no aligned 5' && aligned 3' introns continue # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), tinyexonQ.protein_start(), tinyexonS.protein_start(), )) pacbp.strip_unmatched_ends() # continue if no fraction could be aligned if len(pacbp) == 0: continue tinypacbporf = pacbp2pacbporf(pacbp, tinyexonQ.orf, tinyexonS.orf) tinypacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print tinypacbporf tinypacbporf.print_protein_and_dna() print len(donor_introns), len(acceptor_introns), print max([ dQ.donor.pssm_score + dS.donor.pssm_score for dQ, dS in donor_introns ]), print max([ aQ.acceptor.pssm_score + aS.acceptor.pssm_score for aQ, aS in acceptor_introns ]) #################################################################### # if here, we have accepted tinyexon bridges! # gather them and store to return_list for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ, dS in algdonors ]: continue for intronAQkey, intronAQ in resultdictQ[keyQ][ 1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ, aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][ 1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ, aS in algacceps ]: continue #################################################### # set some meta-data properties to the intron objects #################################################### _score_introns_obtained_by_mapping( intronDQ, intronDS, pacbporfD, tinypacbporf, source='ABGPmappingTE') _score_introns_obtained_by_mapping( intronAQ, intronAS, tinypacbporf, pacbporfA, source='ABGPmappingTE') # create _linked_to_xxx attributes intronDQ._linked_to_pacbporfs = [tinypacbporf] intronAQ._linked_to_pacbporfs = [tinypacbporf] intronDS._linked_to_pacbporfs = [tinypacbporf] intronAS._linked_to_pacbporfs = [tinypacbporf] intronDQ._linked_to_introns = [intronAQ] intronAQ._linked_to_introns = [intronDQ] intronDS._linked_to_introns = [intronAS] intronAS._linked_to_introns = [intronDS] # append to tmp result list return_list.append( (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS)) # check if there are >1 candidate tiny exons # currently, we choose only to return the **best** mapped tinyexon if len(return_list) == 0: pass elif len(return_list) == 1: pass else: # only take the highest scoring candidate here min_distance = min([(a._distance + d._distance) for a, b, c, d, e in return_list]) pos2score = [] for (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) in return_list: if (intronDQ._distance + intronAQ._distance) > min_distance: pos2score.append(0.0) else: # calculate overall pssm score total_pssm = 0.0 total_pssm += intronDQ.donor.pssm_score total_pssm += intronDQ.acceptor.pssm_score total_pssm += intronDS.donor.pssm_score total_pssm += intronDS.acceptor.pssm_score total_pssm += intronAQ.donor.pssm_score total_pssm += intronAQ.acceptor.pssm_score total_pssm += intronAS.donor.pssm_score total_pssm += intronAS.acceptor.pssm_score pos2score.append(total_pssm) # get highest score and linked tinyexon max_score = max(pos2score) return_list = [return_list[pos2score.index(max_score)]] ############################################################################ # some printing in verbose mode if verbose and return_list: (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) = return_list[0] print "BEST MAPPED TINYEXON:" print tinypacbporf print tinypacbporf.query, intronDQ._distance, intronAQ._distance, print(intronDQ.donor.pos, intronDQ.acceptor.pos), print(intronDS.donor.pos, intronDS.acceptor.pos), print(intronAQ.donor.pos, intronAQ.acceptor.pos), print(intronAS.donor.pos, intronAS.acceptor.pos) ############################################################################ # return the result list return return_list
def merge_pacbporfs_with_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see orfs.merge_orfs_with_intron for **kwargs @attention: see functions._filter_for_alignable_splice_sites for **kwargs @attention: see functions._filter_for_entropy for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_query_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) sdr = pacbporfD.alignment_dna_range_sbjct() sar = pacbporfA.alignment_dna_range_sbjct() min_donor_sbjct_pos = max( [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_sbjct_pos = min( [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) # get list of introns #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, # min_donor_pos =min_donor_query_pos, # max_acceptor_pos=max_accep_query_pos,**kwargs) #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS, # min_donor_pos =min_donor_sbjct_pos, # max_acceptor_pos=max_accep_sbjct_pos,**kwargs) # get list of introns intronsQ = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, **kwargs) intronsS = merge_orfs_with_intron(pacbporfD.orfS, pacbporfA.orfS, **kwargs) # get unique list of donors & acceptors donorQ = olba(list(Set([inQ.donor for inQ in intronsQ])), order_by='pos') donorS = olba(list(Set([inS.donor for inS in intronsS])), order_by='pos') accepQ = olba(list(Set([inQ.acceptor for inQ in intronsQ])), order_by='pos') accepS = olba(list(Set([inS.acceptor for inS in intronsS])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donorQ], "aQ1", [a.pos for a in accepQ] print "dS1", [d.pos for d in donorS], "aS1", [a.pos for a in accepS] ############################################################################ # filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor'] algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD, **kwargs) kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor'] algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA, **kwargs) ############################################################################ if verbose: print "dQ2", [_dq.pos for (_dq, _ds) in algdonors], print "aQ2", [_aq.pos for (_aq, _as) in algacceps] print "dS2", [_ds.pos for (_dq, _ds) in algdonors], print "aS2", [_as.pos for (_aq, _as) in algacceps] ############################################################################ # remove sites with to low alignment entropy algdonors = _filter_for_entropy( algdonors, pacbporfD, 'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy( algacceps, pacbporfA, 'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) ############################################################################ if verbose: print "dQ3", [_dq.pos for (_dq, _ds) in algdonors], print "aQ3", [_aq.pos for (_aq, _as) in algacceps] print "dS3", [_ds.pos for (_dq, _ds) in algdonors], print "aS3", [_as.pos for (_aq, _as) in algacceps] ############################################################################ # make unique position lists for quick lookup in intron lists dQpl = Set([dQ.pos for dQ, dS in algdonors]) dSpl = Set([dS.pos for dQ, dS in algdonors]) aQpl = Set([aQ.pos for aQ, aS in algacceps]) aSpl = Set([aS.pos for aQ, aS in algacceps]) # check exterior boundaries of PacbPORFs sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() # now make list of aligable introns algintrons = [] for intQ in intronsQ: # check if intron falls within the PacbPORF aligned area if intQ.donor.pos <= sposD.query_dna_start: continue if intQ.acceptor.pos >= eposA.query_dna_end: continue if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl: # Query intron occurs in list of alignable splice sites! for intS in intronsS: # check if intron falls within the PacbPORF aligned area if intS.donor.pos <= sposD.sbjct_dna_start: continue if intS.acceptor.pos >= eposA.sbjct_dna_end: continue if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl: # Sbjct intron occurs as well in alignable splice sites! if (intQ.donor,intS.donor) in algdonors and\ (intQ.acceptor,intS.acceptor) in algacceps: # Sbjct & Query Donor & Acceptor are alignable! algintrons.append((intQ, intS)) ############################################################################ # set some meta-data properties to the intron objects ############################################################################ for intQ, intS in algintrons: distDnt = pacbporfD.get_distance_aligned_nucleotide_positions( query=intQ.donor.pos, sbjct=intS.donor.pos) distAnt = pacbporfA.get_distance_aligned_nucleotide_positions( query=intQ.acceptor.pos, sbjct=intS.acceptor.pos) # final distance check. kwargs['aligned_site_max_triplet_distance'] # is applied on donor and acceptor site. This distance measured on the # protein sequence can be DOUBLED in case distDnt / distAnt are # opposite (+ and -). Check here if the protein sequence gap is # as well <= kwargs['aligned_site_max_triplet_distance']. if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance'] * 3: continue # add distance score to introns intQ._distance = abs(distDnt) + abs(distAnt) intS._distance = abs(distDnt) + abs(distAnt) # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, pacbporfD, pacbporfA) succes = set_apps_intron_sbjct(intS, pacbporfD, pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPmapping" intS._gff['fsource'] = "ABGPmapping" ######################################################################## if verbose: # some printing.... print "Aligned introns:", (intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos), print "DIST:", distDnt, distAnt, print "[%s]" % kwargs['aligned_site_max_triplet_distance'], print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep), print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % ( intQ.donor.pssm_score, intS.donor.pssm_score, intQ.acceptor.pssm_score, intS.acceptor.pssm_score, ) ######################################################################## # return lists of aligned introns return algintrons
def merge_pacbporfs(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj, allow_query_projecting=True, allow_sbjct_projecting=True, allow_query_mapping=True, allow_sbjct_mapping=True, allow_projecting=True, allow_mapping=True, verbose=False): """ Merge 2 PacbPORF objects with an interface into a gene structure @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit/create **kwargs dictionary for some forced attributes kwargs = {} _update_kwargs(kwargs, KWARGS_SPLICESITES) # deal with allow_xxx attributes if not allow_projecting: allow_query_projecting = False allow_sbjct_projecting = False if not allow_mapping: allow_query_mapping = False allow_sbjct_mapping = False # check if Orf objects of PacbPORFS are identical queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id # return data structure of introns introns = {'query': [], 'sbjct': []} # Scan Orfs for splice sites. # This has probably been performed before, but when not done, # cached donor & acceptor sites lists seems to be empty -> no introns pacbporfD.orfQ.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) pacbporfD.orfS.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) pacbporfA.orfQ.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) pacbporfA.orfS.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) if not queryOrfsIdentical and not sbjctOrfsIdentical: introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination( introns1) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns2 = merge_pacbporfs_with_closeby_independant_introns( pacbporfD, pacbporfA) introns3 = merge_pacbporfs_with_phase_shift_introns( pacbporfD, pacbporfA) introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj) introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron( pacbporfD, pacbporfA, queryOrfSetObj) introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron( pacbporfD, pacbporfA, sbjctOrfSetObj) introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron( pacbporfD, pacbporfA, sbjctOrfSetObj) introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron( pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} introns4 = {} introns5 = {} introns6 = {} introns7 = {} introns8 = {} introns9 = merge_pacbporfs_with_conserved_acceptor_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns9 = _filter_aligned_introns_on_pssm_entropy_combination( introns9) introns10 = merge_pacbporfs_with_conserved_donor_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns10 = _filter_aligned_introns_on_pssm_entropy_combination( introns10) # store introns obtained by most simplest case projecting/mapping introns['query'].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns['sbjct'].extend(Set([intrS for (intrQ, intrS) in introns1])) # only store introns from intron2 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, cigpacbp) in introns2: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) # only store introns from intron3 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS) in introns3: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) # only store introns from intron4 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) k4 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['query'].append(intrQ2) introns['sbjct'].append(intrS2) # only store introns from intron5 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['query'].append(intrQ2) introns['sbjct'].append(intrS2) # only store introns from intron6 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['query'].append(intrQ2) introns['sbjct'].append(intrS2) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns['query']: introns['query'].remove(None) while None in introns['sbjct']: introns['sbjct'].remove(None) # only store introns from intron7 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['sbjct'].append(intrS2) # only store introns from intron8 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ: introns['query'].append(intrQ) introns['query'].append(intrQ2) introns['sbjct'].append(intrS) # only store introns from introns9 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS) in introns9: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (2163, 2283): print "STRACC", k1, intrQ, k1 not in keysQ print "STRACC", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns['query'].append(intrQ) if k2 not in keysS: introns['sbjct'].append(intrS) # only store introns from introns10 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS) in introns10: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (1642, 1858): print "STRDON", k1, intrQ, k1 not in keysQ print "STRDON", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns['query'].append(intrQ) if k2 not in keysS: introns['sbjct'].append(intrS) # finally, do the bridging thingy introns0 = merge_pacbporfs_with_query_intron_bridgeing( pacbporfD, pacbporfA) # only store introns from introns0 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] for intrQ in introns0: if intrQ.coords() not in keysQ: introns['query'].append(intrQ) #introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] ) #introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] ) #introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] ) #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] ) #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] ) #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] ) #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] ) #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] ) #introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] ) #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] ) #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] ) #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] ) #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] ) #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] ) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns['query']: introns['query'].remove(None) while None in introns['sbjct']: introns['sbjct'].remove(None) elif not queryOrfsIdentical: seqerror = merge_pacbporf_with_sequenceerror_in_query( pacbporfD, pacbporfA) introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query( pacbporfD, pacbporfA, queryOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_query( pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store sequencerror if it exists if seqerror: introns['query'].append(seqerror) # store introns obtained by most simplest case projecting/mapping introns['query'].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns['query'].append(intr1) introns['query'].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns['query'].append(intr1) introns['query'].append(intr2) introns['query'].append(intr3) if not introns['query'] and allow_sbjct_mapping and allow_query_mapping: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing( pacbporfD, pacbporfA) # potential stopless 3n intron in SBJCT introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination( introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns( pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns3 = merge_pacbporfs_with_phase_shift_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination( introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns['query']] for intrQ, intrS in introns1: if intrQ.coords() not in keys: introns['query'].append(intrQ) keys = [intron.coords() for intron in introns['query']] for (intrQ, intrS, cigpacbp) in introns2: if intrQ.coords() not in keys: introns['query'].append(intrQ) keys = [intron.coords() for intron in introns['query']] for intrQ, intrS in introns3: if intrQ.coords() not in keys: introns['query'].append(intrQ) keys = [intron.coords() for intron in introns['query']] for intron in introns0: if intron.coords() not in keys: introns['query'].append(intron) keys = [intron.coords() for intron in introns['query']] keys = [intron.coords() for intron in introns['sbjct']] for intrQ, intrS in introns1: if intrS.coords() not in keys: introns['query'].append(intrS) keys = [intron.coords() for intron in introns['sbjct']] for (intrQ, intrS, cigpacbp) in introns2: if intrS.coords() not in keys: introns['query'].append(intrS) keys = [intron.coords() for intron in introns['sbjct']] for intrQ, intrS in introns3: if intrS.coords() not in keys: introns['query'].append(intrS) keys = [intron.coords() for intron in introns['sbjct']] elif not introns['query']: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing( pacbporfD, pacbporfA) # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns['query']] for intron in introns0: if intron.coords() not in keys: introns['query'].append(intron) else: # projecting introns yielded results; do not try mapping pass elif not sbjctOrfsIdentical: introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct( pacbporfD, pacbporfA, sbjctOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct( pacbporfD, pacbporfA, sbjctOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store introns obtained by most simplest case projecting/mapping introns['sbjct'].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns['sbjct'].append(intr1) introns['sbjct'].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns['sbjct'].append(intr1) introns['sbjct'].append(intr2) introns['sbjct'].append(intr3) if not introns['sbjct'] and allow_sbjct_mapping and allow_query_mapping: # potential stopless 3n intron in QUERY introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination( introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns( pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns3 = merge_pacbporfs_with_phase_shift_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination( introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # store introns introns['query'].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns['sbjct'].extend(Set([intrS for (intrQ, intrS) in introns1])) introns['query'].extend( [intrQ for (intrQ, intrS, cigpacbp) in introns2]) introns['query'].extend([intrQ for (intrQ, intrS) in introns3]) introns['sbjct'].extend( [intrS for (intrQ, intrS, cigpacbp) in introns2]) introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3]) else: # projecting introns yielded results; do not try mapping pass elif queryOrfsIdentical and sbjctOrfsIdentical: if allow_query_mapping: introns1 = merge_pacbporfs_by_inframe_intron_in_query( pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns1 = [] if allow_sbjct_mapping: introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct( pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns2 = [] if allow_sbjct_mapping and allow_query_mapping: introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination( introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # no mapping (unigene or continious alignment provided) introns3 = [] #introns4 = merge_pacbporfs_with_closeby_independant_introns( # pacbporfD,pacbporfA) #introns5 = merge_pacbporfs_with_phase_shift_introns( # pacbporfD,pacbporfA) introns['query'].extend([prj.projected_introns[0] for prj in introns1]) introns['sbjct'].extend([prj.projected_introns[0] for prj in introns2]) introns['query'].extend([intrQ for (intrQ, intrS) in introns3]) introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3]) else: # none of these cases; allow_projecting or allow_mapping == False! pass # Filter for stopless3n introns introns['query'] = _filter_stopless_3n_introns(introns['query']) introns['sbjct'] = _filter_stopless_3n_introns(introns['sbjct']) # return list of introns return introns
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,pacbporfA, orfSetObject,queryorsbjct,verbose = False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) else: (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) else: (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range( 3-dPhase % 3, len(query), 3) for pos in range(0,len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([ 0, (len(query) - query.count("N"))/2 ]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) #################################################### if verbose: print (pacbporfD.orfQ.id,pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname,'w') fh.write(sfmpat+"\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % ( donorOrf.inputgenomicsequence, EXECUTABLE_SFM,fname, dObj.pos+(kwargs['min_intron_nt_length']-3), aObj.pos-(kwargs['min_intron_nt_length']-3) ) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr,seqmatch in matches.iteritems(): startQ,stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_elegiable_orfs( max_orf_start=exonQstart,min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:],splicetype='donor') aScore = _score_splice_site(seqmatch[0:11],splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos-dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos-intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print (pacbporfD.orfQ.id,tinyexonorf.id,pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = ( tinyexonorf, exonQstart, exonQstop ) sbjct_data = ( prjctOrf, posDsbjct, posAsbjct ) splicesite_data = ( dObj,intron1_aObj, intron2_dObj, aObj ) tinyexons.append( ( query_data, sbjct_data, splicesite_data ) ) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data,sbjct_data,splicesite_data) = tinyexons[0] orfQ,query_dna_start,query_dna_end = query_data orfS,sbjct_dna_start,sbjct_dna_end = sbjct_data (intron1_dObj,intron1_aObj,intron2_dObj,intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) -1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) -1 stopQaa = orfQ.dnapos2aapos(query_dna_end) +1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) +1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 while startSaa <= orfS.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 while stopSaa > orfS.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa,abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa,abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa-startQaa), print (query_dna_start,query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa-startSaa), print (sbjct_dna_start,sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start-2:query_dna_end+2] print orfS.inputgenomicsequence[sbjct_dna_start-2:sbjct_dna_end+2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( qAAseq, sAAseq, startQaa, startSaa ) ) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp,orfQ,orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs( intron1_dObj, intron1_aObj, None, donorOrf,pacbporf.orfQ ) intron2 = IntronConnectingOrfs( intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf ) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1,pacbporfD,pacbporf) succes = set_apps_intron_query(intron2,pacbporf,pacbporfA) else: succes = set_apps_intron_sbjct(intron1,pacbporfD,pacbporf) succes = set_apps_intron_sbjct(intron2,pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [ pacbporf ] intron2._linked_to_pacbporfs = [ pacbporf ] intron1._linked_to_introns = [ intron2 ] intron2._linked_to_introns = [ intron1 ] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1,intron2,pacbporf)]
def _merge_pacbporfs_by_two_tinyexons(pacbporfD,pacbporfA, orfSetObject,queryorsbjct,verbose = False, **kwargs): """ """ # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) tinyexons = [] sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS dStart,dEnd = sposD.query_dna_start, eposD.query_dna_end aStart,aEnd = sposA.query_dna_start, eposA.query_dna_end elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ dStart,dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end aStart,aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # get all potential combinations of two tinyexons tinyexoncombis = merge_orfs_with_two_tinyexons( donorOrf, accepOrf, donorOrf._donor_sites, accepOrf._acceptor_sites, orfSetObject.orfs, ) results = [] for dObj in donorOrf._donor_sites: if queryorsbjct == "query": (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) else: (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break # check if dObj is on pfD; # introns of tinyexons can be projected outside of pfD/pfA area if dObj.pos < dStart: continue for aObj in accepOrf._acceptor_sites: if queryorsbjct == "query": (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) else: (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break # check if aObj is on pfA; # introns of tinyexons can be projected outside of pfD/pfA area if aObj.pos > aEnd: continue if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= (kwargs['max_tinyexon_nt_length']*2): break if distance < (kwargs['min_tinyexon_nt_length']*2): continue filtered_tinyexoncombis = _filter_tinyexoncombis(tinyexoncombis, min_length = distance, max_length = distance, min_first_acceptor_pos = dObj.pos + kwargs['min_tinyexon_intron_nt_length'], max_final_donor_pos = aObj.pos - kwargs['min_tinyexon_intron_nt_length'], phase_final_donor = aObj.phase, phase_first_acceptor= dObj.phase, ) if not filtered_tinyexoncombis: continue #################################################################### if verbose: print distance, dObj, aObj, len(tinyexoncombis), print len(filtered_tinyexoncombis) #################################################################### for exon1,intron,exon2 in filtered_tinyexoncombis: # make preceding intron preceding_intron = IntronConnectingOrfs( dObj,exon1.acceptor, None,donorOrf,exon1.orf ) # make subsequent intron subsequent_intron = IntronConnectingOrfs( exon2.donor, aObj, None,exon2.orf,accepOrf) ################################################################ if verbose: print "\t", exon1, exon1.proteinsequence(), print preceding_intron.phase, exon1.donor.phase, print subsequent_intron.phase, preceding_intron.shared_aa, print intron.shared_aa, subsequent_intron.shared_aa print "\t", exon2, exon2.proteinsequence() ################################################################ # get prjctOrf sequence for comparison correctionA = 0 if aObj.phase != 0: # INCLUDE the final AA which is broken by the splicesite correctionA=1 if queryorsbjct == "query": startPos,_phase = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) stopPos,_phase = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) start = pacbporfD._positions[startPos].sbjct_pos stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA else: startPos,_phase = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) stopPos,_phase = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) start = pacbporfD._positions[startPos].query_pos stop = pacbporfA._positions[stopPos].query_pos + correctionA if stop <= start: # tinyexon is so tiny that is does not have a single # full aligned AA -> discard here continue # actually get the prjctOrf sequence aaseq = prjctOrf.getaas(abs_pos_start=start,abs_pos_end=stop) # initialize a PacbP for the combination of both tinyexons # afterwards, check if the indentityscore is > 0.XX from pacb import PacbP seqparts = [ preceding_intron.shared_aa, exon1.proteinsequence(), intron.shared_aa, exon2.proteinsequence(), subsequent_intron.shared_aa ] ################################################################ if verbose or len("".join(seqparts)) != len(aaseq): print pacbporfD print exon1.orf, exon2.orf, prjctOrf print pacbporfA print seqparts print aaseq, len(aaseq), len("".join(seqparts)), (start,stop) print "'%s'" % queryorsbjct, print "Q", (algDobj.query_pos, algAobj.query_pos), print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos) print "distance:", distance, kwargs['max_tinyexon_nt_length'], print (posDsbjct, posAsbjct), print "Q-dna:", ( algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase ), print "S-dna:", ( algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase ) ################################################################ # ignore by continue when sequences not identical in length if len("".join(seqparts)) != len(aaseq): continue testpacbp = PacbP(input=( "".join(seqparts), aaseq, 0, 0) ) testpacbp.strip_unmatched_ends() if not ( testpacbp.identityscore > 0.60 and\ (float(testpacbp.length) / len(aaseq)) > 0.70 ): # not a very convincing alignment continue ################################################################ if verbose: print testpacbp testpacbp.print_protein() ################################################################ # if here, succesfully mapped 2 tiny exons!! # get all sequences/coordinates in place for # pacbporf formation orfQ1 = exon1.orf orfS1 = prjctOrf orfQ2 = exon2.orf orfS2 = prjctOrf seqQ1 = exon1.proteinsequence() seqQ2 = exon2.proteinsequence() coordQ1 = exon1.acceptor.pos / 3 coordS1 = start coordQ2 = exon2.acceptor.pos / 3 coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len(seqparts[2]) seqS1 = aaseq[0:(len(seqparts[0])+len(seqparts[1]))] seqS2 = aaseq[-(len(seqparts[3])+len(seqparts[4])):] if len(seqparts[0]): seqS1 = seqS1[1:] coordS1 += 1 if len(seqparts[4]): seqS2 = seqS2[:-1] if queryorsbjct == "sbjct": # swap query <-> sbjct orfQ1,orfS1 = orfS1,orfQ1 orfQ2,orfS2 = orfS2,orfQ2 seqQ1,seqS1 = seqS1,seqQ1 seqQ2,seqS2 = seqS2,seqQ2 coordQ1,coordS1 = coordS1,coordQ1 coordQ2,coordS2 = coordS2,coordQ2 ################################################################ if verbose: print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2 print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2 ################################################################ # make pacbporfs pacbp1 = PacbP(input=( seqQ1, seqS1, coordQ1, coordS1) ) pacbp1.strip_unmatched_ends() tinypacbporf1 = pacbp2pacbporf(pacbp1,orfQ1,orfS1) tinypacbporf1.extend_pacbporf_after_stops() pacbp2 = PacbP(input=( seqQ2, seqS2, coordQ2, coordS2) ) pacbp2.strip_unmatched_ends() tinypacbporf2 = pacbp2pacbporf(pacbp2,orfQ2,orfS2) tinypacbporf2.extend_pacbporf_after_stops() ################################################################ if verbose: print tinypacbporf1 tinypacbporf1.print_protein_and_dna() print tinypacbporf2 tinypacbporf2.print_protein_and_dna() ################################################################ ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron preceding_intron._distance = 0 intron._distance = 0 subsequent_intron._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(preceding_intron,pacbporfD,tinypacbporf1) succes = set_apps_intron_query(intron,tinypacbporf1,tinypacbporf2) succes = set_apps_intron_query(subsequent_intron,tinypacbporf2,pacbporfA) else: succes = set_apps_intron_sbjct(preceding_intron,pacbporfD,tinypacbporf1) succes = set_apps_intron_sbjct(intron,tinypacbporf1,tinypacbporf2) succes = set_apps_intron_sbjct(subsequent_intron,tinypacbporf2,pacbporfA) # set GFF fsource attribute for recognition of intron sources preceding_intron._gff['fsource'] = "ABGPprojectingTE" intron._gff['fsource'] = "ABGPprojectingTE" subsequent_intron._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] preceding_intron._linked_to_introns = [ intron,subsequent_intron ] intron._linked_to_introns = [ preceding_intron,subsequent_intron ] subsequent_intron._linked_to_introns = [ intron,preceding_intron ] ################################################################ # append to results ################################################################ results.append( ( preceding_intron, intron, subsequent_intron, tinypacbporf1, tinypacbporf2, ) ) # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row) return results
def _merge_pacbporfs_by_two_tinyexons(pacbporfD, pacbporfA, orfSetObject, queryorsbjct, verbose=False, **kwargs): """ """ # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) tinyexons = [] sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS dStart, dEnd = sposD.query_dna_start, eposD.query_dna_end aStart, aEnd = sposA.query_dna_start, eposA.query_dna_end elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ dStart, dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end aStart, aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # get all potential combinations of two tinyexons tinyexoncombis = merge_orfs_with_two_tinyexons( donorOrf, accepOrf, donorOrf._donor_sites, accepOrf._acceptor_sites, orfSetObject.orfs, ) results = [] for dObj in donorOrf._donor_sites: if queryorsbjct == "query": (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos, forced_return=True) else: (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos, forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break # check if dObj is on pfD; # introns of tinyexons can be projected outside of pfD/pfA area if dObj.pos < dStart: continue for aObj in accepOrf._acceptor_sites: if queryorsbjct == "query": (aPos, aPhase) = pacbporfA.dnaposition_query(aObj.pos, forced_return=True) else: (aPos, aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos, forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break # check if aObj is on pfA; # introns of tinyexons can be projected outside of pfD/pfA area if aObj.pos > aEnd: continue if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= (kwargs['max_tinyexon_nt_length'] * 2): break if distance < (kwargs['min_tinyexon_nt_length'] * 2): continue filtered_tinyexoncombis = _filter_tinyexoncombis( tinyexoncombis, min_length=distance, max_length=distance, min_first_acceptor_pos=dObj.pos + kwargs['min_tinyexon_intron_nt_length'], max_final_donor_pos=aObj.pos - kwargs['min_tinyexon_intron_nt_length'], phase_final_donor=aObj.phase, phase_first_acceptor=dObj.phase, ) if not filtered_tinyexoncombis: continue #################################################################### if verbose: print distance, dObj, aObj, len(tinyexoncombis), print len(filtered_tinyexoncombis) #################################################################### for exon1, intron, exon2 in filtered_tinyexoncombis: # make preceding intron preceding_intron = IntronConnectingOrfs( dObj, exon1.acceptor, None, donorOrf, exon1.orf) # make subsequent intron subsequent_intron = IntronConnectingOrfs( exon2.donor, aObj, None, exon2.orf, accepOrf) ################################################################ if verbose: print "\t", exon1, exon1.proteinsequence(), print preceding_intron.phase, exon1.donor.phase, print subsequent_intron.phase, preceding_intron.shared_aa, print intron.shared_aa, subsequent_intron.shared_aa print "\t", exon2, exon2.proteinsequence() ################################################################ # get prjctOrf sequence for comparison correctionA = 0 if aObj.phase != 0: # INCLUDE the final AA which is broken by the splicesite correctionA = 1 if queryorsbjct == "query": startPos, _phase = pacbporfD.dnaposition_query( dObj.pos, forced_return=True) stopPos, _phase = pacbporfA.dnaposition_query( aObj.pos, forced_return=True) start = pacbporfD._positions[startPos].sbjct_pos stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA else: startPos, _phase = pacbporfD.dnaposition_sbjct( dObj.pos, forced_return=True) stopPos, _phase = pacbporfA.dnaposition_sbjct( aObj.pos, forced_return=True) start = pacbporfD._positions[startPos].query_pos stop = pacbporfA._positions[stopPos].query_pos + correctionA if stop <= start: # tinyexon is so tiny that is does not have a single # full aligned AA -> discard here continue # actually get the prjctOrf sequence aaseq = prjctOrf.getaas(abs_pos_start=start, abs_pos_end=stop) # initialize a PacbP for the combination of both tinyexons # afterwards, check if the indentityscore is > 0.XX from pacb import PacbP seqparts = [ preceding_intron.shared_aa, exon1.proteinsequence(), intron.shared_aa, exon2.proteinsequence(), subsequent_intron.shared_aa ] ################################################################ if verbose or len("".join(seqparts)) != len(aaseq): print pacbporfD print exon1.orf, exon2.orf, prjctOrf print pacbporfA print seqparts print aaseq, len(aaseq), len("".join(seqparts)), (start, stop) print "'%s'" % queryorsbjct, print "Q", (algDobj.query_pos, algAobj.query_pos), print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos) print "distance:", distance, kwargs[ 'max_tinyexon_nt_length'], print(posDsbjct, posAsbjct), print "Q-dna:", (algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase), print "S-dna:", (algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase) ################################################################ # ignore by continue when sequences not identical in length if len("".join(seqparts)) != len(aaseq): continue testpacbp = PacbP(input=("".join(seqparts), aaseq, 0, 0)) testpacbp.strip_unmatched_ends() if not ( testpacbp.identityscore > 0.60 and\ (float(testpacbp.length) / len(aaseq)) > 0.70 ): # not a very convincing alignment continue ################################################################ if verbose: print testpacbp testpacbp.print_protein() ################################################################ # if here, succesfully mapped 2 tiny exons!! # get all sequences/coordinates in place for # pacbporf formation orfQ1 = exon1.orf orfS1 = prjctOrf orfQ2 = exon2.orf orfS2 = prjctOrf seqQ1 = exon1.proteinsequence() seqQ2 = exon2.proteinsequence() coordQ1 = exon1.acceptor.pos / 3 coordS1 = start coordQ2 = exon2.acceptor.pos / 3 coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len( seqparts[2]) seqS1 = aaseq[0:(len(seqparts[0]) + len(seqparts[1]))] seqS2 = aaseq[-(len(seqparts[3]) + len(seqparts[4])):] if len(seqparts[0]): seqS1 = seqS1[1:] coordS1 += 1 if len(seqparts[4]): seqS2 = seqS2[:-1] if queryorsbjct == "sbjct": # swap query <-> sbjct orfQ1, orfS1 = orfS1, orfQ1 orfQ2, orfS2 = orfS2, orfQ2 seqQ1, seqS1 = seqS1, seqQ1 seqQ2, seqS2 = seqS2, seqQ2 coordQ1, coordS1 = coordS1, coordQ1 coordQ2, coordS2 = coordS2, coordQ2 ################################################################ if verbose: print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2 print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2 ################################################################ # make pacbporfs pacbp1 = PacbP(input=(seqQ1, seqS1, coordQ1, coordS1)) pacbp1.strip_unmatched_ends() tinypacbporf1 = pacbp2pacbporf(pacbp1, orfQ1, orfS1) tinypacbporf1.extend_pacbporf_after_stops() pacbp2 = PacbP(input=(seqQ2, seqS2, coordQ2, coordS2)) pacbp2.strip_unmatched_ends() tinypacbporf2 = pacbp2pacbporf(pacbp2, orfQ2, orfS2) tinypacbporf2.extend_pacbporf_after_stops() ################################################################ if verbose: print tinypacbporf1 tinypacbporf1.print_protein_and_dna() print tinypacbporf2 tinypacbporf2.print_protein_and_dna() ################################################################ ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron preceding_intron._distance = 0 intron._distance = 0 subsequent_intron._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(preceding_intron, pacbporfD, tinypacbporf1) succes = set_apps_intron_query(intron, tinypacbporf1, tinypacbporf2) succes = set_apps_intron_query(subsequent_intron, tinypacbporf2, pacbporfA) else: succes = set_apps_intron_sbjct(preceding_intron, pacbporfD, tinypacbporf1) succes = set_apps_intron_sbjct(intron, tinypacbporf1, tinypacbporf2) succes = set_apps_intron_sbjct(subsequent_intron, tinypacbporf2, pacbporfA) # set GFF fsource attribute for recognition of intron sources preceding_intron._gff['fsource'] = "ABGPprojectingTE" intron._gff['fsource'] = "ABGPprojectingTE" subsequent_intron._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] intron._linked_to_pacbporfs = [tinypacbporf1, tinypacbporf2] subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] preceding_intron._linked_to_introns = [ intron, subsequent_intron ] intron._linked_to_introns = [ preceding_intron, subsequent_intron ] subsequent_intron._linked_to_introns = [ intron, preceding_intron ] ################################################################ # append to results ################################################################ results.append(( preceding_intron, intron, subsequent_intron, tinypacbporf1, tinypacbporf2, )) # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row) return results
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge query Orfs in PacbPORF by **best** intron @attention: see orfs.merge_orfs_with_intron for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_query_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) # get list of introns intronlist = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, min_donor_pos=min_donor_query_pos, max_acceptor_pos=max_accep_query_pos, **kwargs) # filter on entropy # settings for minimal alignment entropy score if min([pacbporfD.identityscore, pacbporfA.identityscore]) > 0.55: min_donor_site_entropy = 0.01 min_acceptor_site_entropy = 0.01 intronlist = _filter_introns_on_entropy( intronlist, pacbporfD, pacbporfA, min_donor_site_entropy=min_donor_site_entropy, min_acceptor_site_entropy=min_acceptor_site_entropy) else: # do not filter, but do not forget to store apps data to intron(s) for intron in intronlist: succes = set_apps_intron_query(intron, pacbporfD, pacbporfA) for intron in intronlist: intron._distance = 0 # ?? # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = 'ABGPbridgeing' # get unique list of donors & acceptors donor = olba(list(Set([intron.donor for intron in intronlist])), order_by='pos') accep = olba(list(Set([intron.acceptor for intron in intronlist])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep] ############################################################################ intronlist = _filter_introns_on_pssm_entropy_combination(intronlist) # get unique list of donors & acceptors donor = olba(list(Set([intron.donor for intron in intronlist])), order_by='pos') accep = olba(list(Set([intron.acceptor for intron in intronlist])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep] ############################################################################ filtered_intron_list = [] for intron in intronlist: intron.assign_bp_and_ppts() if intron.branchpoint and (intron.ppt5p or intron.ppt3p): filtered_intron_list.append(intron) else: pass # check if list is emptied due to branchpoint filtering # in that case, filter for either branchpoint OR polyppt if not filtered_intron_list and intronlist: for intron in intronlist: if intron.branchpoint or (intron.ppt5p or intron.ppt3p): filtered_intron_list.append(intron) # return list of filtered introns return filtered_intron_list
def merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=False,**kwargs): """ Merge 2 PacbPORF objects by introns @attention: see orfs.merge_orfs_with_intron for **kwargs @attention: see functions._filter_for_alignable_splice_sites for **kwargs @attention: see functions._filter_for_entropy for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) sdr = pacbporfD.alignment_dna_range_sbjct() sar = pacbporfA.alignment_dna_range_sbjct() min_donor_sbjct_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_sbjct_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) # get list of introns #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, # min_donor_pos =min_donor_query_pos, # max_acceptor_pos=max_accep_query_pos,**kwargs) #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS, # min_donor_pos =min_donor_sbjct_pos, # max_acceptor_pos=max_accep_sbjct_pos,**kwargs) # get list of introns intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,**kwargs) intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,**kwargs) # get unique list of donors & acceptors donorQ = olba( list(Set([inQ.donor for inQ in intronsQ ])), order_by='pos') donorS = olba( list(Set([inS.donor for inS in intronsS ])), order_by='pos') accepQ = olba( list(Set([inQ.acceptor for inQ in intronsQ ])), order_by='pos') accepS = olba( list(Set([inS.acceptor for inS in intronsS ])), order_by='pos') ############################################################################ if verbose: print "dQ1", [ d.pos for d in donorQ ], "aQ1", [ a.pos for a in accepQ ] print "dS1", [ d.pos for d in donorS ], "aS1", [ a.pos for a in accepS ] ############################################################################ # filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor'] algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs) kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor'] algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs) ############################################################################ if verbose: print "dQ2", [ _dq.pos for (_dq,_ds) in algdonors ], print "aQ2", [ _aq.pos for (_aq,_as) in algacceps ] print "dS2", [ _ds.pos for (_dq,_ds) in algdonors ], print "aS2", [ _as.pos for (_aq,_as) in algacceps ] ############################################################################ # remove sites with to low alignment entropy algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) ############################################################################ if verbose: print "dQ3", [ _dq.pos for (_dq,_ds) in algdonors ], print "aQ3", [ _aq.pos for (_aq,_as) in algacceps ] print "dS3", [ _ds.pos for (_dq,_ds) in algdonors ], print "aS3", [ _as.pos for (_aq,_as) in algacceps ] ############################################################################ # make unique position lists for quick lookup in intron lists dQpl = Set([ dQ.pos for dQ,dS in algdonors ]) dSpl = Set([ dS.pos for dQ,dS in algdonors ]) aQpl = Set([ aQ.pos for aQ,aS in algacceps ]) aSpl = Set([ aS.pos for aQ,aS in algacceps ]) # check exterior boundaries of PacbPORFs sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() # now make list of aligable introns algintrons = [] for intQ in intronsQ: # check if intron falls within the PacbPORF aligned area if intQ.donor.pos <= sposD.query_dna_start: continue if intQ.acceptor.pos >= eposA.query_dna_end: continue if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl: # Query intron occurs in list of alignable splice sites! for intS in intronsS: # check if intron falls within the PacbPORF aligned area if intS.donor.pos <= sposD.sbjct_dna_start: continue if intS.acceptor.pos >= eposA.sbjct_dna_end: continue if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl: # Sbjct intron occurs as well in alignable splice sites! if (intQ.donor,intS.donor) in algdonors and\ (intQ.acceptor,intS.acceptor) in algacceps: # Sbjct & Query Donor & Acceptor are alignable! algintrons.append( ( intQ, intS ) ) ############################################################################ # set some meta-data properties to the intron objects ############################################################################ for intQ,intS in algintrons: distDnt = pacbporfD.get_distance_aligned_nucleotide_positions( query = intQ.donor.pos, sbjct = intS.donor.pos ) distAnt = pacbporfA.get_distance_aligned_nucleotide_positions( query = intQ.acceptor.pos, sbjct = intS.acceptor.pos ) # final distance check. kwargs['aligned_site_max_triplet_distance'] # is applied on donor and acceptor site. This distance measured on the # protein sequence can be DOUBLED in case distDnt / distAnt are # opposite (+ and -). Check here if the protein sequence gap is # as well <= kwargs['aligned_site_max_triplet_distance']. if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance']*3: continue # add distance score to introns intQ._distance = abs(distDnt) + abs(distAnt) intS._distance = abs(distDnt) + abs(distAnt) # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,pacbporfD,pacbporfA) succes = set_apps_intron_sbjct(intS,pacbporfD,pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPmapping" intS._gff['fsource'] = "ABGPmapping" ######################################################################## if verbose: # some printing.... print "Aligned introns:", ( intQ.donor.pos, intQ.acceptor.pos ) , print ( intS.donor.pos, intS.acceptor.pos ), print "DIST:", distDnt, distAnt, print "[%s]" % kwargs['aligned_site_max_triplet_distance'], print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep), print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % ( intQ.donor.pssm_score, intS.donor.pssm_score, intQ.acceptor.pssm_score, intS.acceptor.pssm_score, ) ######################################################################## # return lists of aligned introns return algintrons
def merge_pacbporfs_by_tinyexons(pacbporfD,pacbporfA, orfSetObjQ,orfSetObjS,verbose=False,**kwargs): """ """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 resultlistQ = merge_orfs_with_tinyexon( pacbporfD.orfQ,pacbporfA.orfQ, preceding_donor_sites=pacbporfD.orfQ._donor_sites, subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites, orflist=orfSetObjQ.orfs,**kwargs) resultlistS = merge_orfs_with_tinyexon( pacbporfD.orfS,pacbporfA.orfS, preceding_donor_sites=pacbporfD.orfS._donor_sites, subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites, orflist=orfSetObjS.orfs,**kwargs) # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ] resultdictQ,key2exonQ = _tinyexon_list_2_dict(resultlistQ) resultdictS,key2exonS = _tinyexon_list_2_dict(resultlistS) # get unique list of donors & acceptors donorQ = olba( list(Set([inD.donor for inD,te,inA in resultlistQ ])), order_by='pos') donorS = olba( list(Set([inD.donor for inD,te,inA in resultlistS ])), order_by='pos') accepQ = olba( list(Set([inA.acceptor for inD,te,inA in resultlistQ ])), order_by='pos') accepS = olba( list(Set([inA.acceptor for inD,te,inA in resultlistS ])), order_by='pos') ## filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = True # True kwargs['aligned_site_max_triplet_distance'] = 0 # 2 algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs) algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs) # settings for minimal alignment entropy score # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!! min_donor_site_alignment_entropy = 0.1 min_acceptor_site_alignment_entropy = 0.1 # remove sites with to low alignment entropy algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS return_list = [] ############################################################################ if verbose: print "bridges constructed: ORFS:", print (pacbporfD.orfQ.id,pacbporfA.orfQ.id), print (pacbporfD.orfS.id,pacbporfA.orfS.id), print len(resultdictQ), len(resultdictS), print ( len(resultlistQ), len(donorQ), len(accepQ) ), print ( len(resultlistS), len(donorS), len(accepS) ), print ( len(algdonors), len(algacceps) ) ############################################################################ for keyQ,tinyexonQ in key2exonQ.iteritems(): for keyS,tinyexonS in key2exonS.iteritems(): if tinyexonQ.donor.phase != tinyexonS.donor.phase: continue if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase: continue if tinyexonQ.length != tinyexonS.length: continue # if here, then tinyexons of identical structure #################################################################### if verbose: print tinyexonQ.length, tinyexonQ.donor.phase, print ( len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1]) ), print ( len(resultdictS[keyS][0]), len(resultdictS[keyS][1]) ), print tinyexonQ, print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score #################################################################### donor_introns = [] acceptor_introns = [] for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]: continue # check if they exists as aligned sites alignedkey = ( intronDQ.donor.pos, intronDS.donor.pos ) if alignedkey not in [ (dQ.pos, dS.pos) for dQ,dS in algdonors ]: continue # if here, we have a set of introns 5' of the tinyexon # which are perfectly alignable! donor_introns.append((intronDQ,intronDS)) for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]: continue # check if they exists as aligned sites alignedkey = ( intronAQ.acceptor.pos, intronAS.acceptor.pos ) if alignedkey not in [ (aQ.pos, aS.pos) for aQ,aS in algacceps ]: continue # if here, we have a set of introns 3' of the tinyexon # which are perfectly alignable! acceptor_introns.append((intronAQ,intronAS)) if not len(donor_introns) or not len(acceptor_introns): # no aligned 5' && aligned 3' introns continue # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), tinyexonQ.protein_start(), tinyexonS.protein_start(), ) ) pacbp.strip_unmatched_ends() # continue if no fraction could be aligned if len(pacbp) == 0: continue tinypacbporf = pacbp2pacbporf(pacbp,tinyexonQ.orf,tinyexonS.orf) tinypacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print tinypacbporf tinypacbporf.print_protein_and_dna() print len(donor_introns), len(acceptor_introns), print max([ dQ.donor.pssm_score+dS.donor.pssm_score for dQ,dS in donor_introns]), print max([ aQ.acceptor.pssm_score+aS.acceptor.pssm_score for aQ,aS in acceptor_introns]) #################################################################### # if here, we have accepted tinyexon bridges! # gather them and store to return_list for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]: continue for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]: continue #################################################### # set some meta-data properties to the intron objects #################################################### _score_introns_obtained_by_mapping( intronDQ,intronDS,pacbporfD, tinypacbporf,source='ABGPmappingTE') _score_introns_obtained_by_mapping( intronAQ,intronAS,tinypacbporf, pacbporfA,source='ABGPmappingTE') # create _linked_to_xxx attributes intronDQ._linked_to_pacbporfs = [ tinypacbporf ] intronAQ._linked_to_pacbporfs = [ tinypacbporf ] intronDS._linked_to_pacbporfs = [ tinypacbporf ] intronAS._linked_to_pacbporfs = [ tinypacbporf ] intronDQ._linked_to_introns = [ intronAQ ] intronAQ._linked_to_introns = [ intronDQ ] intronDS._linked_to_introns = [ intronAS ] intronAS._linked_to_introns = [ intronDS ] # append to tmp result list return_list.append( (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) ) # check if there are >1 candidate tiny exons # currently, we choose only to return the **best** mapped tinyexon if len(return_list) == 0: pass elif len(return_list) == 1: pass else: # only take the highest scoring candidate here min_distance = min([ (a._distance+d._distance) for a,b,c,d,e in return_list ]) pos2score = [] for (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) in return_list: if (intronDQ._distance + intronAQ._distance) > min_distance: pos2score.append( 0.0 ) else: # calculate overall pssm score total_pssm = 0.0 total_pssm += intronDQ.donor.pssm_score total_pssm += intronDQ.acceptor.pssm_score total_pssm += intronDS.donor.pssm_score total_pssm += intronDS.acceptor.pssm_score total_pssm += intronAQ.donor.pssm_score total_pssm += intronAQ.acceptor.pssm_score total_pssm += intronAS.donor.pssm_score total_pssm += intronAS.acceptor.pssm_score pos2score.append( total_pssm ) # get highest score and linked tinyexon max_score = max(pos2score) return_list = [ return_list[pos2score.index(max_score)] ] ############################################################################ # some printing in verbose mode if verbose and return_list: (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) = return_list[0] print "BEST MAPPED TINYEXON:" print tinypacbporf print tinypacbporf.query, intronDQ._distance, intronAQ._distance, print ( intronDQ.donor.pos, intronDQ.acceptor.pos ), print ( intronDS.donor.pos, intronDS.acceptor.pos ), print ( intronAQ.donor.pos, intronAQ.acceptor.pos ), print ( intronAS.donor.pos, intronAS.acceptor.pos ) ############################################################################ # return the result list return return_list
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,pacbporfA, verbose=False,**kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs,KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=verbose,**kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs['cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ,intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) ######################################################################## if verbose: print (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt-distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append( ( intQ, intS ) ) ############################################################################ if verbose: for intQ,intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ,intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart= qEnd - max([distA,distD]) sStart= pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA,distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart= pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA,distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart= sEnd + min([distA,distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart,qEnd,qSeq) headerS = "sbjct_%s_%s_%s" % (sStart,sEnd,sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, (distD,distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = { headerQ: qSeq, headerS: sSeq } (alignedseqs,alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[headerQ], alignment, alignedseqs[headerS] ), coords=(qStart,qEnd,sStart,sEnd) ) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfD.orfQ,pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfA.orfQ,pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt-distAnt) intS._distance = abs(distDnt-distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,cig_pacbporf,pacbporfA) succes = set_apps_intron_sbjct(intS,pacbporfD,cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,pacbporfD,cig_pacbporf) succes = set_apps_intron_sbjct(intS,cig_pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [ cig_pacbporf ] intS._linked_to_pacbporfs = [ cig_pacbporf ] # append to found_cig_list found_cig_list.append( ( intQ, intS, cig_pacbporf ) ) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list
def merge_pacbporfs( pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj, allow_query_projecting=True, allow_sbjct_projecting=True, allow_query_mapping=True, allow_sbjct_mapping=True, allow_projecting=True, allow_mapping=True, verbose=False, ): """ Merge 2 PacbPORF objects with an interface into a gene structure @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit/create **kwargs dictionary for some forced attributes kwargs = {} _update_kwargs(kwargs, KWARGS_SPLICESITES) # deal with allow_xxx attributes if not allow_projecting: allow_query_projecting = False allow_sbjct_projecting = False if not allow_mapping: allow_query_mapping = False allow_sbjct_mapping = False # check if Orf objects of PacbPORFS are identical queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id # return data structure of introns introns = {"query": [], "sbjct": []} # Scan Orfs for splice sites. # This has probably been performed before, but when not done, # cached donor & acceptor sites lists seems to be empty -> no introns pacbporfD.orfQ.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs["min_donor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_donor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"], ) pacbporfD.orfS.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs["min_donor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_donor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"], ) pacbporfA.orfQ.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs["min_acceptor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_acceptor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"], ) pacbporfA.orfS.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs["min_acceptor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_acceptor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"], ) if not queryOrfsIdentical and not sbjctOrfsIdentical: introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA) introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA) introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj) introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj) introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj) introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj) introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} introns4 = {} introns5 = {} introns6 = {} introns7 = {} introns8 = {} introns9 = merge_pacbporfs_with_conserved_acceptor_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns9 = _filter_aligned_introns_on_pssm_entropy_combination(introns9) introns10 = merge_pacbporfs_with_conserved_donor_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns10 = _filter_aligned_introns_on_pssm_entropy_combination(introns10) # store introns obtained by most simplest case projecting/mapping introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1])) # only store introns from intron2 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, cigpacbp) in introns2: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) # only store introns from intron3 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS) in introns3: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) # only store introns from intron4 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) k4 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["query"].append(intrQ2) introns["sbjct"].append(intrS2) # only store introns from intron5 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["query"].append(intrQ2) introns["sbjct"].append(intrS2) # only store introns from intron6 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["query"].append(intrQ2) introns["sbjct"].append(intrS2) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns["query"]: introns["query"].remove(None) while None in introns["sbjct"]: introns["sbjct"].remove(None) # only store introns from intron7 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["sbjct"].append(intrS2) # only store introns from intron8 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ: introns["query"].append(intrQ) introns["query"].append(intrQ2) introns["sbjct"].append(intrS) # only store introns from introns9 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS) in introns9: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (2163, 2283): print "STRACC", k1, intrQ, k1 not in keysQ print "STRACC", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns["query"].append(intrQ) if k2 not in keysS: introns["sbjct"].append(intrS) # only store introns from introns10 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS) in introns10: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (1642, 1858): print "STRDON", k1, intrQ, k1 not in keysQ print "STRDON", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns["query"].append(intrQ) if k2 not in keysS: introns["sbjct"].append(intrS) # finally, do the bridging thingy introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA) # only store introns from introns0 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] for intrQ in introns0: if intrQ.coords() not in keysQ: introns["query"].append(intrQ) # introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] ) # introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] ) # introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] ) # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] ) # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] ) # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] ) # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] ) # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] ) # introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] ) # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] ) # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] ) # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] ) # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] ) # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] ) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns["query"]: introns["query"].remove(None) while None in introns["sbjct"]: introns["sbjct"].remove(None) elif not queryOrfsIdentical: seqerror = merge_pacbporf_with_sequenceerror_in_query(pacbporfD, pacbporfA) introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query(pacbporfD, pacbporfA, queryOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_query(pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store sequencerror if it exists if seqerror: introns["query"].append(seqerror) # store introns obtained by most simplest case projecting/mapping introns["query"].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns["query"].append(intr1) introns["query"].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns["query"].append(intr1) introns["query"].append(intr2) introns["query"].append(intr3) if not introns["query"] and allow_sbjct_mapping and allow_query_mapping: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA) # potential stopless 3n intron in SBJCT introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns["query"]] for intrQ, intrS in introns1: if intrQ.coords() not in keys: introns["query"].append(intrQ) keys = [intron.coords() for intron in introns["query"]] for (intrQ, intrS, cigpacbp) in introns2: if intrQ.coords() not in keys: introns["query"].append(intrQ) keys = [intron.coords() for intron in introns["query"]] for intrQ, intrS in introns3: if intrQ.coords() not in keys: introns["query"].append(intrQ) keys = [intron.coords() for intron in introns["query"]] for intron in introns0: if intron.coords() not in keys: introns["query"].append(intron) keys = [intron.coords() for intron in introns["query"]] keys = [intron.coords() for intron in introns["sbjct"]] for intrQ, intrS in introns1: if intrS.coords() not in keys: introns["query"].append(intrS) keys = [intron.coords() for intron in introns["sbjct"]] for (intrQ, intrS, cigpacbp) in introns2: if intrS.coords() not in keys: introns["query"].append(intrS) keys = [intron.coords() for intron in introns["sbjct"]] for intrQ, intrS in introns3: if intrS.coords() not in keys: introns["query"].append(intrS) keys = [intron.coords() for intron in introns["sbjct"]] elif not introns["query"]: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA) # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns["query"]] for intron in introns0: if intron.coords() not in keys: introns["query"].append(intron) else: # projecting introns yielded results; do not try mapping pass elif not sbjctOrfsIdentical: introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store introns obtained by most simplest case projecting/mapping introns["sbjct"].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns["sbjct"].append(intr1) introns["sbjct"].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns["sbjct"].append(intr1) introns["sbjct"].append(intr2) introns["sbjct"].append(intr3) if not introns["sbjct"] and allow_sbjct_mapping and allow_query_mapping: # potential stopless 3n intron in QUERY introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # store introns introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1])) introns["query"].extend([intrQ for (intrQ, intrS, cigpacbp) in introns2]) introns["query"].extend([intrQ for (intrQ, intrS) in introns3]) introns["sbjct"].extend([intrS for (intrQ, intrS, cigpacbp) in introns2]) introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3]) else: # projecting introns yielded results; do not try mapping pass elif queryOrfsIdentical and sbjctOrfsIdentical: if allow_query_mapping: introns1 = merge_pacbporfs_by_inframe_intron_in_query(pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns1 = [] if allow_sbjct_mapping: introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct(pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns2 = [] if allow_sbjct_mapping and allow_query_mapping: introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # no mapping (unigene or continious alignment provided) introns3 = [] # introns4 = merge_pacbporfs_with_closeby_independant_introns( # pacbporfD,pacbporfA) # introns5 = merge_pacbporfs_with_phase_shift_introns( # pacbporfD,pacbporfA) introns["query"].extend([prj.projected_introns[0] for prj in introns1]) introns["sbjct"].extend([prj.projected_introns[0] for prj in introns2]) introns["query"].extend([intrQ for (intrQ, intrS) in introns3]) introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3]) else: # none of these cases; allow_projecting or allow_mapping == False! pass # Filter for stopless3n introns introns["query"] = _filter_stopless_3n_introns(introns["query"]) introns["sbjct"] = _filter_stopless_3n_introns(introns["sbjct"]) # return list of introns return introns
def merge_orfs_with_two_tinyexons(preceding_orf, subsequent_orf, preceding_donor_sites=[], subsequent_acceptor_sites=[], orflist=[], **kwargs): """ Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors @type preceding_orf: Orf object @param preceding_orf: Orf object that contains preceding_donor_site(s) @type subsequent_orf: Orf object @param subsequent_orf: Orf object that contains subsequent_acceptor_site(s) @type preceding_donor_sites: list @param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects @type subsequent_acceptor_sites: list @param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects @type orflist: list @param orflist: list with Orf objects @attention: see get_potential_tiny_exons_on_orf for additional **kwargs @rtype: list @return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron ) """ if not preceding_donor_sites: return [] if not subsequent_acceptor_sites: return [] if not orflist: return [] # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) # return list with (intron,tinyexon,intron) tuples returntinyexons = [] tinyexoncollection = [] tinyexoncombis = [] min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites]) max_subsequent_acceptor_sites_pos = max( [a.pos for a in subsequent_acceptor_sites]) for orfX in orflist: # check if orf is correctly positions towards the splice sites' extremes min_pos = min_preceding_donor_sites_pos + kwargs[ 'min_tinyexon_intron_nt_length'] max_pos = max_subsequent_acceptor_sites_pos - kwargs[ 'min_tinyexon_intron_nt_length'] # if so, do not check this Orf if orfX.endPY <= min_pos: continue if orfX.startPY >= max_pos: continue # extend the tinyexoncollection tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX, **kwargs)) # make tinyexoncollection ordered on start pos tinyexoncollection = _order_intron_list(tinyexoncollection, order_by='donor_pos') # donor_pos makes REVERSE ordering; restore this by reversing tinyexoncollection.reverse() # make 2-elemented tuples of tinyexons which can co-occur together for tinyexon1 in tinyexoncollection: for pos in range(len(tinyexoncollection) - 1, -1, -1): tinyexon2 = tinyexoncollection[pos] if tinyexon2.donor.pos < tinyexon1.donor.pos: break intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue # if here, elegiable combi! intron = IntronConnectingOrfs( tinyexon1.donor, tinyexon2.acceptor, get_shared_nucleotides_at_splicesite(subsequent_orf, preceding_orf, tinyexon2.acceptor, tinyexon1.donor), preceding_orf, subsequent_orf) totlen = tinyexon1.length + tinyexon2.length combi = (totlen, tinyexon1, intron, tinyexon2) tinyexoncombis.append(combi) # return an ordered list based on length tinyexoncombis.sort() return [(exon1, intron, exon2) for l, exon1, intron, exon2 in tinyexoncombis]