def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge query Orfs in PacbPORF by **best** intron @attention: see orfs.merge_orfs_with_intron for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_query_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) # get list of introns intronlist = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, min_donor_pos=min_donor_query_pos, max_acceptor_pos=max_accep_query_pos, **kwargs) # filter on entropy # settings for minimal alignment entropy score if min([pacbporfD.identityscore, pacbporfA.identityscore]) > 0.55: min_donor_site_entropy = 0.01 min_acceptor_site_entropy = 0.01 intronlist = _filter_introns_on_entropy( intronlist, pacbporfD, pacbporfA, min_donor_site_entropy=min_donor_site_entropy, min_acceptor_site_entropy=min_acceptor_site_entropy) else: # do not filter, but do not forget to store apps data to intron(s) for intron in intronlist: succes = set_apps_intron_query(intron, pacbporfD, pacbporfA) for intron in intronlist: intron._distance = 0 # ?? # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = 'ABGPbridgeing' # get unique list of donors & acceptors donor = olba(list(Set([intron.donor for intron in intronlist])), order_by='pos') accep = olba(list(Set([intron.acceptor for intron in intronlist])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep] ############################################################################ intronlist = _filter_introns_on_pssm_entropy_combination(intronlist) # get unique list of donors & acceptors donor = olba(list(Set([intron.donor for intron in intronlist])), order_by='pos') accep = olba(list(Set([intron.acceptor for intron in intronlist])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep] ############################################################################ filtered_intron_list = [] for intron in intronlist: intron.assign_bp_and_ppts() if intron.branchpoint and (intron.ppt5p or intron.ppt3p): filtered_intron_list.append(intron) else: pass # check if list is emptied due to branchpoint filtering # in that case, filter for either branchpoint OR polyppt if not filtered_intron_list and intronlist: for intron in intronlist: if intron.branchpoint or (intron.ppt5p or intron.ppt3p): filtered_intron_list.append(intron) # return list of filtered introns return filtered_intron_list
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,pacbporfA,verbose=False,**kwargs): """ Merge query Orfs in PacbPORF by **best** intron @attention: see orfs.merge_orfs_with_intron for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) # get list of introns intronlist = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, min_donor_pos =min_donor_query_pos, max_acceptor_pos=max_accep_query_pos,**kwargs) # filter on entropy # settings for minimal alignment entropy score if min([pacbporfD.identityscore,pacbporfA.identityscore]) > 0.55: min_donor_site_entropy = 0.01 min_acceptor_site_entropy = 0.01 intronlist = _filter_introns_on_entropy(intronlist,pacbporfD,pacbporfA, min_donor_site_entropy=min_donor_site_entropy, min_acceptor_site_entropy=min_acceptor_site_entropy) else: # do not filter, but do not forget to store apps data to intron(s) for intron in intronlist: succes = set_apps_intron_query(intron,pacbporfD,pacbporfA) for intron in intronlist: intron._distance = 0 # ?? # set GFF fsource attribute for recognition of intron sources intron._gff['fsource'] = 'ABGPbridgeing' # get unique list of donors & acceptors donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos') accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos') ############################################################################ if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep] ############################################################################ intronlist = _filter_introns_on_pssm_entropy_combination(intronlist) # get unique list of donors & acceptors donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos') accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos') ############################################################################ if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep] ############################################################################ filtered_intron_list = [] for intron in intronlist: intron.assign_bp_and_ppts() if intron.branchpoint and (intron.ppt5p or intron.ppt3p): filtered_intron_list.append( intron ) else: pass # check if list is emptied due to branchpoint filtering # in that case, filter for either branchpoint OR polyppt if not filtered_intron_list and intronlist: for intron in intronlist: if intron.branchpoint or (intron.ppt5p or intron.ppt3p): filtered_intron_list.append( intron ) # return list of filtered introns return filtered_intron_list
def merge_pacbporfs_by_tinyexons(pacbporfD,pacbporfA, orfSetObjQ,orfSetObjS,verbose=False,**kwargs): """ """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 resultlistQ = merge_orfs_with_tinyexon( pacbporfD.orfQ,pacbporfA.orfQ, preceding_donor_sites=pacbporfD.orfQ._donor_sites, subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites, orflist=orfSetObjQ.orfs,**kwargs) resultlistS = merge_orfs_with_tinyexon( pacbporfD.orfS,pacbporfA.orfS, preceding_donor_sites=pacbporfD.orfS._donor_sites, subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites, orflist=orfSetObjS.orfs,**kwargs) # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ] resultdictQ,key2exonQ = _tinyexon_list_2_dict(resultlistQ) resultdictS,key2exonS = _tinyexon_list_2_dict(resultlistS) # get unique list of donors & acceptors donorQ = olba( list(Set([inD.donor for inD,te,inA in resultlistQ ])), order_by='pos') donorS = olba( list(Set([inD.donor for inD,te,inA in resultlistS ])), order_by='pos') accepQ = olba( list(Set([inA.acceptor for inD,te,inA in resultlistQ ])), order_by='pos') accepS = olba( list(Set([inA.acceptor for inD,te,inA in resultlistS ])), order_by='pos') ## filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = True # True kwargs['aligned_site_max_triplet_distance'] = 0 # 2 algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs) algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs) # settings for minimal alignment entropy score # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!! min_donor_site_alignment_entropy = 0.1 min_acceptor_site_alignment_entropy = 0.1 # remove sites with to low alignment entropy algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS return_list = [] ############################################################################ if verbose: print "bridges constructed: ORFS:", print (pacbporfD.orfQ.id,pacbporfA.orfQ.id), print (pacbporfD.orfS.id,pacbporfA.orfS.id), print len(resultdictQ), len(resultdictS), print ( len(resultlistQ), len(donorQ), len(accepQ) ), print ( len(resultlistS), len(donorS), len(accepS) ), print ( len(algdonors), len(algacceps) ) ############################################################################ for keyQ,tinyexonQ in key2exonQ.iteritems(): for keyS,tinyexonS in key2exonS.iteritems(): if tinyexonQ.donor.phase != tinyexonS.donor.phase: continue if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase: continue if tinyexonQ.length != tinyexonS.length: continue # if here, then tinyexons of identical structure #################################################################### if verbose: print tinyexonQ.length, tinyexonQ.donor.phase, print ( len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1]) ), print ( len(resultdictS[keyS][0]), len(resultdictS[keyS][1]) ), print tinyexonQ, print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score #################################################################### donor_introns = [] acceptor_introns = [] for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]: continue # check if they exists as aligned sites alignedkey = ( intronDQ.donor.pos, intronDS.donor.pos ) if alignedkey not in [ (dQ.pos, dS.pos) for dQ,dS in algdonors ]: continue # if here, we have a set of introns 5' of the tinyexon # which are perfectly alignable! donor_introns.append((intronDQ,intronDS)) for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]: continue # check if they exists as aligned sites alignedkey = ( intronAQ.acceptor.pos, intronAS.acceptor.pos ) if alignedkey not in [ (aQ.pos, aS.pos) for aQ,aS in algacceps ]: continue # if here, we have a set of introns 3' of the tinyexon # which are perfectly alignable! acceptor_introns.append((intronAQ,intronAS)) if not len(donor_introns) or not len(acceptor_introns): # no aligned 5' && aligned 3' introns continue # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), tinyexonQ.protein_start(), tinyexonS.protein_start(), ) ) pacbp.strip_unmatched_ends() # continue if no fraction could be aligned if len(pacbp) == 0: continue tinypacbporf = pacbp2pacbporf(pacbp,tinyexonQ.orf,tinyexonS.orf) tinypacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print tinypacbporf tinypacbporf.print_protein_and_dna() print len(donor_introns), len(acceptor_introns), print max([ dQ.donor.pssm_score+dS.donor.pssm_score for dQ,dS in donor_introns]), print max([ aQ.acceptor.pssm_score+aS.acceptor.pssm_score for aQ,aS in acceptor_introns]) #################################################################### # if here, we have accepted tinyexon bridges! # gather them and store to return_list for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]: continue for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]: continue #################################################### # set some meta-data properties to the intron objects #################################################### _score_introns_obtained_by_mapping( intronDQ,intronDS,pacbporfD, tinypacbporf,source='ABGPmappingTE') _score_introns_obtained_by_mapping( intronAQ,intronAS,tinypacbporf, pacbporfA,source='ABGPmappingTE') # create _linked_to_xxx attributes intronDQ._linked_to_pacbporfs = [ tinypacbporf ] intronAQ._linked_to_pacbporfs = [ tinypacbporf ] intronDS._linked_to_pacbporfs = [ tinypacbporf ] intronAS._linked_to_pacbporfs = [ tinypacbporf ] intronDQ._linked_to_introns = [ intronAQ ] intronAQ._linked_to_introns = [ intronDQ ] intronDS._linked_to_introns = [ intronAS ] intronAS._linked_to_introns = [ intronDS ] # append to tmp result list return_list.append( (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) ) # check if there are >1 candidate tiny exons # currently, we choose only to return the **best** mapped tinyexon if len(return_list) == 0: pass elif len(return_list) == 1: pass else: # only take the highest scoring candidate here min_distance = min([ (a._distance+d._distance) for a,b,c,d,e in return_list ]) pos2score = [] for (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) in return_list: if (intronDQ._distance + intronAQ._distance) > min_distance: pos2score.append( 0.0 ) else: # calculate overall pssm score total_pssm = 0.0 total_pssm += intronDQ.donor.pssm_score total_pssm += intronDQ.acceptor.pssm_score total_pssm += intronDS.donor.pssm_score total_pssm += intronDS.acceptor.pssm_score total_pssm += intronAQ.donor.pssm_score total_pssm += intronAQ.acceptor.pssm_score total_pssm += intronAS.donor.pssm_score total_pssm += intronAS.acceptor.pssm_score pos2score.append( total_pssm ) # get highest score and linked tinyexon max_score = max(pos2score) return_list = [ return_list[pos2score.index(max_score)] ] ############################################################################ # some printing in verbose mode if verbose and return_list: (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) = return_list[0] print "BEST MAPPED TINYEXON:" print tinypacbporf print tinypacbporf.query, intronDQ._distance, intronAQ._distance, print ( intronDQ.donor.pos, intronDQ.acceptor.pos ), print ( intronDS.donor.pos, intronDS.acceptor.pos ), print ( intronAQ.donor.pos, intronAQ.acceptor.pos ), print ( intronAS.donor.pos, intronAS.acceptor.pos ) ############################################################################ # return the result list return return_list
def merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=False,**kwargs): """ Merge 2 PacbPORF objects by introns @attention: see orfs.merge_orfs_with_intron for **kwargs @attention: see functions._filter_for_alignable_splice_sites for **kwargs @attention: see functions._filter_for_entropy for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) sdr = pacbporfD.alignment_dna_range_sbjct() sar = pacbporfA.alignment_dna_range_sbjct() min_donor_sbjct_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) max_accep_sbjct_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ]) # get list of introns #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, # min_donor_pos =min_donor_query_pos, # max_acceptor_pos=max_accep_query_pos,**kwargs) #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS, # min_donor_pos =min_donor_sbjct_pos, # max_acceptor_pos=max_accep_sbjct_pos,**kwargs) # get list of introns intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,**kwargs) intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,**kwargs) # get unique list of donors & acceptors donorQ = olba( list(Set([inQ.donor for inQ in intronsQ ])), order_by='pos') donorS = olba( list(Set([inS.donor for inS in intronsS ])), order_by='pos') accepQ = olba( list(Set([inQ.acceptor for inQ in intronsQ ])), order_by='pos') accepS = olba( list(Set([inS.acceptor for inS in intronsS ])), order_by='pos') ############################################################################ if verbose: print "dQ1", [ d.pos for d in donorQ ], "aQ1", [ a.pos for a in accepQ ] print "dS1", [ d.pos for d in donorS ], "aS1", [ a.pos for a in accepS ] ############################################################################ # filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor'] algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs) kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor'] algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs) ############################################################################ if verbose: print "dQ2", [ _dq.pos for (_dq,_ds) in algdonors ], print "aQ2", [ _aq.pos for (_aq,_as) in algacceps ] print "dS2", [ _ds.pos for (_dq,_ds) in algdonors ], print "aS2", [ _as.pos for (_aq,_as) in algacceps ] ############################################################################ # remove sites with to low alignment entropy algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) ############################################################################ if verbose: print "dQ3", [ _dq.pos for (_dq,_ds) in algdonors ], print "aQ3", [ _aq.pos for (_aq,_as) in algacceps ] print "dS3", [ _ds.pos for (_dq,_ds) in algdonors ], print "aS3", [ _as.pos for (_aq,_as) in algacceps ] ############################################################################ # make unique position lists for quick lookup in intron lists dQpl = Set([ dQ.pos for dQ,dS in algdonors ]) dSpl = Set([ dS.pos for dQ,dS in algdonors ]) aQpl = Set([ aQ.pos for aQ,aS in algacceps ]) aSpl = Set([ aS.pos for aQ,aS in algacceps ]) # check exterior boundaries of PacbPORFs sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() # now make list of aligable introns algintrons = [] for intQ in intronsQ: # check if intron falls within the PacbPORF aligned area if intQ.donor.pos <= sposD.query_dna_start: continue if intQ.acceptor.pos >= eposA.query_dna_end: continue if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl: # Query intron occurs in list of alignable splice sites! for intS in intronsS: # check if intron falls within the PacbPORF aligned area if intS.donor.pos <= sposD.sbjct_dna_start: continue if intS.acceptor.pos >= eposA.sbjct_dna_end: continue if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl: # Sbjct intron occurs as well in alignable splice sites! if (intQ.donor,intS.donor) in algdonors and\ (intQ.acceptor,intS.acceptor) in algacceps: # Sbjct & Query Donor & Acceptor are alignable! algintrons.append( ( intQ, intS ) ) ############################################################################ # set some meta-data properties to the intron objects ############################################################################ for intQ,intS in algintrons: distDnt = pacbporfD.get_distance_aligned_nucleotide_positions( query = intQ.donor.pos, sbjct = intS.donor.pos ) distAnt = pacbporfA.get_distance_aligned_nucleotide_positions( query = intQ.acceptor.pos, sbjct = intS.acceptor.pos ) # final distance check. kwargs['aligned_site_max_triplet_distance'] # is applied on donor and acceptor site. This distance measured on the # protein sequence can be DOUBLED in case distDnt / distAnt are # opposite (+ and -). Check here if the protein sequence gap is # as well <= kwargs['aligned_site_max_triplet_distance']. if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance']*3: continue # add distance score to introns intQ._distance = abs(distDnt) + abs(distAnt) intS._distance = abs(distDnt) + abs(distAnt) # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,pacbporfD,pacbporfA) succes = set_apps_intron_sbjct(intS,pacbporfD,pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPmapping" intS._gff['fsource'] = "ABGPmapping" ######################################################################## if verbose: # some printing.... print "Aligned introns:", ( intQ.donor.pos, intQ.acceptor.pos ) , print ( intS.donor.pos, intS.acceptor.pos ), print "DIST:", distDnt, distAnt, print "[%s]" % kwargs['aligned_site_max_triplet_distance'], print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep), print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % ( intQ.donor.pssm_score, intS.donor.pssm_score, intQ.acceptor.pssm_score, intS.acceptor.pssm_score, ) ######################################################################## # return lists of aligned introns return algintrons
def merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, orfSetObjQ, orfSetObjS, verbose=False, **kwargs): """ """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 resultlistQ = merge_orfs_with_tinyexon( pacbporfD.orfQ, pacbporfA.orfQ, preceding_donor_sites=pacbporfD.orfQ._donor_sites, subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites, orflist=orfSetObjQ.orfs, **kwargs) resultlistS = merge_orfs_with_tinyexon( pacbporfD.orfS, pacbporfA.orfS, preceding_donor_sites=pacbporfD.orfS._donor_sites, subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites, orflist=orfSetObjS.orfs, **kwargs) # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ] resultdictQ, key2exonQ = _tinyexon_list_2_dict(resultlistQ) resultdictS, key2exonS = _tinyexon_list_2_dict(resultlistS) # get unique list of donors & acceptors donorQ = olba(list(Set([inD.donor for inD, te, inA in resultlistQ])), order_by='pos') donorS = olba(list(Set([inD.donor for inD, te, inA in resultlistS])), order_by='pos') accepQ = olba(list(Set([inA.acceptor for inD, te, inA in resultlistQ])), order_by='pos') accepS = olba(list(Set([inA.acceptor for inD, te, inA in resultlistS])), order_by='pos') ## filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = True # True kwargs['aligned_site_max_triplet_distance'] = 0 # 2 algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD, **kwargs) algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA, **kwargs) # settings for minimal alignment entropy score # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!! min_donor_site_alignment_entropy = 0.1 min_acceptor_site_alignment_entropy = 0.1 # remove sites with to low alignment entropy algdonors = _filter_for_entropy( algdonors, pacbporfD, 'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy( algacceps, pacbporfA, 'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS return_list = [] ############################################################################ if verbose: print "bridges constructed: ORFS:", print(pacbporfD.orfQ.id, pacbporfA.orfQ.id), print(pacbporfD.orfS.id, pacbporfA.orfS.id), print len(resultdictQ), len(resultdictS), print(len(resultlistQ), len(donorQ), len(accepQ)), print(len(resultlistS), len(donorS), len(accepS)), print(len(algdonors), len(algacceps)) ############################################################################ for keyQ, tinyexonQ in key2exonQ.iteritems(): for keyS, tinyexonS in key2exonS.iteritems(): if tinyexonQ.donor.phase != tinyexonS.donor.phase: continue if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase: continue if tinyexonQ.length != tinyexonS.length: continue # if here, then tinyexons of identical structure #################################################################### if verbose: print tinyexonQ.length, tinyexonQ.donor.phase, print(len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1])), print(len(resultdictS[keyS][0]), len(resultdictS[keyS][1])), print tinyexonQ, print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score #################################################################### donor_introns = [] acceptor_introns = [] for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ, dS in algdonors ]: continue # check if they exists as aligned sites alignedkey = (intronDQ.donor.pos, intronDS.donor.pos) if alignedkey not in [(dQ.pos, dS.pos) for dQ, dS in algdonors]: continue # if here, we have a set of introns 5' of the tinyexon # which are perfectly alignable! donor_introns.append((intronDQ, intronDS)) for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ, aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ, aS in algacceps ]: continue # check if they exists as aligned sites alignedkey = (intronAQ.acceptor.pos, intronAS.acceptor.pos) if alignedkey not in [(aQ.pos, aS.pos) for aQ, aS in algacceps]: continue # if here, we have a set of introns 3' of the tinyexon # which are perfectly alignable! acceptor_introns.append((intronAQ, intronAS)) if not len(donor_introns) or not len(acceptor_introns): # no aligned 5' && aligned 3' introns continue # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), tinyexonQ.protein_start(), tinyexonS.protein_start(), )) pacbp.strip_unmatched_ends() # continue if no fraction could be aligned if len(pacbp) == 0: continue tinypacbporf = pacbp2pacbporf(pacbp, tinyexonQ.orf, tinyexonS.orf) tinypacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print tinypacbporf tinypacbporf.print_protein_and_dna() print len(donor_introns), len(acceptor_introns), print max([ dQ.donor.pssm_score + dS.donor.pssm_score for dQ, dS in donor_introns ]), print max([ aQ.acceptor.pssm_score + aS.acceptor.pssm_score for aQ, aS in acceptor_introns ]) #################################################################### # if here, we have accepted tinyexon bridges! # gather them and store to return_list for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ, dS in algdonors ]: continue for intronAQkey, intronAQ in resultdictQ[keyQ][ 1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ, aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][ 1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ, aS in algacceps ]: continue #################################################### # set some meta-data properties to the intron objects #################################################### _score_introns_obtained_by_mapping( intronDQ, intronDS, pacbporfD, tinypacbporf, source='ABGPmappingTE') _score_introns_obtained_by_mapping( intronAQ, intronAS, tinypacbporf, pacbporfA, source='ABGPmappingTE') # create _linked_to_xxx attributes intronDQ._linked_to_pacbporfs = [tinypacbporf] intronAQ._linked_to_pacbporfs = [tinypacbporf] intronDS._linked_to_pacbporfs = [tinypacbporf] intronAS._linked_to_pacbporfs = [tinypacbporf] intronDQ._linked_to_introns = [intronAQ] intronAQ._linked_to_introns = [intronDQ] intronDS._linked_to_introns = [intronAS] intronAS._linked_to_introns = [intronDS] # append to tmp result list return_list.append( (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS)) # check if there are >1 candidate tiny exons # currently, we choose only to return the **best** mapped tinyexon if len(return_list) == 0: pass elif len(return_list) == 1: pass else: # only take the highest scoring candidate here min_distance = min([(a._distance + d._distance) for a, b, c, d, e in return_list]) pos2score = [] for (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) in return_list: if (intronDQ._distance + intronAQ._distance) > min_distance: pos2score.append(0.0) else: # calculate overall pssm score total_pssm = 0.0 total_pssm += intronDQ.donor.pssm_score total_pssm += intronDQ.acceptor.pssm_score total_pssm += intronDS.donor.pssm_score total_pssm += intronDS.acceptor.pssm_score total_pssm += intronAQ.donor.pssm_score total_pssm += intronAQ.acceptor.pssm_score total_pssm += intronAS.donor.pssm_score total_pssm += intronAS.acceptor.pssm_score pos2score.append(total_pssm) # get highest score and linked tinyexon max_score = max(pos2score) return_list = [return_list[pos2score.index(max_score)]] ############################################################################ # some printing in verbose mode if verbose and return_list: (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) = return_list[0] print "BEST MAPPED TINYEXON:" print tinypacbporf print tinypacbporf.query, intronDQ._distance, intronAQ._distance, print(intronDQ.donor.pos, intronDQ.acceptor.pos), print(intronDS.donor.pos, intronDS.acceptor.pos), print(intronAQ.donor.pos, intronAQ.acceptor.pos), print(intronAS.donor.pos, intronAS.acceptor.pos) ############################################################################ # return the result list return return_list
def merge_pacbporfs_with_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see orfs.merge_orfs_with_intron for **kwargs @attention: see functions._filter_for_alignable_splice_sites for **kwargs @attention: see functions._filter_for_entropy for **kwargs @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 # calculate maximal/minimal donor/acceptor site position based on alignment ELEGIABLE_SPLICE_SITE_AA_RANGE = 75 qdr = pacbporfD.alignment_dna_range_query() qar = pacbporfA.alignment_dna_range_query() min_donor_query_pos = max( [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_query_pos = min( [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) sdr = pacbporfD.alignment_dna_range_sbjct() sar = pacbporfA.alignment_dna_range_sbjct() min_donor_sbjct_pos = max( [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) max_accep_sbjct_pos = min( [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)]) # get list of introns #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ, # min_donor_pos =min_donor_query_pos, # max_acceptor_pos=max_accep_query_pos,**kwargs) #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS, # min_donor_pos =min_donor_sbjct_pos, # max_acceptor_pos=max_accep_sbjct_pos,**kwargs) # get list of introns intronsQ = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, **kwargs) intronsS = merge_orfs_with_intron(pacbporfD.orfS, pacbporfA.orfS, **kwargs) # get unique list of donors & acceptors donorQ = olba(list(Set([inQ.donor for inQ in intronsQ])), order_by='pos') donorS = olba(list(Set([inS.donor for inS in intronsS])), order_by='pos') accepQ = olba(list(Set([inQ.acceptor for inQ in intronsQ])), order_by='pos') accepS = olba(list(Set([inS.acceptor for inS in intronsS])), order_by='pos') ############################################################################ if verbose: print "dQ1", [d.pos for d in donorQ], "aQ1", [a.pos for a in accepQ] print "dS1", [d.pos for d in donorS], "aS1", [a.pos for a in accepS] ############################################################################ # filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor'] algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD, **kwargs) kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor'] algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA, **kwargs) ############################################################################ if verbose: print "dQ2", [_dq.pos for (_dq, _ds) in algdonors], print "aQ2", [_aq.pos for (_aq, _as) in algacceps] print "dS2", [_ds.pos for (_dq, _ds) in algdonors], print "aS2", [_as.pos for (_aq, _as) in algacceps] ############################################################################ # remove sites with to low alignment entropy algdonors = _filter_for_entropy( algdonors, pacbporfD, 'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy( algacceps, pacbporfA, 'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) ############################################################################ if verbose: print "dQ3", [_dq.pos for (_dq, _ds) in algdonors], print "aQ3", [_aq.pos for (_aq, _as) in algacceps] print "dS3", [_ds.pos for (_dq, _ds) in algdonors], print "aS3", [_as.pos for (_aq, _as) in algacceps] ############################################################################ # make unique position lists for quick lookup in intron lists dQpl = Set([dQ.pos for dQ, dS in algdonors]) dSpl = Set([dS.pos for dQ, dS in algdonors]) aQpl = Set([aQ.pos for aQ, aS in algacceps]) aSpl = Set([aS.pos for aQ, aS in algacceps]) # check exterior boundaries of PacbPORFs sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() # now make list of aligable introns algintrons = [] for intQ in intronsQ: # check if intron falls within the PacbPORF aligned area if intQ.donor.pos <= sposD.query_dna_start: continue if intQ.acceptor.pos >= eposA.query_dna_end: continue if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl: # Query intron occurs in list of alignable splice sites! for intS in intronsS: # check if intron falls within the PacbPORF aligned area if intS.donor.pos <= sposD.sbjct_dna_start: continue if intS.acceptor.pos >= eposA.sbjct_dna_end: continue if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl: # Sbjct intron occurs as well in alignable splice sites! if (intQ.donor,intS.donor) in algdonors and\ (intQ.acceptor,intS.acceptor) in algacceps: # Sbjct & Query Donor & Acceptor are alignable! algintrons.append((intQ, intS)) ############################################################################ # set some meta-data properties to the intron objects ############################################################################ for intQ, intS in algintrons: distDnt = pacbporfD.get_distance_aligned_nucleotide_positions( query=intQ.donor.pos, sbjct=intS.donor.pos) distAnt = pacbporfA.get_distance_aligned_nucleotide_positions( query=intQ.acceptor.pos, sbjct=intS.acceptor.pos) # final distance check. kwargs['aligned_site_max_triplet_distance'] # is applied on donor and acceptor site. This distance measured on the # protein sequence can be DOUBLED in case distDnt / distAnt are # opposite (+ and -). Check here if the protein sequence gap is # as well <= kwargs['aligned_site_max_triplet_distance']. if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance'] * 3: continue # add distance score to introns intQ._distance = abs(distDnt) + abs(distAnt) intS._distance = abs(distDnt) + abs(distAnt) # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, pacbporfD, pacbporfA) succes = set_apps_intron_sbjct(intS, pacbporfD, pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPmapping" intS._gff['fsource'] = "ABGPmapping" ######################################################################## if verbose: # some printing.... print "Aligned introns:", (intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos), print "DIST:", distDnt, distAnt, print "[%s]" % kwargs['aligned_site_max_triplet_distance'], print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep), print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % ( intQ.donor.pssm_score, intS.donor.pssm_score, intQ.acceptor.pssm_score, intS.acceptor.pssm_score, ) ######################################################################## # return lists of aligned introns return algintrons