Beispiel #1
0
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,
                                                pacbporfA,
                                                verbose=False,
                                                **kwargs):
    """
    Merge query Orfs in PacbPORF by **best** intron

    @attention: see orfs.merge_orfs_with_intron for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max(
        [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_query_pos = min(
        [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    # get list of introns
    intronlist = merge_orfs_with_intron(pacbporfD.orfQ,
                                        pacbporfA.orfQ,
                                        min_donor_pos=min_donor_query_pos,
                                        max_acceptor_pos=max_accep_query_pos,
                                        **kwargs)

    # filter on entropy
    # settings for minimal alignment entropy score
    if min([pacbporfD.identityscore, pacbporfA.identityscore]) > 0.55:
        min_donor_site_entropy = 0.01
        min_acceptor_site_entropy = 0.01
        intronlist = _filter_introns_on_entropy(
            intronlist,
            pacbporfD,
            pacbporfA,
            min_donor_site_entropy=min_donor_site_entropy,
            min_acceptor_site_entropy=min_acceptor_site_entropy)
    else:
        # do not filter, but do not forget to store apps data to intron(s)
        for intron in intronlist:
            succes = set_apps_intron_query(intron, pacbporfD, pacbporfA)

    for intron in intronlist:
        intron._distance = 0  # ??
        # set GFF fsource attribute for recognition of intron sources
        intron._gff['fsource'] = 'ABGPbridgeing'

    # get unique list of donors & acceptors
    donor = olba(list(Set([intron.donor for intron in intronlist])),
                 order_by='pos')
    accep = olba(list(Set([intron.acceptor for intron in intronlist])),
                 order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep]
    ############################################################################

    intronlist = _filter_introns_on_pssm_entropy_combination(intronlist)

    # get unique list of donors & acceptors
    donor = olba(list(Set([intron.donor for intron in intronlist])),
                 order_by='pos')
    accep = olba(list(Set([intron.acceptor for intron in intronlist])),
                 order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep]
    ############################################################################

    filtered_intron_list = []
    for intron in intronlist:
        intron.assign_bp_and_ppts()
        if intron.branchpoint and (intron.ppt5p or intron.ppt3p):
            filtered_intron_list.append(intron)
        else:
            pass

    # check if list is emptied due to branchpoint filtering
    # in that case, filter for either branchpoint OR polyppt
    if not filtered_intron_list and intronlist:
        for intron in intronlist:
            if intron.branchpoint or (intron.ppt5p or intron.ppt3p):
                filtered_intron_list.append(intron)

    # return list of filtered introns
    return filtered_intron_list
Beispiel #2
0
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,pacbporfA,verbose=False,**kwargs):
    """
    Merge query Orfs in PacbPORF by **best** intron

    @attention: see orfs.merge_orfs_with_intron for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    # get list of introns
    intronlist = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
            min_donor_pos   =min_donor_query_pos,
            max_acceptor_pos=max_accep_query_pos,**kwargs)


    # filter on entropy
    # settings for minimal alignment entropy score
    if min([pacbporfD.identityscore,pacbporfA.identityscore]) > 0.55:
        min_donor_site_entropy = 0.01
        min_acceptor_site_entropy = 0.01
        intronlist = _filter_introns_on_entropy(intronlist,pacbporfD,pacbporfA,
                min_donor_site_entropy=min_donor_site_entropy,
                min_acceptor_site_entropy=min_acceptor_site_entropy)
    else:
        # do not filter, but do not forget to store apps data to intron(s)
        for intron in intronlist:
            succes = set_apps_intron_query(intron,pacbporfD,pacbporfA)


    for intron in intronlist:
        intron._distance = 0 # ??
        # set GFF fsource attribute for recognition of intron sources
        intron._gff['fsource'] = 'ABGPbridgeing'

    # get unique list of donors & acceptors
    donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos')
    accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos')

    ############################################################################
    if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep]
    ############################################################################

    intronlist = _filter_introns_on_pssm_entropy_combination(intronlist)

    # get unique list of donors & acceptors
    donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos')
    accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos')

    ############################################################################
    if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep]
    ############################################################################

    filtered_intron_list = []
    for intron in intronlist:
        intron.assign_bp_and_ppts()
        if intron.branchpoint and (intron.ppt5p or intron.ppt3p):
            filtered_intron_list.append( intron )
        else:
            pass

    # check if list is emptied due to branchpoint filtering
    # in that case, filter for either branchpoint OR polyppt
    if not filtered_intron_list and intronlist:
        for intron in intronlist:
            if intron.branchpoint or (intron.ppt5p or intron.ppt3p):
                filtered_intron_list.append( intron )

    # return list of filtered introns
    return filtered_intron_list
Beispiel #3
0
def _merge_pacbporfs_by_intron(pfD,pfA,queryorsbjct,verbose=False,**kwargs):
    """
    Project splicesites from SBJCT intron on continious QUERY PacbPORFs

    @type  pfD: PacbPORF object
    @param pfD: PacbPORF object that has to deliver (aligned) donor sites

    @type  pfA: PacbPORF object
    @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @rtype:  list
    @return: list with ProjectedIntrons (from Sbjct on Query)
    """
    # input validation
    IsPacbPORF(pfD)
    IsPacbPORF(pfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_INTRON)

    ### if not kwargs.has_key('projected_intron_max_nt_offset'):
    ###    kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET
    ### if not kwargs.has_key('projected_intron_max_aa_offset'):
    ###    kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0


    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    sposD = pfD._get_original_alignment_pos_start()
    eposD = pfD._get_original_alignment_pos_end()
    sposA = pfA._get_original_alignment_pos_start()
    eposA = pfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        # Orfs of SBJCT must be identical
        IsIdenticalOrfs(pfD.orfS,pfA.orfS)
        donorOrf = pfD.orfQ
        accepOrf = pfA.orfQ
        prjctOrf = pfD.orfS # pfD.orfS == pfA.orfS
        dStart = sposD.query_dna_start  # ALIGNED start of donorPacbPORF
        dEnd   = pfD.query_dna_end      # ABSOLUTE end of donorPacbPORF
        aStart = pfA.query_dna_start    # ABSOLUTE start of acceptorPacbPORF
        aEnd   = eposA.query_dna_end    # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "sbjct_dna_start"
        # calculate elegiable splice site range
        qdr = pfD.alignment_dna_range_query()
        qar = pfA.alignment_dna_range_query()
        min_donor_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
        max_accep_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    elif queryorsbjct == "sbjct":
        # Orfs of QUERY  must be identical
        IsIdenticalOrfs(pfD.orfQ,pfA.orfQ)
        donorOrf = pfD.orfS
        accepOrf = pfA.orfS
        prjctOrf = pfD.orfQ # pfD.orfQ == pfA.orfQ
        dStart = sposD.sbjct_dna_start  # ALIGNED start of donorPacbPORF
        dEnd   = pfD.sbjct_dna_end      # ABSOLUTE end of donorPacbPORF
        aStart = pfA.sbjct_dna_start    # ABSOLUTE start of acceptorPacbPORF
        aEnd   = eposA.sbjct_dna_end    # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "query_dna_start"
        # calculate elegiable splice site range
        sdr = pfD.alignment_dna_range_sbjct()
        sar = pfA.alignment_dna_range_sbjct()
        min_donor_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
        max_accep_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # predict introns only in `queryorsbjct` Orfs
    # introns is a list of IntronConnectingOrfs objects
    introns = merge_orfs_with_intron(donorOrf,accepOrf,
            min_donor_pos=min_donor_pos,
            max_acceptor_pos=max_accep_pos,
            order_by='length',**kwargs)

    # return list with projected introns
    projected_introns = []

    # gather unique donor and acceptor positions from list
    # of IntronConnectingOrfs
    for intron in introns:
        # break if intron is to large
        if kwargs['max_intron_nt_length'] and intron.length > kwargs['max_intron_nt_length']: break
        # continue if intron is to small
        if kwargs['min_intron_nt_length'] and intron.length < kwargs['min_intron_nt_length']: continue
        # continue if intron has non-canonical features


        # check if intron.start is on pfD;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.start <= dStart: continue
        if intron.start >= dEnd:   continue

        # check if intron.end is on pfA;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.end <= aStart: continue
        if intron.end >= aEnd:   continue

        if queryorsbjct == "sbjct":
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_sbjct(intron.donor.pos,forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_sbjct(intron.acceptor.pos,forced_return=True)
            # calculate projected distance on QUERY
            posDposQuery = pfD._positions[donorPositionPos].query_pos
            posAposQuery = pfA._positions[accepPositionPos].query_pos
            aaDistance   = posAposQuery - posDposQuery
        else:
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_query(intron.donor.pos,forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_query(intron.acceptor.pos,forced_return=True)
            # calculate binary entropy from projected position on SBJCT
            posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos
            posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos
            aaDistance   = posAposSbjct - posDposSbjct

        # calculate binary entropy score
        entropyDonorSbjct   = pfD.alignment_entropy(donorPositionPos,method='donor')
        entropyAcceptorSbjct= pfA.alignment_entropy(accepPositionPos,method='acceptor')

        # do distance check upon (projected) intron acceptance
        if abs(aaDistance) <= kwargs['max_aa_offset']:

            # check if we've runned out of the aligned part
            outofalignedpacbporf = False

            # get the projected donor position; mind the gap on this spot ;-)
            while pfD._positions[donorPositionPos].isa_gap and donorPositionPos > 0 :
                donorPositionPos -= 1
            else:
                projected_donor_position = getattr(pfD._positions[donorPositionPos],outOfAlignmentAttribute) + phaseD
                if donorPositionPos == 0 and pfD._positions[donorPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::donor"
                    outofalignedpacbporf = True

            # get the projected acceptor position; mind the gap on this spot ;-)
            while pfA._positions[accepPositionPos].isa_gap and len(pfA._positions) > accepPositionPos+1:
                accepPositionPos += 1
            else:
                projected_accep_position = getattr(pfA._positions[accepPositionPos],outOfAlignmentAttribute) + phaseA
                if accepPositionPos == len(pfA._positions)-1 and pfA._positions[accepPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::acceptor"
                    outofalignedpacbporf = True

            if not outofalignedpacbporf:
                ################################################################
                # set some meta-data properties to the intron object
                ################################################################
                # add distance score to intron
                intron._distance = abs(aaDistance)*3

                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(intron,pfD,pfA)
                else:
                    succes = set_apps_intron_sbjct(intron,pfD,pfA)
        
                # set GFF fsource attribute for recognition of intron sources
                intron._gff['fsource'] = "ABGPprojecting"

                # make a ProjectedIntronConnectingOrfs object
                pico = ProjectedIntronConnectingOrfs(prjctOrf,
                        projected_donor_position,
                        projected_accep_position)
                intron.binary_entropy_donor = entropyDonorSbjct
                intron.binary_entropy_acceptor = entropyAcceptorSbjct
                pico.add_projected_intron( intron )
                pico.phase = intron.phase
                projected_introns.append( pico )

                ################################################################
                if verbose:
                    print "PROJ::", intron._distance,
                    print (pfD.orfQ.id, pfA.orfQ.id),
                    print (pfD.orfS.id, pfA.orfS.id),
                    print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos),
                    print "%2.1f,%2.1f" % (intron.donor.pssm_score, intron.acceptor.pssm_score),
                    print "%2.1f,%2.1f" % (intron.binary_entropy_donor,intron.binary_entropy_acceptor)
                ################################################################

        if aaDistance > kwargs['max_aa_offset']:
            # break out; ordered by length can never result in
            # a proper projected intron
            break


    # filter out less relevant ones compared to complete set of results
    projected_introns = _filter_projected_introns(projected_introns)

    # and return a list of ProjectedIntronConnectingOrfs
    return projected_introns
Beispiel #4
0
def merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=False,**kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see orfs.merge_orfs_with_intron for **kwargs
    @attention: see functions._filter_for_alignable_splice_sites for **kwargs
    @attention: see functions._filter_for_entropy for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    sdr = pacbporfD.alignment_dna_range_sbjct()
    sar = pacbporfA.alignment_dna_range_sbjct()
    min_donor_sbjct_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_sbjct_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    # get list of introns
    #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
    #        min_donor_pos   =min_donor_query_pos,
    #        max_acceptor_pos=max_accep_query_pos,**kwargs)
    #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,
    #        min_donor_pos   =min_donor_sbjct_pos,
    #        max_acceptor_pos=max_accep_sbjct_pos,**kwargs)

    # get list of introns
    intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,**kwargs)
    intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,**kwargs)

    # get unique list of donors & acceptors
    donorQ = olba( list(Set([inQ.donor for inQ in intronsQ ])), order_by='pos')
    donorS = olba( list(Set([inS.donor for inS in intronsS ])), order_by='pos')
    accepQ = olba( list(Set([inQ.acceptor for inQ in intronsQ ])), order_by='pos')
    accepS = olba( list(Set([inS.acceptor for inS in intronsS ])), order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [ d.pos for d in donorQ ], "aQ1", [ a.pos for a in accepQ ]
        print "dS1", [ d.pos for d in donorS ], "aS1", [ a.pos for a in accepS ]
    ############################################################################

    # filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor']
    algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs)
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor']
    algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs)

    ############################################################################
    if verbose:
        print "dQ2", [ _dq.pos for (_dq,_ds) in algdonors ],
        print "aQ2", [ _aq.pos for (_aq,_as) in algacceps ]
        print "dS2", [ _ds.pos for (_dq,_ds) in algdonors ],
        print "aS2", [ _as.pos for (_aq,_as) in algacceps ]
    ############################################################################

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor',
                min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor',
                min_alignment_entropy=min_acceptor_site_alignment_entropy)

    ############################################################################
    if verbose:
        print "dQ3", [ _dq.pos for (_dq,_ds) in algdonors ],
        print "aQ3", [ _aq.pos for (_aq,_as) in algacceps ]
        print "dS3", [ _ds.pos for (_dq,_ds) in algdonors ],
        print "aS3", [ _as.pos for (_aq,_as) in algacceps ]
    ############################################################################


    # make unique position lists for quick lookup in intron lists
    dQpl = Set([ dQ.pos for dQ,dS in algdonors ])
    dSpl = Set([ dS.pos for dQ,dS in algdonors ])
    aQpl = Set([ aQ.pos for aQ,aS in algacceps ])
    aSpl = Set([ aS.pos for aQ,aS in algacceps ])

    # check exterior boundaries of PacbPORFs
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()

    # now make list of aligable introns
    algintrons = []
    for intQ in intronsQ:
        # check if intron falls within the PacbPORF aligned area
        if intQ.donor.pos <= sposD.query_dna_start: continue
        if intQ.acceptor.pos >= eposA.query_dna_end: continue
        if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl:
            # Query intron occurs in list of alignable splice sites!
            for intS in intronsS:
                # check if intron falls within the PacbPORF aligned area
                if intS.donor.pos <= sposD.sbjct_dna_start: continue
                if intS.acceptor.pos >= eposA.sbjct_dna_end: continue
                if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl:
                    # Sbjct intron occurs as well in alignable splice sites!
                    if (intQ.donor,intS.donor) in algdonors and\
                    (intQ.acceptor,intS.acceptor) in algacceps:
                        # Sbjct & Query Donor & Acceptor are alignable!
                        algintrons.append( ( intQ, intS ) )

    ############################################################################
    # set some meta-data properties to the intron objects
    ############################################################################
    for intQ,intS in algintrons:
        distDnt = pacbporfD.get_distance_aligned_nucleotide_positions(
                        query = intQ.donor.pos, sbjct = intS.donor.pos
                        )
        distAnt = pacbporfA.get_distance_aligned_nucleotide_positions(
                        query = intQ.acceptor.pos, sbjct = intS.acceptor.pos
                        )

        # final distance check. kwargs['aligned_site_max_triplet_distance']
        # is applied on donor and acceptor site. This distance measured on the
        # protein sequence can be DOUBLED in case distDnt / distAnt are
        # opposite (+ and -). Check here if the protein sequence gap is
        # as well <= kwargs['aligned_site_max_triplet_distance'].
        if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance']*3:
            continue

        # add distance score to introns
        intQ._distance = abs(distDnt) + abs(distAnt)
        intS._distance = abs(distDnt) + abs(distAnt)

        # add Alignment Positional Periphery Score into objects
        succes = set_apps_intron_query(intQ,pacbporfD,pacbporfA)
        succes = set_apps_intron_sbjct(intS,pacbporfD,pacbporfA)

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPmapping"
        intS._gff['fsource'] = "ABGPmapping"

        ########################################################################
        if verbose:
            # some printing....
            print "Aligned introns:", ( intQ.donor.pos, intQ.acceptor.pos ) ,
            print ( intS.donor.pos, intS.acceptor.pos ),
            print "DIST:", distDnt, distAnt,
            print "[%s]" % kwargs['aligned_site_max_triplet_distance'],
            print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep),
            print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % (
                intQ.donor.pssm_score, intS.donor.pssm_score,
                intQ.acceptor.pssm_score, intS.acceptor.pssm_score,
                )
        ########################################################################

    # return lists of aligned introns
    return algintrons
Beispiel #5
0
def _merge_pacbporfs_by_intron(pfD,
                               pfA,
                               queryorsbjct,
                               verbose=False,
                               **kwargs):
    """
    Project splicesites from SBJCT intron on continious QUERY PacbPORFs

    @type  pfD: PacbPORF object
    @param pfD: PacbPORF object that has to deliver (aligned) donor sites

    @type  pfA: PacbPORF object
    @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @rtype:  list
    @return: list with ProjectedIntrons (from Sbjct on Query)
    """
    # input validation
    IsPacbPORF(pfD)
    IsPacbPORF(pfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_INTRON)

    ### if not kwargs.has_key('projected_intron_max_nt_offset'):
    ###    kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET
    ### if not kwargs.has_key('projected_intron_max_aa_offset'):
    ###    kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    sposD = pfD._get_original_alignment_pos_start()
    eposD = pfD._get_original_alignment_pos_end()
    sposA = pfA._get_original_alignment_pos_start()
    eposA = pfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        # Orfs of SBJCT must be identical
        IsIdenticalOrfs(pfD.orfS, pfA.orfS)
        donorOrf = pfD.orfQ
        accepOrf = pfA.orfQ
        prjctOrf = pfD.orfS  # pfD.orfS == pfA.orfS
        dStart = sposD.query_dna_start  # ALIGNED start of donorPacbPORF
        dEnd = pfD.query_dna_end  # ABSOLUTE end of donorPacbPORF
        aStart = pfA.query_dna_start  # ABSOLUTE start of acceptorPacbPORF
        aEnd = eposA.query_dna_end  # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "sbjct_dna_start"
        # calculate elegiable splice site range
        qdr = pfD.alignment_dna_range_query()
        qar = pfA.alignment_dna_range_query()
        min_donor_pos = max(
            [min(qdr),
             max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
        max_accep_pos = min(
            [max(qar),
             min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    elif queryorsbjct == "sbjct":
        # Orfs of QUERY  must be identical
        IsIdenticalOrfs(pfD.orfQ, pfA.orfQ)
        donorOrf = pfD.orfS
        accepOrf = pfA.orfS
        prjctOrf = pfD.orfQ  # pfD.orfQ == pfA.orfQ
        dStart = sposD.sbjct_dna_start  # ALIGNED start of donorPacbPORF
        dEnd = pfD.sbjct_dna_end  # ABSOLUTE end of donorPacbPORF
        aStart = pfA.sbjct_dna_start  # ABSOLUTE start of acceptorPacbPORF
        aEnd = eposA.sbjct_dna_end  # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "query_dna_start"
        # calculate elegiable splice site range
        sdr = pfD.alignment_dna_range_sbjct()
        sar = pfA.alignment_dna_range_sbjct()
        min_donor_pos = max(
            [min(sdr),
             max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
        max_accep_pos = min(
            [max(sar),
             min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # predict introns only in `queryorsbjct` Orfs
    # introns is a list of IntronConnectingOrfs objects
    introns = merge_orfs_with_intron(donorOrf,
                                     accepOrf,
                                     min_donor_pos=min_donor_pos,
                                     max_acceptor_pos=max_accep_pos,
                                     order_by='length',
                                     **kwargs)

    # return list with projected introns
    projected_introns = []

    # gather unique donor and acceptor positions from list
    # of IntronConnectingOrfs
    for intron in introns:
        # break if intron is to large
        if kwargs['max_intron_nt_length'] and intron.length > kwargs[
                'max_intron_nt_length']:
            break
        # continue if intron is to small
        if kwargs['min_intron_nt_length'] and intron.length < kwargs[
                'min_intron_nt_length']:
            continue
        # continue if intron has non-canonical features

        # check if intron.start is on pfD;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.start <= dStart: continue
        if intron.start >= dEnd: continue

        # check if intron.end is on pfA;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.end <= aStart: continue
        if intron.end >= aEnd: continue

        if queryorsbjct == "sbjct":
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_sbjct(
                intron.donor.pos, forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_sbjct(
                intron.acceptor.pos, forced_return=True)
            # calculate projected distance on QUERY
            posDposQuery = pfD._positions[donorPositionPos].query_pos
            posAposQuery = pfA._positions[accepPositionPos].query_pos
            aaDistance = posAposQuery - posDposQuery
        else:
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_query(
                intron.donor.pos, forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_query(
                intron.acceptor.pos, forced_return=True)
            # calculate binary entropy from projected position on SBJCT
            posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos
            posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos
            aaDistance = posAposSbjct - posDposSbjct

        # calculate binary entropy score
        entropyDonorSbjct = pfD.alignment_entropy(donorPositionPos,
                                                  method='donor')
        entropyAcceptorSbjct = pfA.alignment_entropy(accepPositionPos,
                                                     method='acceptor')

        # do distance check upon (projected) intron acceptance
        if abs(aaDistance) <= kwargs['max_aa_offset']:

            # check if we've runned out of the aligned part
            outofalignedpacbporf = False

            # get the projected donor position; mind the gap on this spot ;-)
            while pfD._positions[
                    donorPositionPos].isa_gap and donorPositionPos > 0:
                donorPositionPos -= 1
            else:
                projected_donor_position = getattr(
                    pfD._positions[donorPositionPos],
                    outOfAlignmentAttribute) + phaseD
                if donorPositionPos == 0 and pfD._positions[
                        donorPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::donor"
                    outofalignedpacbporf = True

            # get the projected acceptor position; mind the gap on this spot ;-)
            while pfA._positions[accepPositionPos].isa_gap and len(
                    pfA._positions) > accepPositionPos + 1:
                accepPositionPos += 1
            else:
                projected_accep_position = getattr(
                    pfA._positions[accepPositionPos],
                    outOfAlignmentAttribute) + phaseA
                if accepPositionPos == len(
                        pfA._positions
                ) - 1 and pfA._positions[accepPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::acceptor"
                    outofalignedpacbporf = True

            if not outofalignedpacbporf:
                ################################################################
                # set some meta-data properties to the intron object
                ################################################################
                # add distance score to intron
                intron._distance = abs(aaDistance) * 3

                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(intron, pfD, pfA)
                else:
                    succes = set_apps_intron_sbjct(intron, pfD, pfA)

                # set GFF fsource attribute for recognition of intron sources
                intron._gff['fsource'] = "ABGPprojecting"

                # make a ProjectedIntronConnectingOrfs object
                pico = ProjectedIntronConnectingOrfs(prjctOrf,
                                                     projected_donor_position,
                                                     projected_accep_position)
                intron.binary_entropy_donor = entropyDonorSbjct
                intron.binary_entropy_acceptor = entropyAcceptorSbjct
                pico.add_projected_intron(intron)
                pico.phase = intron.phase
                projected_introns.append(pico)

                ################################################################
                if verbose:
                    print "PROJ::", intron._distance,
                    print(pfD.orfQ.id, pfA.orfQ.id),
                    print(pfD.orfS.id, pfA.orfS.id),
                    print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos),
                    print "%2.1f,%2.1f" % (intron.donor.pssm_score,
                                           intron.acceptor.pssm_score),
                    print "%2.1f,%2.1f" % (intron.binary_entropy_donor,
                                           intron.binary_entropy_acceptor)
                ################################################################

        if aaDistance > kwargs['max_aa_offset']:
            # break out; ordered by length can never result in
            # a proper projected intron
            break

    # filter out less relevant ones compared to complete set of results
    projected_introns = _filter_projected_introns(projected_introns)

    # and return a list of ProjectedIntronConnectingOrfs
    return projected_introns
Beispiel #6
0
def merge_pacbporfs_with_introns(pacbporfD,
                                 pacbporfA,
                                 verbose=False,
                                 **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see orfs.merge_orfs_with_intron for **kwargs
    @attention: see functions._filter_for_alignable_splice_sites for **kwargs
    @attention: see functions._filter_for_entropy for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max(
        [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_query_pos = min(
        [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    sdr = pacbporfD.alignment_dna_range_sbjct()
    sar = pacbporfA.alignment_dna_range_sbjct()
    min_donor_sbjct_pos = max(
        [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_sbjct_pos = min(
        [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    # get list of introns
    #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
    #        min_donor_pos   =min_donor_query_pos,
    #        max_acceptor_pos=max_accep_query_pos,**kwargs)
    #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,
    #        min_donor_pos   =min_donor_sbjct_pos,
    #        max_acceptor_pos=max_accep_sbjct_pos,**kwargs)

    # get list of introns
    intronsQ = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, **kwargs)
    intronsS = merge_orfs_with_intron(pacbporfD.orfS, pacbporfA.orfS, **kwargs)

    # get unique list of donors & acceptors
    donorQ = olba(list(Set([inQ.donor for inQ in intronsQ])), order_by='pos')
    donorS = olba(list(Set([inS.donor for inS in intronsS])), order_by='pos')
    accepQ = olba(list(Set([inQ.acceptor for inQ in intronsQ])),
                  order_by='pos')
    accepS = olba(list(Set([inS.acceptor for inS in intronsS])),
                  order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donorQ], "aQ1", [a.pos for a in accepQ]
        print "dS1", [d.pos for d in donorS], "aS1", [a.pos for a in accepS]
    ############################################################################

    # filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor']
    algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD,
                                                   **kwargs)
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor']
    algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA,
                                                   **kwargs)

    ############################################################################
    if verbose:
        print "dQ2", [_dq.pos for (_dq, _ds) in algdonors],
        print "aQ2", [_aq.pos for (_aq, _as) in algacceps]
        print "dS2", [_ds.pos for (_dq, _ds) in algdonors],
        print "aS2", [_as.pos for (_aq, _as) in algacceps]
    ############################################################################

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(
        algdonors,
        pacbporfD,
        'donor',
        min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(
        algacceps,
        pacbporfA,
        'acceptor',
        min_alignment_entropy=min_acceptor_site_alignment_entropy)

    ############################################################################
    if verbose:
        print "dQ3", [_dq.pos for (_dq, _ds) in algdonors],
        print "aQ3", [_aq.pos for (_aq, _as) in algacceps]
        print "dS3", [_ds.pos for (_dq, _ds) in algdonors],
        print "aS3", [_as.pos for (_aq, _as) in algacceps]
    ############################################################################

    # make unique position lists for quick lookup in intron lists
    dQpl = Set([dQ.pos for dQ, dS in algdonors])
    dSpl = Set([dS.pos for dQ, dS in algdonors])
    aQpl = Set([aQ.pos for aQ, aS in algacceps])
    aSpl = Set([aS.pos for aQ, aS in algacceps])

    # check exterior boundaries of PacbPORFs
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()

    # now make list of aligable introns
    algintrons = []
    for intQ in intronsQ:
        # check if intron falls within the PacbPORF aligned area
        if intQ.donor.pos <= sposD.query_dna_start: continue
        if intQ.acceptor.pos >= eposA.query_dna_end: continue
        if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl:
            # Query intron occurs in list of alignable splice sites!
            for intS in intronsS:
                # check if intron falls within the PacbPORF aligned area
                if intS.donor.pos <= sposD.sbjct_dna_start: continue
                if intS.acceptor.pos >= eposA.sbjct_dna_end: continue
                if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl:
                    # Sbjct intron occurs as well in alignable splice sites!
                    if (intQ.donor,intS.donor) in algdonors and\
                    (intQ.acceptor,intS.acceptor) in algacceps:
                        # Sbjct & Query Donor & Acceptor are alignable!
                        algintrons.append((intQ, intS))

    ############################################################################
    # set some meta-data properties to the intron objects
    ############################################################################
    for intQ, intS in algintrons:
        distDnt = pacbporfD.get_distance_aligned_nucleotide_positions(
            query=intQ.donor.pos, sbjct=intS.donor.pos)
        distAnt = pacbporfA.get_distance_aligned_nucleotide_positions(
            query=intQ.acceptor.pos, sbjct=intS.acceptor.pos)

        # final distance check. kwargs['aligned_site_max_triplet_distance']
        # is applied on donor and acceptor site. This distance measured on the
        # protein sequence can be DOUBLED in case distDnt / distAnt are
        # opposite (+ and -). Check here if the protein sequence gap is
        # as well <= kwargs['aligned_site_max_triplet_distance'].
        if abs(distAnt -
               distDnt) > kwargs['aligned_site_max_triplet_distance'] * 3:
            continue

        # add distance score to introns
        intQ._distance = abs(distDnt) + abs(distAnt)
        intS._distance = abs(distDnt) + abs(distAnt)

        # add Alignment Positional Periphery Score into objects
        succes = set_apps_intron_query(intQ, pacbporfD, pacbporfA)
        succes = set_apps_intron_sbjct(intS, pacbporfD, pacbporfA)

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPmapping"
        intS._gff['fsource'] = "ABGPmapping"

        ########################################################################
        if verbose:
            # some printing....
            print "Aligned introns:", (intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos),
            print "DIST:", distDnt, distAnt,
            print "[%s]" % kwargs['aligned_site_max_triplet_distance'],
            print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor,
                                            intQ._apps_accep),
            print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % (
                intQ.donor.pssm_score,
                intS.donor.pssm_score,
                intQ.acceptor.pssm_score,
                intS.acceptor.pssm_score,
            )
        ########################################################################

    # return lists of aligned introns
    return algintrons