Esempio n. 1
0
def merge_pacbporfs_with_phase_shift_introns(pacbporfD,
                                             pacbporfA,
                                             verbose=False,
                                             **kwargs):
    """
    Merge 2 PacbPORF objects by introns of which one underwent a phase shift

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs, KWARGS_PHASE_SHIFT_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_distance']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD, pacbporfA, **kwargs)
    psh_introns = []

    # check if there is length congruence between the cig_introns
    for intQ, intS in alg_introns:
        # check phase equilibrium -> if equal, no phase shift
        if intQ.donor.phase == intS.donor.phase:
            continue

        ########################################################################
        # set some meta-data properties to the intron objects
        # attribute _distance is already set in merge_pacbporfs_with_introns
        # attribute(s) ~APPS are already set in merge_pacbporfs_with_introns
        ########################################################################

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPphs"
        intS._gff['fsource'] = "ABGPphs"

        # putatively a phase shifted intron pair
        psh_introns.append((intQ, intS))

    # return lists of phase shifted introns
    return psh_introns
Esempio n. 2
0
def merge_pacbporfs_with_phase_shift_introns(pacbporfD,pacbporfA,
    verbose=False,**kwargs):
    """
    Merge 2 PacbPORF objects by introns of which one underwent a phase shift

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs,KWARGS_PHASE_SHIFT_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_distance']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,**kwargs)
    psh_introns = []

    # check if there is length congruence between the cig_introns
    for intQ,intS in alg_introns:
        # check phase equilibrium -> if equal, no phase shift
        if intQ.donor.phase == intS.donor.phase:
            continue

        ########################################################################
        # set some meta-data properties to the intron objects
        # attribute _distance is already set in merge_pacbporfs_with_introns
        # attribute(s) ~APPS are already set in merge_pacbporfs_with_introns
        ########################################################################

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPphs"
        intS._gff['fsource'] = "ABGPphs"

        # putatively a phase shifted intron pair
        psh_introns.append( ( intQ, intS ) )
    
    # return lists of phase shifted introns
    return psh_introns
Esempio n. 3
0
def find_stopless3n_introns_on_orf(orfObj,
                                   has_branchpoint=False,
                                   has_polypyrimidine=False,
                                   order_by='length',
                                   **kwargs):
    """
    Find potential stopless3n introns on this orf

    @attention: **kwargs can contain other (here) unnecessarily arguments
    @attention: **kwargs are required in the merge_orfs_with_intron() function

    @type  orfObj: Orf object
    @param orfObj: Orf object which is scanned for stopless3n introns

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfObj)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_STOPLESS_3N_INTRONS)

    # find stopless3nintrons
    stopless3nintrons = merge_orfs_with_intron(orfObj, orfObj, **kwargs)

    # filter for presence of branchpoint / polypyrimidine tracks
    if has_branchpoint or has_polypyrimidine:
        filtered = []
        for intron in stopless3nintrons:
            intron.assign_bp_and_ppts()
            if has_branchpoint and not intron.branchpoint:
                continue
            intron_bp_dist = intron.get_branchpoint_nt_distance()
            if has_branchpoint and intron_bp_dist == None:
                continue
            intron_bp_optimality = min([
                abs(offset - intron_bp_dist)
                for offset in OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE
            ])
            if has_branchpoint and intron_bp_optimality > MAXIMAL_OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE:
                continue
            if has_polypyrimidine and not (intron.ppt5p or intron.ppt3p):
                continue
            # if here, accepted!
            filtered.append(intron)
    else:
        filtered = stopless3nintrons

    # return ordered intron list
    return _order_intron_list(filtered, order_by=order_by)
Esempio n. 4
0
def _get_tinyexon_dict(input, omit_identifier_list=[], **kwargs):
    """ """
    _update_kwargs(kwargs, KWARGS_TINYEXON_PAIRWISE)
    tinyexondata = {}
    for orgid in input.keys():
        if orgid in omit_identifier_list: continue
        tinyexondata[orgid] = []
        for orfObj in input[orgid]['orfs'].orfs:
            tinyexondata[orgid].extend(
                get_potential_tiny_exons_on_orf(orfObj, **kwargs))
        tinyexondata[orgid] = order_list_by_attribute(tinyexondata[orgid],
                                                      order_by='length')
    # return dict with predicted tinyexons
    return tinyexondata
Esempio n. 5
0
def _get_tinyexon_dict(input,omit_identifier_list=[],**kwargs):
    """ """
    _update_kwargs(kwargs,KWARGS_TINYEXON_PAIRWISE)
    tinyexondata = {}
    for orgid in input.keys():
        if orgid in omit_identifier_list: continue
        tinyexondata[orgid] = []
        for orfObj in input[orgid]['orfs'].orfs:
            tinyexondata[orgid].extend( get_potential_tiny_exons_on_orf(
                        orfObj,**kwargs ) )
        tinyexondata[orgid] = order_list_by_attribute(
                        tinyexondata[orgid],order_by='length')
    # return dict with predicted tinyexons
    return tinyexondata
Esempio n. 6
0
def find_stopless3n_introns_on_orf(orfObj,
    has_branchpoint = False,
    has_polypyrimidine = False,
    order_by = 'length',**kwargs):
    """
    Find potential stopless3n introns on this orf

    @attention: **kwargs can contain other (here) unnecessarily arguments
    @attention: **kwargs are required in the merge_orfs_with_intron() function

    @type  orfObj: Orf object
    @param orfObj: Orf object which is scanned for stopless3n introns

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfObj)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_STOPLESS_3N_INTRONS)

    # find stopless3nintrons
    stopless3nintrons = merge_orfs_with_intron(orfObj,orfObj,**kwargs)

    # filter for presence of branchpoint / polypyrimidine tracks
    if has_branchpoint or has_polypyrimidine:
        filtered = []
        for intron in stopless3nintrons:
            intron.assign_bp_and_ppts()
            if has_branchpoint and not intron.branchpoint:
                continue
            intron_bp_dist = intron.get_branchpoint_nt_distance()
            if has_branchpoint and intron_bp_dist == None:
                continue
            intron_bp_optimality = min([ abs(offset-intron_bp_dist) for offset in OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE ])
            if has_branchpoint and intron_bp_optimality > MAXIMAL_OPTIMAL_BRACNHPOINT_TO_ACCEPTOR_DISTANCE:
                continue 
            if has_polypyrimidine and not (intron.ppt5p or intron.ppt3p):
                continue
            # if here, accepted!
            filtered.append( intron )
    else:
        filtered = stopless3nintrons

    # return ordered intron list
    return _order_intron_list(filtered,order_by=order_by)
Esempio n. 7
0
def _merge_pacbporfs_by_intron(pfD,pfA,queryorsbjct,verbose=False,**kwargs):
    """
    Project splicesites from SBJCT intron on continious QUERY PacbPORFs

    @type  pfD: PacbPORF object
    @param pfD: PacbPORF object that has to deliver (aligned) donor sites

    @type  pfA: PacbPORF object
    @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @rtype:  list
    @return: list with ProjectedIntrons (from Sbjct on Query)
    """
    # input validation
    IsPacbPORF(pfD)
    IsPacbPORF(pfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_INTRON)

    ### if not kwargs.has_key('projected_intron_max_nt_offset'):
    ###    kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET
    ### if not kwargs.has_key('projected_intron_max_aa_offset'):
    ###    kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0


    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    sposD = pfD._get_original_alignment_pos_start()
    eposD = pfD._get_original_alignment_pos_end()
    sposA = pfA._get_original_alignment_pos_start()
    eposA = pfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        # Orfs of SBJCT must be identical
        IsIdenticalOrfs(pfD.orfS,pfA.orfS)
        donorOrf = pfD.orfQ
        accepOrf = pfA.orfQ
        prjctOrf = pfD.orfS # pfD.orfS == pfA.orfS
        dStart = sposD.query_dna_start  # ALIGNED start of donorPacbPORF
        dEnd   = pfD.query_dna_end      # ABSOLUTE end of donorPacbPORF
        aStart = pfA.query_dna_start    # ABSOLUTE start of acceptorPacbPORF
        aEnd   = eposA.query_dna_end    # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "sbjct_dna_start"
        # calculate elegiable splice site range
        qdr = pfD.alignment_dna_range_query()
        qar = pfA.alignment_dna_range_query()
        min_donor_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
        max_accep_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    elif queryorsbjct == "sbjct":
        # Orfs of QUERY  must be identical
        IsIdenticalOrfs(pfD.orfQ,pfA.orfQ)
        donorOrf = pfD.orfS
        accepOrf = pfA.orfS
        prjctOrf = pfD.orfQ # pfD.orfQ == pfA.orfQ
        dStart = sposD.sbjct_dna_start  # ALIGNED start of donorPacbPORF
        dEnd   = pfD.sbjct_dna_end      # ABSOLUTE end of donorPacbPORF
        aStart = pfA.sbjct_dna_start    # ABSOLUTE start of acceptorPacbPORF
        aEnd   = eposA.sbjct_dna_end    # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "query_dna_start"
        # calculate elegiable splice site range
        sdr = pfD.alignment_dna_range_sbjct()
        sar = pfA.alignment_dna_range_sbjct()
        min_donor_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
        max_accep_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # predict introns only in `queryorsbjct` Orfs
    # introns is a list of IntronConnectingOrfs objects
    introns = merge_orfs_with_intron(donorOrf,accepOrf,
            min_donor_pos=min_donor_pos,
            max_acceptor_pos=max_accep_pos,
            order_by='length',**kwargs)

    # return list with projected introns
    projected_introns = []

    # gather unique donor and acceptor positions from list
    # of IntronConnectingOrfs
    for intron in introns:
        # break if intron is to large
        if kwargs['max_intron_nt_length'] and intron.length > kwargs['max_intron_nt_length']: break
        # continue if intron is to small
        if kwargs['min_intron_nt_length'] and intron.length < kwargs['min_intron_nt_length']: continue
        # continue if intron has non-canonical features


        # check if intron.start is on pfD;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.start <= dStart: continue
        if intron.start >= dEnd:   continue

        # check if intron.end is on pfA;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.end <= aStart: continue
        if intron.end >= aEnd:   continue

        if queryorsbjct == "sbjct":
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_sbjct(intron.donor.pos,forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_sbjct(intron.acceptor.pos,forced_return=True)
            # calculate projected distance on QUERY
            posDposQuery = pfD._positions[donorPositionPos].query_pos
            posAposQuery = pfA._positions[accepPositionPos].query_pos
            aaDistance   = posAposQuery - posDposQuery
        else:
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_query(intron.donor.pos,forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_query(intron.acceptor.pos,forced_return=True)
            # calculate binary entropy from projected position on SBJCT
            posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos
            posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos
            aaDistance   = posAposSbjct - posDposSbjct

        # calculate binary entropy score
        entropyDonorSbjct   = pfD.alignment_entropy(donorPositionPos,method='donor')
        entropyAcceptorSbjct= pfA.alignment_entropy(accepPositionPos,method='acceptor')

        # do distance check upon (projected) intron acceptance
        if abs(aaDistance) <= kwargs['max_aa_offset']:

            # check if we've runned out of the aligned part
            outofalignedpacbporf = False

            # get the projected donor position; mind the gap on this spot ;-)
            while pfD._positions[donorPositionPos].isa_gap and donorPositionPos > 0 :
                donorPositionPos -= 1
            else:
                projected_donor_position = getattr(pfD._positions[donorPositionPos],outOfAlignmentAttribute) + phaseD
                if donorPositionPos == 0 and pfD._positions[donorPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::donor"
                    outofalignedpacbporf = True

            # get the projected acceptor position; mind the gap on this spot ;-)
            while pfA._positions[accepPositionPos].isa_gap and len(pfA._positions) > accepPositionPos+1:
                accepPositionPos += 1
            else:
                projected_accep_position = getattr(pfA._positions[accepPositionPos],outOfAlignmentAttribute) + phaseA
                if accepPositionPos == len(pfA._positions)-1 and pfA._positions[accepPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::acceptor"
                    outofalignedpacbporf = True

            if not outofalignedpacbporf:
                ################################################################
                # set some meta-data properties to the intron object
                ################################################################
                # add distance score to intron
                intron._distance = abs(aaDistance)*3

                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(intron,pfD,pfA)
                else:
                    succes = set_apps_intron_sbjct(intron,pfD,pfA)
        
                # set GFF fsource attribute for recognition of intron sources
                intron._gff['fsource'] = "ABGPprojecting"

                # make a ProjectedIntronConnectingOrfs object
                pico = ProjectedIntronConnectingOrfs(prjctOrf,
                        projected_donor_position,
                        projected_accep_position)
                intron.binary_entropy_donor = entropyDonorSbjct
                intron.binary_entropy_acceptor = entropyAcceptorSbjct
                pico.add_projected_intron( intron )
                pico.phase = intron.phase
                projected_introns.append( pico )

                ################################################################
                if verbose:
                    print "PROJ::", intron._distance,
                    print (pfD.orfQ.id, pfA.orfQ.id),
                    print (pfD.orfS.id, pfA.orfS.id),
                    print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos),
                    print "%2.1f,%2.1f" % (intron.donor.pssm_score, intron.acceptor.pssm_score),
                    print "%2.1f,%2.1f" % (intron.binary_entropy_donor,intron.binary_entropy_acceptor)
                ################################################################

        if aaDistance > kwargs['max_aa_offset']:
            # break out; ordered by length can never result in
            # a proper projected intron
            break


    # filter out less relevant ones compared to complete set of results
    projected_introns = _filter_projected_introns(projected_introns)

    # and return a list of ProjectedIntronConnectingOrfs
    return projected_introns
Esempio n. 8
0
def merge_orfs_with_two_tinyexons(preceding_orf,subsequent_orf,
    preceding_donor_sites=[],
    subsequent_acceptor_sites=[],
    orflist=[],**kwargs):
    """
    Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @attention: see get_potential_tiny_exons_on_orf for additional **kwargs

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron )

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    # return list with (intron,tinyexon,intron) tuples
    returntinyexons = []
    tinyexoncollection = []
    tinyexoncombis = []
    min_preceding_donor_sites_pos     = min([ d.pos for d in preceding_donor_sites ])
    max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) 

    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        min_pos = min_preceding_donor_sites_pos + kwargs['min_tinyexon_intron_nt_length']
        max_pos = max_subsequent_acceptor_sites_pos - kwargs['min_tinyexon_intron_nt_length']
        # if so, do not check this Orf
        if orfX.endPY   <= min_pos: continue
        if orfX.startPY >= max_pos: continue
        # extend the tinyexoncollection
        tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX,**kwargs) )

    # make tinyexoncollection ordered on start pos
    tinyexoncollection = _order_intron_list(tinyexoncollection,order_by='donor_pos')
    # donor_pos makes REVERSE ordering; restore this by reversing
    tinyexoncollection.reverse()

    # make 2-elemented tuples of tinyexons which can co-occur together
    for tinyexon1 in tinyexoncollection:
        for pos in range(len(tinyexoncollection)-1,-1,-1):
            tinyexon2 = tinyexoncollection[pos]
            if tinyexon2.donor.pos < tinyexon1.donor.pos: break
            intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos
            if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue
            if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue
            if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue
            # if here, elegiable combi!
            intron = IntronConnectingOrfs(
                    tinyexon1.donor,tinyexon2.acceptor,
                    get_shared_nucleotides_at_splicesite(
                            subsequent_orf,preceding_orf,
                            tinyexon2.acceptor,tinyexon1.donor
                            ),
                    preceding_orf,subsequent_orf)
            totlen = tinyexon1.length+tinyexon2.length
            combi = ( totlen, tinyexon1, intron, tinyexon2 )
            tinyexoncombis.append( combi )

    # return an ordered list based on length
    tinyexoncombis.sort()
    return [ (exon1,intron,exon2) for l,exon1,intron,exon2 in tinyexoncombis ]
Esempio n. 9
0
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,pacbporfA,verbose=False,**kwargs):
    """
    Merge query Orfs in PacbPORF by **best** intron

    @attention: see orfs.merge_orfs_with_intron for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    # get list of introns
    intronlist = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
            min_donor_pos   =min_donor_query_pos,
            max_acceptor_pos=max_accep_query_pos,**kwargs)


    # filter on entropy
    # settings for minimal alignment entropy score
    if min([pacbporfD.identityscore,pacbporfA.identityscore]) > 0.55:
        min_donor_site_entropy = 0.01
        min_acceptor_site_entropy = 0.01
        intronlist = _filter_introns_on_entropy(intronlist,pacbporfD,pacbporfA,
                min_donor_site_entropy=min_donor_site_entropy,
                min_acceptor_site_entropy=min_acceptor_site_entropy)
    else:
        # do not filter, but do not forget to store apps data to intron(s)
        for intron in intronlist:
            succes = set_apps_intron_query(intron,pacbporfD,pacbporfA)


    for intron in intronlist:
        intron._distance = 0 # ??
        # set GFF fsource attribute for recognition of intron sources
        intron._gff['fsource'] = 'ABGPbridgeing'

    # get unique list of donors & acceptors
    donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos')
    accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos')

    ############################################################################
    if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep]
    ############################################################################

    intronlist = _filter_introns_on_pssm_entropy_combination(intronlist)

    # get unique list of donors & acceptors
    donor = olba( list(Set([intron.donor for intron in intronlist ])), order_by='pos')
    accep = olba( list(Set([intron.acceptor for intron in intronlist ])), order_by='pos')

    ############################################################################
    if verbose: print "dQ1",[d.pos for d in donor],"aQ1",[a.pos for a in accep]
    ############################################################################

    filtered_intron_list = []
    for intron in intronlist:
        intron.assign_bp_and_ppts()
        if intron.branchpoint and (intron.ppt5p or intron.ppt3p):
            filtered_intron_list.append( intron )
        else:
            pass

    # check if list is emptied due to branchpoint filtering
    # in that case, filter for either branchpoint OR polyppt
    if not filtered_intron_list and intronlist:
        for intron in intronlist:
            if intron.branchpoint or (intron.ppt5p or intron.ppt3p):
                filtered_intron_list.append( intron )

    # return list of filtered introns
    return filtered_intron_list
Esempio n. 10
0
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,
                                                 pacbporfA,
                                                 orfSetObject,
                                                 queryorsbjct,
                                                 verbose=False,
                                                 **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  orfSetObject: object with elegiable Orfs
    @param orfSetObject: object with elegiable Orfs

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @rtype:  list
    @return: list with ( intron, ExonOnOrf, intron ) on the query sequence
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    MAX_TINYEXON_NT_LENGTH = 33
    MIN_TINYEXON_NT_LENGTH = 6

    tinyexons = []
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        alignedDonorRange = pacbporfD.alignment_dna_range_query()
        alignedAccepRange = pacbporfA.alignment_dna_range_query()
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        alignedDonorRange = pacbporfD.alignment_dna_range_sbjct()
        alignedAccepRange = pacbporfA.alignment_dna_range_sbjct()
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    for dObj in donorOrf._donor_sites:
        # do not make a projection OVER the aligned area
        if dObj.pos < min(alignedDonorRange): continue
        if queryorsbjct == "query":
            (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos,
                                                         forced_return=True)
        else:
            (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,
                                                         forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break
        for aObj in accepOrf._acceptor_sites:
            # do not make a projection OVER the aligned area
            if aObj.pos > max(alignedAccepRange): continue
            if queryorsbjct == "query":
                (aPos,
                 aPhase) = pacbporfA.dnaposition_query(aObj.pos,
                                                       forced_return=True)
            else:
                (aPos,
                 aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,
                                                       forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break
            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= MAX_TINYEXON_NT_LENGTH:
                break
            if distance < MIN_TINYEXON_NT_LENGTH:
                continue

            ####################################################
            # generate a ScanForMatches pattern file
            ####################################################
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct])
            # mask all non-phase0 nucleotides to N residues;
            # this represents the regularexpression for a specific
            # peptide sequence
            firstphasepositions = range(3 - dPhase % 3, len(query), 3)
            for pos in range(0, len(query)):
                if pos not in firstphasepositions:
                    query[pos] = "N"
            # calculate a ~50% mismatch number
            mismatches = max([0, (len(query) - query.count("N")) / 2])
            # write the pattern to string and subsequently to file
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            if kwargs['allow_non_canonical_donor']:
                sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % (
                    AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO)
            else:
                sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % (
                    AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO)

            ####################################################
            if verbose:
                print(pacbporfD.orfQ.id, pacbporfA.orfQ.id),
                print distance, dObj, aObj
                print sfmpat
            ####################################################

            fname = "sfmpat_tinyexon_%s_%s_%s_%s" % (
                donorOrf.id,
                accepOrf.id,
                posDsbjct,
                posAsbjct,
            )
            fh = open(fname, 'w')
            fh.write(sfmpat + "\n")
            fh.close()

            ####################################################
            # run ScanForMatches
            ####################################################
            command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\
                      """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\
                      """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\
                      """{ print $1"["$2","$3"]\\n"$4 } }' """
            command = command % (donorOrf.inputgenomicsequence, EXECUTABLE_SFM,
                                 fname, dObj.pos +
                                 (kwargs['min_intron_nt_length'] - 3),
                                 aObj.pos -
                                 (kwargs['min_intron_nt_length'] - 3))
            co = osPopen(command)
            matches = parseFasta(co.readlines())
            co.close()

            # filter matches for:
            # (1) correct donor & acceptor phase
            # (2) high enough donor & acceptor site scores
            for hdr, seqmatch in matches.iteritems():
                startQ, stopQ = [
                    int(item) for item in hdr.split(":")[1][1:-1].split(",")
                ]
                exonQstart = startQ + AUSO + 2 - 1
                exonQstop = stopQ - DDSO - 2

                ####################################
                # get Orf object of tinyexon
                ####################################
                tinyexonorf = None
                # select the Orf on which the tinyexon is located
                for orfObj in orfSetObject.get_eligible_orfs(
                        max_orf_start=exonQstart, min_orf_end=exonQstop):
                    orfPhase = (exonQstart - orfObj.startPY) % 3
                    if orfPhase == dPhase:
                        tinyexonorf = orfObj
                        break
                else:
                    # No tinyexonorf assigned!! Iin case a regex matched
                    # over a STOP-codon or the regex length is smaller
                    # then the smallest Orf, no Orf can be assigned
                    continue

                # filter for donor & acceptor score
                dScore = _score_splice_site(seqmatch[-9:], splicetype='donor')
                aScore = _score_splice_site(seqmatch[0:11],
                                            splicetype='acceptor')
                if dScore < kwargs['min_donor_pssm_score']:
                    continue
                if aScore < kwargs['min_acceptor_pssm_score']:
                    continue

                # scan Orf for splicesites
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                    splicetype="donor",
                    min_pssm_score=kwargs['min_donor_pssm_score'],
                    allow_non_canonical=kwargs['allow_non_canonical_donor'],
                    non_canonical_min_pssm_score=kwargs[
                        'non_canonical_min_donor_pssm_score'])
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                    splicetype="acceptor",
                    min_pssm_score=kwargs['min_acceptor_pssm_score'],
                    allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
                    non_canonical_min_pssm_score=kwargs[
                        'non_canonical_min_acceptor_pssm_score'])

                # get 1th intron donor object
                intron1_aObj = None
                for a in tinyexonorf._acceptor_sites:
                    if a.pos == exonQstart:
                        intron1_aObj = a
                        break
                else:
                    # pseudo-acceptorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # get 2th intron donor object
                intron2_dObj = None
                for d in tinyexonorf._donor_sites:
                    if d.pos == exonQstop:
                        intron2_dObj = d
                        break
                else:
                    # pseudo-donorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # check if introns are of elegiable lengths
                if (intron1_aObj.pos -
                        dObj.pos) > kwargs['max_intron_nt_length']:
                    continue
                if (aObj.pos -
                        intron2_dObj.pos) > kwargs['max_intron_nt_length']:
                    continue

                ####################################################
                if True or verbose:
                    # if here, a candidate!!!
                    print(pacbporfD.orfQ.id, tinyexonorf.id,
                          pacbporfA.orfQ.id),
                    print hdr, dScore, aScore
                    print seqmatch
                ####################################################

                # append to found tinyexons
                query_data = (tinyexonorf, exonQstart, exonQstop)
                sbjct_data = (prjctOrf, posDsbjct, posAsbjct)
                splicesite_data = (dObj, intron1_aObj, intron2_dObj, aObj)
                tinyexons.append((query_data, sbjct_data, splicesite_data))

            # file cleanup
            osRemove(fname)

    # return - End Of Function - if no tinyexons are found
    if not tinyexons:
        return []

    ####################################
    # select the **best** tinyexon
    ####################################
    (query_data, sbjct_data, splicesite_data) = tinyexons[0]
    orfQ, query_dna_start, query_dna_end = query_data
    orfS, sbjct_dna_start, sbjct_dna_end = sbjct_data
    (intron1_dObj, intron1_aObj, intron2_dObj, intron2_aObj) = splicesite_data

    ####################################################
    if verbose:
        print "tinyexon orf:", orfQ
        print "tinyexon orf:", intron1_aObj
        print "tinyexon orf:", intron2_dObj
    ####################################################

    ####################################
    # make tinyexon PacbPORF
    ####################################
    startQaa = orfQ.dnapos2aapos(query_dna_start) - 1
    startSaa = orfS.dnapos2aapos(sbjct_dna_start) - 1
    stopQaa = orfQ.dnapos2aapos(query_dna_end) + 1
    stopSaa = orfS.dnapos2aapos(sbjct_dna_end) + 1
    # check for directly leading stop codon on tinyexon
    while startQaa <= orfQ.protein_startPY:
        startQaa += 1
        startSaa += 1
        query_dna_start += 3
        sbjct_dna_start += 3
    while startSaa <= orfS.protein_startPY:
        startQaa += 1
        startSaa += 1
        query_dna_start += 3
        sbjct_dna_start += 3
    # check for directly tailing stop codon on tinyexon
    while stopQaa > orfQ.protein_endPY:
        stopQaa -= 1
        stopSaa -= 1
        query_dna_end -= 3
        sbjct_dna_end -= 3
    while stopSaa > orfS.protein_endPY:
        stopQaa -= 1
        stopSaa -= 1
        query_dna_end -= 3
        sbjct_dna_end -= 3
    # get sequences
    qAAseq = orfQ.getaas(abs_pos_start=startQaa, abs_pos_end=stopQaa)
    sAAseq = orfS.getaas(abs_pos_start=startSaa, abs_pos_end=stopSaa)

    ####################################################
    if verbose or len(qAAseq) != len(sAAseq):
        # if unequal lengths, error will be raised upon PacbP.__init__()
        print orfQ, qAAseq, startQaa, stopQaa, (stopQaa - startQaa),
        print(query_dna_start, query_dna_end)
        print orfS, sAAseq, startSaa, stopSaa, (stopSaa - startSaa),
        print(sbjct_dna_start, sbjct_dna_end)
        print orfQ.inputgenomicsequence[query_dna_start - 2:query_dna_end + 2]
        print orfS.inputgenomicsequence[sbjct_dna_start - 2:sbjct_dna_end + 2]
    ####################################################

    # initialize extended tinyexon PacbPORF
    from pacb import PacbP
    pacbp = PacbP(input=(qAAseq, sAAseq, startQaa, startSaa))
    pacbp.strip_unmatched_ends()
    pacbporf = pacbp2pacbporf(pacbp, orfQ, orfS)
    pacbporf.extend_pacbporf_after_stops()
    pacbporf.source = 'ABGPprojectingTE'

    ####################################
    # make introns
    ####################################
    intron1 = IntronConnectingOrfs(intron1_dObj, intron1_aObj, None, donorOrf,
                                   pacbporf.orfQ)
    intron2 = IntronConnectingOrfs(intron2_dObj, intron2_aObj, None,
                                   pacbporf.orfQ, accepOrf)

    ################################################################
    # set some meta-data properties to the intron objects
    ################################################################
    # add distance score to intron
    intron1._distance = 0
    intron2._distance = 0

    # add Alignment Positional Periphery Score into objects
    if queryorsbjct == "query":
        succes = set_apps_intron_query(intron1, pacbporfD, pacbporf)
        succes = set_apps_intron_query(intron2, pacbporf, pacbporfA)
    else:
        succes = set_apps_intron_sbjct(intron1, pacbporfD, pacbporf)
        succes = set_apps_intron_sbjct(intron2, pacbporf, pacbporfA)

    # set GFF fsource attribute for recognition of intron sources
    intron1._gff['fsource'] = "ABGPprojectingTE"
    intron2._gff['fsource'] = "ABGPprojectingTE"

    # create _linked_to_xxx attributes
    intron1._linked_to_pacbporfs = [pacbporf]
    intron2._linked_to_pacbporfs = [pacbporf]
    intron1._linked_to_introns = [intron2]
    intron2._linked_to_introns = [intron1]

    ####################################################
    if verbose:
        print pacbporf
        pacbporf.print_protein_and_dna()
        print intron1
        print intron2
        if False:
            # printing data when this function needs to be debugged:
            print ""
            print intron1
            print intron2
            print ""
            print pacbporfD
            pacbporfD.print_protein_and_dna()
            print ""
            print pacbporf
            pacbporf.print_protein_and_dna()
            print ""
            print pacbporfA
            pacbporfA.print_protein_and_dna()
            import sys
            sys.exit()
    ####################################################

    # return introns and intermediate tinyexon PacbPORF
    return [(intron1, intron2, pacbporf)]
Esempio n. 11
0
def _merge_pacbporfs_by_intron(pfD,
                               pfA,
                               queryorsbjct,
                               verbose=False,
                               **kwargs):
    """
    Project splicesites from SBJCT intron on continious QUERY PacbPORFs

    @type  pfD: PacbPORF object
    @param pfD: PacbPORF object that has to deliver (aligned) donor sites

    @type  pfA: PacbPORF object
    @param pfA: PacbPORF object that has to deliver (aligned) acceptor sites

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @rtype:  list
    @return: list with ProjectedIntrons (from Sbjct on Query)
    """
    # input validation
    IsPacbPORF(pfD)
    IsPacbPORF(pfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_INTRON)

    ### if not kwargs.has_key('projected_intron_max_nt_offset'):
    ###    kwargs['projected_intron_max_nt_offset'] = PROJECTED_INTRON_MAX_NT_OFFSET
    ### if not kwargs.has_key('projected_intron_max_aa_offset'):
    ###    kwargs['projected_intron_max_aa_offset'] = PROJECTED_INTRON_MAX_AA_OFFSET

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    sposD = pfD._get_original_alignment_pos_start()
    eposD = pfD._get_original_alignment_pos_end()
    sposA = pfA._get_original_alignment_pos_start()
    eposA = pfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        # Orfs of SBJCT must be identical
        IsIdenticalOrfs(pfD.orfS, pfA.orfS)
        donorOrf = pfD.orfQ
        accepOrf = pfA.orfQ
        prjctOrf = pfD.orfS  # pfD.orfS == pfA.orfS
        dStart = sposD.query_dna_start  # ALIGNED start of donorPacbPORF
        dEnd = pfD.query_dna_end  # ABSOLUTE end of donorPacbPORF
        aStart = pfA.query_dna_start  # ABSOLUTE start of acceptorPacbPORF
        aEnd = eposA.query_dna_end  # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "sbjct_dna_start"
        # calculate elegiable splice site range
        qdr = pfD.alignment_dna_range_query()
        qar = pfA.alignment_dna_range_query()
        min_donor_pos = max(
            [min(qdr),
             max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
        max_accep_pos = min(
            [max(qar),
             min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    elif queryorsbjct == "sbjct":
        # Orfs of QUERY  must be identical
        IsIdenticalOrfs(pfD.orfQ, pfA.orfQ)
        donorOrf = pfD.orfS
        accepOrf = pfA.orfS
        prjctOrf = pfD.orfQ  # pfD.orfQ == pfA.orfQ
        dStart = sposD.sbjct_dna_start  # ALIGNED start of donorPacbPORF
        dEnd = pfD.sbjct_dna_end  # ABSOLUTE end of donorPacbPORF
        aStart = pfA.sbjct_dna_start  # ABSOLUTE start of acceptorPacbPORF
        aEnd = eposA.sbjct_dna_end  # ALIGNED end of acceptorPacbPORF
        outOfAlignmentAttribute = "query_dna_start"
        # calculate elegiable splice site range
        sdr = pfD.alignment_dna_range_sbjct()
        sar = pfA.alignment_dna_range_sbjct()
        min_donor_pos = max(
            [min(sdr),
             max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
        max_accep_pos = min(
            [max(sar),
             min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # predict introns only in `queryorsbjct` Orfs
    # introns is a list of IntronConnectingOrfs objects
    introns = merge_orfs_with_intron(donorOrf,
                                     accepOrf,
                                     min_donor_pos=min_donor_pos,
                                     max_acceptor_pos=max_accep_pos,
                                     order_by='length',
                                     **kwargs)

    # return list with projected introns
    projected_introns = []

    # gather unique donor and acceptor positions from list
    # of IntronConnectingOrfs
    for intron in introns:
        # break if intron is to large
        if kwargs['max_intron_nt_length'] and intron.length > kwargs[
                'max_intron_nt_length']:
            break
        # continue if intron is to small
        if kwargs['min_intron_nt_length'] and intron.length < kwargs[
                'min_intron_nt_length']:
            continue
        # continue if intron has non-canonical features

        # check if intron.start is on pfD;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.start <= dStart: continue
        if intron.start >= dEnd: continue

        # check if intron.end is on pfA;
        # inframe-introns can be projected outside of pfD/pfA area
        if intron.end <= aStart: continue
        if intron.end >= aEnd: continue

        if queryorsbjct == "sbjct":
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_sbjct(
                intron.donor.pos, forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_sbjct(
                intron.acceptor.pos, forced_return=True)
            # calculate projected distance on QUERY
            posDposQuery = pfD._positions[donorPositionPos].query_pos
            posAposQuery = pfA._positions[accepPositionPos].query_pos
            aaDistance = posAposQuery - posDposQuery
        else:
            # get positions of donor & acceptor in the PacbPORF alignment
            donorPositionPos, phaseD = pfD.dnaposition_query(
                intron.donor.pos, forced_return=True)
            accepPositionPos, phaseA = pfA.dnaposition_query(
                intron.acceptor.pos, forced_return=True)
            # calculate binary entropy from projected position on SBJCT
            posDposSbjct = pfD._positions[donorPositionPos].sbjct_pos
            posAposSbjct = pfA._positions[accepPositionPos].sbjct_pos
            aaDistance = posAposSbjct - posDposSbjct

        # calculate binary entropy score
        entropyDonorSbjct = pfD.alignment_entropy(donorPositionPos,
                                                  method='donor')
        entropyAcceptorSbjct = pfA.alignment_entropy(accepPositionPos,
                                                     method='acceptor')

        # do distance check upon (projected) intron acceptance
        if abs(aaDistance) <= kwargs['max_aa_offset']:

            # check if we've runned out of the aligned part
            outofalignedpacbporf = False

            # get the projected donor position; mind the gap on this spot ;-)
            while pfD._positions[
                    donorPositionPos].isa_gap and donorPositionPos > 0:
                donorPositionPos -= 1
            else:
                projected_donor_position = getattr(
                    pfD._positions[donorPositionPos],
                    outOfAlignmentAttribute) + phaseD
                if donorPositionPos == 0 and pfD._positions[
                        donorPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::donor"
                    outofalignedpacbporf = True

            # get the projected acceptor position; mind the gap on this spot ;-)
            while pfA._positions[accepPositionPos].isa_gap and len(
                    pfA._positions) > accepPositionPos + 1:
                accepPositionPos += 1
            else:
                projected_accep_position = getattr(
                    pfA._positions[accepPositionPos],
                    outOfAlignmentAttribute) + phaseA
                if accepPositionPos == len(
                        pfA._positions
                ) - 1 and pfA._positions[accepPositionPos].isa_gap:
                    print "WarningThatIsTackled::outofalignedpacbporf::acceptor"
                    outofalignedpacbporf = True

            if not outofalignedpacbporf:
                ################################################################
                # set some meta-data properties to the intron object
                ################################################################
                # add distance score to intron
                intron._distance = abs(aaDistance) * 3

                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(intron, pfD, pfA)
                else:
                    succes = set_apps_intron_sbjct(intron, pfD, pfA)

                # set GFF fsource attribute for recognition of intron sources
                intron._gff['fsource'] = "ABGPprojecting"

                # make a ProjectedIntronConnectingOrfs object
                pico = ProjectedIntronConnectingOrfs(prjctOrf,
                                                     projected_donor_position,
                                                     projected_accep_position)
                intron.binary_entropy_donor = entropyDonorSbjct
                intron.binary_entropy_acceptor = entropyAcceptorSbjct
                pico.add_projected_intron(intron)
                pico.phase = intron.phase
                projected_introns.append(pico)

                ################################################################
                if verbose:
                    print "PROJ::", intron._distance,
                    print(pfD.orfQ.id, pfA.orfQ.id),
                    print(pfD.orfS.id, pfA.orfS.id),
                    print "%s-%snt" % (intron.donor.pos, intron.acceptor.pos),
                    print "%2.1f,%2.1f" % (intron.donor.pssm_score,
                                           intron.acceptor.pssm_score),
                    print "%2.1f,%2.1f" % (intron.binary_entropy_donor,
                                           intron.binary_entropy_acceptor)
                ################################################################

        if aaDistance > kwargs['max_aa_offset']:
            # break out; ordered by length can never result in
            # a proper projected intron
            break

    # filter out less relevant ones compared to complete set of results
    projected_introns = _filter_projected_introns(projected_introns)

    # and return a list of ProjectedIntronConnectingOrfs
    return projected_introns
Esempio n. 12
0
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,
                                                     pacbporfA,
                                                     verbose=False,
                                                     **kwargs):
    """
    Merge 2 PacbPORF objects by closeby independant gained introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs, KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs[
            'cig_max_aa_length']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD,
                                               pacbporfA,
                                               verbose=verbose,
                                               **kwargs)
    cig_introns = []

    if verbose:
        print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs[
            'cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance']

    # check if there is length congruence between the cig_introns
    for intQ, intS in alg_introns:
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,
                                                     forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,
                                                     forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,
                                                     forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,
                                                     forced_return=True)
        distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase)
        distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase)
        ########################################################################
        if verbose:
            print(intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos),
            print distDnt, distAnt, kwargs['max_nt_offset']
        ########################################################################
        if abs(distDnt - distAnt) > kwargs['max_nt_offset']:
            # intermediate ciigPacbPORF has query vs sbjct length discrepancy
            # *3 for AA2nt coordinate conversion, +2 to allow different phases
            # e.g. phase difference can give 1AA+2nt difference
            continue
        if intQ.donor.phase == intS.donor.phase and\
        (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if intQ.acceptor.phase == intS.acceptor.phase and\
        (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if abs(distDnt) <= 5 or abs(distDnt) <= 5:
            # most likely a splice site phase shift, not a c.i.g.
            continue

        if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\
        abs(distAnt/3) <= kwargs['cig_max_aa_length']:
            # putatively a closeby independant (intron) gain
            cig_introns.append((intQ, intS))

    ############################################################################
    if verbose:
        for intQ, intS in cig_introns:
            print "cig?:", (intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos)
    ############################################################################

    # return variable to store found positive cases of CIG into
    found_cig_list = []

    # check if there is some sequence similarity
    for intQ, intS in cig_introns:
        # get alignment positions around query & sbjcts splice sites
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,
                                                     forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,
                                                     forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,
                                                     forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,
                                                     forced_return=True)
        distD = dQpos - dSpos
        distA = aQpos - aSpos
        distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase)
        distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase)

        if distDnt > 0:  # then, distAnt is as well > 0
            # QUERY is extended on the donor side
            #mode   = "SQ"
            #qStart = pacbporfD._positions[dSpos].query_pos
            #qEnd   = qStart + distD
            #sStart = pacbporfA._positions[aSpos].sbjct_pos
            #sEnd   = sStart + distD
            #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)
            mode = "SQ"
            qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos)
            qStart = qEnd - max([distA, distD])
            sStart = pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos)
            sEnd = sStart + max([distA, distD])
            qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,
                                         abs_pos_end=qEnd)
            sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,
                                         abs_pos_end=sEnd)

        else:  # distDnt and distAnt are < 0
            ## SBJCT is extended on the donor site
            #mode   = "QS"
            #qStart = pacbporfA._positions[aQpos].query_pos
            #qEnd   = qStart - distA
            #sStart = pacbporfD._positions[dQpos].sbjct_pos
            #sEnd   = sStart - distA
            #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd)
            #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd)
            mode = "QS"
            qStart = pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos)
            qEnd = qStart - min([distA, distD])
            sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos)
            sStart = sEnd + min([distA, distD])
            qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart,
                                         abs_pos_end=qEnd)
            sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart,
                                         abs_pos_end=sEnd)

        headerQ = "query_%s_%s_%s" % (qStart, qEnd, qSeq)
        headerS = "sbjct_%s_%s_%s" % (sStart, sEnd, sSeq)
        headerQ = headerQ[0:20]  # truncate to prevent error
        headerS = headerS[0:20]  # truncate to prevent error
        if verbose:
            print mode, (
                distD, distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt,
            print dQpos, aQpos, dSpos, aSpos
        if not qSeq: continue  # superfluous check-doublecheck for sequence
        if not sSeq: continue  # superfluous check-doublecheck for sequence

        ####################################################
        # make PacbPORF with ClustalW
        ####################################################
        # align the sequences with clustalw
        seqs = {headerQ: qSeq, headerS: sSeq}
        (alignedseqs, alignment) = clustalw(seqs=seqs)

        # make pacbp from clustalw alignment
        pacbp = pacbp_from_clustalw(alignment=(alignedseqs[headerQ], alignment,
                                               alignedseqs[headerS]),
                                    coords=(qStart, qEnd, sStart, sEnd))

        if not pacbp: continue

        # strip unaligned fraction of this pacbp object, then check length
        pacbp.strip_unmatched_ends()

        if len(pacbp) < kwargs['cig_min_aa_length']:
            continue
        if len(pacbp) > kwargs['cig_max_aa_length']:
            continue

        if pacbp:
            # initialize extended tiny PacbPORF caused by c.i.g.
            if distDnt > 0:
                cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfD.orfQ,
                                              pacbporfA.orfS)
            else:
                cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfA.orfQ,
                                              pacbporfD.orfS)
            cig_pacbporf.extend_pacbporf_after_stops()
            ####################################################################
            if verbose:
                print pacbp, len(pacbp)
                print cig_pacbporf
                print "CIG:", intQ
                print "CIG:", intS
                print distD, distA, distDnt, distAnt
                cig_pacbporf.print_protein_and_dna()
            ####################################################################

            ####################################################################
            # set some meta-data properties to the intron objects
            ####################################################################

            # add distance score to introns
            # The distance set in merge_pacbporfs_with_introns is large;
            # it is the actual distance between the splice sites. In CIG,
            # the measure for distance is the length difference between
            # the offset between query and sbjct measured on the cig_pacbporf
            intQ._distance = abs(distDnt - distAnt)
            intS._distance = abs(distDnt - distAnt)

            if distDnt > 0:  # then, distAnt is as well > 0
                # QUERY is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ, cig_pacbporf, pacbporfA)
                succes = set_apps_intron_sbjct(intS, pacbporfD, cig_pacbporf)
            else:
                # SBJCT is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ, pacbporfD, cig_pacbporf)
                succes = set_apps_intron_sbjct(intS, cig_pacbporf, pacbporfA)

            # set GFF fsource attribute for recognition of intron sources
            intQ._gff['fsource'] = "ABGPcig"
            intS._gff['fsource'] = "ABGPcig"

            # create _linked_to_xxx attributes
            intQ._linked_to_pacbporfs = [cig_pacbporf]
            intS._linked_to_pacbporfs = [cig_pacbporf]

            # append to found_cig_list
            found_cig_list.append((intQ, intS, cig_pacbporf))

        else:
            # no alignment possible -> try next
            continue

    # return lists of closeby_independant_introns
    return found_cig_list
Esempio n. 13
0
def merge_pacbporfs_by_tinyexons(pacbporfD,
                                 pacbporfA,
                                 orfSetObjQ,
                                 orfSetObjS,
                                 verbose=False,
                                 **kwargs):
    """ """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    resultlistQ = merge_orfs_with_tinyexon(
        pacbporfD.orfQ,
        pacbporfA.orfQ,
        preceding_donor_sites=pacbporfD.orfQ._donor_sites,
        subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites,
        orflist=orfSetObjQ.orfs,
        **kwargs)
    resultlistS = merge_orfs_with_tinyexon(
        pacbporfD.orfS,
        pacbporfA.orfS,
        preceding_donor_sites=pacbporfD.orfS._donor_sites,
        subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites,
        orflist=orfSetObjS.orfs,
        **kwargs)

    # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ]
    resultdictQ, key2exonQ = _tinyexon_list_2_dict(resultlistQ)
    resultdictS, key2exonS = _tinyexon_list_2_dict(resultlistS)

    # get unique list of donors & acceptors
    donorQ = olba(list(Set([inD.donor for inD, te, inA in resultlistQ])),
                  order_by='pos')
    donorS = olba(list(Set([inD.donor for inD, te, inA in resultlistS])),
                  order_by='pos')
    accepQ = olba(list(Set([inA.acceptor for inD, te, inA in resultlistQ])),
                  order_by='pos')
    accepS = olba(list(Set([inA.acceptor for inD, te, inA in resultlistS])),
                  order_by='pos')

    ## filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = True  # True
    kwargs['aligned_site_max_triplet_distance'] = 0  # 2
    algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD,
                                                   **kwargs)
    algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA,
                                                   **kwargs)

    # settings for minimal alignment entropy score
    # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!!
    min_donor_site_alignment_entropy = 0.1
    min_acceptor_site_alignment_entropy = 0.1

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(
        algdonors,
        pacbporfD,
        'donor',
        min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(
        algacceps,
        pacbporfA,
        'acceptor',
        min_alignment_entropy=min_acceptor_site_alignment_entropy)

    # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS
    return_list = []

    ############################################################################
    if verbose:
        print "bridges constructed: ORFS:",
        print(pacbporfD.orfQ.id, pacbporfA.orfQ.id),
        print(pacbporfD.orfS.id, pacbporfA.orfS.id),
        print len(resultdictQ), len(resultdictS),
        print(len(resultlistQ), len(donorQ), len(accepQ)),
        print(len(resultlistS), len(donorS), len(accepS)),
        print(len(algdonors), len(algacceps))
    ############################################################################

    for keyQ, tinyexonQ in key2exonQ.iteritems():
        for keyS, tinyexonS in key2exonS.iteritems():
            if tinyexonQ.donor.phase != tinyexonS.donor.phase:
                continue
            if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase:
                continue
            if tinyexonQ.length != tinyexonS.length:
                continue
            # if here, then tinyexons of identical structure

            ####################################################################
            if verbose:
                print tinyexonQ.length, tinyexonQ.donor.phase,
                print(len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1])),
                print(len(resultdictS[keyS][0]), len(resultdictS[keyS][1])),
                print tinyexonQ,
                print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(),
                print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score
            ####################################################################

            donor_introns = []
            acceptor_introns = []
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [
                            dS.pos for dQ, dS in algdonors
                    ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = (intronDQ.donor.pos, intronDS.donor.pos)
                    if alignedkey not in [(dQ.pos, dS.pos)
                                          for dQ, dS in algdonors]:
                        continue
                    # if here, we have a set of introns 5' of the tinyexon
                    # which are perfectly alignable!
                    donor_introns.append((intronDQ, intronDS))

            for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems():
                if intronAQ.acceptor.pos not in [
                        aQ.pos for aQ, aS in algacceps
                ]:
                    continue
                for intronASkey, intronAS in resultdictS[keyS][1].iteritems():
                    if intronAS.acceptor.pos not in [
                            aS.pos for aQ, aS in algacceps
                    ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = (intronAQ.acceptor.pos, intronAS.acceptor.pos)
                    if alignedkey not in [(aQ.pos, aS.pos)
                                          for aQ, aS in algacceps]:
                        continue
                    # if here, we have a set of introns 3' of the tinyexon
                    # which are perfectly alignable!
                    acceptor_introns.append((intronAQ, intronAS))

            if not len(donor_introns) or not len(acceptor_introns):
                # no aligned 5' && aligned 3' introns
                continue

            # initialize extended tinyexon PacbPORF
            from pacb import PacbP
            pacbp = PacbP(input=(
                tinyexonQ.proteinsequence(),
                tinyexonS.proteinsequence(),
                tinyexonQ.protein_start(),
                tinyexonS.protein_start(),
            ))
            pacbp.strip_unmatched_ends()
            # continue if no fraction could be aligned
            if len(pacbp) == 0: continue
            tinypacbporf = pacbp2pacbporf(pacbp, tinyexonQ.orf, tinyexonS.orf)
            tinypacbporf.extend_pacbporf_after_stops()

            ####################################################################
            if verbose:
                print tinypacbporf
                tinypacbporf.print_protein_and_dna()
                print len(donor_introns), len(acceptor_introns),
                print max([
                    dQ.donor.pssm_score + dS.donor.pssm_score
                    for dQ, dS in donor_introns
                ]),
                print max([
                    aQ.acceptor.pssm_score + aS.acceptor.pssm_score
                    for aQ, aS in acceptor_introns
                ])
            ####################################################################

            # if here, we have accepted tinyexon bridges!
            # gather them and store to return_list
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [
                            dS.pos for dQ, dS in algdonors
                    ]:
                        continue
                    for intronAQkey, intronAQ in resultdictQ[keyQ][
                            1].iteritems():
                        if intronAQ.acceptor.pos not in [
                                aQ.pos for aQ, aS in algacceps
                        ]:
                            continue
                        for intronASkey, intronAS in resultdictS[keyS][
                                1].iteritems():
                            if intronAS.acceptor.pos not in [
                                    aS.pos for aQ, aS in algacceps
                            ]:
                                continue
                            ####################################################
                            # set some meta-data properties to the intron objects
                            ####################################################
                            _score_introns_obtained_by_mapping(
                                intronDQ,
                                intronDS,
                                pacbporfD,
                                tinypacbporf,
                                source='ABGPmappingTE')
                            _score_introns_obtained_by_mapping(
                                intronAQ,
                                intronAS,
                                tinypacbporf,
                                pacbporfA,
                                source='ABGPmappingTE')
                            # create _linked_to_xxx attributes
                            intronDQ._linked_to_pacbporfs = [tinypacbporf]
                            intronAQ._linked_to_pacbporfs = [tinypacbporf]
                            intronDS._linked_to_pacbporfs = [tinypacbporf]
                            intronAS._linked_to_pacbporfs = [tinypacbporf]
                            intronDQ._linked_to_introns = [intronAQ]
                            intronAQ._linked_to_introns = [intronDQ]
                            intronDS._linked_to_introns = [intronAS]
                            intronAS._linked_to_introns = [intronDS]
                            # append to tmp result list
                            return_list.append(
                                (intronDQ, intronDS, tinypacbporf, intronAQ,
                                 intronAS))

    # check if there are >1 candidate tiny exons
    # currently, we choose only to return the **best** mapped tinyexon
    if len(return_list) == 0:
        pass
    elif len(return_list) == 1:
        pass
    else:
        # only take the highest scoring candidate here
        min_distance = min([(a._distance + d._distance)
                            for a, b, c, d, e in return_list])
        pos2score = []
        for (intronDQ, intronDS, tinypacbporf, intronAQ,
             intronAS) in return_list:
            if (intronDQ._distance + intronAQ._distance) > min_distance:
                pos2score.append(0.0)
            else:
                # calculate overall pssm score
                total_pssm = 0.0
                total_pssm += intronDQ.donor.pssm_score
                total_pssm += intronDQ.acceptor.pssm_score
                total_pssm += intronDS.donor.pssm_score
                total_pssm += intronDS.acceptor.pssm_score
                total_pssm += intronAQ.donor.pssm_score
                total_pssm += intronAQ.acceptor.pssm_score
                total_pssm += intronAS.donor.pssm_score
                total_pssm += intronAS.acceptor.pssm_score
                pos2score.append(total_pssm)
        # get highest score and linked tinyexon
        max_score = max(pos2score)
        return_list = [return_list[pos2score.index(max_score)]]

    ############################################################################
    # some printing in verbose mode
    if verbose and return_list:
        (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) = return_list[0]
        print "BEST MAPPED TINYEXON:"
        print tinypacbporf
        print tinypacbporf.query, intronDQ._distance, intronAQ._distance,
        print(intronDQ.donor.pos, intronDQ.acceptor.pos),
        print(intronDS.donor.pos, intronDS.acceptor.pos),
        print(intronAQ.donor.pos, intronAQ.acceptor.pos),
        print(intronAS.donor.pos, intronAS.acceptor.pos)
    ############################################################################

    # return the result list
    return return_list
Esempio n. 14
0
def merge_pacbporfs_with_introns(pacbporfD,
                                 pacbporfA,
                                 verbose=False,
                                 **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see orfs.merge_orfs_with_intron for **kwargs
    @attention: see functions._filter_for_alignable_splice_sites for **kwargs
    @attention: see functions._filter_for_entropy for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max(
        [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_query_pos = min(
        [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    sdr = pacbporfD.alignment_dna_range_sbjct()
    sar = pacbporfA.alignment_dna_range_sbjct()
    min_donor_sbjct_pos = max(
        [min(sdr), max(sdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_sbjct_pos = min(
        [max(sar), min(sar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    # get list of introns
    #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
    #        min_donor_pos   =min_donor_query_pos,
    #        max_acceptor_pos=max_accep_query_pos,**kwargs)
    #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,
    #        min_donor_pos   =min_donor_sbjct_pos,
    #        max_acceptor_pos=max_accep_sbjct_pos,**kwargs)

    # get list of introns
    intronsQ = merge_orfs_with_intron(pacbporfD.orfQ, pacbporfA.orfQ, **kwargs)
    intronsS = merge_orfs_with_intron(pacbporfD.orfS, pacbporfA.orfS, **kwargs)

    # get unique list of donors & acceptors
    donorQ = olba(list(Set([inQ.donor for inQ in intronsQ])), order_by='pos')
    donorS = olba(list(Set([inS.donor for inS in intronsS])), order_by='pos')
    accepQ = olba(list(Set([inQ.acceptor for inQ in intronsQ])),
                  order_by='pos')
    accepS = olba(list(Set([inS.acceptor for inS in intronsS])),
                  order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donorQ], "aQ1", [a.pos for a in accepQ]
        print "dS1", [d.pos for d in donorS], "aS1", [a.pos for a in accepS]
    ############################################################################

    # filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor']
    algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD,
                                                   **kwargs)
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor']
    algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA,
                                                   **kwargs)

    ############################################################################
    if verbose:
        print "dQ2", [_dq.pos for (_dq, _ds) in algdonors],
        print "aQ2", [_aq.pos for (_aq, _as) in algacceps]
        print "dS2", [_ds.pos for (_dq, _ds) in algdonors],
        print "aS2", [_as.pos for (_aq, _as) in algacceps]
    ############################################################################

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(
        algdonors,
        pacbporfD,
        'donor',
        min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(
        algacceps,
        pacbporfA,
        'acceptor',
        min_alignment_entropy=min_acceptor_site_alignment_entropy)

    ############################################################################
    if verbose:
        print "dQ3", [_dq.pos for (_dq, _ds) in algdonors],
        print "aQ3", [_aq.pos for (_aq, _as) in algacceps]
        print "dS3", [_ds.pos for (_dq, _ds) in algdonors],
        print "aS3", [_as.pos for (_aq, _as) in algacceps]
    ############################################################################

    # make unique position lists for quick lookup in intron lists
    dQpl = Set([dQ.pos for dQ, dS in algdonors])
    dSpl = Set([dS.pos for dQ, dS in algdonors])
    aQpl = Set([aQ.pos for aQ, aS in algacceps])
    aSpl = Set([aS.pos for aQ, aS in algacceps])

    # check exterior boundaries of PacbPORFs
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()

    # now make list of aligable introns
    algintrons = []
    for intQ in intronsQ:
        # check if intron falls within the PacbPORF aligned area
        if intQ.donor.pos <= sposD.query_dna_start: continue
        if intQ.acceptor.pos >= eposA.query_dna_end: continue
        if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl:
            # Query intron occurs in list of alignable splice sites!
            for intS in intronsS:
                # check if intron falls within the PacbPORF aligned area
                if intS.donor.pos <= sposD.sbjct_dna_start: continue
                if intS.acceptor.pos >= eposA.sbjct_dna_end: continue
                if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl:
                    # Sbjct intron occurs as well in alignable splice sites!
                    if (intQ.donor,intS.donor) in algdonors and\
                    (intQ.acceptor,intS.acceptor) in algacceps:
                        # Sbjct & Query Donor & Acceptor are alignable!
                        algintrons.append((intQ, intS))

    ############################################################################
    # set some meta-data properties to the intron objects
    ############################################################################
    for intQ, intS in algintrons:
        distDnt = pacbporfD.get_distance_aligned_nucleotide_positions(
            query=intQ.donor.pos, sbjct=intS.donor.pos)
        distAnt = pacbporfA.get_distance_aligned_nucleotide_positions(
            query=intQ.acceptor.pos, sbjct=intS.acceptor.pos)

        # final distance check. kwargs['aligned_site_max_triplet_distance']
        # is applied on donor and acceptor site. This distance measured on the
        # protein sequence can be DOUBLED in case distDnt / distAnt are
        # opposite (+ and -). Check here if the protein sequence gap is
        # as well <= kwargs['aligned_site_max_triplet_distance'].
        if abs(distAnt -
               distDnt) > kwargs['aligned_site_max_triplet_distance'] * 3:
            continue

        # add distance score to introns
        intQ._distance = abs(distDnt) + abs(distAnt)
        intS._distance = abs(distDnt) + abs(distAnt)

        # add Alignment Positional Periphery Score into objects
        succes = set_apps_intron_query(intQ, pacbporfD, pacbporfA)
        succes = set_apps_intron_sbjct(intS, pacbporfD, pacbporfA)

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPmapping"
        intS._gff['fsource'] = "ABGPmapping"

        ########################################################################
        if verbose:
            # some printing....
            print "Aligned introns:", (intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos),
            print "DIST:", distDnt, distAnt,
            print "[%s]" % kwargs['aligned_site_max_triplet_distance'],
            print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor,
                                            intQ._apps_accep),
            print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % (
                intQ.donor.pssm_score,
                intS.donor.pssm_score,
                intQ.acceptor.pssm_score,
                intS.acceptor.pssm_score,
            )
        ########################################################################

    # return lists of aligned introns
    return algintrons
Esempio n. 15
0
def merge_pacbporfs(pacbporfD,
                    pacbporfA,
                    queryOrfSetObj,
                    sbjctOrfSetObj,
                    allow_query_projecting=True,
                    allow_sbjct_projecting=True,
                    allow_query_mapping=True,
                    allow_sbjct_mapping=True,
                    allow_projecting=True,
                    allow_mapping=True,
                    verbose=False):
    """
    Merge 2 PacbPORF objects with an interface into a gene structure

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit/create **kwargs dictionary for some forced attributes
    kwargs = {}
    _update_kwargs(kwargs, KWARGS_SPLICESITES)

    # deal with allow_xxx attributes
    if not allow_projecting:
        allow_query_projecting = False
        allow_sbjct_projecting = False
    if not allow_mapping:
        allow_query_mapping = False
        allow_sbjct_mapping = False

    # check if Orf objects of PacbPORFS are identical
    queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id
    sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id

    # return data structure of introns
    introns = {'query': [], 'sbjct': []}

    # Scan Orfs for splice sites.
    # This has probably been performed before, but when not done,
    # cached donor & acceptor sites lists seems to be empty -> no introns
    pacbporfD.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs['min_donor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_donor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_donor_pssm_score'])
    pacbporfD.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs['min_donor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_donor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_donor_pssm_score'])
    pacbporfA.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs['min_acceptor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_acceptor_pssm_score'])
    pacbporfA.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs['min_acceptor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_acceptor_pssm_score'])

    if not queryOrfsIdentical and not sbjctOrfsIdentical:

        introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns1 = _filter_aligned_introns_on_pssm_entropy_combination(
            introns1)


        if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
        pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
            introns2 = merge_pacbporfs_with_closeby_independant_introns(
                pacbporfD, pacbporfA)
            introns3 = merge_pacbporfs_with_phase_shift_introns(
                pacbporfD, pacbporfA)
            introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA,
                                                    queryOrfSetObj,
                                                    sbjctOrfSetObj)

            introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron(
                pacbporfD, pacbporfA, queryOrfSetObj)

            introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron(
                pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron(
                pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron(
                pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}
            introns4 = {}
            introns5 = {}
            introns6 = {}
            introns7 = {}
            introns8 = {}

        introns9 = merge_pacbporfs_with_conserved_acceptor_introns(
            pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns9 = _filter_aligned_introns_on_pssm_entropy_combination(
            introns9)

        introns10 = merge_pacbporfs_with_conserved_donor_introns(
            pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns10 = _filter_aligned_introns_on_pssm_entropy_combination(
            introns10)

        # store introns obtained by most simplest case projecting/mapping
        introns['query'].extend(Set([intrQ for (intrQ, intrS) in introns1]))
        introns['sbjct'].extend(Set([intrS for (intrQ, intrS) in introns1]))

        # only store introns from intron2 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, cigpacbp) in introns2:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)

        # only store introns from intron3 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS) in introns3:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)

        # only store introns from intron4 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS2)

        # only store introns from intron5 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else: k1 = None
            if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else: k2 = None
            if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else: k3 = None
            if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else: k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS2)

        # only store introns from intron6 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6:
            if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else: k1 = None
            if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else: k2 = None
            if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else: k3 = None
            if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else: k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS2)

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns['query']:
            introns['query'].remove(None)
        while None in introns['sbjct']:
            introns['sbjct'].remove(None)

        # only store introns from intron7 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['sbjct'].append(intrS2)

        # only store introns from intron8 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ:
                introns['query'].append(intrQ)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS)

        # only store introns from introns9 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS) in introns9:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (2163, 2283):
                print "STRACC", k1, intrQ, k1 not in keysQ
                print "STRACC", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns['query'].append(intrQ)
            if k2 not in keysS:
                introns['sbjct'].append(intrS)

        # only store introns from introns10 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS) in introns10:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (1642, 1858):
                print "STRDON", k1, intrQ, k1 not in keysQ
                print "STRDON", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns['query'].append(intrQ)
            if k2 not in keysS:
                introns['sbjct'].append(intrS)

        # finally, do the bridging thingy
        introns0 = merge_pacbporfs_with_query_intron_bridgeing(
            pacbporfD, pacbporfA)

        # only store introns from introns0 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        for intrQ in introns0:
            if intrQ.coords() not in keysQ:
                introns['query'].append(intrQ)

        #introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] )
        #introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] )
        #introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] )
        #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] )
        #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] )
        #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] )
        #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] )
        #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] )
        #introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] )
        #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] )
        #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] )
        #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] )
        #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] )
        #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] )

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns['query']:
            introns['query'].remove(None)
        while None in introns['sbjct']:
            introns['sbjct'].remove(None)

    elif not queryOrfsIdentical:
        seqerror = merge_pacbporf_with_sequenceerror_in_query(
            pacbporfD, pacbporfA)
        introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA)


        if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
        pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query(
                pacbporfD, pacbporfA, queryOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_query(
                pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store sequencerror if it exists
        if seqerror: introns['query'].append(seqerror)

        # store introns obtained by most simplest case projecting/mapping
        introns['query'].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['query']]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns['query'].append(intr1)
                introns['query'].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['query']]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns['query'].append(intr1)
                introns['query'].append(intr2)
                introns['query'].append(intr3)

        if not introns['query'] and allow_sbjct_mapping and allow_query_mapping:
            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(
                pacbporfD, pacbporfA)

            # potential stopless 3n intron in SBJCT
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(
                introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(
                pacbporfD, pacbporfA)

            if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
            pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
                introns3 = merge_pacbporfs_with_phase_shift_introns(
                    pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(
                    introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)

            else:
                # do not allow more complex intron merging
                introns3 = {}

            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns['query']]
            for intrQ, intrS in introns1:
                if intrQ.coords() not in keys:
                    introns['query'].append(intrQ)
                    keys = [intron.coords() for intron in introns['query']]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrQ.coords() not in keys:
                    introns['query'].append(intrQ)
                    keys = [intron.coords() for intron in introns['query']]
            for intrQ, intrS in introns3:
                if intrQ.coords() not in keys:
                    introns['query'].append(intrQ)
                    keys = [intron.coords() for intron in introns['query']]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns['query'].append(intron)
                    keys = [intron.coords() for intron in introns['query']]

            keys = [intron.coords() for intron in introns['sbjct']]
            for intrQ, intrS in introns1:
                if intrS.coords() not in keys:
                    introns['query'].append(intrS)
                    keys = [intron.coords() for intron in introns['sbjct']]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrS.coords() not in keys:
                    introns['query'].append(intrS)
                    keys = [intron.coords() for intron in introns['sbjct']]
            for intrQ, intrS in introns3:
                if intrS.coords() not in keys:
                    introns['query'].append(intrS)
                    keys = [intron.coords() for intron in introns['sbjct']]

        elif not introns['query']:

            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(
                pacbporfD, pacbporfA)
            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns['query']]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns['query'].append(intron)
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif not sbjctOrfsIdentical:
        introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA)

        if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
        pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct(
                pacbporfD, pacbporfA, sbjctOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct(
                pacbporfD, pacbporfA, sbjctOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store introns obtained by most simplest case projecting/mapping
        introns['sbjct'].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['sbjct']]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns['sbjct'].append(intr1)
                introns['sbjct'].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['sbjct']]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns['sbjct'].append(intr1)
                introns['sbjct'].append(intr2)
                introns['sbjct'].append(intr3)

        if not introns['sbjct'] and allow_sbjct_mapping and allow_query_mapping:
            # potential stopless 3n intron in QUERY
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(
                introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(
                pacbporfD, pacbporfA)


            if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
            pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
                introns3 = merge_pacbporfs_with_phase_shift_introns(
                    pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(
                    introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)
            else:
                # do not allow more complex intron merging
                introns3 = {}

            # store introns
            introns['query'].extend(Set([intrQ
                                         for (intrQ, intrS) in introns1]))
            introns['sbjct'].extend(Set([intrS
                                         for (intrQ, intrS) in introns1]))
            introns['query'].extend(
                [intrQ for (intrQ, intrS, cigpacbp) in introns2])
            introns['query'].extend([intrQ for (intrQ, intrS) in introns3])
            introns['sbjct'].extend(
                [intrS for (intrQ, intrS, cigpacbp) in introns2])
            introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3])
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif queryOrfsIdentical and sbjctOrfsIdentical:
        if allow_query_mapping:
            introns1 = merge_pacbporfs_by_inframe_intron_in_query(
                pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns1 = []

        if allow_sbjct_mapping:
            introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct(
                pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns2 = []

        if allow_sbjct_mapping and allow_query_mapping:
            introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns3 = _filter_aligned_introns_on_pssm_entropy_combination(
                introns3)
            # apply stopless3n intron filtering
            introns3 = _filter_aligned_stopless_3n_introns(introns3)

        else:
            # no mapping (unigene or continious alignment provided)
            introns3 = []

        #introns4 = merge_pacbporfs_with_closeby_independant_introns(
        #                pacbporfD,pacbporfA)
        #introns5 = merge_pacbporfs_with_phase_shift_introns(
        #                pacbporfD,pacbporfA)

        introns['query'].extend([prj.projected_introns[0] for prj in introns1])
        introns['sbjct'].extend([prj.projected_introns[0] for prj in introns2])
        introns['query'].extend([intrQ for (intrQ, intrS) in introns3])
        introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3])

    else:
        # none of these cases; allow_projecting or allow_mapping == False!
        pass

    # Filter for stopless3n introns
    introns['query'] = _filter_stopless_3n_introns(introns['query'])
    introns['sbjct'] = _filter_stopless_3n_introns(introns['sbjct'])

    # return list of introns
    return introns
Esempio n. 16
0
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,pacbporfA,
    orfSetObject,queryorsbjct,verbose = False, **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  orfSetObject: object with elegiable Orfs
    @param orfSetObject: object with elegiable Orfs

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @rtype:  list
    @return: list with ( intron, ExonOnOrf, intron ) on the query sequence
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    MAX_TINYEXON_NT_LENGTH = 33
    MIN_TINYEXON_NT_LENGTH = 6

    tinyexons = []
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        alignedDonorRange = pacbporfD.alignment_dna_range_query()
        alignedAccepRange = pacbporfA.alignment_dna_range_query()
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        alignedDonorRange = pacbporfD.alignment_dna_range_sbjct()
        alignedAccepRange = pacbporfA.alignment_dna_range_sbjct()
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    for dObj in donorOrf._donor_sites:
        # do not make a projection OVER the aligned area
        if dObj.pos < min(alignedDonorRange): continue
        if queryorsbjct == "query":
            (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True)
        else:
            (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break
        for aObj in accepOrf._acceptor_sites:
            # do not make a projection OVER the aligned area
            if aObj.pos > max(alignedAccepRange): continue
            if queryorsbjct == "query":
                (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True)
            else:
                (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break
            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= MAX_TINYEXON_NT_LENGTH:
                break
            if distance < MIN_TINYEXON_NT_LENGTH:
                continue

            ####################################################
            # generate a ScanForMatches pattern file
            ####################################################
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct])
            # mask all non-phase0 nucleotides to N residues;
            # this represents the regularexpression for a specific
            # peptide sequence
            firstphasepositions = range( 3-dPhase % 3, len(query), 3)
            for pos in range(0,len(query)):
                if pos not in firstphasepositions:
                    query[pos] = "N"
            # calculate a ~50% mismatch number
            mismatches =  max([ 0, (len(query) - query.count("N"))/2 ])
            # write the pattern to string and subsequently to file
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            if kwargs['allow_non_canonical_donor']:
                sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % (
                    AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO)
            else:
                sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % (
                    AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO)

            ####################################################
            if verbose:
                print (pacbporfD.orfQ.id,pacbporfA.orfQ.id),
                print distance, dObj, aObj
                print sfmpat
            ####################################################

            fname = "sfmpat_tinyexon_%s_%s_%s_%s" % (
                        donorOrf.id,
                        accepOrf.id,
                        posDsbjct,
                        posAsbjct,
                        )
            fh = open(fname,'w')
            fh.write(sfmpat+"\n")
            fh.close()

            ####################################################
            # run ScanForMatches
            ####################################################
            command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\
                      """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\
                      """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\
                      """{ print $1"["$2","$3"]\\n"$4 } }' """
            command = command % (
                        donorOrf.inputgenomicsequence,
                        EXECUTABLE_SFM,fname,
                        dObj.pos+(kwargs['min_intron_nt_length']-3),
                        aObj.pos-(kwargs['min_intron_nt_length']-3) )
            co = osPopen(command)
            matches = parseFasta(co.readlines())
            co.close()

            # filter matches for:
            # (1) correct donor & acceptor phase
            # (2) high enough donor & acceptor site scores
            for hdr,seqmatch in matches.iteritems():
                startQ,stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ]
                exonQstart   = startQ + AUSO + 2 - 1
                exonQstop    = stopQ  - DDSO - 2

                ####################################
                # get Orf object of tinyexon
                ####################################
                tinyexonorf = None
                # select the Orf on which the tinyexon is located
                for orfObj in orfSetObject.get_elegiable_orfs(
                max_orf_start=exonQstart,min_orf_end=exonQstop):
                    orfPhase = (exonQstart - orfObj.startPY) % 3
                    if orfPhase == dPhase:               
                        tinyexonorf = orfObj
                        break
                else:
                    # No tinyexonorf assigned!! Iin case a regex matched
                    # over a STOP-codon or the regex length is smaller
                    # then the smallest Orf, no Orf can be assigned
                    continue

                # filter for donor & acceptor score            
                dScore = _score_splice_site(seqmatch[-9:],splicetype='donor')
                aScore = _score_splice_site(seqmatch[0:11],splicetype='acceptor')
                if dScore < kwargs['min_donor_pssm_score']:
                    continue
                if aScore < kwargs['min_acceptor_pssm_score']:
                    continue

                # scan Orf for splicesites
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                        splicetype="donor",
                        min_pssm_score=kwargs['min_donor_pssm_score'],
                        allow_non_canonical=kwargs['allow_non_canonical_donor'],
                        non_canonical_min_pssm_score=kwargs['non_canonical_min_donor_pssm_score'])
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                        splicetype="acceptor",
                        min_pssm_score=kwargs['min_acceptor_pssm_score'],
                        allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
                        non_canonical_min_pssm_score=kwargs['non_canonical_min_acceptor_pssm_score'])

                # get 1th intron donor object
                intron1_aObj = None
                for a in tinyexonorf._acceptor_sites:
                    if a.pos == exonQstart:
                        intron1_aObj = a
                        break
                else:
                    # pseudo-acceptorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # get 2th intron donor object
                intron2_dObj = None
                for d in tinyexonorf._donor_sites:
                    if d.pos == exonQstop:
                        intron2_dObj = d
                        break
                else:
                    # pseudo-donorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue


                # check if introns are of elegiable lengths
                if (intron1_aObj.pos-dObj.pos) > kwargs['max_intron_nt_length']:
                    continue
                if (aObj.pos-intron2_dObj.pos) > kwargs['max_intron_nt_length']:
                    continue

                ####################################################
                if True or verbose:
                    # if here, a candidate!!!
                    print (pacbporfD.orfQ.id,tinyexonorf.id,pacbporfA.orfQ.id),
                    print hdr, dScore, aScore
                    print seqmatch
                ####################################################

                # append to found tinyexons
                query_data      = ( tinyexonorf, exonQstart, exonQstop )
                sbjct_data      = ( prjctOrf, posDsbjct, posAsbjct )
                splicesite_data = ( dObj,intron1_aObj, intron2_dObj, aObj )
                tinyexons.append( ( query_data, sbjct_data, splicesite_data ) )


            # file cleanup
            osRemove(fname)

    # return - End Of Function - if no tinyexons are found
    if not tinyexons:
        return []

    ####################################
    # select the **best** tinyexon
    ####################################
    (query_data,sbjct_data,splicesite_data) = tinyexons[0]
    orfQ,query_dna_start,query_dna_end = query_data
    orfS,sbjct_dna_start,sbjct_dna_end = sbjct_data
    (intron1_dObj,intron1_aObj,intron2_dObj,intron2_aObj) = splicesite_data

    ####################################################
    if verbose:
        print "tinyexon orf:", orfQ
        print "tinyexon orf:", intron1_aObj
        print "tinyexon orf:", intron2_dObj
    ####################################################

    ####################################
    # make tinyexon PacbPORF
    ####################################
    startQaa = orfQ.dnapos2aapos(query_dna_start) -1
    startSaa = orfS.dnapos2aapos(sbjct_dna_start) -1
    stopQaa  = orfQ.dnapos2aapos(query_dna_end) +1
    stopSaa  = orfS.dnapos2aapos(sbjct_dna_end) +1
    # check for directly leading stop codon on tinyexon
    while startQaa <= orfQ.protein_startPY:
        startQaa+=1
        startSaa+=1
        query_dna_start+=3
        sbjct_dna_start+=3
    while startSaa <= orfS.protein_startPY:
        startQaa+=1
        startSaa+=1
        query_dna_start+=3
        sbjct_dna_start+=3
    # check for directly tailing stop codon on tinyexon
    while stopQaa > orfQ.protein_endPY:
        stopQaa-=1
        stopSaa-=1
        query_dna_end-=3
        sbjct_dna_end-=3
    while stopSaa > orfS.protein_endPY:
        stopQaa-=1
        stopSaa-=1
        query_dna_end-=3
        sbjct_dna_end-=3
    # get sequences
    qAAseq = orfQ.getaas(abs_pos_start=startQaa,abs_pos_end=stopQaa)
    sAAseq = orfS.getaas(abs_pos_start=startSaa,abs_pos_end=stopSaa)

    ####################################################
    if verbose or len(qAAseq) != len(sAAseq):
        # if unequal lengths, error will be raised upon PacbP.__init__()
        print orfQ, qAAseq, startQaa, stopQaa, (stopQaa-startQaa),
        print (query_dna_start,query_dna_end)
        print orfS, sAAseq, startSaa, stopSaa, (stopSaa-startSaa),
        print (sbjct_dna_start,sbjct_dna_end)
        print orfQ.inputgenomicsequence[query_dna_start-2:query_dna_end+2]
        print orfS.inputgenomicsequence[sbjct_dna_start-2:sbjct_dna_end+2]
    ####################################################

    # initialize extended tinyexon PacbPORF
    from pacb import PacbP
    pacbp = PacbP(input=( qAAseq, sAAseq, startQaa, startSaa ) )
    pacbp.strip_unmatched_ends()
    pacbporf = pacbp2pacbporf(pacbp,orfQ,orfS)
    pacbporf.extend_pacbporf_after_stops()
    pacbporf.source = 'ABGPprojectingTE'

    ####################################
    # make introns
    ####################################
    intron1 = IntronConnectingOrfs(
                intron1_dObj, intron1_aObj, None,
                donorOrf,pacbporf.orfQ )
    intron2 = IntronConnectingOrfs(
                intron2_dObj, intron2_aObj, None,
                pacbporf.orfQ, accepOrf )


    ################################################################
    # set some meta-data properties to the intron objects
    ################################################################
    # add distance score to intron
    intron1._distance = 0
    intron2._distance = 0

    # add Alignment Positional Periphery Score into objects
    if queryorsbjct == "query":
        succes = set_apps_intron_query(intron1,pacbporfD,pacbporf)
        succes = set_apps_intron_query(intron2,pacbporf,pacbporfA)
    else:
        succes = set_apps_intron_sbjct(intron1,pacbporfD,pacbporf)
        succes = set_apps_intron_sbjct(intron2,pacbporf,pacbporfA)

    # set GFF fsource attribute for recognition of intron sources
    intron1._gff['fsource'] = "ABGPprojectingTE"
    intron2._gff['fsource'] = "ABGPprojectingTE"

    # create _linked_to_xxx attributes
    intron1._linked_to_pacbporfs = [ pacbporf ]
    intron2._linked_to_pacbporfs = [ pacbporf ]
    intron1._linked_to_introns   = [ intron2 ]
    intron2._linked_to_introns   = [ intron1 ]

    ####################################################
    if verbose:
        print pacbporf
        pacbporf.print_protein_and_dna()
        print intron1
        print intron2
        if False:
            # printing data when this function needs to be debugged:
            print ""
            print intron1
            print intron2
            print ""
            print pacbporfD
            pacbporfD.print_protein_and_dna()
            print ""
            print pacbporf
            pacbporf.print_protein_and_dna()
            print ""
            print pacbporfA
            pacbporfA.print_protein_and_dna()
            import sys
            sys.exit()
    ####################################################

    # return introns and intermediate tinyexon PacbPORF
    return [(intron1,intron2,pacbporf)]
Esempio n. 17
0
def _merge_pacbporfs_by_two_tinyexons(pacbporfD,pacbporfA,
    orfSetObject,queryorsbjct,verbose = False, **kwargs):
    """ """
    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    tinyexons = []
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        dStart,dEnd = sposD.query_dna_start, eposD.query_dna_end
        aStart,aEnd = sposA.query_dna_start, eposA.query_dna_end
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        dStart,dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end
        aStart,aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # get all potential combinations of two tinyexons
    tinyexoncombis = merge_orfs_with_two_tinyexons(
                donorOrf, accepOrf,
                donorOrf._donor_sites,
                accepOrf._acceptor_sites,
                orfSetObject.orfs,
                )

    results = []

    for dObj in donorOrf._donor_sites:
        if queryorsbjct == "query":
            (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True)
        else:
            (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break

        # check if dObj is on pfD;
        # introns of tinyexons can be projected outside of pfD/pfA area
        if dObj.pos < dStart: continue

        for aObj in accepOrf._acceptor_sites:
            if queryorsbjct == "query":
                (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True)
            else:
                (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break

            # check if aObj is on pfA;
            # introns of tinyexons can be projected outside of pfD/pfA area
            if aObj.pos > aEnd: continue

            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= (kwargs['max_tinyexon_nt_length']*2):
                break
            if distance < (kwargs['min_tinyexon_nt_length']*2):
                continue

            filtered_tinyexoncombis = _filter_tinyexoncombis(tinyexoncombis,
                    min_length = distance,
                    max_length = distance,
                    min_first_acceptor_pos = dObj.pos + kwargs['min_tinyexon_intron_nt_length'],
                    max_final_donor_pos = aObj.pos - kwargs['min_tinyexon_intron_nt_length'],
                    phase_final_donor = aObj.phase,
                    phase_first_acceptor= dObj.phase,
                    )

            if not filtered_tinyexoncombis: continue

            ####################################################################
            if verbose:
                print distance, dObj, aObj, len(tinyexoncombis),
                print len(filtered_tinyexoncombis)
            ####################################################################

            for exon1,intron,exon2 in filtered_tinyexoncombis:
                # make preceding intron
                preceding_intron = IntronConnectingOrfs(
                    dObj,exon1.acceptor,
                    None,donorOrf,exon1.orf )

                # make subsequent intron
                subsequent_intron = IntronConnectingOrfs(
                    exon2.donor, aObj,
                    None,exon2.orf,accepOrf)

                ################################################################
                if verbose:
                    print "\t", exon1, exon1.proteinsequence(),
                    print preceding_intron.phase, exon1.donor.phase,
                    print subsequent_intron.phase, preceding_intron.shared_aa,
                    print intron.shared_aa, subsequent_intron.shared_aa 
                    print "\t", exon2, exon2.proteinsequence()
                ################################################################

                # get prjctOrf sequence for comparison
                correctionA = 0
                if aObj.phase != 0:
                    # INCLUDE the final AA which is broken by the splicesite
                    correctionA=1
                if queryorsbjct == "query":
                    startPos,_phase = pacbporfD.dnaposition_query(dObj.pos,forced_return=True)
                    stopPos,_phase  = pacbporfA.dnaposition_query(aObj.pos,forced_return=True)
                    start = pacbporfD._positions[startPos].sbjct_pos
                    stop  = pacbporfA._positions[stopPos].sbjct_pos + correctionA
                else:
                    startPos,_phase = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True)
                    stopPos,_phase  = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True)
                    start = pacbporfD._positions[startPos].query_pos
                    stop  = pacbporfA._positions[stopPos].query_pos + correctionA

                if stop <= start:
                    # tinyexon is so tiny that is does not have a single
                    # full aligned AA -> discard here
                    continue

                # actually get the prjctOrf sequence
                aaseq = prjctOrf.getaas(abs_pos_start=start,abs_pos_end=stop)

                # initialize a PacbP for the combination of both tinyexons
                # afterwards, check if the indentityscore is > 0.XX
                from pacb import PacbP
                seqparts = [ preceding_intron.shared_aa,
                             exon1.proteinsequence(),
                             intron.shared_aa,
                             exon2.proteinsequence(),
                             subsequent_intron.shared_aa ]

                ################################################################
                if verbose or len("".join(seqparts)) != len(aaseq):
                    print pacbporfD
                    print exon1.orf, exon2.orf, prjctOrf
                    print pacbporfA
                    print seqparts
                    print aaseq, len(aaseq), len("".join(seqparts)), (start,stop)
                    print "'%s'" % queryorsbjct,
                    print "Q", (algDobj.query_pos, algAobj.query_pos),
                    print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos)
                    print "distance:", distance, kwargs['max_tinyexon_nt_length'],
                    print (posDsbjct, posAsbjct),
                    print "Q-dna:", ( algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase ),
                    print "S-dna:", ( algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase )
                ################################################################

                # ignore by continue when sequences not identical in length
                if len("".join(seqparts)) != len(aaseq): continue

                testpacbp = PacbP(input=( "".join(seqparts), aaseq, 0, 0) )
                testpacbp.strip_unmatched_ends()

                if not ( testpacbp.identityscore > 0.60 and\
                (float(testpacbp.length) / len(aaseq)) > 0.70 ):
                    # not a very convincing alignment
                    continue

                ################################################################
                if verbose:
                    print testpacbp
                    testpacbp.print_protein()
                ################################################################

                # if here, succesfully mapped 2 tiny exons!!
                # get all sequences/coordinates in place for
                # pacbporf formation
                orfQ1   = exon1.orf
                orfS1   = prjctOrf
                orfQ2   = exon2.orf
                orfS2   = prjctOrf
                seqQ1   = exon1.proteinsequence()
                seqQ2   = exon2.proteinsequence()
                coordQ1 = exon1.acceptor.pos / 3
                coordS1 = start
                coordQ2 = exon2.acceptor.pos / 3
                coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len(seqparts[2])
                seqS1   = aaseq[0:(len(seqparts[0])+len(seqparts[1]))]
                seqS2   = aaseq[-(len(seqparts[3])+len(seqparts[4])):]
                if len(seqparts[0]):
                    seqS1 = seqS1[1:]
                    coordS1 += 1
                if len(seqparts[4]):
                    seqS2 = seqS2[:-1]

                if queryorsbjct == "sbjct": 
                    # swap query <-> sbjct
                    orfQ1,orfS1 = orfS1,orfQ1 
                    orfQ2,orfS2 = orfS2,orfQ2
                    seqQ1,seqS1 = seqS1,seqQ1
                    seqQ2,seqS2 = seqS2,seqQ2
                    coordQ1,coordS1 = coordS1,coordQ1
                    coordQ2,coordS2 = coordS2,coordQ2

                ################################################################
                if verbose:
                    print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2
                    print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2
                ################################################################


                # make pacbporfs
                pacbp1 = PacbP(input=( seqQ1, seqS1, coordQ1, coordS1) )
                pacbp1.strip_unmatched_ends()
                tinypacbporf1 = pacbp2pacbporf(pacbp1,orfQ1,orfS1)
                tinypacbporf1.extend_pacbporf_after_stops()
                pacbp2 = PacbP(input=( seqQ2, seqS2, coordQ2, coordS2) )
                pacbp2.strip_unmatched_ends()
                tinypacbporf2 = pacbp2pacbporf(pacbp2,orfQ2,orfS2)
                tinypacbporf2.extend_pacbporf_after_stops()

                ################################################################
                if verbose:
                    print tinypacbporf1
                    tinypacbporf1.print_protein_and_dna()
                    print tinypacbporf2
                    tinypacbporf2.print_protein_and_dna()
                ################################################################


                ################################################################
                # set some meta-data properties to the intron objects
                ################################################################
                # add distance score to intron
                preceding_intron._distance  = 0
                intron._distance            = 0
                subsequent_intron._distance = 0
            
                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(preceding_intron,pacbporfD,tinypacbporf1)
                    succes = set_apps_intron_query(intron,tinypacbporf1,tinypacbporf2)
                    succes = set_apps_intron_query(subsequent_intron,tinypacbporf2,pacbporfA)
                else:
                    succes = set_apps_intron_sbjct(preceding_intron,pacbporfD,tinypacbporf1)
                    succes = set_apps_intron_sbjct(intron,tinypacbporf1,tinypacbporf2)
                    succes = set_apps_intron_sbjct(subsequent_intron,tinypacbporf2,pacbporfA)
            
                # set GFF fsource attribute for recognition of intron sources
                preceding_intron._gff['fsource']  = "ABGPprojectingTE"
                intron._gff['fsource']            = "ABGPprojectingTE"
                subsequent_intron._gff['fsource'] = "ABGPprojectingTE"


                # create _linked_to_xxx attributes
                preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ]
                intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ]
                subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ]
                preceding_intron._linked_to_introns   = [ intron,subsequent_intron ]
                intron._linked_to_introns             = [ preceding_intron,subsequent_intron ]
                subsequent_intron._linked_to_introns  = [ intron,preceding_intron ]

                ################################################################
                # append to results
                ################################################################
                results.append( (
                    preceding_intron,
                    intron,
                    subsequent_intron,
                    tinypacbporf1,
                    tinypacbporf2,
                    ) )


    # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row)
    return results
Esempio n. 18
0
def _merge_pacbporfs_by_two_tinyexons(pacbporfD,
                                      pacbporfA,
                                      orfSetObject,
                                      queryorsbjct,
                                      verbose=False,
                                      **kwargs):
    """ """
    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    tinyexons = []
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        dStart, dEnd = sposD.query_dna_start, eposD.query_dna_end
        aStart, aEnd = sposA.query_dna_start, eposA.query_dna_end
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        dStart, dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end
        aStart, aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # get all potential combinations of two tinyexons
    tinyexoncombis = merge_orfs_with_two_tinyexons(
        donorOrf,
        accepOrf,
        donorOrf._donor_sites,
        accepOrf._acceptor_sites,
        orfSetObject.orfs,
    )

    results = []

    for dObj in donorOrf._donor_sites:
        if queryorsbjct == "query":
            (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos,
                                                         forced_return=True)
        else:
            (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,
                                                         forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break

        # check if dObj is on pfD;
        # introns of tinyexons can be projected outside of pfD/pfA area
        if dObj.pos < dStart: continue

        for aObj in accepOrf._acceptor_sites:
            if queryorsbjct == "query":
                (aPos,
                 aPhase) = pacbporfA.dnaposition_query(aObj.pos,
                                                       forced_return=True)
            else:
                (aPos,
                 aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,
                                                       forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break

            # check if aObj is on pfA;
            # introns of tinyexons can be projected outside of pfD/pfA area
            if aObj.pos > aEnd: continue

            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= (kwargs['max_tinyexon_nt_length'] * 2):
                break
            if distance < (kwargs['min_tinyexon_nt_length'] * 2):
                continue

            filtered_tinyexoncombis = _filter_tinyexoncombis(
                tinyexoncombis,
                min_length=distance,
                max_length=distance,
                min_first_acceptor_pos=dObj.pos +
                kwargs['min_tinyexon_intron_nt_length'],
                max_final_donor_pos=aObj.pos -
                kwargs['min_tinyexon_intron_nt_length'],
                phase_final_donor=aObj.phase,
                phase_first_acceptor=dObj.phase,
            )

            if not filtered_tinyexoncombis: continue

            ####################################################################
            if verbose:
                print distance, dObj, aObj, len(tinyexoncombis),
                print len(filtered_tinyexoncombis)
            ####################################################################

            for exon1, intron, exon2 in filtered_tinyexoncombis:
                # make preceding intron
                preceding_intron = IntronConnectingOrfs(
                    dObj, exon1.acceptor, None, donorOrf, exon1.orf)

                # make subsequent intron
                subsequent_intron = IntronConnectingOrfs(
                    exon2.donor, aObj, None, exon2.orf, accepOrf)

                ################################################################
                if verbose:
                    print "\t", exon1, exon1.proteinsequence(),
                    print preceding_intron.phase, exon1.donor.phase,
                    print subsequent_intron.phase, preceding_intron.shared_aa,
                    print intron.shared_aa, subsequent_intron.shared_aa
                    print "\t", exon2, exon2.proteinsequence()
                ################################################################

                # get prjctOrf sequence for comparison
                correctionA = 0
                if aObj.phase != 0:
                    # INCLUDE the final AA which is broken by the splicesite
                    correctionA = 1
                if queryorsbjct == "query":
                    startPos, _phase = pacbporfD.dnaposition_query(
                        dObj.pos, forced_return=True)
                    stopPos, _phase = pacbporfA.dnaposition_query(
                        aObj.pos, forced_return=True)
                    start = pacbporfD._positions[startPos].sbjct_pos
                    stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA
                else:
                    startPos, _phase = pacbporfD.dnaposition_sbjct(
                        dObj.pos, forced_return=True)
                    stopPos, _phase = pacbporfA.dnaposition_sbjct(
                        aObj.pos, forced_return=True)
                    start = pacbporfD._positions[startPos].query_pos
                    stop = pacbporfA._positions[stopPos].query_pos + correctionA

                if stop <= start:
                    # tinyexon is so tiny that is does not have a single
                    # full aligned AA -> discard here
                    continue

                # actually get the prjctOrf sequence
                aaseq = prjctOrf.getaas(abs_pos_start=start, abs_pos_end=stop)

                # initialize a PacbP for the combination of both tinyexons
                # afterwards, check if the indentityscore is > 0.XX
                from pacb import PacbP
                seqparts = [
                    preceding_intron.shared_aa,
                    exon1.proteinsequence(), intron.shared_aa,
                    exon2.proteinsequence(), subsequent_intron.shared_aa
                ]

                ################################################################
                if verbose or len("".join(seqparts)) != len(aaseq):
                    print pacbporfD
                    print exon1.orf, exon2.orf, prjctOrf
                    print pacbporfA
                    print seqparts
                    print aaseq, len(aaseq), len("".join(seqparts)), (start,
                                                                      stop)
                    print "'%s'" % queryorsbjct,
                    print "Q", (algDobj.query_pos, algAobj.query_pos),
                    print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos)
                    print "distance:", distance, kwargs[
                        'max_tinyexon_nt_length'],
                    print(posDsbjct, posAsbjct),
                    print "Q-dna:", (algDobj.query_dna_start, dPhase,
                                     algAobj.query_dna_start, aPhase),
                    print "S-dna:", (algDobj.sbjct_dna_start, dPhase,
                                     algAobj.sbjct_dna_start, aPhase)
                ################################################################

                # ignore by continue when sequences not identical in length
                if len("".join(seqparts)) != len(aaseq): continue

                testpacbp = PacbP(input=("".join(seqparts), aaseq, 0, 0))
                testpacbp.strip_unmatched_ends()

                if not ( testpacbp.identityscore > 0.60 and\
                (float(testpacbp.length) / len(aaseq)) > 0.70 ):
                    # not a very convincing alignment
                    continue

                ################################################################
                if verbose:
                    print testpacbp
                    testpacbp.print_protein()
                ################################################################

                # if here, succesfully mapped 2 tiny exons!!
                # get all sequences/coordinates in place for
                # pacbporf formation
                orfQ1 = exon1.orf
                orfS1 = prjctOrf
                orfQ2 = exon2.orf
                orfS2 = prjctOrf
                seqQ1 = exon1.proteinsequence()
                seqQ2 = exon2.proteinsequence()
                coordQ1 = exon1.acceptor.pos / 3
                coordS1 = start
                coordQ2 = exon2.acceptor.pos / 3
                coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len(
                    seqparts[2])
                seqS1 = aaseq[0:(len(seqparts[0]) + len(seqparts[1]))]
                seqS2 = aaseq[-(len(seqparts[3]) + len(seqparts[4])):]
                if len(seqparts[0]):
                    seqS1 = seqS1[1:]
                    coordS1 += 1
                if len(seqparts[4]):
                    seqS2 = seqS2[:-1]

                if queryorsbjct == "sbjct":
                    # swap query <-> sbjct
                    orfQ1, orfS1 = orfS1, orfQ1
                    orfQ2, orfS2 = orfS2, orfQ2
                    seqQ1, seqS1 = seqS1, seqQ1
                    seqQ2, seqS2 = seqS2, seqQ2
                    coordQ1, coordS1 = coordS1, coordQ1
                    coordQ2, coordS2 = coordS2, coordQ2

                ################################################################
                if verbose:
                    print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2
                    print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2
                ################################################################

                # make pacbporfs
                pacbp1 = PacbP(input=(seqQ1, seqS1, coordQ1, coordS1))
                pacbp1.strip_unmatched_ends()
                tinypacbporf1 = pacbp2pacbporf(pacbp1, orfQ1, orfS1)
                tinypacbporf1.extend_pacbporf_after_stops()
                pacbp2 = PacbP(input=(seqQ2, seqS2, coordQ2, coordS2))
                pacbp2.strip_unmatched_ends()
                tinypacbporf2 = pacbp2pacbporf(pacbp2, orfQ2, orfS2)
                tinypacbporf2.extend_pacbporf_after_stops()

                ################################################################
                if verbose:
                    print tinypacbporf1
                    tinypacbporf1.print_protein_and_dna()
                    print tinypacbporf2
                    tinypacbporf2.print_protein_and_dna()
                ################################################################

                ################################################################
                # set some meta-data properties to the intron objects
                ################################################################
                # add distance score to intron
                preceding_intron._distance = 0
                intron._distance = 0
                subsequent_intron._distance = 0

                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(preceding_intron, pacbporfD,
                                                   tinypacbporf1)
                    succes = set_apps_intron_query(intron, tinypacbporf1,
                                                   tinypacbporf2)
                    succes = set_apps_intron_query(subsequent_intron,
                                                   tinypacbporf2, pacbporfA)
                else:
                    succes = set_apps_intron_sbjct(preceding_intron, pacbporfD,
                                                   tinypacbporf1)
                    succes = set_apps_intron_sbjct(intron, tinypacbporf1,
                                                   tinypacbporf2)
                    succes = set_apps_intron_sbjct(subsequent_intron,
                                                   tinypacbporf2, pacbporfA)

                # set GFF fsource attribute for recognition of intron sources
                preceding_intron._gff['fsource'] = "ABGPprojectingTE"
                intron._gff['fsource'] = "ABGPprojectingTE"
                subsequent_intron._gff['fsource'] = "ABGPprojectingTE"

                # create _linked_to_xxx attributes
                preceding_intron._linked_to_pacbporfs = [
                    tinypacbporf1, tinypacbporf2
                ]
                intron._linked_to_pacbporfs = [tinypacbporf1, tinypacbporf2]
                subsequent_intron._linked_to_pacbporfs = [
                    tinypacbporf1, tinypacbporf2
                ]
                preceding_intron._linked_to_introns = [
                    intron, subsequent_intron
                ]
                intron._linked_to_introns = [
                    preceding_intron, subsequent_intron
                ]
                subsequent_intron._linked_to_introns = [
                    intron, preceding_intron
                ]

                ################################################################
                # append to results
                ################################################################
                results.append((
                    preceding_intron,
                    intron,
                    subsequent_intron,
                    tinypacbporf1,
                    tinypacbporf2,
                ))

    # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row)
    return results
Esempio n. 19
0
def merge_pacbporfs_with_query_intron_bridgeing(pacbporfD,
                                                pacbporfA,
                                                verbose=False,
                                                **kwargs):
    """
    Merge query Orfs in PacbPORF by **best** intron

    @attention: see orfs.merge_orfs_with_intron for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max(
        [min(qdr), max(qdr) - (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])
    max_accep_query_pos = min(
        [max(qar), min(qar) + (ELEGIABLE_SPLICE_SITE_AA_RANGE * 3)])

    # get list of introns
    intronlist = merge_orfs_with_intron(pacbporfD.orfQ,
                                        pacbporfA.orfQ,
                                        min_donor_pos=min_donor_query_pos,
                                        max_acceptor_pos=max_accep_query_pos,
                                        **kwargs)

    # filter on entropy
    # settings for minimal alignment entropy score
    if min([pacbporfD.identityscore, pacbporfA.identityscore]) > 0.55:
        min_donor_site_entropy = 0.01
        min_acceptor_site_entropy = 0.01
        intronlist = _filter_introns_on_entropy(
            intronlist,
            pacbporfD,
            pacbporfA,
            min_donor_site_entropy=min_donor_site_entropy,
            min_acceptor_site_entropy=min_acceptor_site_entropy)
    else:
        # do not filter, but do not forget to store apps data to intron(s)
        for intron in intronlist:
            succes = set_apps_intron_query(intron, pacbporfD, pacbporfA)

    for intron in intronlist:
        intron._distance = 0  # ??
        # set GFF fsource attribute for recognition of intron sources
        intron._gff['fsource'] = 'ABGPbridgeing'

    # get unique list of donors & acceptors
    donor = olba(list(Set([intron.donor for intron in intronlist])),
                 order_by='pos')
    accep = olba(list(Set([intron.acceptor for intron in intronlist])),
                 order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep]
    ############################################################################

    intronlist = _filter_introns_on_pssm_entropy_combination(intronlist)

    # get unique list of donors & acceptors
    donor = olba(list(Set([intron.donor for intron in intronlist])),
                 order_by='pos')
    accep = olba(list(Set([intron.acceptor for intron in intronlist])),
                 order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [d.pos for d in donor], "aQ1", [a.pos for a in accep]
    ############################################################################

    filtered_intron_list = []
    for intron in intronlist:
        intron.assign_bp_and_ppts()
        if intron.branchpoint and (intron.ppt5p or intron.ppt3p):
            filtered_intron_list.append(intron)
        else:
            pass

    # check if list is emptied due to branchpoint filtering
    # in that case, filter for either branchpoint OR polyppt
    if not filtered_intron_list and intronlist:
        for intron in intronlist:
            if intron.branchpoint or (intron.ppt5p or intron.ppt3p):
                filtered_intron_list.append(intron)

    # return list of filtered introns
    return filtered_intron_list
Esempio n. 20
0
def merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=False,**kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see orfs.merge_orfs_with_intron for **kwargs
    @attention: see functions._filter_for_alignable_splice_sites for **kwargs
    @attention: see functions._filter_for_entropy for **kwargs

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    # calculate maximal/minimal donor/acceptor site position based on alignment
    ELEGIABLE_SPLICE_SITE_AA_RANGE = 75

    qdr = pacbporfD.alignment_dna_range_query()
    qar = pacbporfA.alignment_dna_range_query()
    min_donor_query_pos = max([ min(qdr), max(qdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_query_pos = min([ max(qar), min(qar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    sdr = pacbporfD.alignment_dna_range_sbjct()
    sar = pacbporfA.alignment_dna_range_sbjct()
    min_donor_sbjct_pos = max([ min(sdr), max(sdr)-(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])
    max_accep_sbjct_pos = min([ max(sar), min(sar)+(ELEGIABLE_SPLICE_SITE_AA_RANGE*3) ])

    # get list of introns
    #intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,
    #        min_donor_pos   =min_donor_query_pos,
    #        max_acceptor_pos=max_accep_query_pos,**kwargs)
    #intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,
    #        min_donor_pos   =min_donor_sbjct_pos,
    #        max_acceptor_pos=max_accep_sbjct_pos,**kwargs)

    # get list of introns
    intronsQ = merge_orfs_with_intron(pacbporfD.orfQ,pacbporfA.orfQ,**kwargs)
    intronsS = merge_orfs_with_intron(pacbporfD.orfS,pacbporfA.orfS,**kwargs)

    # get unique list of donors & acceptors
    donorQ = olba( list(Set([inQ.donor for inQ in intronsQ ])), order_by='pos')
    donorS = olba( list(Set([inS.donor for inS in intronsS ])), order_by='pos')
    accepQ = olba( list(Set([inQ.acceptor for inQ in intronsQ ])), order_by='pos')
    accepS = olba( list(Set([inS.acceptor for inS in intronsS ])), order_by='pos')

    ############################################################################
    if verbose:
        print "dQ1", [ d.pos for d in donorQ ], "aQ1", [ a.pos for a in accepQ ]
        print "dS1", [ d.pos for d in donorS ], "aS1", [ a.pos for a in accepS ]
    ############################################################################

    # filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_donor']
    algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs)
    kwargs['allow_non_canonical'] = kwargs['allow_non_canonical_acceptor']
    algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs)

    ############################################################################
    if verbose:
        print "dQ2", [ _dq.pos for (_dq,_ds) in algdonors ],
        print "aQ2", [ _aq.pos for (_aq,_as) in algacceps ]
        print "dS2", [ _ds.pos for (_dq,_ds) in algdonors ],
        print "aS2", [ _as.pos for (_aq,_as) in algacceps ]
    ############################################################################

    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor',
                min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor',
                min_alignment_entropy=min_acceptor_site_alignment_entropy)

    ############################################################################
    if verbose:
        print "dQ3", [ _dq.pos for (_dq,_ds) in algdonors ],
        print "aQ3", [ _aq.pos for (_aq,_as) in algacceps ]
        print "dS3", [ _ds.pos for (_dq,_ds) in algdonors ],
        print "aS3", [ _as.pos for (_aq,_as) in algacceps ]
    ############################################################################


    # make unique position lists for quick lookup in intron lists
    dQpl = Set([ dQ.pos for dQ,dS in algdonors ])
    dSpl = Set([ dS.pos for dQ,dS in algdonors ])
    aQpl = Set([ aQ.pos for aQ,aS in algacceps ])
    aSpl = Set([ aS.pos for aQ,aS in algacceps ])

    # check exterior boundaries of PacbPORFs
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()

    # now make list of aligable introns
    algintrons = []
    for intQ in intronsQ:
        # check if intron falls within the PacbPORF aligned area
        if intQ.donor.pos <= sposD.query_dna_start: continue
        if intQ.acceptor.pos >= eposA.query_dna_end: continue
        if intQ.donor.pos in dQpl and intQ.acceptor.pos in aQpl:
            # Query intron occurs in list of alignable splice sites!
            for intS in intronsS:
                # check if intron falls within the PacbPORF aligned area
                if intS.donor.pos <= sposD.sbjct_dna_start: continue
                if intS.acceptor.pos >= eposA.sbjct_dna_end: continue
                if intS.donor.pos in dSpl and intS.acceptor.pos in aSpl:
                    # Sbjct intron occurs as well in alignable splice sites!
                    if (intQ.donor,intS.donor) in algdonors and\
                    (intQ.acceptor,intS.acceptor) in algacceps:
                        # Sbjct & Query Donor & Acceptor are alignable!
                        algintrons.append( ( intQ, intS ) )

    ############################################################################
    # set some meta-data properties to the intron objects
    ############################################################################
    for intQ,intS in algintrons:
        distDnt = pacbporfD.get_distance_aligned_nucleotide_positions(
                        query = intQ.donor.pos, sbjct = intS.donor.pos
                        )
        distAnt = pacbporfA.get_distance_aligned_nucleotide_positions(
                        query = intQ.acceptor.pos, sbjct = intS.acceptor.pos
                        )

        # final distance check. kwargs['aligned_site_max_triplet_distance']
        # is applied on donor and acceptor site. This distance measured on the
        # protein sequence can be DOUBLED in case distDnt / distAnt are
        # opposite (+ and -). Check here if the protein sequence gap is
        # as well <= kwargs['aligned_site_max_triplet_distance'].
        if abs(distAnt - distDnt) > kwargs['aligned_site_max_triplet_distance']*3:
            continue

        # add distance score to introns
        intQ._distance = abs(distDnt) + abs(distAnt)
        intS._distance = abs(distDnt) + abs(distAnt)

        # add Alignment Positional Periphery Score into objects
        succes = set_apps_intron_query(intQ,pacbporfD,pacbporfA)
        succes = set_apps_intron_sbjct(intS,pacbporfD,pacbporfA)

        # set GFF fsource attribute for recognition of intron sources
        intQ._gff['fsource'] = "ABGPmapping"
        intS._gff['fsource'] = "ABGPmapping"

        ########################################################################
        if verbose:
            # some printing....
            print "Aligned introns:", ( intQ.donor.pos, intQ.acceptor.pos ) ,
            print ( intS.donor.pos, intS.acceptor.pos ),
            print "DIST:", distDnt, distAnt,
            print "[%s]" % kwargs['aligned_site_max_triplet_distance'],
            print "ENTROPY: %1.2f %1.2f" % (intQ._apps_donor, intQ._apps_accep),
            print "PSSM: (%1.2f %1.2f) (%1.2f %1.2f)" % (
                intQ.donor.pssm_score, intS.donor.pssm_score,
                intQ.acceptor.pssm_score, intS.acceptor.pssm_score,
                )
        ########################################################################

    # return lists of aligned introns
    return algintrons
Esempio n. 21
0
def merge_pacbporfs_by_tinyexons(pacbporfD,pacbporfA,
    orfSetObjQ,orfSetObjS,verbose=False,**kwargs):
    """ """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_MAPPED_INTRON)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset']

    # settings for minimal alignment entropy score
    min_donor_site_alignment_entropy = 0.0
    min_acceptor_site_alignment_entropy = 0.0

    resultlistQ = merge_orfs_with_tinyexon(
            pacbporfD.orfQ,pacbporfA.orfQ,
            preceding_donor_sites=pacbporfD.orfQ._donor_sites,
            subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites,
            orflist=orfSetObjQ.orfs,**kwargs)
    resultlistS = merge_orfs_with_tinyexon(
            pacbporfD.orfS,pacbporfA.orfS,
            preceding_donor_sites=pacbporfD.orfS._donor_sites,
            subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites,
            orflist=orfSetObjS.orfs,**kwargs)

    # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ]
    resultdictQ,key2exonQ = _tinyexon_list_2_dict(resultlistQ)
    resultdictS,key2exonS = _tinyexon_list_2_dict(resultlistS)

    # get unique list of donors & acceptors
    donorQ = olba( list(Set([inD.donor for inD,te,inA in resultlistQ ])), order_by='pos')
    donorS = olba( list(Set([inD.donor for inD,te,inA in resultlistS ])), order_by='pos')
    accepQ = olba( list(Set([inA.acceptor for inD,te,inA in resultlistQ ])), order_by='pos')
    accepS = olba( list(Set([inA.acceptor for inD,te,inA in resultlistS ])), order_by='pos')

    ## filter for alignable donor & acceptor sites
    kwargs['allow_non_canonical']               = True # True
    kwargs['aligned_site_max_triplet_distance'] = 0     # 2
    algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs)
    algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs)

    # settings for minimal alignment entropy score
    # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!!
    min_donor_site_alignment_entropy = 0.1
    min_acceptor_site_alignment_entropy = 0.1


    # remove sites with to low alignment entropy
    algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor',
                min_alignment_entropy=min_donor_site_alignment_entropy)
    algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor',
                min_alignment_entropy=min_acceptor_site_alignment_entropy)

    # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS
    return_list = []

    ############################################################################
    if verbose:
        print "bridges constructed: ORFS:",
        print (pacbporfD.orfQ.id,pacbporfA.orfQ.id),
        print (pacbporfD.orfS.id,pacbporfA.orfS.id),
        print len(resultdictQ), len(resultdictS),
        print ( len(resultlistQ), len(donorQ), len(accepQ) ),
        print ( len(resultlistS), len(donorS), len(accepS) ),
        print ( len(algdonors), len(algacceps) )
    ############################################################################

    for keyQ,tinyexonQ in key2exonQ.iteritems():
        for keyS,tinyexonS in key2exonS.iteritems():
            if tinyexonQ.donor.phase != tinyexonS.donor.phase:
                continue
            if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase:
                continue
            if tinyexonQ.length != tinyexonS.length:
                continue
            # if here, then tinyexons of identical structure


            ####################################################################
            if verbose:
                print tinyexonQ.length, tinyexonQ.donor.phase,
                print ( len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1]) ),
                print ( len(resultdictS[keyS][0]), len(resultdictS[keyS][1]) ),
                print tinyexonQ,
                print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(),
                print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score
            ####################################################################

            donor_introns = []
            acceptor_introns = []
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = ( intronDQ.donor.pos, intronDS.donor.pos )
                    if alignedkey not in [ (dQ.pos, dS.pos) for dQ,dS in algdonors ]:
                        continue
                    # if here, we have a set of introns 5' of the tinyexon
                    # which are perfectly alignable!
                    donor_introns.append((intronDQ,intronDS))

            for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems():
                if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]:
                    continue
                for intronASkey, intronAS in resultdictS[keyS][1].iteritems():
                    if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]:
                        continue
                    # check if they exists as aligned sites
                    alignedkey = ( intronAQ.acceptor.pos, intronAS.acceptor.pos )
                    if alignedkey not in [ (aQ.pos, aS.pos) for aQ,aS in algacceps ]:
                        continue
                    # if here, we have a set of introns 3' of the tinyexon
                    # which are perfectly alignable!
                    acceptor_introns.append((intronAQ,intronAS))

            if not len(donor_introns) or not len(acceptor_introns):
                # no aligned 5' && aligned 3' introns
                continue

            # initialize extended tinyexon PacbPORF
            from pacb import PacbP
            pacbp = PacbP(input=( 
                    tinyexonQ.proteinsequence(),
                    tinyexonS.proteinsequence(),
                    tinyexonQ.protein_start(),
                    tinyexonS.protein_start(),
                    ) )
            pacbp.strip_unmatched_ends()
            # continue if no fraction could be aligned
            if len(pacbp) == 0: continue
            tinypacbporf = pacbp2pacbporf(pacbp,tinyexonQ.orf,tinyexonS.orf)
            tinypacbporf.extend_pacbporf_after_stops()

            ####################################################################
            if verbose:
                print tinypacbporf
                tinypacbporf.print_protein_and_dna()
                print len(donor_introns), len(acceptor_introns),
                print max([ dQ.donor.pssm_score+dS.donor.pssm_score for dQ,dS in donor_introns]),
                print max([ aQ.acceptor.pssm_score+aS.acceptor.pssm_score for aQ,aS in acceptor_introns])
            ####################################################################


            # if here, we have accepted tinyexon bridges!
            # gather them and store to return_list
            for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems():
                if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]:
                    continue
                for intronDSkey, intronDS in resultdictS[keyS][0].iteritems():
                    if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]:
                        continue
                    for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems():
                        if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]:
                            continue
                        for intronASkey, intronAS in resultdictS[keyS][1].iteritems():
                            if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]:
                                continue
                            ####################################################
                            # set some meta-data properties to the intron objects
                            ####################################################
                            _score_introns_obtained_by_mapping(
                                    intronDQ,intronDS,pacbporfD,
                                    tinypacbporf,source='ABGPmappingTE')
                            _score_introns_obtained_by_mapping(
                                    intronAQ,intronAS,tinypacbporf,
                                    pacbporfA,source='ABGPmappingTE')
                            # create _linked_to_xxx attributes
                            intronDQ._linked_to_pacbporfs = [ tinypacbporf ]
                            intronAQ._linked_to_pacbporfs = [ tinypacbporf ]
                            intronDS._linked_to_pacbporfs = [ tinypacbporf ]
                            intronAS._linked_to_pacbporfs = [ tinypacbporf ]
                            intronDQ._linked_to_introns   = [ intronAQ ]
                            intronAQ._linked_to_introns   = [ intronDQ ]
                            intronDS._linked_to_introns   = [ intronAS ]
                            intronAS._linked_to_introns   = [ intronDS ]
                            # append to tmp result list
                            return_list.append(
                                (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS)
                                )

    # check if there are >1 candidate tiny exons
    # currently, we choose only to return the **best** mapped tinyexon 
    if len(return_list) == 0:
        pass
    elif len(return_list) == 1:
        pass
    else:
        # only take the highest scoring candidate here 
        min_distance = min([ (a._distance+d._distance) for a,b,c,d,e in return_list ])
        pos2score = []
        for (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) in return_list:
            if (intronDQ._distance + intronAQ._distance) > min_distance:
                pos2score.append( 0.0 )
            else:
                # calculate overall pssm score
                total_pssm = 0.0
                total_pssm += intronDQ.donor.pssm_score
                total_pssm += intronDQ.acceptor.pssm_score
                total_pssm += intronDS.donor.pssm_score
                total_pssm += intronDS.acceptor.pssm_score
                total_pssm += intronAQ.donor.pssm_score
                total_pssm += intronAQ.acceptor.pssm_score
                total_pssm += intronAS.donor.pssm_score
                total_pssm += intronAS.acceptor.pssm_score
                pos2score.append( total_pssm )
        # get highest score and linked tinyexon
        max_score = max(pos2score)
        return_list = [ return_list[pos2score.index(max_score)] ]

    ############################################################################
    # some printing in verbose mode
    if verbose and return_list:
        (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) = return_list[0]
        print "BEST MAPPED TINYEXON:"
        print tinypacbporf
        print tinypacbporf.query, intronDQ._distance, intronAQ._distance,
        print ( intronDQ.donor.pos, intronDQ.acceptor.pos ),
        print ( intronDS.donor.pos, intronDS.acceptor.pos ),
        print ( intronAQ.donor.pos, intronAQ.acceptor.pos ),
        print ( intronAS.donor.pos, intronAS.acceptor.pos )
    ############################################################################

    # return the result list
    return return_list
Esempio n. 22
0
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,pacbporfA,
    verbose=False,**kwargs):
    """
    Merge 2 PacbPORF objects by closeby independant gained introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs,KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['cig_max_aa_length']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=verbose,**kwargs)
    cig_introns = []

    if verbose:
        print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs['cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance']

    # check if there is length congruence between the cig_introns
    for intQ,intS in alg_introns:
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True)
        distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase)
        distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase)
        ########################################################################
        if verbose:
            print (intQ.donor.pos, intQ.acceptor.pos),
            print (intS.donor.pos, intS.acceptor.pos),
            print distDnt, distAnt, kwargs['max_nt_offset']
        ########################################################################
        if abs(distDnt-distAnt) > kwargs['max_nt_offset']:
            # intermediate ciigPacbPORF has query vs sbjct length discrepancy
            # *3 for AA2nt coordinate conversion, +2 to allow different phases
            # e.g. phase difference can give 1AA+2nt difference
            continue
        if intQ.donor.phase == intS.donor.phase and\
        (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if intQ.acceptor.phase == intS.acceptor.phase and\
        (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if abs(distDnt) <= 5 or abs(distDnt) <= 5:
            # most likely a splice site phase shift, not a c.i.g.
            continue

        if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\
        abs(distAnt/3) <= kwargs['cig_max_aa_length']:
            # putatively a closeby independant (intron) gain
            cig_introns.append( ( intQ, intS ) )

    ############################################################################
    if verbose:
        for intQ,intS in cig_introns:
            print "cig?:", (intQ.donor.pos, intQ.acceptor.pos),
            print (intS.donor.pos, intS.acceptor.pos)
    ############################################################################


    # return variable to store found positive cases of CIG into
    found_cig_list = []

    # check if there is some sequence similarity
    for intQ,intS in cig_introns:
        # get alignment positions around query & sbjcts splice sites
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True)
        distD = dQpos - dSpos
        distA = aQpos - aSpos
        distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase)
        distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase)

        if distDnt > 0:   # then, distAnt is as well > 0
            # QUERY is extended on the donor side
            #mode   = "SQ"
            #qStart = pacbporfD._positions[dSpos].query_pos
            #qEnd   = qStart + distD
            #sStart = pacbporfA._positions[aSpos].sbjct_pos
            #sEnd   = sStart + distD
            #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)
            mode  = "SQ"
            qEnd  = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos)
            qStart= qEnd - max([distA,distD])
            sStart= pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos)
            sEnd  = sStart + max([distA,distD])
            qSeq  = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            sSeq  = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)

        else: # distDnt and distAnt are < 0
            ## SBJCT is extended on the donor site
            #mode   = "QS"
            #qStart = pacbporfA._positions[aQpos].query_pos
            #qEnd   = qStart - distA
            #sStart = pacbporfD._positions[dQpos].sbjct_pos
            #sEnd   = sStart - distA
            #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd)
            #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd)
            mode  = "QS"
            qStart= pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos)
            qEnd  = qStart - min([distA,distD])
            sEnd  = pacbporfD.orfS.dnapos2aapos(intS.donor.pos)
            sStart= sEnd + min([distA,distD])
            qSeq  = pacbporfA.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            sSeq  = pacbporfD.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)


        headerQ = "query_%s_%s_%s" % (qStart,qEnd,qSeq)
        headerS = "sbjct_%s_%s_%s" % (sStart,sEnd,sSeq)
        headerQ = headerQ[0:20] # truncate to prevent error
        headerS = headerS[0:20] # truncate to prevent error
        if verbose:
            print mode, (distD,distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt,
            print dQpos, aQpos, dSpos, aSpos
        if not qSeq: continue # superfluous check-doublecheck for sequence
        if not sSeq: continue # superfluous check-doublecheck for sequence

        ####################################################
        # make PacbPORF with ClustalW
        ####################################################
        # align the sequences with clustalw
        seqs = { headerQ: qSeq, headerS: sSeq }
        (alignedseqs,alignment) = clustalw(seqs=seqs)

        # make pacbp from clustalw alignment
        pacbp = pacbp_from_clustalw(
                    alignment=(
                            alignedseqs[headerQ],
                            alignment,
                            alignedseqs[headerS]
                            ),
                    coords=(qStart,qEnd,sStart,sEnd)
                    )

        if not pacbp: continue

        # strip unaligned fraction of this pacbp object, then check length
        pacbp.strip_unmatched_ends()

        if len(pacbp) < kwargs['cig_min_aa_length']:
            continue
        if len(pacbp) > kwargs['cig_max_aa_length']:
            continue

        if pacbp:
            # initialize extended tiny PacbPORF caused by c.i.g.
            if distDnt > 0:
                cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfD.orfQ,pacbporfA.orfS)
            else:
                cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfA.orfQ,pacbporfD.orfS)
            cig_pacbporf.extend_pacbporf_after_stops()
            ####################################################################
            if verbose:
                print pacbp, len(pacbp)
                print cig_pacbporf
                print "CIG:", intQ
                print "CIG:", intS
                print distD, distA, distDnt, distAnt
                cig_pacbporf.print_protein_and_dna()
            ####################################################################

            ####################################################################
            # set some meta-data properties to the intron objects
            ####################################################################


            # add distance score to introns
            # The distance set in merge_pacbporfs_with_introns is large;
            # it is the actual distance between the splice sites. In CIG,
            # the measure for distance is the length difference between
            # the offset between query and sbjct measured on the cig_pacbporf
            intQ._distance = abs(distDnt-distAnt)
            intS._distance = abs(distDnt-distAnt)
    
            if distDnt > 0:   # then, distAnt is as well > 0
                # QUERY is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ,cig_pacbporf,pacbporfA)
                succes = set_apps_intron_sbjct(intS,pacbporfD,cig_pacbporf)
            else:
                # SBJCT is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ,pacbporfD,cig_pacbporf)
                succes = set_apps_intron_sbjct(intS,cig_pacbporf,pacbporfA)

            # set GFF fsource attribute for recognition of intron sources
            intQ._gff['fsource'] = "ABGPcig"
            intS._gff['fsource'] = "ABGPcig"

            # create _linked_to_xxx attributes
            intQ._linked_to_pacbporfs = [ cig_pacbporf ]
            intS._linked_to_pacbporfs = [ cig_pacbporf ]


            # append to found_cig_list
            found_cig_list.append( ( intQ, intS, cig_pacbporf ) )

        else:
            # no alignment possible -> try next
            continue
    
    # return lists of closeby_independant_introns
    return found_cig_list
Esempio n. 23
0
def merge_pacbporfs(
    pacbporfD,
    pacbporfA,
    queryOrfSetObj,
    sbjctOrfSetObj,
    allow_query_projecting=True,
    allow_sbjct_projecting=True,
    allow_query_mapping=True,
    allow_sbjct_mapping=True,
    allow_projecting=True,
    allow_mapping=True,
    verbose=False,
):
    """
    Merge 2 PacbPORF objects with an interface into a gene structure

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit/create **kwargs dictionary for some forced attributes
    kwargs = {}
    _update_kwargs(kwargs, KWARGS_SPLICESITES)

    # deal with allow_xxx attributes
    if not allow_projecting:
        allow_query_projecting = False
        allow_sbjct_projecting = False
    if not allow_mapping:
        allow_query_mapping = False
        allow_sbjct_mapping = False

    # check if Orf objects of PacbPORFS are identical
    queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id
    sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id

    # return data structure of introns
    introns = {"query": [], "sbjct": []}

    # Scan Orfs for splice sites.
    # This has probably been performed before, but when not done,
    # cached donor & acceptor sites lists seems to be empty -> no introns
    pacbporfD.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs["min_donor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_donor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"],
    )
    pacbporfD.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs["min_donor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_donor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"],
    )
    pacbporfA.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs["min_acceptor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_acceptor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"],
    )
    pacbporfA.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs["min_acceptor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_acceptor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"],
    )

    if not queryOrfsIdentical and not sbjctOrfsIdentical:

        introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1)

        if (
            pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
        ):
            introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA)
            introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA)
            introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj)

            introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj)

            introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}
            introns4 = {}
            introns5 = {}
            introns6 = {}
            introns7 = {}
            introns8 = {}

        introns9 = merge_pacbporfs_with_conserved_acceptor_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns9 = _filter_aligned_introns_on_pssm_entropy_combination(introns9)

        introns10 = merge_pacbporfs_with_conserved_donor_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns10 = _filter_aligned_introns_on_pssm_entropy_combination(introns10)

        # store introns obtained by most simplest case projecting/mapping
        introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1]))
        introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1]))

        # only store introns from intron2 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, cigpacbp) in introns2:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)

        # only store introns from intron3 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS) in introns3:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)

        # only store introns from intron4 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS2)

        # only store introns from intron5 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            if intrQ:
                k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else:
                k1 = None
            if intrS:
                k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else:
                k2 = None
            if intrQ2:
                k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else:
                k3 = None
            if intrS2:
                k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else:
                k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS2)

        # only store introns from intron6 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6:
            if intrQ:
                k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else:
                k1 = None
            if intrS:
                k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else:
                k2 = None
            if intrQ2:
                k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else:
                k3 = None
            if intrS2:
                k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else:
                k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS2)

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns["query"]:
            introns["query"].remove(None)
        while None in introns["sbjct"]:
            introns["sbjct"].remove(None)

        # only store introns from intron7 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["sbjct"].append(intrS2)

        # only store introns from intron8 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ:
                introns["query"].append(intrQ)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS)

        # only store introns from introns9 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS) in introns9:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (2163, 2283):
                print "STRACC", k1, intrQ, k1 not in keysQ
                print "STRACC", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns["query"].append(intrQ)
            if k2 not in keysS:
                introns["sbjct"].append(intrS)

        # only store introns from introns10 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS) in introns10:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (1642, 1858):
                print "STRDON", k1, intrQ, k1 not in keysQ
                print "STRDON", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns["query"].append(intrQ)
            if k2 not in keysS:
                introns["sbjct"].append(intrS)

        # finally, do the bridging thingy
        introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA)

        # only store introns from introns0 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        for intrQ in introns0:
            if intrQ.coords() not in keysQ:
                introns["query"].append(intrQ)

        # introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] )
        # introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] )
        # introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] )
        # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] )
        # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] )
        # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] )
        # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] )
        # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] )
        # introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] )
        # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] )
        # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] )
        # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] )
        # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] )
        # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] )

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns["query"]:
            introns["query"].remove(None)
        while None in introns["sbjct"]:
            introns["sbjct"].remove(None)

    elif not queryOrfsIdentical:
        seqerror = merge_pacbporf_with_sequenceerror_in_query(pacbporfD, pacbporfA)
        introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA)

        if (
            pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
        ):
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query(pacbporfD, pacbporfA, queryOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_query(pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store sequencerror if it exists
        if seqerror:
            introns["query"].append(seqerror)

        # store introns obtained by most simplest case projecting/mapping
        introns["query"].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns["query"].append(intr1)
                introns["query"].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns["query"].append(intr1)
                introns["query"].append(intr2)
                introns["query"].append(intr3)

        if not introns["query"] and allow_sbjct_mapping and allow_query_mapping:
            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA)

            # potential stopless 3n intron in SBJCT
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA)

            if (
                pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
                and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            ):
                introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)

            else:
                # do not allow more complex intron merging
                introns3 = {}

            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns["query"]]
            for intrQ, intrS in introns1:
                if intrQ.coords() not in keys:
                    introns["query"].append(intrQ)
                    keys = [intron.coords() for intron in introns["query"]]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrQ.coords() not in keys:
                    introns["query"].append(intrQ)
                    keys = [intron.coords() for intron in introns["query"]]
            for intrQ, intrS in introns3:
                if intrQ.coords() not in keys:
                    introns["query"].append(intrQ)
                    keys = [intron.coords() for intron in introns["query"]]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns["query"].append(intron)
                    keys = [intron.coords() for intron in introns["query"]]

            keys = [intron.coords() for intron in introns["sbjct"]]
            for intrQ, intrS in introns1:
                if intrS.coords() not in keys:
                    introns["query"].append(intrS)
                    keys = [intron.coords() for intron in introns["sbjct"]]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrS.coords() not in keys:
                    introns["query"].append(intrS)
                    keys = [intron.coords() for intron in introns["sbjct"]]
            for intrQ, intrS in introns3:
                if intrS.coords() not in keys:
                    introns["query"].append(intrS)
                    keys = [intron.coords() for intron in introns["sbjct"]]

        elif not introns["query"]:

            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA)
            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns["query"]]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns["query"].append(intron)
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif not sbjctOrfsIdentical:
        introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA)

        if (
            pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
        ):
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store introns obtained by most simplest case projecting/mapping
        introns["sbjct"].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns["sbjct"].append(intr1)
                introns["sbjct"].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns["sbjct"].append(intr1)
                introns["sbjct"].append(intr2)
                introns["sbjct"].append(intr3)

        if not introns["sbjct"] and allow_sbjct_mapping and allow_query_mapping:
            # potential stopless 3n intron in QUERY
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA)

            if (
                pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
                and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            ):
                introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)
            else:
                # do not allow more complex intron merging
                introns3 = {}

            # store introns
            introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1]))
            introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1]))
            introns["query"].extend([intrQ for (intrQ, intrS, cigpacbp) in introns2])
            introns["query"].extend([intrQ for (intrQ, intrS) in introns3])
            introns["sbjct"].extend([intrS for (intrQ, intrS, cigpacbp) in introns2])
            introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3])
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif queryOrfsIdentical and sbjctOrfsIdentical:
        if allow_query_mapping:
            introns1 = merge_pacbporfs_by_inframe_intron_in_query(pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns1 = []

        if allow_sbjct_mapping:
            introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct(pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns2 = []

        if allow_sbjct_mapping and allow_query_mapping:
            introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3)
            # apply stopless3n intron filtering
            introns3 = _filter_aligned_stopless_3n_introns(introns3)

        else:
            # no mapping (unigene or continious alignment provided)
            introns3 = []

        # introns4 = merge_pacbporfs_with_closeby_independant_introns(
        #                pacbporfD,pacbporfA)
        # introns5 = merge_pacbporfs_with_phase_shift_introns(
        #                pacbporfD,pacbporfA)

        introns["query"].extend([prj.projected_introns[0] for prj in introns1])
        introns["sbjct"].extend([prj.projected_introns[0] for prj in introns2])
        introns["query"].extend([intrQ for (intrQ, intrS) in introns3])
        introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3])

    else:
        # none of these cases; allow_projecting or allow_mapping == False!
        pass

    # Filter for stopless3n introns
    introns["query"] = _filter_stopless_3n_introns(introns["query"])
    introns["sbjct"] = _filter_stopless_3n_introns(introns["sbjct"])

    # return list of introns
    return introns
Esempio n. 24
0
def merge_orfs_with_two_tinyexons(preceding_orf,
                                  subsequent_orf,
                                  preceding_donor_sites=[],
                                  subsequent_acceptor_sites=[],
                                  orflist=[],
                                  **kwargs):
    """
    Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @attention: see get_potential_tiny_exons_on_orf for additional **kwargs

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron )

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    # return list with (intron,tinyexon,intron) tuples
    returntinyexons = []
    tinyexoncollection = []
    tinyexoncombis = []
    min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites])
    max_subsequent_acceptor_sites_pos = max(
        [a.pos for a in subsequent_acceptor_sites])

    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        min_pos = min_preceding_donor_sites_pos + kwargs[
            'min_tinyexon_intron_nt_length']
        max_pos = max_subsequent_acceptor_sites_pos - kwargs[
            'min_tinyexon_intron_nt_length']
        # if so, do not check this Orf
        if orfX.endPY <= min_pos: continue
        if orfX.startPY >= max_pos: continue
        # extend the tinyexoncollection
        tinyexoncollection.extend(
            get_potential_tiny_exons_on_orf(orfX, **kwargs))

    # make tinyexoncollection ordered on start pos
    tinyexoncollection = _order_intron_list(tinyexoncollection,
                                            order_by='donor_pos')
    # donor_pos makes REVERSE ordering; restore this by reversing
    tinyexoncollection.reverse()

    # make 2-elemented tuples of tinyexons which can co-occur together
    for tinyexon1 in tinyexoncollection:
        for pos in range(len(tinyexoncollection) - 1, -1, -1):
            tinyexon2 = tinyexoncollection[pos]
            if tinyexon2.donor.pos < tinyexon1.donor.pos: break
            intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos
            if intron_length < kwargs['min_tinyexon_intron_nt_length']:
                continue
            if intron_length > kwargs['max_tinyexon_intron_nt_length']:
                continue
            if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue
            # if here, elegiable combi!
            intron = IntronConnectingOrfs(
                tinyexon1.donor, tinyexon2.acceptor,
                get_shared_nucleotides_at_splicesite(subsequent_orf,
                                                     preceding_orf,
                                                     tinyexon2.acceptor,
                                                     tinyexon1.donor),
                preceding_orf, subsequent_orf)
            totlen = tinyexon1.length + tinyexon2.length
            combi = (totlen, tinyexon1, intron, tinyexon2)
            tinyexoncombis.append(combi)

    # return an ordered list based on length
    tinyexoncombis.sort()
    return [(exon1, intron, exon2)
            for l, exon1, intron, exon2 in tinyexoncombis]