Example #1
0
def merge_orfs_with_two_tinyexons(preceding_orf,subsequent_orf,
    preceding_donor_sites=[],
    subsequent_acceptor_sites=[],
    orflist=[],**kwargs):
    """
    Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @attention: see get_potential_tiny_exons_on_orf for additional **kwargs

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron )

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    # return list with (intron,tinyexon,intron) tuples
    returntinyexons = []
    tinyexoncollection = []
    tinyexoncombis = []
    min_preceding_donor_sites_pos     = min([ d.pos for d in preceding_donor_sites ])
    max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) 

    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        min_pos = min_preceding_donor_sites_pos + kwargs['min_tinyexon_intron_nt_length']
        max_pos = max_subsequent_acceptor_sites_pos - kwargs['min_tinyexon_intron_nt_length']
        # if so, do not check this Orf
        if orfX.endPY   <= min_pos: continue
        if orfX.startPY >= max_pos: continue
        # extend the tinyexoncollection
        tinyexoncollection.extend( get_potential_tiny_exons_on_orf(orfX,**kwargs) )

    # make tinyexoncollection ordered on start pos
    tinyexoncollection = _order_intron_list(tinyexoncollection,order_by='donor_pos')
    # donor_pos makes REVERSE ordering; restore this by reversing
    tinyexoncollection.reverse()

    # make 2-elemented tuples of tinyexons which can co-occur together
    for tinyexon1 in tinyexoncollection:
        for pos in range(len(tinyexoncollection)-1,-1,-1):
            tinyexon2 = tinyexoncollection[pos]
            if tinyexon2.donor.pos < tinyexon1.donor.pos: break
            intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos
            if intron_length < kwargs['min_tinyexon_intron_nt_length']: continue
            if intron_length > kwargs['max_tinyexon_intron_nt_length']: continue
            if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue
            # if here, elegiable combi!
            intron = IntronConnectingOrfs(
                    tinyexon1.donor,tinyexon2.acceptor,
                    get_shared_nucleotides_at_splicesite(
                            subsequent_orf,preceding_orf,
                            tinyexon2.acceptor,tinyexon1.donor
                            ),
                    preceding_orf,subsequent_orf)
            totlen = tinyexon1.length+tinyexon2.length
            combi = ( totlen, tinyexon1, intron, tinyexon2 )
            tinyexoncombis.append( combi )

    # return an ordered list based on length
    tinyexoncombis.sort()
    return [ (exon1,intron,exon2) for l,exon1,intron,exon2 in tinyexoncombis ]
Example #2
0
def merge_orfs_with_intron(orfD,orfA,
    max_intron_nt_length         = MAX_INTRON_NT_LENGTH,
    min_intron_nt_length         = MIN_INTRON_NT_LENGTH,
    min_donor_pssm_score         = MIN_DONOR_PSSM_SCORE,
    min_acceptor_pssm_score      = MIN_ACCEPTOR_PSSM_SCORE,
    allow_non_canonical_donor    = ALLOW_NON_CANONICAL_DONOR,
    allow_non_canonical_acceptor = ALLOW_NON_CANONICAL_ACCEPTOR,
    non_canonical_min_donor_pssm_score    = NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
    non_canonical_min_acceptor_pssm_score = NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
    min_donor_pos=None,
    max_donor_pos=None,
    min_acceptor_pos=None,
    max_acceptor_pos=None,
    order_by = 'length',**kwargs):
    """
    Merge 2 Orf objects by introns

    @attention: **kwargs can contain other (here) unnecessarily arguments

    @type  orfD: Orf object
    @param orfD: Orf object that has to deliver a PSSM donor object

    @type  orfA: Orf object
    @param orfA: Orf object that has to deliver a PSSM acceptor object

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: maximal length (nt) of the intron
    
    @type  min_intron_nt_length: integer
    @param min_intron_nt_length: minimal length (nt) of the intron

    @type  min_donor_pssm_score: float
    @param min_donor_pssm_score: minimal pssm score of donor splice site

    @type  min_acceptor_pssm_score: float
    @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfD)
    IsOrf(orfA)

    # scan for splice sites (if not already done -> is checked in function)
    orfD.scan_orf_for_pssm_splice_sites(
            splicetype="donor",
            min_pssm_score=min_donor_pssm_score,
            allow_non_canonical=allow_non_canonical_donor,
            non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    orfA.scan_orf_for_pssm_splice_sites(
            splicetype="acceptor",
            min_pssm_score=min_acceptor_pssm_score,
            allow_non_canonical=allow_non_canonical_acceptor,
            non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with introns
    introns = []

    # most quickest scan possible: are there donors & acceptors?
    if orfD._donor_sites == [] or orfA._acceptor_sites == []:
        # no introns possible because splice sites are missing
        return introns

    # very quick scan: are exons not to far from each other?
    if max_intron_nt_length and\
    (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length:
        # no introns possible that can bridge this gap
        return introns

    for donor in orfD._donor_sites:
        if not allow_non_canonical_donor and not donor.is_canonical():
            continue
        elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score:
            continue
        elif not donor.is_canonical() and donor.pssm_score < non_canonical_min_donor_pssm_score:
            continue
        elif (min_donor_pos or min_donor_pos==0) and donor.pos < min_donor_pos:
            continue
        elif (max_donor_pos or max_donor_pos==0) and donor.pos > max_donor_pos:
            continue
        else:
            # donor site accepted
            pass 

        for acceptor in orfA._acceptor_sites:
            if not allow_non_canonical_acceptor and not acceptor.is_canonical():
                continue
            elif acceptor.is_canonical() and acceptor.pssm_score < min_acceptor_pssm_score:
                continue
            elif not acceptor.is_canonical() and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score:
                continue
            elif (min_acceptor_pos or min_acceptor_pos==0) and acceptor.pos < min_acceptor_pos:
                continue
            elif (max_acceptor_pos or max_acceptor_pos==0) and acceptor.pos > max_acceptor_pos:
                continue
            else:
                # acceptor site accepted
                pass 

            # generate intron length and phase variable
            intron_length = acceptor.pos - donor.pos
            intron_phase  = intron_length % 3

            # check phase compatibilty (1) of splice sites
            if donor.phase != acceptor.phase: continue
            # check phase compatibilty (2) of splice sites
            if ( intron_phase + orfD.frame ) % 3 != orfA.frame % 3: continue

            # check if intron length is in between the boundaries
            if max_intron_nt_length and intron_length > max_intron_nt_length: continue
            if min_intron_nt_length and intron_length < min_intron_nt_length: continue

            # okay, if we reach this point, we have a valid intron
            shared_nts = get_shared_nucleotides_at_splicesite(
                    orfA,orfD,acceptor,donor
                    )

            # make a IntronConnectingOrfs object
            intron = IntronConnectingOrfs(donor,acceptor,shared_nts,orfD,orfA)
            introns.append(intron)

    # return ordered intron list
    return _order_intron_list(introns,order_by=order_by)
Example #3
0
def merge_orfs_with_tinyexon(preceding_orf,subsequent_orf,
    preceding_donor_sites=[],
    subsequent_acceptor_sites=[],
    orflist=[],
    max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH,
    min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH,
    max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH,
    min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH,
    min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE,
    min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE,
    min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE,
    **kwargs):
    """
    Bridge two `neighbouring` Orfs by a tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @type  max_tinyexon_nt_length: integer
	@param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt

    @type  min_tinyexon_nt_length: integer
	@param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt

    @type  max_tinyexon_intron_nt_length: integer
    @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt

    @type  min_tinyexon_intron_nt_length: integer
    @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt

    @type  min_total_pssm_score: float or None
	@param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon

    @type  min_donor_pssm_score: float or None
	@param min_donor_pssm_score: minimal donor pssm score of tinyexon

    @type  min_acceptor_pssm_score: float or None
	@param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon, subsequent_intron )

    @attention: Global vars that have to be set upon usage:
        MIN_DONOR_PSSM_SCORE
        MIN_ACCEPTOR_PSSM_SCORE
        # and all TINYEXON variable named
        TINYEXON_MAX_NT_LENGTH                          
        TINYEXON_MIN_NT_LENGTH                          
        TINYEXON_MAX_INTRON_NT_LENGTH                   
        TINYEXON_MIN_INTRON_NT_LENGTH                   
        TINYEXON_MIN_PSSM_SCORE                         
        TINYEXON_MIN_DONOR_PSSM_SCORE                   
        TINYEXON_MIN_ACCEPTOR_PSSM_SCORE                
        TINYEXON_ALLOW_NON_CANONICAL_DONOR              
        TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR           
        TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE           
        TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE     
        TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE  

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # return list with (intron,tinyexon,intron) tuples
    returnexons = []
    min_preceding_donor_sites_pos     = min([ d.pos for d in preceding_donor_sites ])
    max_subsequent_acceptor_sites_pos = max([ a.pos for a in subsequent_acceptor_sites ]) 
    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        if orfX.endPY   <= min_preceding_donor_sites_pos: continue
        if orfX.startPY >= max_subsequent_acceptor_sites_pos: continue

        # if here, we can try to make a bridge by a tinyexon
        for donor in preceding_donor_sites:
            # orf not correctly positions towards the donor site
            if orfX.endPY <= donor.pos: continue

            # check pssm_score of donor site
            # TODO: this is in fact the donor on the normal, large orf
            # TODO: do we want to check this pssm score?
            if donor.pssm_score < min_donor_pssm_score: continue

            for acceptor in subsequent_acceptor_sites:
                if orfX.startPY >= acceptor.pos: continue

                # check pssm_score of acceptor site
                # TODO: this is in fact the acceptor on the normal, large orf
                # TODO: do we want to check this pssm score?
                if acceptor.pssm_score < min_acceptor_pssm_score: continue

                # okay, now try to bridge it!
                exons = find_tiny_exon_on_orf(orfX,order_by='total_pssm',
                        max_tinyexon_nt_length=max_tinyexon_nt_length,
                        min_tinyexon_nt_length=min_tinyexon_nt_length,
                        max_tinyexon_intron_nt_length=max_tinyexon_intron_nt_length,
                        min_tinyexon_intron_nt_length=min_tinyexon_intron_nt_length,
                        min_donor_pssm_score=min_donor_pssm_score,
                        min_acceptor_pssm_score=min_acceptor_pssm_score,
                        min_total_pssm_score=min_total_pssm_score,
                        preceding_donor=donor,
                        subsequent_acceptor=acceptor
                        )
                # and append to returnexons
                for tinyexon in exons:

                    # make preceding intron
                    shared_nts_A = get_shared_nucleotides_at_splicesite(
                            tinyexon.orf,preceding_orf,
                            tinyexon.acceptor,donor
                            )
                    preceding_intron = IntronConnectingOrfs(
                        donor,tinyexon.acceptor,
                        shared_nts_A,preceding_orf,tinyexon.orf )

                    # make subsequent intron
                    shared_nts_B = get_shared_nucleotides_at_splicesite(
                            subsequent_orf,tinyexon.orf,
                            acceptor,tinyexon.donor
                            )

                    subsequent_intron = IntronConnectingOrfs(
                        tinyexon.donor, acceptor,
                        shared_nts_B,tinyexon.orf,subsequent_orf )

                    # and append to exons
                    returnexons.append( ( preceding_intron, tinyexon, subsequent_intron ) )

    # and return the list of intron/exon/intron
    return returnexons
Example #4
0
def merge_orfs_with_two_tinyexons(preceding_orf,
                                  subsequent_orf,
                                  preceding_donor_sites=[],
                                  subsequent_acceptor_sites=[],
                                  orflist=[],
                                  **kwargs):
    """
    Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @attention: see get_potential_tiny_exons_on_orf for additional **kwargs

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron )

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    # return list with (intron,tinyexon,intron) tuples
    returntinyexons = []
    tinyexoncollection = []
    tinyexoncombis = []
    min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites])
    max_subsequent_acceptor_sites_pos = max(
        [a.pos for a in subsequent_acceptor_sites])

    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        min_pos = min_preceding_donor_sites_pos + kwargs[
            'min_tinyexon_intron_nt_length']
        max_pos = max_subsequent_acceptor_sites_pos - kwargs[
            'min_tinyexon_intron_nt_length']
        # if so, do not check this Orf
        if orfX.endPY <= min_pos: continue
        if orfX.startPY >= max_pos: continue
        # extend the tinyexoncollection
        tinyexoncollection.extend(
            get_potential_tiny_exons_on_orf(orfX, **kwargs))

    # make tinyexoncollection ordered on start pos
    tinyexoncollection = _order_intron_list(tinyexoncollection,
                                            order_by='donor_pos')
    # donor_pos makes REVERSE ordering; restore this by reversing
    tinyexoncollection.reverse()

    # make 2-elemented tuples of tinyexons which can co-occur together
    for tinyexon1 in tinyexoncollection:
        for pos in range(len(tinyexoncollection) - 1, -1, -1):
            tinyexon2 = tinyexoncollection[pos]
            if tinyexon2.donor.pos < tinyexon1.donor.pos: break
            intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos
            if intron_length < kwargs['min_tinyexon_intron_nt_length']:
                continue
            if intron_length > kwargs['max_tinyexon_intron_nt_length']:
                continue
            if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue
            # if here, elegiable combi!
            intron = IntronConnectingOrfs(
                tinyexon1.donor, tinyexon2.acceptor,
                get_shared_nucleotides_at_splicesite(subsequent_orf,
                                                     preceding_orf,
                                                     tinyexon2.acceptor,
                                                     tinyexon1.donor),
                preceding_orf, subsequent_orf)
            totlen = tinyexon1.length + tinyexon2.length
            combi = (totlen, tinyexon1, intron, tinyexon2)
            tinyexoncombis.append(combi)

    # return an ordered list based on length
    tinyexoncombis.sort()
    return [(exon1, intron, exon2)
            for l, exon1, intron, exon2 in tinyexoncombis]
Example #5
0
def merge_orfs_with_intron(
        orfD,
        orfA,
        max_intron_nt_length=MAX_INTRON_NT_LENGTH,
        min_intron_nt_length=MIN_INTRON_NT_LENGTH,
        min_donor_pssm_score=MIN_DONOR_PSSM_SCORE,
        min_acceptor_pssm_score=MIN_ACCEPTOR_PSSM_SCORE,
        allow_non_canonical_donor=ALLOW_NON_CANONICAL_DONOR,
        allow_non_canonical_acceptor=ALLOW_NON_CANONICAL_ACCEPTOR,
        non_canonical_min_donor_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
        non_canonical_min_acceptor_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
        min_donor_pos=None,
        max_donor_pos=None,
        min_acceptor_pos=None,
        max_acceptor_pos=None,
        order_by='length',
        **kwargs):
    """
    Merge 2 Orf objects by introns

    @attention: **kwargs can contain other (here) unnecessarily arguments

    @type  orfD: Orf object
    @param orfD: Orf object that has to deliver a PSSM donor object

    @type  orfA: Orf object
    @param orfA: Orf object that has to deliver a PSSM acceptor object

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: maximal length (nt) of the intron
    
    @type  min_intron_nt_length: integer
    @param min_intron_nt_length: minimal length (nt) of the intron

    @type  min_donor_pssm_score: float
    @param min_donor_pssm_score: minimal pssm score of donor splice site

    @type  min_acceptor_pssm_score: float
    @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfD)
    IsOrf(orfA)

    # scan for splice sites (if not already done -> is checked in function)
    orfD.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=min_donor_pssm_score,
        allow_non_canonical=allow_non_canonical_donor,
        non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    orfA.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=min_acceptor_pssm_score,
        allow_non_canonical=allow_non_canonical_acceptor,
        non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with introns
    introns = []

    # most quickest scan possible: are there donors & acceptors?
    if orfD._donor_sites == [] or orfA._acceptor_sites == []:
        # no introns possible because splice sites are missing
        return introns

    # very quick scan: are exons not to far from each other?
    if max_intron_nt_length and\
    (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length:
        # no introns possible that can bridge this gap
        return introns

    for donor in orfD._donor_sites:
        if not allow_non_canonical_donor and not donor.is_canonical():
            continue
        elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score:
            continue
        elif not donor.is_canonical(
        ) and donor.pssm_score < non_canonical_min_donor_pssm_score:
            continue
        elif (min_donor_pos
              or min_donor_pos == 0) and donor.pos < min_donor_pos:
            continue
        elif (max_donor_pos
              or max_donor_pos == 0) and donor.pos > max_donor_pos:
            continue
        else:
            # donor site accepted
            pass

        for acceptor in orfA._acceptor_sites:
            if not allow_non_canonical_acceptor and not acceptor.is_canonical(
            ):
                continue
            elif acceptor.is_canonical(
            ) and acceptor.pssm_score < min_acceptor_pssm_score:
                continue
            elif not acceptor.is_canonical(
            ) and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score:
                continue
            elif (min_acceptor_pos or min_acceptor_pos
                  == 0) and acceptor.pos < min_acceptor_pos:
                continue
            elif (max_acceptor_pos or max_acceptor_pos
                  == 0) and acceptor.pos > max_acceptor_pos:
                continue
            else:
                # acceptor site accepted
                pass

            # generate intron length and phase variable
            intron_length = acceptor.pos - donor.pos
            intron_phase = intron_length % 3

            # check phase compatibilty (1) of splice sites
            if donor.phase != acceptor.phase: continue
            # check phase compatibilty (2) of splice sites
            if (intron_phase + orfD.frame) % 3 != orfA.frame % 3: continue

            # check if intron length is in between the boundaries
            if max_intron_nt_length and intron_length > max_intron_nt_length:
                continue
            if min_intron_nt_length and intron_length < min_intron_nt_length:
                continue

            # okay, if we reach this point, we have a valid intron
            shared_nts = get_shared_nucleotides_at_splicesite(
                orfA, orfD, acceptor, donor)

            # make a IntronConnectingOrfs object
            intron = IntronConnectingOrfs(donor, acceptor, shared_nts, orfD,
                                          orfA)
            introns.append(intron)

    # return ordered intron list
    return _order_intron_list(introns, order_by=order_by)
Example #6
0
def merge_orfs_with_tinyexon(
        preceding_orf,
        subsequent_orf,
        preceding_donor_sites=[],
        subsequent_acceptor_sites=[],
        orflist=[],
        max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH,
        min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH,
        max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH,
        min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH,
        min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE,
        min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE,
        min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE,
        **kwargs):
    """
    Bridge two `neighbouring` Orfs by a tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @type  max_tinyexon_nt_length: integer
	@param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt

    @type  min_tinyexon_nt_length: integer
	@param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt

    @type  max_tinyexon_intron_nt_length: integer
    @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt

    @type  min_tinyexon_intron_nt_length: integer
    @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt

    @type  min_total_pssm_score: float or None
	@param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon

    @type  min_donor_pssm_score: float or None
	@param min_donor_pssm_score: minimal donor pssm score of tinyexon

    @type  min_acceptor_pssm_score: float or None
	@param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon, subsequent_intron )

    @attention: Global vars that have to be set upon usage:
        MIN_DONOR_PSSM_SCORE
        MIN_ACCEPTOR_PSSM_SCORE
        # and all TINYEXON variable named
        TINYEXON_MAX_NT_LENGTH                          
        TINYEXON_MIN_NT_LENGTH                          
        TINYEXON_MAX_INTRON_NT_LENGTH                   
        TINYEXON_MIN_INTRON_NT_LENGTH                   
        TINYEXON_MIN_PSSM_SCORE                         
        TINYEXON_MIN_DONOR_PSSM_SCORE                   
        TINYEXON_MIN_ACCEPTOR_PSSM_SCORE                
        TINYEXON_ALLOW_NON_CANONICAL_DONOR              
        TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR           
        TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE           
        TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE     
        TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE  

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # return list with (intron,tinyexon,intron) tuples
    returnexons = []
    min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites])
    max_subsequent_acceptor_sites_pos = max(
        [a.pos for a in subsequent_acceptor_sites])
    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        if orfX.endPY <= min_preceding_donor_sites_pos: continue
        if orfX.startPY >= max_subsequent_acceptor_sites_pos: continue

        # if here, we can try to make a bridge by a tinyexon
        for donor in preceding_donor_sites:
            # orf not correctly positions towards the donor site
            if orfX.endPY <= donor.pos: continue

            # check pssm_score of donor site
            # TODO: this is in fact the donor on the normal, large orf
            # TODO: do we want to check this pssm score?
            if donor.pssm_score < min_donor_pssm_score: continue

            for acceptor in subsequent_acceptor_sites:
                if orfX.startPY >= acceptor.pos: continue

                # check pssm_score of acceptor site
                # TODO: this is in fact the acceptor on the normal, large orf
                # TODO: do we want to check this pssm score?
                if acceptor.pssm_score < min_acceptor_pssm_score: continue

                # okay, now try to bridge it!
                exons = find_tiny_exon_on_orf(
                    orfX,
                    order_by='total_pssm',
                    max_tinyexon_nt_length=max_tinyexon_nt_length,
                    min_tinyexon_nt_length=min_tinyexon_nt_length,
                    max_tinyexon_intron_nt_length=max_tinyexon_intron_nt_length,
                    min_tinyexon_intron_nt_length=min_tinyexon_intron_nt_length,
                    min_donor_pssm_score=min_donor_pssm_score,
                    min_acceptor_pssm_score=min_acceptor_pssm_score,
                    min_total_pssm_score=min_total_pssm_score,
                    preceding_donor=donor,
                    subsequent_acceptor=acceptor)
                # and append to returnexons
                for tinyexon in exons:

                    # make preceding intron
                    shared_nts_A = get_shared_nucleotides_at_splicesite(
                        tinyexon.orf, preceding_orf, tinyexon.acceptor, donor)
                    preceding_intron = IntronConnectingOrfs(
                        donor, tinyexon.acceptor, shared_nts_A, preceding_orf,
                        tinyexon.orf)

                    # make subsequent intron
                    shared_nts_B = get_shared_nucleotides_at_splicesite(
                        subsequent_orf, tinyexon.orf, acceptor, tinyexon.donor)

                    subsequent_intron = IntronConnectingOrfs(
                        tinyexon.donor, acceptor, shared_nts_B, tinyexon.orf,
                        subsequent_orf)

                    # and append to exons
                    returnexons.append(
                        (preceding_intron, tinyexon, subsequent_intron))

    # and return the list of intron/exon/intron
    return returnexons