Esempio n. 1
0
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,pacbporfA,
    orfSetObject,queryorsbjct,verbose = False, **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  orfSetObject: object with elegiable Orfs
    @param orfSetObject: object with elegiable Orfs

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @rtype:  list
    @return: list with ( intron, ExonOnOrf, intron ) on the query sequence
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    MAX_TINYEXON_NT_LENGTH = 33
    MIN_TINYEXON_NT_LENGTH = 6

    tinyexons = []
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        alignedDonorRange = pacbporfD.alignment_dna_range_query()
        alignedAccepRange = pacbporfA.alignment_dna_range_query()
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        alignedDonorRange = pacbporfD.alignment_dna_range_sbjct()
        alignedAccepRange = pacbporfA.alignment_dna_range_sbjct()
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    for dObj in donorOrf._donor_sites:
        # do not make a projection OVER the aligned area
        if dObj.pos < min(alignedDonorRange): continue
        if queryorsbjct == "query":
            (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True)
        else:
            (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break
        for aObj in accepOrf._acceptor_sites:
            # do not make a projection OVER the aligned area
            if aObj.pos > max(alignedAccepRange): continue
            if queryorsbjct == "query":
                (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True)
            else:
                (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break
            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= MAX_TINYEXON_NT_LENGTH:
                break
            if distance < MIN_TINYEXON_NT_LENGTH:
                continue

            ####################################################
            # generate a ScanForMatches pattern file
            ####################################################
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct])
            # mask all non-phase0 nucleotides to N residues;
            # this represents the regularexpression for a specific
            # peptide sequence
            firstphasepositions = range( 3-dPhase % 3, len(query), 3)
            for pos in range(0,len(query)):
                if pos not in firstphasepositions:
                    query[pos] = "N"
            # calculate a ~50% mismatch number
            mismatches =  max([ 0, (len(query) - query.count("N"))/2 ])
            # write the pattern to string and subsequently to file
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            if kwargs['allow_non_canonical_donor']:
                sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % (
                    AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO)
            else:
                sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % (
                    AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO)

            ####################################################
            if verbose:
                print (pacbporfD.orfQ.id,pacbporfA.orfQ.id),
                print distance, dObj, aObj
                print sfmpat
            ####################################################

            fname = "sfmpat_tinyexon_%s_%s_%s_%s" % (
                        donorOrf.id,
                        accepOrf.id,
                        posDsbjct,
                        posAsbjct,
                        )
            fh = open(fname,'w')
            fh.write(sfmpat+"\n")
            fh.close()

            ####################################################
            # run ScanForMatches
            ####################################################
            command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\
                      """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\
                      """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\
                      """{ print $1"["$2","$3"]\\n"$4 } }' """
            command = command % (
                        donorOrf.inputgenomicsequence,
                        EXECUTABLE_SFM,fname,
                        dObj.pos+(kwargs['min_intron_nt_length']-3),
                        aObj.pos-(kwargs['min_intron_nt_length']-3) )
            co = osPopen(command)
            matches = parseFasta(co.readlines())
            co.close()

            # filter matches for:
            # (1) correct donor & acceptor phase
            # (2) high enough donor & acceptor site scores
            for hdr,seqmatch in matches.iteritems():
                startQ,stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ]
                exonQstart   = startQ + AUSO + 2 - 1
                exonQstop    = stopQ  - DDSO - 2

                ####################################
                # get Orf object of tinyexon
                ####################################
                tinyexonorf = None
                # select the Orf on which the tinyexon is located
                for orfObj in orfSetObject.get_elegiable_orfs(
                max_orf_start=exonQstart,min_orf_end=exonQstop):
                    orfPhase = (exonQstart - orfObj.startPY) % 3
                    if orfPhase == dPhase:               
                        tinyexonorf = orfObj
                        break
                else:
                    # No tinyexonorf assigned!! Iin case a regex matched
                    # over a STOP-codon or the regex length is smaller
                    # then the smallest Orf, no Orf can be assigned
                    continue

                # filter for donor & acceptor score            
                dScore = _score_splice_site(seqmatch[-9:],splicetype='donor')
                aScore = _score_splice_site(seqmatch[0:11],splicetype='acceptor')
                if dScore < kwargs['min_donor_pssm_score']:
                    continue
                if aScore < kwargs['min_acceptor_pssm_score']:
                    continue

                # scan Orf for splicesites
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                        splicetype="donor",
                        min_pssm_score=kwargs['min_donor_pssm_score'],
                        allow_non_canonical=kwargs['allow_non_canonical_donor'],
                        non_canonical_min_pssm_score=kwargs['non_canonical_min_donor_pssm_score'])
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                        splicetype="acceptor",
                        min_pssm_score=kwargs['min_acceptor_pssm_score'],
                        allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
                        non_canonical_min_pssm_score=kwargs['non_canonical_min_acceptor_pssm_score'])

                # get 1th intron donor object
                intron1_aObj = None
                for a in tinyexonorf._acceptor_sites:
                    if a.pos == exonQstart:
                        intron1_aObj = a
                        break
                else:
                    # pseudo-acceptorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # get 2th intron donor object
                intron2_dObj = None
                for d in tinyexonorf._donor_sites:
                    if d.pos == exonQstop:
                        intron2_dObj = d
                        break
                else:
                    # pseudo-donorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue


                # check if introns are of elegiable lengths
                if (intron1_aObj.pos-dObj.pos) > kwargs['max_intron_nt_length']:
                    continue
                if (aObj.pos-intron2_dObj.pos) > kwargs['max_intron_nt_length']:
                    continue

                ####################################################
                if True or verbose:
                    # if here, a candidate!!!
                    print (pacbporfD.orfQ.id,tinyexonorf.id,pacbporfA.orfQ.id),
                    print hdr, dScore, aScore
                    print seqmatch
                ####################################################

                # append to found tinyexons
                query_data      = ( tinyexonorf, exonQstart, exonQstop )
                sbjct_data      = ( prjctOrf, posDsbjct, posAsbjct )
                splicesite_data = ( dObj,intron1_aObj, intron2_dObj, aObj )
                tinyexons.append( ( query_data, sbjct_data, splicesite_data ) )


            # file cleanup
            osRemove(fname)

    # return - End Of Function - if no tinyexons are found
    if not tinyexons:
        return []

    ####################################
    # select the **best** tinyexon
    ####################################
    (query_data,sbjct_data,splicesite_data) = tinyexons[0]
    orfQ,query_dna_start,query_dna_end = query_data
    orfS,sbjct_dna_start,sbjct_dna_end = sbjct_data
    (intron1_dObj,intron1_aObj,intron2_dObj,intron2_aObj) = splicesite_data

    ####################################################
    if verbose:
        print "tinyexon orf:", orfQ
        print "tinyexon orf:", intron1_aObj
        print "tinyexon orf:", intron2_dObj
    ####################################################

    ####################################
    # make tinyexon PacbPORF
    ####################################
    startQaa = orfQ.dnapos2aapos(query_dna_start) -1
    startSaa = orfS.dnapos2aapos(sbjct_dna_start) -1
    stopQaa  = orfQ.dnapos2aapos(query_dna_end) +1
    stopSaa  = orfS.dnapos2aapos(sbjct_dna_end) +1
    # check for directly leading stop codon on tinyexon
    while startQaa <= orfQ.protein_startPY:
        startQaa+=1
        startSaa+=1
        query_dna_start+=3
        sbjct_dna_start+=3
    while startSaa <= orfS.protein_startPY:
        startQaa+=1
        startSaa+=1
        query_dna_start+=3
        sbjct_dna_start+=3
    # check for directly tailing stop codon on tinyexon
    while stopQaa > orfQ.protein_endPY:
        stopQaa-=1
        stopSaa-=1
        query_dna_end-=3
        sbjct_dna_end-=3
    while stopSaa > orfS.protein_endPY:
        stopQaa-=1
        stopSaa-=1
        query_dna_end-=3
        sbjct_dna_end-=3
    # get sequences
    qAAseq = orfQ.getaas(abs_pos_start=startQaa,abs_pos_end=stopQaa)
    sAAseq = orfS.getaas(abs_pos_start=startSaa,abs_pos_end=stopSaa)

    ####################################################
    if verbose or len(qAAseq) != len(sAAseq):
        # if unequal lengths, error will be raised upon PacbP.__init__()
        print orfQ, qAAseq, startQaa, stopQaa, (stopQaa-startQaa),
        print (query_dna_start,query_dna_end)
        print orfS, sAAseq, startSaa, stopSaa, (stopSaa-startSaa),
        print (sbjct_dna_start,sbjct_dna_end)
        print orfQ.inputgenomicsequence[query_dna_start-2:query_dna_end+2]
        print orfS.inputgenomicsequence[sbjct_dna_start-2:sbjct_dna_end+2]
    ####################################################

    # initialize extended tinyexon PacbPORF
    from pacb import PacbP
    pacbp = PacbP(input=( qAAseq, sAAseq, startQaa, startSaa ) )
    pacbp.strip_unmatched_ends()
    pacbporf = pacbp2pacbporf(pacbp,orfQ,orfS)
    pacbporf.extend_pacbporf_after_stops()
    pacbporf.source = 'ABGPprojectingTE'

    ####################################
    # make introns
    ####################################
    intron1 = IntronConnectingOrfs(
                intron1_dObj, intron1_aObj, None,
                donorOrf,pacbporf.orfQ )
    intron2 = IntronConnectingOrfs(
                intron2_dObj, intron2_aObj, None,
                pacbporf.orfQ, accepOrf )


    ################################################################
    # set some meta-data properties to the intron objects
    ################################################################
    # add distance score to intron
    intron1._distance = 0
    intron2._distance = 0

    # add Alignment Positional Periphery Score into objects
    if queryorsbjct == "query":
        succes = set_apps_intron_query(intron1,pacbporfD,pacbporf)
        succes = set_apps_intron_query(intron2,pacbporf,pacbporfA)
    else:
        succes = set_apps_intron_sbjct(intron1,pacbporfD,pacbporf)
        succes = set_apps_intron_sbjct(intron2,pacbporf,pacbporfA)

    # set GFF fsource attribute for recognition of intron sources
    intron1._gff['fsource'] = "ABGPprojectingTE"
    intron2._gff['fsource'] = "ABGPprojectingTE"

    # create _linked_to_xxx attributes
    intron1._linked_to_pacbporfs = [ pacbporf ]
    intron2._linked_to_pacbporfs = [ pacbporf ]
    intron1._linked_to_introns   = [ intron2 ]
    intron2._linked_to_introns   = [ intron1 ]

    ####################################################
    if verbose:
        print pacbporf
        pacbporf.print_protein_and_dna()
        print intron1
        print intron2
        if False:
            # printing data when this function needs to be debugged:
            print ""
            print intron1
            print intron2
            print ""
            print pacbporfD
            pacbporfD.print_protein_and_dna()
            print ""
            print pacbporf
            pacbporf.print_protein_and_dna()
            print ""
            print pacbporfA
            pacbporfA.print_protein_and_dna()
            import sys
            sys.exit()
    ####################################################

    # return introns and intermediate tinyexon PacbPORF
    return [(intron1,intron2,pacbporf)]
Esempio n. 2
0
def _merge_pacbporfs_by_two_tinyexons(pacbporfD,pacbporfA,
    orfSetObject,queryorsbjct,verbose = False, **kwargs):
    """ """
    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    tinyexons = []
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        dStart,dEnd = sposD.query_dna_start, eposD.query_dna_end
        aStart,aEnd = sposA.query_dna_start, eposA.query_dna_end
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        dStart,dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end
        aStart,aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # get all potential combinations of two tinyexons
    tinyexoncombis = merge_orfs_with_two_tinyexons(
                donorOrf, accepOrf,
                donorOrf._donor_sites,
                accepOrf._acceptor_sites,
                orfSetObject.orfs,
                )

    results = []

    for dObj in donorOrf._donor_sites:
        if queryorsbjct == "query":
            (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True)
        else:
            (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break

        # check if dObj is on pfD;
        # introns of tinyexons can be projected outside of pfD/pfA area
        if dObj.pos < dStart: continue

        for aObj in accepOrf._acceptor_sites:
            if queryorsbjct == "query":
                (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True)
            else:
                (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break

            # check if aObj is on pfA;
            # introns of tinyexons can be projected outside of pfD/pfA area
            if aObj.pos > aEnd: continue

            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= (kwargs['max_tinyexon_nt_length']*2):
                break
            if distance < (kwargs['min_tinyexon_nt_length']*2):
                continue

            filtered_tinyexoncombis = _filter_tinyexoncombis(tinyexoncombis,
                    min_length = distance,
                    max_length = distance,
                    min_first_acceptor_pos = dObj.pos + kwargs['min_tinyexon_intron_nt_length'],
                    max_final_donor_pos = aObj.pos - kwargs['min_tinyexon_intron_nt_length'],
                    phase_final_donor = aObj.phase,
                    phase_first_acceptor= dObj.phase,
                    )

            if not filtered_tinyexoncombis: continue

            ####################################################################
            if verbose:
                print distance, dObj, aObj, len(tinyexoncombis),
                print len(filtered_tinyexoncombis)
            ####################################################################

            for exon1,intron,exon2 in filtered_tinyexoncombis:
                # make preceding intron
                preceding_intron = IntronConnectingOrfs(
                    dObj,exon1.acceptor,
                    None,donorOrf,exon1.orf )

                # make subsequent intron
                subsequent_intron = IntronConnectingOrfs(
                    exon2.donor, aObj,
                    None,exon2.orf,accepOrf)

                ################################################################
                if verbose:
                    print "\t", exon1, exon1.proteinsequence(),
                    print preceding_intron.phase, exon1.donor.phase,
                    print subsequent_intron.phase, preceding_intron.shared_aa,
                    print intron.shared_aa, subsequent_intron.shared_aa 
                    print "\t", exon2, exon2.proteinsequence()
                ################################################################

                # get prjctOrf sequence for comparison
                correctionA = 0
                if aObj.phase != 0:
                    # INCLUDE the final AA which is broken by the splicesite
                    correctionA=1
                if queryorsbjct == "query":
                    startPos,_phase = pacbporfD.dnaposition_query(dObj.pos,forced_return=True)
                    stopPos,_phase  = pacbporfA.dnaposition_query(aObj.pos,forced_return=True)
                    start = pacbporfD._positions[startPos].sbjct_pos
                    stop  = pacbporfA._positions[stopPos].sbjct_pos + correctionA
                else:
                    startPos,_phase = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True)
                    stopPos,_phase  = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True)
                    start = pacbporfD._positions[startPos].query_pos
                    stop  = pacbporfA._positions[stopPos].query_pos + correctionA

                if stop <= start:
                    # tinyexon is so tiny that is does not have a single
                    # full aligned AA -> discard here
                    continue

                # actually get the prjctOrf sequence
                aaseq = prjctOrf.getaas(abs_pos_start=start,abs_pos_end=stop)

                # initialize a PacbP for the combination of both tinyexons
                # afterwards, check if the indentityscore is > 0.XX
                from pacb import PacbP
                seqparts = [ preceding_intron.shared_aa,
                             exon1.proteinsequence(),
                             intron.shared_aa,
                             exon2.proteinsequence(),
                             subsequent_intron.shared_aa ]

                ################################################################
                if verbose or len("".join(seqparts)) != len(aaseq):
                    print pacbporfD
                    print exon1.orf, exon2.orf, prjctOrf
                    print pacbporfA
                    print seqparts
                    print aaseq, len(aaseq), len("".join(seqparts)), (start,stop)
                    print "'%s'" % queryorsbjct,
                    print "Q", (algDobj.query_pos, algAobj.query_pos),
                    print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos)
                    print "distance:", distance, kwargs['max_tinyexon_nt_length'],
                    print (posDsbjct, posAsbjct),
                    print "Q-dna:", ( algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase ),
                    print "S-dna:", ( algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase )
                ################################################################

                # ignore by continue when sequences not identical in length
                if len("".join(seqparts)) != len(aaseq): continue

                testpacbp = PacbP(input=( "".join(seqparts), aaseq, 0, 0) )
                testpacbp.strip_unmatched_ends()

                if not ( testpacbp.identityscore > 0.60 and\
                (float(testpacbp.length) / len(aaseq)) > 0.70 ):
                    # not a very convincing alignment
                    continue

                ################################################################
                if verbose:
                    print testpacbp
                    testpacbp.print_protein()
                ################################################################

                # if here, succesfully mapped 2 tiny exons!!
                # get all sequences/coordinates in place for
                # pacbporf formation
                orfQ1   = exon1.orf
                orfS1   = prjctOrf
                orfQ2   = exon2.orf
                orfS2   = prjctOrf
                seqQ1   = exon1.proteinsequence()
                seqQ2   = exon2.proteinsequence()
                coordQ1 = exon1.acceptor.pos / 3
                coordS1 = start
                coordQ2 = exon2.acceptor.pos / 3
                coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len(seqparts[2])
                seqS1   = aaseq[0:(len(seqparts[0])+len(seqparts[1]))]
                seqS2   = aaseq[-(len(seqparts[3])+len(seqparts[4])):]
                if len(seqparts[0]):
                    seqS1 = seqS1[1:]
                    coordS1 += 1
                if len(seqparts[4]):
                    seqS2 = seqS2[:-1]

                if queryorsbjct == "sbjct": 
                    # swap query <-> sbjct
                    orfQ1,orfS1 = orfS1,orfQ1 
                    orfQ2,orfS2 = orfS2,orfQ2
                    seqQ1,seqS1 = seqS1,seqQ1
                    seqQ2,seqS2 = seqS2,seqQ2
                    coordQ1,coordS1 = coordS1,coordQ1
                    coordQ2,coordS2 = coordS2,coordQ2

                ################################################################
                if verbose:
                    print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2
                    print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2
                ################################################################


                # make pacbporfs
                pacbp1 = PacbP(input=( seqQ1, seqS1, coordQ1, coordS1) )
                pacbp1.strip_unmatched_ends()
                tinypacbporf1 = pacbp2pacbporf(pacbp1,orfQ1,orfS1)
                tinypacbporf1.extend_pacbporf_after_stops()
                pacbp2 = PacbP(input=( seqQ2, seqS2, coordQ2, coordS2) )
                pacbp2.strip_unmatched_ends()
                tinypacbporf2 = pacbp2pacbporf(pacbp2,orfQ2,orfS2)
                tinypacbporf2.extend_pacbporf_after_stops()

                ################################################################
                if verbose:
                    print tinypacbporf1
                    tinypacbporf1.print_protein_and_dna()
                    print tinypacbporf2
                    tinypacbporf2.print_protein_and_dna()
                ################################################################


                ################################################################
                # set some meta-data properties to the intron objects
                ################################################################
                # add distance score to intron
                preceding_intron._distance  = 0
                intron._distance            = 0
                subsequent_intron._distance = 0
            
                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(preceding_intron,pacbporfD,tinypacbporf1)
                    succes = set_apps_intron_query(intron,tinypacbporf1,tinypacbporf2)
                    succes = set_apps_intron_query(subsequent_intron,tinypacbporf2,pacbporfA)
                else:
                    succes = set_apps_intron_sbjct(preceding_intron,pacbporfD,tinypacbporf1)
                    succes = set_apps_intron_sbjct(intron,tinypacbporf1,tinypacbporf2)
                    succes = set_apps_intron_sbjct(subsequent_intron,tinypacbporf2,pacbporfA)
            
                # set GFF fsource attribute for recognition of intron sources
                preceding_intron._gff['fsource']  = "ABGPprojectingTE"
                intron._gff['fsource']            = "ABGPprojectingTE"
                subsequent_intron._gff['fsource'] = "ABGPprojectingTE"


                # create _linked_to_xxx attributes
                preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ]
                intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ]
                subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ]
                preceding_intron._linked_to_introns   = [ intron,subsequent_intron ]
                intron._linked_to_introns             = [ preceding_intron,subsequent_intron ]
                subsequent_intron._linked_to_introns  = [ intron,preceding_intron ]

                ################################################################
                # append to results
                ################################################################
                results.append( (
                    preceding_intron,
                    intron,
                    subsequent_intron,
                    tinypacbporf1,
                    tinypacbporf2,
                    ) )


    # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row)
    return results
Esempio n. 3
0
def _merge_pacbporfs_by_two_tinyexons(pacbporfD,
                                      pacbporfA,
                                      orfSetObject,
                                      queryorsbjct,
                                      verbose=False,
                                      **kwargs):
    """ """
    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    tinyexons = []
    sposD = pacbporfD._get_original_alignment_pos_start()
    eposD = pacbporfD._get_original_alignment_pos_end()
    sposA = pacbporfA._get_original_alignment_pos_start()
    eposA = pacbporfA._get_original_alignment_pos_end()
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        dStart, dEnd = sposD.query_dna_start, eposD.query_dna_end
        aStart, aEnd = sposA.query_dna_start, eposA.query_dna_end
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        dStart, dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end
        aStart, aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    # get all potential combinations of two tinyexons
    tinyexoncombis = merge_orfs_with_two_tinyexons(
        donorOrf,
        accepOrf,
        donorOrf._donor_sites,
        accepOrf._acceptor_sites,
        orfSetObject.orfs,
    )

    results = []

    for dObj in donorOrf._donor_sites:
        if queryorsbjct == "query":
            (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos,
                                                         forced_return=True)
        else:
            (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,
                                                         forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break

        # check if dObj is on pfD;
        # introns of tinyexons can be projected outside of pfD/pfA area
        if dObj.pos < dStart: continue

        for aObj in accepOrf._acceptor_sites:
            if queryorsbjct == "query":
                (aPos,
                 aPhase) = pacbporfA.dnaposition_query(aObj.pos,
                                                       forced_return=True)
            else:
                (aPos,
                 aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,
                                                       forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break

            # check if aObj is on pfA;
            # introns of tinyexons can be projected outside of pfD/pfA area
            if aObj.pos > aEnd: continue

            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= (kwargs['max_tinyexon_nt_length'] * 2):
                break
            if distance < (kwargs['min_tinyexon_nt_length'] * 2):
                continue

            filtered_tinyexoncombis = _filter_tinyexoncombis(
                tinyexoncombis,
                min_length=distance,
                max_length=distance,
                min_first_acceptor_pos=dObj.pos +
                kwargs['min_tinyexon_intron_nt_length'],
                max_final_donor_pos=aObj.pos -
                kwargs['min_tinyexon_intron_nt_length'],
                phase_final_donor=aObj.phase,
                phase_first_acceptor=dObj.phase,
            )

            if not filtered_tinyexoncombis: continue

            ####################################################################
            if verbose:
                print distance, dObj, aObj, len(tinyexoncombis),
                print len(filtered_tinyexoncombis)
            ####################################################################

            for exon1, intron, exon2 in filtered_tinyexoncombis:
                # make preceding intron
                preceding_intron = IntronConnectingOrfs(
                    dObj, exon1.acceptor, None, donorOrf, exon1.orf)

                # make subsequent intron
                subsequent_intron = IntronConnectingOrfs(
                    exon2.donor, aObj, None, exon2.orf, accepOrf)

                ################################################################
                if verbose:
                    print "\t", exon1, exon1.proteinsequence(),
                    print preceding_intron.phase, exon1.donor.phase,
                    print subsequent_intron.phase, preceding_intron.shared_aa,
                    print intron.shared_aa, subsequent_intron.shared_aa
                    print "\t", exon2, exon2.proteinsequence()
                ################################################################

                # get prjctOrf sequence for comparison
                correctionA = 0
                if aObj.phase != 0:
                    # INCLUDE the final AA which is broken by the splicesite
                    correctionA = 1
                if queryorsbjct == "query":
                    startPos, _phase = pacbporfD.dnaposition_query(
                        dObj.pos, forced_return=True)
                    stopPos, _phase = pacbporfA.dnaposition_query(
                        aObj.pos, forced_return=True)
                    start = pacbporfD._positions[startPos].sbjct_pos
                    stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA
                else:
                    startPos, _phase = pacbporfD.dnaposition_sbjct(
                        dObj.pos, forced_return=True)
                    stopPos, _phase = pacbporfA.dnaposition_sbjct(
                        aObj.pos, forced_return=True)
                    start = pacbporfD._positions[startPos].query_pos
                    stop = pacbporfA._positions[stopPos].query_pos + correctionA

                if stop <= start:
                    # tinyexon is so tiny that is does not have a single
                    # full aligned AA -> discard here
                    continue

                # actually get the prjctOrf sequence
                aaseq = prjctOrf.getaas(abs_pos_start=start, abs_pos_end=stop)

                # initialize a PacbP for the combination of both tinyexons
                # afterwards, check if the indentityscore is > 0.XX
                from pacb import PacbP
                seqparts = [
                    preceding_intron.shared_aa,
                    exon1.proteinsequence(), intron.shared_aa,
                    exon2.proteinsequence(), subsequent_intron.shared_aa
                ]

                ################################################################
                if verbose or len("".join(seqparts)) != len(aaseq):
                    print pacbporfD
                    print exon1.orf, exon2.orf, prjctOrf
                    print pacbporfA
                    print seqparts
                    print aaseq, len(aaseq), len("".join(seqparts)), (start,
                                                                      stop)
                    print "'%s'" % queryorsbjct,
                    print "Q", (algDobj.query_pos, algAobj.query_pos),
                    print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos)
                    print "distance:", distance, kwargs[
                        'max_tinyexon_nt_length'],
                    print(posDsbjct, posAsbjct),
                    print "Q-dna:", (algDobj.query_dna_start, dPhase,
                                     algAobj.query_dna_start, aPhase),
                    print "S-dna:", (algDobj.sbjct_dna_start, dPhase,
                                     algAobj.sbjct_dna_start, aPhase)
                ################################################################

                # ignore by continue when sequences not identical in length
                if len("".join(seqparts)) != len(aaseq): continue

                testpacbp = PacbP(input=("".join(seqparts), aaseq, 0, 0))
                testpacbp.strip_unmatched_ends()

                if not ( testpacbp.identityscore > 0.60 and\
                (float(testpacbp.length) / len(aaseq)) > 0.70 ):
                    # not a very convincing alignment
                    continue

                ################################################################
                if verbose:
                    print testpacbp
                    testpacbp.print_protein()
                ################################################################

                # if here, succesfully mapped 2 tiny exons!!
                # get all sequences/coordinates in place for
                # pacbporf formation
                orfQ1 = exon1.orf
                orfS1 = prjctOrf
                orfQ2 = exon2.orf
                orfS2 = prjctOrf
                seqQ1 = exon1.proteinsequence()
                seqQ2 = exon2.proteinsequence()
                coordQ1 = exon1.acceptor.pos / 3
                coordS1 = start
                coordQ2 = exon2.acceptor.pos / 3
                coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len(
                    seqparts[2])
                seqS1 = aaseq[0:(len(seqparts[0]) + len(seqparts[1]))]
                seqS2 = aaseq[-(len(seqparts[3]) + len(seqparts[4])):]
                if len(seqparts[0]):
                    seqS1 = seqS1[1:]
                    coordS1 += 1
                if len(seqparts[4]):
                    seqS2 = seqS2[:-1]

                if queryorsbjct == "sbjct":
                    # swap query <-> sbjct
                    orfQ1, orfS1 = orfS1, orfQ1
                    orfQ2, orfS2 = orfS2, orfQ2
                    seqQ1, seqS1 = seqS1, seqQ1
                    seqQ2, seqS2 = seqS2, seqQ2
                    coordQ1, coordS1 = coordS1, coordQ1
                    coordQ2, coordS2 = coordS2, coordQ2

                ################################################################
                if verbose:
                    print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2
                    print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2
                ################################################################

                # make pacbporfs
                pacbp1 = PacbP(input=(seqQ1, seqS1, coordQ1, coordS1))
                pacbp1.strip_unmatched_ends()
                tinypacbporf1 = pacbp2pacbporf(pacbp1, orfQ1, orfS1)
                tinypacbporf1.extend_pacbporf_after_stops()
                pacbp2 = PacbP(input=(seqQ2, seqS2, coordQ2, coordS2))
                pacbp2.strip_unmatched_ends()
                tinypacbporf2 = pacbp2pacbporf(pacbp2, orfQ2, orfS2)
                tinypacbporf2.extend_pacbporf_after_stops()

                ################################################################
                if verbose:
                    print tinypacbporf1
                    tinypacbporf1.print_protein_and_dna()
                    print tinypacbporf2
                    tinypacbporf2.print_protein_and_dna()
                ################################################################

                ################################################################
                # set some meta-data properties to the intron objects
                ################################################################
                # add distance score to intron
                preceding_intron._distance = 0
                intron._distance = 0
                subsequent_intron._distance = 0

                # add Alignment Positional Periphery Score into objects
                if queryorsbjct == "query":
                    succes = set_apps_intron_query(preceding_intron, pacbporfD,
                                                   tinypacbporf1)
                    succes = set_apps_intron_query(intron, tinypacbporf1,
                                                   tinypacbporf2)
                    succes = set_apps_intron_query(subsequent_intron,
                                                   tinypacbporf2, pacbporfA)
                else:
                    succes = set_apps_intron_sbjct(preceding_intron, pacbporfD,
                                                   tinypacbporf1)
                    succes = set_apps_intron_sbjct(intron, tinypacbporf1,
                                                   tinypacbporf2)
                    succes = set_apps_intron_sbjct(subsequent_intron,
                                                   tinypacbporf2, pacbporfA)

                # set GFF fsource attribute for recognition of intron sources
                preceding_intron._gff['fsource'] = "ABGPprojectingTE"
                intron._gff['fsource'] = "ABGPprojectingTE"
                subsequent_intron._gff['fsource'] = "ABGPprojectingTE"

                # create _linked_to_xxx attributes
                preceding_intron._linked_to_pacbporfs = [
                    tinypacbporf1, tinypacbporf2
                ]
                intron._linked_to_pacbporfs = [tinypacbporf1, tinypacbporf2]
                subsequent_intron._linked_to_pacbporfs = [
                    tinypacbporf1, tinypacbporf2
                ]
                preceding_intron._linked_to_introns = [
                    intron, subsequent_intron
                ]
                intron._linked_to_introns = [
                    preceding_intron, subsequent_intron
                ]
                subsequent_intron._linked_to_introns = [
                    intron, preceding_intron
                ]

                ################################################################
                # append to results
                ################################################################
                results.append((
                    preceding_intron,
                    intron,
                    subsequent_intron,
                    tinypacbporf1,
                    tinypacbporf2,
                ))

    # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row)
    return results
Esempio n. 4
0
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,
                                                 pacbporfA,
                                                 orfSetObject,
                                                 queryorsbjct,
                                                 verbose=False,
                                                 **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  orfSetObject: object with elegiable Orfs
    @param orfSetObject: object with elegiable Orfs

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @rtype:  list
    @return: list with ( intron, ExonOnOrf, intron ) on the query sequence
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    MAX_TINYEXON_NT_LENGTH = 33
    MIN_TINYEXON_NT_LENGTH = 6

    tinyexons = []
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        alignedDonorRange = pacbporfD.alignment_dna_range_query()
        alignedAccepRange = pacbporfA.alignment_dna_range_query()
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        alignedDonorRange = pacbporfD.alignment_dna_range_sbjct()
        alignedAccepRange = pacbporfA.alignment_dna_range_sbjct()
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    for dObj in donorOrf._donor_sites:
        # do not make a projection OVER the aligned area
        if dObj.pos < min(alignedDonorRange): continue
        if queryorsbjct == "query":
            (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos,
                                                         forced_return=True)
        else:
            (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,
                                                         forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break
        for aObj in accepOrf._acceptor_sites:
            # do not make a projection OVER the aligned area
            if aObj.pos > max(alignedAccepRange): continue
            if queryorsbjct == "query":
                (aPos,
                 aPhase) = pacbporfA.dnaposition_query(aObj.pos,
                                                       forced_return=True)
            else:
                (aPos,
                 aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,
                                                       forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break
            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= MAX_TINYEXON_NT_LENGTH:
                break
            if distance < MIN_TINYEXON_NT_LENGTH:
                continue

            ####################################################
            # generate a ScanForMatches pattern file
            ####################################################
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct])
            # mask all non-phase0 nucleotides to N residues;
            # this represents the regularexpression for a specific
            # peptide sequence
            firstphasepositions = range(3 - dPhase % 3, len(query), 3)
            for pos in range(0, len(query)):
                if pos not in firstphasepositions:
                    query[pos] = "N"
            # calculate a ~50% mismatch number
            mismatches = max([0, (len(query) - query.count("N")) / 2])
            # write the pattern to string and subsequently to file
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            if kwargs['allow_non_canonical_donor']:
                sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % (
                    AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO)
            else:
                sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % (
                    AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO)

            ####################################################
            if verbose:
                print(pacbporfD.orfQ.id, pacbporfA.orfQ.id),
                print distance, dObj, aObj
                print sfmpat
            ####################################################

            fname = "sfmpat_tinyexon_%s_%s_%s_%s" % (
                donorOrf.id,
                accepOrf.id,
                posDsbjct,
                posAsbjct,
            )
            fh = open(fname, 'w')
            fh.write(sfmpat + "\n")
            fh.close()

            ####################################################
            # run ScanForMatches
            ####################################################
            command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\
                      """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\
                      """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\
                      """{ print $1"["$2","$3"]\\n"$4 } }' """
            command = command % (donorOrf.inputgenomicsequence, EXECUTABLE_SFM,
                                 fname, dObj.pos +
                                 (kwargs['min_intron_nt_length'] - 3),
                                 aObj.pos -
                                 (kwargs['min_intron_nt_length'] - 3))
            co = osPopen(command)
            matches = parseFasta(co.readlines())
            co.close()

            # filter matches for:
            # (1) correct donor & acceptor phase
            # (2) high enough donor & acceptor site scores
            for hdr, seqmatch in matches.iteritems():
                startQ, stopQ = [
                    int(item) for item in hdr.split(":")[1][1:-1].split(",")
                ]
                exonQstart = startQ + AUSO + 2 - 1
                exonQstop = stopQ - DDSO - 2

                ####################################
                # get Orf object of tinyexon
                ####################################
                tinyexonorf = None
                # select the Orf on which the tinyexon is located
                for orfObj in orfSetObject.get_eligible_orfs(
                        max_orf_start=exonQstart, min_orf_end=exonQstop):
                    orfPhase = (exonQstart - orfObj.startPY) % 3
                    if orfPhase == dPhase:
                        tinyexonorf = orfObj
                        break
                else:
                    # No tinyexonorf assigned!! Iin case a regex matched
                    # over a STOP-codon or the regex length is smaller
                    # then the smallest Orf, no Orf can be assigned
                    continue

                # filter for donor & acceptor score
                dScore = _score_splice_site(seqmatch[-9:], splicetype='donor')
                aScore = _score_splice_site(seqmatch[0:11],
                                            splicetype='acceptor')
                if dScore < kwargs['min_donor_pssm_score']:
                    continue
                if aScore < kwargs['min_acceptor_pssm_score']:
                    continue

                # scan Orf for splicesites
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                    splicetype="donor",
                    min_pssm_score=kwargs['min_donor_pssm_score'],
                    allow_non_canonical=kwargs['allow_non_canonical_donor'],
                    non_canonical_min_pssm_score=kwargs[
                        'non_canonical_min_donor_pssm_score'])
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                    splicetype="acceptor",
                    min_pssm_score=kwargs['min_acceptor_pssm_score'],
                    allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
                    non_canonical_min_pssm_score=kwargs[
                        'non_canonical_min_acceptor_pssm_score'])

                # get 1th intron donor object
                intron1_aObj = None
                for a in tinyexonorf._acceptor_sites:
                    if a.pos == exonQstart:
                        intron1_aObj = a
                        break
                else:
                    # pseudo-acceptorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # get 2th intron donor object
                intron2_dObj = None
                for d in tinyexonorf._donor_sites:
                    if d.pos == exonQstop:
                        intron2_dObj = d
                        break
                else:
                    # pseudo-donorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # check if introns are of elegiable lengths
                if (intron1_aObj.pos -
                        dObj.pos) > kwargs['max_intron_nt_length']:
                    continue
                if (aObj.pos -
                        intron2_dObj.pos) > kwargs['max_intron_nt_length']:
                    continue

                ####################################################
                if True or verbose:
                    # if here, a candidate!!!
                    print(pacbporfD.orfQ.id, tinyexonorf.id,
                          pacbporfA.orfQ.id),
                    print hdr, dScore, aScore
                    print seqmatch
                ####################################################

                # append to found tinyexons
                query_data = (tinyexonorf, exonQstart, exonQstop)
                sbjct_data = (prjctOrf, posDsbjct, posAsbjct)
                splicesite_data = (dObj, intron1_aObj, intron2_dObj, aObj)
                tinyexons.append((query_data, sbjct_data, splicesite_data))

            # file cleanup
            osRemove(fname)

    # return - End Of Function - if no tinyexons are found
    if not tinyexons:
        return []

    ####################################
    # select the **best** tinyexon
    ####################################
    (query_data, sbjct_data, splicesite_data) = tinyexons[0]
    orfQ, query_dna_start, query_dna_end = query_data
    orfS, sbjct_dna_start, sbjct_dna_end = sbjct_data
    (intron1_dObj, intron1_aObj, intron2_dObj, intron2_aObj) = splicesite_data

    ####################################################
    if verbose:
        print "tinyexon orf:", orfQ
        print "tinyexon orf:", intron1_aObj
        print "tinyexon orf:", intron2_dObj
    ####################################################

    ####################################
    # make tinyexon PacbPORF
    ####################################
    startQaa = orfQ.dnapos2aapos(query_dna_start) - 1
    startSaa = orfS.dnapos2aapos(sbjct_dna_start) - 1
    stopQaa = orfQ.dnapos2aapos(query_dna_end) + 1
    stopSaa = orfS.dnapos2aapos(sbjct_dna_end) + 1
    # check for directly leading stop codon on tinyexon
    while startQaa <= orfQ.protein_startPY:
        startQaa += 1
        startSaa += 1
        query_dna_start += 3
        sbjct_dna_start += 3
    while startSaa <= orfS.protein_startPY:
        startQaa += 1
        startSaa += 1
        query_dna_start += 3
        sbjct_dna_start += 3
    # check for directly tailing stop codon on tinyexon
    while stopQaa > orfQ.protein_endPY:
        stopQaa -= 1
        stopSaa -= 1
        query_dna_end -= 3
        sbjct_dna_end -= 3
    while stopSaa > orfS.protein_endPY:
        stopQaa -= 1
        stopSaa -= 1
        query_dna_end -= 3
        sbjct_dna_end -= 3
    # get sequences
    qAAseq = orfQ.getaas(abs_pos_start=startQaa, abs_pos_end=stopQaa)
    sAAseq = orfS.getaas(abs_pos_start=startSaa, abs_pos_end=stopSaa)

    ####################################################
    if verbose or len(qAAseq) != len(sAAseq):
        # if unequal lengths, error will be raised upon PacbP.__init__()
        print orfQ, qAAseq, startQaa, stopQaa, (stopQaa - startQaa),
        print(query_dna_start, query_dna_end)
        print orfS, sAAseq, startSaa, stopSaa, (stopSaa - startSaa),
        print(sbjct_dna_start, sbjct_dna_end)
        print orfQ.inputgenomicsequence[query_dna_start - 2:query_dna_end + 2]
        print orfS.inputgenomicsequence[sbjct_dna_start - 2:sbjct_dna_end + 2]
    ####################################################

    # initialize extended tinyexon PacbPORF
    from pacb import PacbP
    pacbp = PacbP(input=(qAAseq, sAAseq, startQaa, startSaa))
    pacbp.strip_unmatched_ends()
    pacbporf = pacbp2pacbporf(pacbp, orfQ, orfS)
    pacbporf.extend_pacbporf_after_stops()
    pacbporf.source = 'ABGPprojectingTE'

    ####################################
    # make introns
    ####################################
    intron1 = IntronConnectingOrfs(intron1_dObj, intron1_aObj, None, donorOrf,
                                   pacbporf.orfQ)
    intron2 = IntronConnectingOrfs(intron2_dObj, intron2_aObj, None,
                                   pacbporf.orfQ, accepOrf)

    ################################################################
    # set some meta-data properties to the intron objects
    ################################################################
    # add distance score to intron
    intron1._distance = 0
    intron2._distance = 0

    # add Alignment Positional Periphery Score into objects
    if queryorsbjct == "query":
        succes = set_apps_intron_query(intron1, pacbporfD, pacbporf)
        succes = set_apps_intron_query(intron2, pacbporf, pacbporfA)
    else:
        succes = set_apps_intron_sbjct(intron1, pacbporfD, pacbporf)
        succes = set_apps_intron_sbjct(intron2, pacbporf, pacbporfA)

    # set GFF fsource attribute for recognition of intron sources
    intron1._gff['fsource'] = "ABGPprojectingTE"
    intron2._gff['fsource'] = "ABGPprojectingTE"

    # create _linked_to_xxx attributes
    intron1._linked_to_pacbporfs = [pacbporf]
    intron2._linked_to_pacbporfs = [pacbporf]
    intron1._linked_to_introns = [intron2]
    intron2._linked_to_introns = [intron1]

    ####################################################
    if verbose:
        print pacbporf
        pacbporf.print_protein_and_dna()
        print intron1
        print intron2
        if False:
            # printing data when this function needs to be debugged:
            print ""
            print intron1
            print intron2
            print ""
            print pacbporfD
            pacbporfD.print_protein_and_dna()
            print ""
            print pacbporf
            pacbporf.print_protein_and_dna()
            print ""
            print pacbporfA
            pacbporfA.print_protein_and_dna()
            import sys
            sys.exit()
    ####################################################

    # return introns and intermediate tinyexon PacbPORF
    return [(intron1, intron2, pacbporf)]
Esempio n. 5
0
def merge_orfs_with_two_tinyexons(preceding_orf,
                                  subsequent_orf,
                                  preceding_donor_sites=[],
                                  subsequent_acceptor_sites=[],
                                  orflist=[],
                                  **kwargs):
    """
    Bridge two `neighbouring` Orfs by TWO tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @attention: see get_potential_tiny_exons_on_orf for additional **kwargs

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon1, central_intron, tinyexon2, subsequent_intron )

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    # return list with (intron,tinyexon,intron) tuples
    returntinyexons = []
    tinyexoncollection = []
    tinyexoncombis = []
    min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites])
    max_subsequent_acceptor_sites_pos = max(
        [a.pos for a in subsequent_acceptor_sites])

    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        min_pos = min_preceding_donor_sites_pos + kwargs[
            'min_tinyexon_intron_nt_length']
        max_pos = max_subsequent_acceptor_sites_pos - kwargs[
            'min_tinyexon_intron_nt_length']
        # if so, do not check this Orf
        if orfX.endPY <= min_pos: continue
        if orfX.startPY >= max_pos: continue
        # extend the tinyexoncollection
        tinyexoncollection.extend(
            get_potential_tiny_exons_on_orf(orfX, **kwargs))

    # make tinyexoncollection ordered on start pos
    tinyexoncollection = _order_intron_list(tinyexoncollection,
                                            order_by='donor_pos')
    # donor_pos makes REVERSE ordering; restore this by reversing
    tinyexoncollection.reverse()

    # make 2-elemented tuples of tinyexons which can co-occur together
    for tinyexon1 in tinyexoncollection:
        for pos in range(len(tinyexoncollection) - 1, -1, -1):
            tinyexon2 = tinyexoncollection[pos]
            if tinyexon2.donor.pos < tinyexon1.donor.pos: break
            intron_length = tinyexon2.acceptor.pos - tinyexon1.donor.pos
            if intron_length < kwargs['min_tinyexon_intron_nt_length']:
                continue
            if intron_length > kwargs['max_tinyexon_intron_nt_length']:
                continue
            if tinyexon1.donor.phase != tinyexon2.acceptor.phase: continue
            # if here, elegiable combi!
            intron = IntronConnectingOrfs(
                tinyexon1.donor, tinyexon2.acceptor,
                get_shared_nucleotides_at_splicesite(subsequent_orf,
                                                     preceding_orf,
                                                     tinyexon2.acceptor,
                                                     tinyexon1.donor),
                preceding_orf, subsequent_orf)
            totlen = tinyexon1.length + tinyexon2.length
            combi = (totlen, tinyexon1, intron, tinyexon2)
            tinyexoncombis.append(combi)

    # return an ordered list based on length
    tinyexoncombis.sort()
    return [(exon1, intron, exon2)
            for l, exon1, intron, exon2 in tinyexoncombis]
Esempio n. 6
0
def merge_orfs_with_intron(
        orfD,
        orfA,
        max_intron_nt_length=MAX_INTRON_NT_LENGTH,
        min_intron_nt_length=MIN_INTRON_NT_LENGTH,
        min_donor_pssm_score=MIN_DONOR_PSSM_SCORE,
        min_acceptor_pssm_score=MIN_ACCEPTOR_PSSM_SCORE,
        allow_non_canonical_donor=ALLOW_NON_CANONICAL_DONOR,
        allow_non_canonical_acceptor=ALLOW_NON_CANONICAL_ACCEPTOR,
        non_canonical_min_donor_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
        non_canonical_min_acceptor_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE,
        min_donor_pos=None,
        max_donor_pos=None,
        min_acceptor_pos=None,
        max_acceptor_pos=None,
        order_by='length',
        **kwargs):
    """
    Merge 2 Orf objects by introns

    @attention: **kwargs can contain other (here) unnecessarily arguments

    @type  orfD: Orf object
    @param orfD: Orf object that has to deliver a PSSM donor object

    @type  orfA: Orf object
    @param orfA: Orf object that has to deliver a PSSM acceptor object

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: maximal length (nt) of the intron
    
    @type  min_intron_nt_length: integer
    @param min_intron_nt_length: minimal length (nt) of the intron

    @type  min_donor_pssm_score: float
    @param min_donor_pssm_score: minimal pssm score of donor splice site

    @type  min_acceptor_pssm_score: float
    @param min_acceptor_pssm_score: minimal pssm score of acceptor splice site

    @type  allow_non_canonical_donor: Boolean
    @param allow_non_canonical_donor: search for non-canonical donor sites too

    @type  allow_non_canonical_acceptor: Boolean
    @param allow_non_canonical_acceptor: search for non-canonical acceptor splice sites too

    @type  non_canonical_min_donor_pssm_score: float
    @param non_canonical_min_donor_pssm_score: minimal pssm score of non-canonical donor

    @type  non_canonical_min_acceptor_pssm_score: float
    @param non_canonical_min_acceptor_pssm_score: minimal pssm score of non-canonical acceptor 

    @rtype:  list
    @return: list with introns
    """
    # input validation
    IsOrf(orfD)
    IsOrf(orfA)

    # scan for splice sites (if not already done -> is checked in function)
    orfD.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=min_donor_pssm_score,
        allow_non_canonical=allow_non_canonical_donor,
        non_canonical_min_pssm_score=non_canonical_min_donor_pssm_score)
    orfA.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=min_acceptor_pssm_score,
        allow_non_canonical=allow_non_canonical_acceptor,
        non_canonical_min_pssm_score=non_canonical_min_acceptor_pssm_score)

    # return list with introns
    introns = []

    # most quickest scan possible: are there donors & acceptors?
    if orfD._donor_sites == [] or orfA._acceptor_sites == []:
        # no introns possible because splice sites are missing
        return introns

    # very quick scan: are exons not to far from each other?
    if max_intron_nt_length and\
    (orfA._acceptor_sites[0].pos - orfD._donor_sites[0].pos) > max_intron_nt_length:
        # no introns possible that can bridge this gap
        return introns

    for donor in orfD._donor_sites:
        if not allow_non_canonical_donor and not donor.is_canonical():
            continue
        elif donor.is_canonical() and donor.pssm_score < min_donor_pssm_score:
            continue
        elif not donor.is_canonical(
        ) and donor.pssm_score < non_canonical_min_donor_pssm_score:
            continue
        elif (min_donor_pos
              or min_donor_pos == 0) and donor.pos < min_donor_pos:
            continue
        elif (max_donor_pos
              or max_donor_pos == 0) and donor.pos > max_donor_pos:
            continue
        else:
            # donor site accepted
            pass

        for acceptor in orfA._acceptor_sites:
            if not allow_non_canonical_acceptor and not acceptor.is_canonical(
            ):
                continue
            elif acceptor.is_canonical(
            ) and acceptor.pssm_score < min_acceptor_pssm_score:
                continue
            elif not acceptor.is_canonical(
            ) and acceptor.pssm_score < non_canonical_min_acceptor_pssm_score:
                continue
            elif (min_acceptor_pos or min_acceptor_pos
                  == 0) and acceptor.pos < min_acceptor_pos:
                continue
            elif (max_acceptor_pos or max_acceptor_pos
                  == 0) and acceptor.pos > max_acceptor_pos:
                continue
            else:
                # acceptor site accepted
                pass

            # generate intron length and phase variable
            intron_length = acceptor.pos - donor.pos
            intron_phase = intron_length % 3

            # check phase compatibilty (1) of splice sites
            if donor.phase != acceptor.phase: continue
            # check phase compatibilty (2) of splice sites
            if (intron_phase + orfD.frame) % 3 != orfA.frame % 3: continue

            # check if intron length is in between the boundaries
            if max_intron_nt_length and intron_length > max_intron_nt_length:
                continue
            if min_intron_nt_length and intron_length < min_intron_nt_length:
                continue

            # okay, if we reach this point, we have a valid intron
            shared_nts = get_shared_nucleotides_at_splicesite(
                orfA, orfD, acceptor, donor)

            # make a IntronConnectingOrfs object
            intron = IntronConnectingOrfs(donor, acceptor, shared_nts, orfD,
                                          orfA)
            introns.append(intron)

    # return ordered intron list
    return _order_intron_list(introns, order_by=order_by)
Esempio n. 7
0
def merge_orfs_with_tinyexon(
        preceding_orf,
        subsequent_orf,
        preceding_donor_sites=[],
        subsequent_acceptor_sites=[],
        orflist=[],
        max_tinyexon_nt_length=TINYEXON_MAX_NT_LENGTH,
        min_tinyexon_nt_length=TINYEXON_MIN_NT_LENGTH,
        max_tinyexon_intron_nt_length=TINYEXON_MAX_INTRON_NT_LENGTH,
        min_tinyexon_intron_nt_length=TINYEXON_MIN_INTRON_NT_LENGTH,
        min_donor_pssm_score=TINYEXON_MIN_DONOR_PSSM_SCORE,
        min_acceptor_pssm_score=TINYEXON_MIN_ACCEPTOR_PSSM_SCORE,
        min_total_pssm_score=TINYEXON_MIN_TOTAL_PSSM_SCORE,
        **kwargs):
    """
    Bridge two `neighbouring` Orfs by a tinyexon by applying preceding donors and subsequent acceptors

    @type  preceding_orf: Orf object
	@param preceding_orf: Orf object that contains preceding_donor_site(s)

    @type  subsequent_orf: Orf object
	@param subsequent_orf: Orf object that contains subsequent_acceptor_site(s)

    @type  preceding_donor_sites: list
	@param preceding_donor_sites: list with SpliceDonorGT and/or SpliceDonor objects

    @type  subsequent_acceptor_sites: list
	@param subsequent_acceptor_sites: list with SpliceAcceptorAG and/or SpliceAcceptor objects

    @type  orflist: list
	@param orflist: list with Orf objects

    @type  max_tinyexon_nt_length: integer
	@param max_tinyexon_nt_length: positive integer, largest length of tinyexon in nt

    @type  min_tinyexon_nt_length: integer
	@param min_tinyexon_nt_length: positive integer, smallest length of tinyexon in nt

    @type  max_tinyexon_intron_nt_length: integer
    @param max_tinyexon_intron_nt_length: positive integer, largest length of intron around tinyexon in nt

    @type  min_tinyexon_intron_nt_length: integer
    @param min_tinyexon_intron_nt_length: positive integer, smallest length of intron around tinyexon in nt

    @type  min_total_pssm_score: float or None
	@param min_total_pssm_score: minimal sum of donor - acceptor pssm score pair of tinyexon

    @type  min_donor_pssm_score: float or None
	@param min_donor_pssm_score: minimal donor pssm score of tinyexon

    @type  min_acceptor_pssm_score: float or None
	@param min_acceptor_pssm_score: minimal acceptor pssm score of tinyexon

    @rtype:  list
	@return: list of tuples ( preceding_intron, tinyexon, subsequent_intron )

    @attention: Global vars that have to be set upon usage:
        MIN_DONOR_PSSM_SCORE
        MIN_ACCEPTOR_PSSM_SCORE
        # and all TINYEXON variable named
        TINYEXON_MAX_NT_LENGTH                          
        TINYEXON_MIN_NT_LENGTH                          
        TINYEXON_MAX_INTRON_NT_LENGTH                   
        TINYEXON_MIN_INTRON_NT_LENGTH                   
        TINYEXON_MIN_PSSM_SCORE                         
        TINYEXON_MIN_DONOR_PSSM_SCORE                   
        TINYEXON_MIN_ACCEPTOR_PSSM_SCORE                
        TINYEXON_ALLOW_NON_CANONICAL_DONOR              
        TINYEXON_ALLOW_NON_CANONICAL_ACCEPTOR           
        TINYEXON_NON_CANONICAL_MIN_PSSM_SCORE           
        TINYEXON_NON_CANONICAL_MIN_DONOR_PSSM_SCORE     
        TINYEXON_NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE  

    """
    if not preceding_donor_sites:
        return []
    if not subsequent_acceptor_sites:
        return []
    if not orflist:
        return []

    # return list with (intron,tinyexon,intron) tuples
    returnexons = []
    min_preceding_donor_sites_pos = min([d.pos for d in preceding_donor_sites])
    max_subsequent_acceptor_sites_pos = max(
        [a.pos for a in subsequent_acceptor_sites])
    for orfX in orflist:
        # check if orf is correctly positions towards the splice sites' extremes
        if orfX.endPY <= min_preceding_donor_sites_pos: continue
        if orfX.startPY >= max_subsequent_acceptor_sites_pos: continue

        # if here, we can try to make a bridge by a tinyexon
        for donor in preceding_donor_sites:
            # orf not correctly positions towards the donor site
            if orfX.endPY <= donor.pos: continue

            # check pssm_score of donor site
            # TODO: this is in fact the donor on the normal, large orf
            # TODO: do we want to check this pssm score?
            if donor.pssm_score < min_donor_pssm_score: continue

            for acceptor in subsequent_acceptor_sites:
                if orfX.startPY >= acceptor.pos: continue

                # check pssm_score of acceptor site
                # TODO: this is in fact the acceptor on the normal, large orf
                # TODO: do we want to check this pssm score?
                if acceptor.pssm_score < min_acceptor_pssm_score: continue

                # okay, now try to bridge it!
                exons = find_tiny_exon_on_orf(
                    orfX,
                    order_by='total_pssm',
                    max_tinyexon_nt_length=max_tinyexon_nt_length,
                    min_tinyexon_nt_length=min_tinyexon_nt_length,
                    max_tinyexon_intron_nt_length=max_tinyexon_intron_nt_length,
                    min_tinyexon_intron_nt_length=min_tinyexon_intron_nt_length,
                    min_donor_pssm_score=min_donor_pssm_score,
                    min_acceptor_pssm_score=min_acceptor_pssm_score,
                    min_total_pssm_score=min_total_pssm_score,
                    preceding_donor=donor,
                    subsequent_acceptor=acceptor)
                # and append to returnexons
                for tinyexon in exons:

                    # make preceding intron
                    shared_nts_A = get_shared_nucleotides_at_splicesite(
                        tinyexon.orf, preceding_orf, tinyexon.acceptor, donor)
                    preceding_intron = IntronConnectingOrfs(
                        donor, tinyexon.acceptor, shared_nts_A, preceding_orf,
                        tinyexon.orf)

                    # make subsequent intron
                    shared_nts_B = get_shared_nucleotides_at_splicesite(
                        subsequent_orf, tinyexon.orf, acceptor, tinyexon.donor)

                    subsequent_intron = IntronConnectingOrfs(
                        tinyexon.donor, acceptor, shared_nts_B, tinyexon.orf,
                        subsequent_orf)

                    # and append to exons
                    returnexons.append(
                        (preceding_intron, tinyexon, subsequent_intron))

    # and return the list of intron/exon/intron
    return returnexons