Ejemplo n.º 1
0
def CodingBlockGraph2GeneTreeGraph(cbg):
    """
    Convert CodingBlockGraph 2 GeneTree

    @attention: function just converts, error check is not performed here!

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph instance

    @rtype:  GeneTreeGraph
    @return: GeneTreeGraph instance
    """
    gtg = GeneTreeGraph()
    cbgnode2orgnode = {}
    for node in cbg.get_nodes():
        org = cbg._organism_from_node(node)
        gtg.add_node(org)
        # add node/org combi to mapping dict
        cbgnode2orgnode[ node ] = org
    # now add all the edges
    omsr = cbg.overall_minimal_spanning_range()
    for (n1,n2) in cbg.pairwisecrosscombinations_node():
        if cbg.has_edge(n1,n2):
            # get pacbp(orf) object
            thepacbp = cbg.get_pacbps_by_nodes(node1=n1,node2=n2)[0]
            # get relative coordinates of the OMSR part of the alignment
            omsrQs = thepacbp.alignmentposition_by_query_pos( min( omsr[n1] ) )
            omsrQe = thepacbp.alignmentposition_by_query_pos( max( omsr[n1] ) )

            # CHECK these coordinates; pacb.exceptions.CoordinateOutOfRange can occur
            # in freaky cases. They shouldn't, but do without discovered reason.
            # However, in the majority of cases, it is just a 1/few aa offset, which
            # can be easily corrected here.
            if str(omsrQs) == str(pacb.exceptions.CoordinateOutOfRange):
                if thepacbp.__class__.__name__ == 'PacbP':
                    # solve by taking thepacbp.query_start 
                    omsrQs = thepacbp.alignmentposition_by_query_pos( thepacbp.query_start )
                else:
                    # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']:
                    # solve by taking orginal alignment position start
                    omsrQs = thepacbp.alignmentposition_by_query_pos(
                        thepacbp._get_original_alignment_pos_start().query_pos
                        )

                ###########################################################################
                ## print warning message(s)
                #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQs, ", 
                #print "node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) )
                #print "WARNING: min(omsr(", min( omsr[n1] ), ")", min(omsr[n1]),
                #print max(omsr[n1]), " taken ->", thepacbp.query_start, omsrQs
                #print "WARNING: ", thepacbp
                ###########################################################################

            if str(omsrQe) == str(pacb.exceptions.CoordinateOutOfRange):
                if thepacbp.__class__.__name__ == 'PacbP':
                    # solve by taking thepacbp.query_end
                    omsrQe = thepacbp.alignmentposition_by_query_pos( thepacbp.query_end )
                else:
                    # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']:
                    # solve by taking orginal alignment position end 
                    omsrQe = thepacbp.alignmentposition_by_query_pos(
                        thepacbp._get_original_alignment_pos_end().query_pos
                        ) + 1  # add +1 to create a python list range coordinate

                ###########################################################################
                ## print warning message(s)
                #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQe, ",
                #print node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) )
                #print "WARNING: max(omsr(", max( omsr[n1] ), ")", min(omsr[n1]),
                #print max(omsr[n1]), " taken ->", thepacbp.query_end, omsrQe
                #print "WARNING: ", thepacbp
                ###########################################################################

            else:
                # omsrQe was nicely an integer; add +1 because max(OMSR) is not a range coord
                omsrQe += 1

            # calculate identityscore
            identityscore = pacb.calculate_identityscore( thepacbp.alignment[omsrQs:omsrQe] )
        else:
            # this edge is absent in the CBG!
            # TODO -> this will cause a crash a few lines later
            # by definition, a CBG MUST HAVE ALL EDGES at this stage!
            print "about to crash!!!!"
            print cbg
            print cbg.node_count(), cbg.edge_count(), "missing:", (n1,n2) 
            identityscore = 0.0
        # get organism identifyers from node and add edge
        o1,o2 = cbgnode2orgnode[ n1 ], cbgnode2orgnode[ n2 ]

        # Wt used is identityscore == Identity + 0.5* Similarity
        gtg.add_edge( o1, o2, wt=identityscore )

        # add additional statistics to gtg object. Wt used is
        # identitypercentage is TRUE aa indentity %
        identityperc = pacb.calculate_identity( thepacbp.alignment[omsrQs:omsrQe] )
        gtg._aa_identity_percentages[(o1,o2)] = identityperc
        gtg._aa_identity_percentages[(o2,o1)] = identityperc

        # bitscoreratio is ratio of bits / max bits
        bitscoreratio = pacb.calculate_bitscoreratio(
                thepacbp.query[omsrQs:omsrQe],
                thepacbp.sbjct[omsrQs:omsrQe],
                matrix = thepacbp.MATRIX
                )
        gtg._bitscore_ratios[(o1,o2)] = bitscoreratio
        gtg._bitscore_ratios[(o2,o1)] = bitscoreratio

      
        # ntidentity is obviously nt identity%
        dnaQseq, dnaSseq = thepacbp.get_unextended_aligned_dna_sequences()
        ntidentity = sequence_identity_ratio(dnaQseq,dnaSseq)
        gtg._nt_identity_percentages[(o1,o2)] = ntidentity
        gtg._nt_identity_percentages[(o2,o1)] = ntidentity

    # check if the graph is saturated (complete)
    # if not (organism/node/orf missing), add this as a zero-wt edge
    gtg.makecompletegraph(wt=0.0)
    # and return this new genetree graph
    return gtg
Ejemplo n.º 2
0
def cexpander_checkCBG4omsrbordergaps(cbg,
    omit5pside = False, omit3pside = False,
    max_bitscoreratio_threshold =\
            CBG_CEXPANDER_OMSRBORDERGAPS_MAX_BITSCORERATIO_THRESHOLD,
    nonuniform_aa_offset = CBG_CEXPANDER_OMSRBORDERGAPS_NONUNIFORM_AA_OFFSET,
    gap_size = CBG_CEXPANDER_OMSRBORDERGAPS_GAP_SIZE,
    verbose = False):
    """
    Check the area directly around the OMSR of a CBG for non-uniform alignments

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph instance to optimize 

    @type  omit5pside: Boolean
    @param omit5pside: Do not process the 5' side (left) of the CBG

    @type  omit3pside: Boolean
    @param omit3pside: Do not process the 3' side (rigth) of the CBG

    @type  nonuniform_aa_offset: integer
    @param nonuniform_aa_offset: area of the nonuniform stretch to check for gaps

    @type  gap_size: integer
    @param gap_size: continuous gap length to occur in the nonuniform_aa_offset in order to shorten the CBG

    @type  max_bitscoreratio_threshold: float
    @param max_bitscoreratio_threshold: maximal bitscore ratio of Q vs. S slice to enforce a CBG shortening

    @type  verbose: Boolean 
    @param verbose: print debugging/intermediate information to STDOUT 

    @rtype:  Boolean ( or NoOverallMinimalSpanningRange or ZeroUniformlyAlignedPositions exception )
    @return: status weather or not the CBG was shortened
    """
    hasconsistency = cbg._cexpander.binarystring.count("1") >= 1
    has5Pomsrflaw = cbg._cexpander.binarystring[0] == "0"
    PACBPS_CORRECTED = 0

    if not hasconsistency:
        # a priori error. CBGs must have at least a single Uniformly Aligned AA position
        raise ZeroUniformlyAlignedPositions

    if not omit5pside and (hasconsistency, has5Pomsrflaw) == (True, True):
        # start correction on the 5' side of the OMSR
        omsr = cbg.overall_minimal_spanning_range()
        replacements = {}
        ########################################################################
        if verbose:
            print "STARTING cexpander_checkCBG4omsrbordergaps 5p side"
            print cbg
            print "cexp::", cbg._cexpander.binarystring, cbg._cexpander.header
        ########################################################################
        for (currentkey, nodeQ, nodeS), pacbporf in cbg.pacbps.iteritems():
            # get slice of the pacbporf around the max(OMSR) query value
            orgQ = cbg.organism_by_node(nodeQ)
            cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring
            endQpos = min(omsr[nodeQ]) + cexpQstr.find("1")
            staQpos = endQpos - nonuniform_aa_offset

            # get slice of the pacbporf around the max(OMSR) sbjct value
            orgS = cbg.organism_by_node(nodeS)
            cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring
            endSpos = min(omsr[nodeS]) + cexpSstr.find("1")
            staSpos = endSpos - nonuniform_aa_offset

            # correct staQpos if < pacbporf.orfQ.protein_startPY
            staQpos = max([pacbporf.orfQ.protein_startPY, staQpos])
            editedQ = staQpos != endQpos - nonuniform_aa_offset

            # correct staSpos if < pacbporf.orfS.protein_startPY
            staQpos = max([pacbporf.orfS.protein_startPY, staSpos])
            editedS = staSpos != endSpos - nonuniform_aa_offset

            if editedQ and editedS:
                if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\
                (endQpos - (staQpos + nonuniform_aa_offset) ):
                    # editing on Sbjct is gte as on Query -> take Sbjct
                    (q, m, s, coords) = pacbporf.alignmentpart_by_sbjct(
                        staSpos, endSpos)
                else:
                    # other way around -> take Query
                    (q, m, s, coords) = pacbporf.alignmentpart_by_query(
                        staQpos, endQpos)
            elif editedS:
                # take by sbjct coords
                (q, m, s,
                 coords) = pacbporf.alignmentpart_by_sbjct(staSpos, endSpos)
            else:
                # unedited or edited Query -> take by query coords
                (q, m, s,
                 coords) = pacbporf.alignmentpart_by_query(staQpos, endQpos)

            # check minval of coords; CBGs at the far 5' end of the
            # input DNA sequence can get negative coords for their
            # non-existing Orf frontal STOPcodon (up to -3)
            if min(coords) < 0: continue

            # get bitscore-ratio of this Query/Sbjct slice
            (qS, qE, sS, sE) = coords
            bitscoreratio = pacb.calculate_bitscoreratio(q, s)

            # get bitscore-ratio of this Query/Sbjct slice
            bitscoreratio = pacb.calculate_bitscoreratio(q, s)

            # if more gaps in this alignment slice then expected -> a pacbp split will follow
            # the slice is of size (2*omsr_offset)+1
            if q.find('-' * gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                pos = q.find('-' * gap_size)
                while pos + gap_size < len(q) and q[pos + gap_size] == "-":
                    pos += 1
                splitpos = qS + pos + gap_size
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            elif s.find('-' * gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                pos = s.find('-' * gap_size)
                while pos + gap_size < len(s) and s[pos + gap_size] == "-":
                    pos += 1
                splitpos = sS + pos + gap_size
                # correct splitpos by pacbp.sbjct_start
                splitpos = splitpos - pacbp.sbjct_start

            elif bitscoreratio <= max_bitscoreratio_threshold:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                # correct for matches on the rigth of the match string
                splitpos = qE - (len(m) - m.rfind(" ") - 1)
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            else:
                ################################################################
                if verbose:
                    print nodeQ, nodeS, "'%s' '%s' '%s'" % (q, m, s), coords,
                    print "settings:", (nonuniform_aa_offset, gap_size),
                    print "bitsratio: %1.3f" % bitscoreratio
                ################################################################
                # not passing the cut-off for splitting this pacbp
                continue

            ####################################################################
            if verbose:
                print "5p,", nodeQ, nodeS, (q, m, s, coords),
                print "bitsratio: %1.3f (thr:%1.3f)" % (
                    bitscoreratio, max_bitscoreratio_threshold)
                print pacbp, "relative splitpos:", splitpos
                pacbp.print_protein(_linesize=120)
            ####################################################################

            # now split the pacbp on this position and recreate the pacbporf
            pacbpR = pacb.splitting.split_pacb_on_coordinates(
                pacbp, (splitpos, splitpos), returnside='rigth')

            if pacbpR:
                newpacbporf = pacb.conversion.pacbp2pacbporf(
                    pacbpR, pacbporf.orfQ, pacbporf.orfS)
                newpacbporf.extend_pacbporf_after_stops()
                # store to replacements dict
                replacements[(currentkey, nodeQ, nodeS)] = newpacbporf
                ################################################################
                if verbose:
                    print pacbpR
                    pacbpR.print_protein(_linesize=120)
                    print newpacbporf
                ################################################################
                # increase counter for how much pacbps are corrected
                PACBPS_CORRECTED += 1

        # do the replacements of 5' PacbP corrections
        status = _update_cbg_with_pacbporf_replacements(cbg, replacements)
        if status == True:
            pass  # cbg succesfully updated; still an OMSR
        elif status == False:
            # raise a NoOverallMinimalSpanningRange Exception
            print "WARNING: NoOverallMinimalSpanningRange", cbg
            raise NoOverallMinimalSpanningRange, str(cbg)
        else:
            pass

    # check (again!) if the is any consistency and if there is a 3' inconsistency
    hasconsistency = cbg._cexpander.binarystring.count("1") >= 1
    has3Pomsrflaw = cbg._cexpander.binarystring[-1] == "0"

    if not hasconsistency:
        # due to 5' optimization, the complete CBG alignment collapsed!
        raise ZeroUniformlyAlignedPositions

    if not omit3pside and (hasconsistency, has3Pomsrflaw) == (True, True):
        # start correction on the 3' side of the OMSR
        omsr = cbg.overall_minimal_spanning_range()
        replacements = {}
        ########################################################################
        if verbose:
            print "STARTING cexpander_checkCBG4omsrbordergaps 3p side"
            print cbg, "\ncexp::", cbg._cexpander.binarystring,
            print cbg._cexpander.header
        ########################################################################

        for (currentkey, nodeQ, nodeS), pacbporf in cbg.pacbps.iteritems():
            # get slice of the pacbporf around the max(OMSR) query value
            orgQ = cbg.organism_by_node(nodeQ)
            cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring
            staQpos = max(omsr[nodeQ]) - (len(cexpQstr) - cexpQstr.rfind("1"))
            endQpos = staQpos + nonuniform_aa_offset

            # get slice of the pacbporf around the max(OMSR) sbjct value
            orgS = cbg.organism_by_node(nodeS)
            cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring
            staSpos = max(omsr[nodeS]) - (len(cexpSstr) - cexpSstr.rfind("1"))
            endSpos = staSpos + nonuniform_aa_offset

            # correct endQpos if > pacbporf.orfQ.protein_endPY
            endQpos = min([pacbporf.orfQ.protein_endPY, endQpos])
            editedQ = endQpos != staQpos + nonuniform_aa_offset

            # correct endSpos if > pacbporf.orfQ.protein_endPY
            endSpos = min([pacbporf.orfS.protein_endPY, endSpos])
            editedS = endSpos != staSpos + nonuniform_aa_offset

            if editedQ and editedS:
                if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\
                (endQpos - (staQpos + nonuniform_aa_offset) ):
                    # editing on Sbjct is gte as on Query -> take Sbjct
                    (q, m, s, coords) = pacbporf.alignmentpart_by_sbjct(
                        staSpos, endSpos)
                else:
                    # other way around -> take Query
                    (q, m, s, coords) = pacbporf.alignmentpart_by_query(
                        staQpos, endQpos)
            elif editedS:
                # take by sbjct coords
                (q, m, s,
                 coords) = pacbporf.alignmentpart_by_sbjct(staSpos, endSpos)
            else:
                # unedited or edited Query -> take by query coords
                (q, m, s,
                 coords) = pacbporf.alignmentpart_by_query(staQpos, endQpos)

            # get bitscore-ratio of this Query/Sbjct slice
            (qS, qE, sS, sE) = coords
            bitscoreratio = pacb.calculate_bitscoreratio(q, s)

            # if more gaps in this alignment slice then expected -> a pacbp split will follow
            # the slice is of size (2*omsr_offset)+1
            if q.find('-' * gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                splitpos = qS + q.find('-' * gap_size)
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            elif s.find('-' * gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                splitpos = sS + s.find('-' * gap_size)
                # correct splitpos by pacbp.sbjct_start
                splitpos = splitpos - pacbp.sbjct_start

            elif bitscoreratio <= max_bitscoreratio_threshold:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                # correct for matches on the left of the match string
                splitpos = qS + m.find(" ")
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            else:
                ################################################################
                if verbose:
                    print nodeQ, nodeS, "'%s' '%s' '%s'" % (q, m, s), coords,
                    print "settings:", (nonuniform_aa_offset, gap_size),
                    print "bitsratio: %1.3f" % bitscoreratio
                ################################################################
                # not passing the cut-off for splitting this pacbp
                continue

            ####################################################################
            if verbose:
                print "3p,", nodeQ, nodeS, (q, m, s, coords),
                print "bitsratio: %1.3f (thr:%1.3f)" % (
                    bitscoreratio, max_bitscoreratio_threshold)
                print pacbp, "relative splitpos:", splitpos
                pacbp.print_protein(_linesize=120)
            ####################################################################

            # now split the pacbp on this position and recreate the pacbporf
            pacbpL = pacb.splitting.split_pacb_on_coordinates(
                pacbp, (splitpos, splitpos), returnside='left')

            if pacbpL:
                newpacbporf = pacb.conversion.pacbp2pacbporf(
                    pacbpL, pacbporf.orfQ, pacbporf.orfS)
                newpacbporf.extend_pacbporf_after_stops()
                # store to replacements dict
                replacements[(currentkey, nodeQ, nodeS)] = newpacbporf
                ################################################################
                if verbose:
                    print pacbpL
                    pacbpL.print_protein(_linesize=120)
                    print newpacbporf
                ################################################################
                # increase counter for how much pacbps are corrected
                PACBPS_CORRECTED += 1

        # do the replacements of 3' PacbP corrections
        status = _update_cbg_with_pacbporf_replacements(cbg, replacements)
        if status == True:
            pass  # cbg succesfully updated; still an OMSR
        elif status == False:
            # raise a NoOverallMinimalSpanningRange Exception
            raise NoOverallMinimalSpanningRange, str(cbg)
        elif status == None:
            pass  # no updates done at all
        else:
            # NOT POSSIBLE -> status isa NoneBoolean
            pass

    ####################################################################
    if verbose and PACBPS_CORRECTED:
        print "REPLACEMENTS DONE:", PACBPS_CORRECTED, "omit5pside:",
        print omit5pside, "omit3pside", omit3pside
        print cbg
        cbg.printmultiplealignment()
    ####################################################################

    # return if there is something improved
    if PACBPS_CORRECTED: return True
    else: return False
Ejemplo n.º 3
0
def cexpander_checkCBG4omsrbordergaps(cbg,
    omit5pside = False, omit3pside = False,
    max_bitscoreratio_threshold =\
            CBG_CEXPANDER_OMSRBORDERGAPS_MAX_BITSCORERATIO_THRESHOLD,
    nonuniform_aa_offset = CBG_CEXPANDER_OMSRBORDERGAPS_NONUNIFORM_AA_OFFSET,
    gap_size = CBG_CEXPANDER_OMSRBORDERGAPS_GAP_SIZE,
    verbose = False):
    """
    Check the area directly around the OMSR of a CBG for non-uniform alignments

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph instance to optimize 

    @type  omit5pside: Boolean
    @param omit5pside: Do not process the 5' side (left) of the CBG

    @type  omit3pside: Boolean
    @param omit3pside: Do not process the 3' side (rigth) of the CBG

    @type  nonuniform_aa_offset: integer
    @param nonuniform_aa_offset: area of the nonuniform stretch to check for gaps

    @type  gap_size: integer
    @param gap_size: continuous gap length to occur in the nonuniform_aa_offset in order to shorten the CBG

    @type  max_bitscoreratio_threshold: float
    @param max_bitscoreratio_threshold: maximal bitscore ratio of Q vs. S slice to enforce a CBG shortening

    @type  verbose: Boolean 
    @param verbose: print debugging/intermediate information to STDOUT 

    @rtype:  Boolean ( or NoOverallMinimalSpanningRange or ZeroUniformlyAlignedPositions exception )
    @return: status weather or not the CBG was shortened
    """
    hasconsistency = cbg._cexpander.binarystring.count("1") >= 1
    has5Pomsrflaw  = cbg._cexpander.binarystring[0] == "0"
    PACBPS_CORRECTED = 0

    if not hasconsistency:
        # a priori error. CBGs must have at least a single Uniformly Aligned AA position
        raise ZeroUniformlyAlignedPositions

    if not omit5pside and (hasconsistency,has5Pomsrflaw) == (True,True):
        # start correction on the 5' side of the OMSR
        omsr = cbg.overall_minimal_spanning_range()
        replacements = {}
        ########################################################################
        if verbose:
            print "STARTING cexpander_checkCBG4omsrbordergaps 5p side"
            print cbg
            print "cexp::", cbg._cexpander.binarystring, cbg._cexpander.header
        ########################################################################
        for (currentkey,nodeQ,nodeS),pacbporf in cbg.pacbps.iteritems():
            # get slice of the pacbporf around the max(OMSR) query value 
            orgQ     = cbg.organism_by_node(nodeQ) 
            cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring 
            endQpos  = min(omsr[nodeQ]) + cexpQstr.find("1")
            staQpos  = endQpos - nonuniform_aa_offset

            # get slice of the pacbporf around the max(OMSR) sbjct value 
            orgS     = cbg.organism_by_node(nodeS) 
            cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring 
            endSpos  = min(omsr[nodeS]) + cexpSstr.find("1")
            staSpos  = endSpos - nonuniform_aa_offset

            # correct staQpos if < pacbporf.orfQ.protein_startPY
            staQpos  = max([ pacbporf.orfQ.protein_startPY, staQpos ])
            editedQ  = staQpos != endQpos - nonuniform_aa_offset

            # correct staSpos if < pacbporf.orfS.protein_startPY
            staQpos  = max([ pacbporf.orfS.protein_startPY, staSpos ])
            editedS  = staSpos != endSpos - nonuniform_aa_offset

            if editedQ and editedS: 
                if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\
                (endQpos - (staQpos + nonuniform_aa_offset) ): 
                    # editing on Sbjct is gte as on Query -> take Sbjct 
                    (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos ) 
                else: 
                    # other way around -> take Query 
                    (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos ) 
            elif editedS: 
                # take by sbjct coords 
                (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos ) 
            else: 
                # unedited or edited Query -> take by query coords 
                (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos ) 

            # check minval of coords; CBGs at the far 5' end of the
            # input DNA sequence can get negative coords for their
            # non-existing Orf frontal STOPcodon (up to -3)
            if min(coords) < 0: continue

            # get bitscore-ratio of this Query/Sbjct slice
            (qS,qE,sS,sE)  = coords
            bitscoreratio = pacb.calculate_bitscoreratio(q,s)

            # get bitscore-ratio of this Query/Sbjct slice
            bitscoreratio = pacb.calculate_bitscoreratio(q,s)

            # if more gaps in this alignment slice then expected -> a pacbp split will follow
            # the slice is of size (2*omsr_offset)+1
            if q.find('-'*gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                pos = q.find('-'*gap_size)
                while pos+gap_size < len(q) and q[pos+gap_size] == "-":
                    pos+=1
                splitpos = qS + pos + gap_size
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            elif s.find('-'*gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                pos = s.find('-'*gap_size)
                while pos+gap_size < len(s) and s[pos+gap_size] == "-":
                    pos+=1
                splitpos = sS + pos + gap_size
                # correct splitpos by pacbp.sbjct_start
                splitpos = splitpos - pacbp.sbjct_start

            elif bitscoreratio <= max_bitscoreratio_threshold:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                # correct for matches on the rigth of the match string
                splitpos = qE - ( len(m) - m.rfind(" ") - 1)
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            else:
                ################################################################
                if verbose:
                    print nodeQ, nodeS, "'%s' '%s' '%s'" % (q,m,s), coords,
                    print "settings:", (nonuniform_aa_offset, gap_size),
                    print "bitsratio: %1.3f" % bitscoreratio
                ################################################################
                # not passing the cut-off for splitting this pacbp
                continue

            ####################################################################
            if verbose:
                print "5p,", nodeQ,nodeS, (q,m,s,coords),
                print "bitsratio: %1.3f (thr:%1.3f)" % (
                    bitscoreratio,max_bitscoreratio_threshold)
                print pacbp, "relative splitpos:", splitpos
                pacbp.print_protein(_linesize=120)
            ####################################################################

            # now split the pacbp on this position and recreate the pacbporf
            pacbpR = pacb.splitting.split_pacb_on_coordinates(pacbp,(
                        splitpos,splitpos),returnside='rigth')

            if pacbpR:
                newpacbporf = pacb.conversion.pacbp2pacbporf(
                                pacbpR,pacbporf.orfQ,pacbporf.orfS)
                newpacbporf.extend_pacbporf_after_stops()
                # store to replacements dict
                replacements[(currentkey,nodeQ,nodeS)] = newpacbporf
                ################################################################
                if verbose:
                    print pacbpR
                    pacbpR.print_protein(_linesize=120)
                    print newpacbporf
                ################################################################
                # increase counter for how much pacbps are corrected
                PACBPS_CORRECTED+=1

        # do the replacements of 5' PacbP corrections
        status = _update_cbg_with_pacbporf_replacements(cbg,replacements)
        if status == True:
            pass    # cbg succesfully updated; still an OMSR
        elif status == False:
            # raise a NoOverallMinimalSpanningRange Exception
            print "WARNING: NoOverallMinimalSpanningRange", cbg
            raise NoOverallMinimalSpanningRange, str(cbg)
        else:
            pass 

    # check (again!) if the is any consistency and if there is a 3' inconsistency
    hasconsistency = cbg._cexpander.binarystring.count("1") >= 1
    has3Pomsrflaw  = cbg._cexpander.binarystring[-1] == "0"

    if not hasconsistency:
        # due to 5' optimization, the complete CBG alignment collapsed!
        raise ZeroUniformlyAlignedPositions


    if not omit3pside and (hasconsistency,has3Pomsrflaw) == (True,True):
        # start correction on the 3' side of the OMSR
        omsr = cbg.overall_minimal_spanning_range()
        replacements = {}
        ########################################################################
        if verbose:
            print "STARTING cexpander_checkCBG4omsrbordergaps 3p side"
            print cbg, "\ncexp::", cbg._cexpander.binarystring,
            print cbg._cexpander.header
        ########################################################################

        for (currentkey,nodeQ,nodeS),pacbporf in cbg.pacbps.iteritems():
            # get slice of the pacbporf around the max(OMSR) query value
            orgQ     = cbg.organism_by_node(nodeQ)
            cexpQstr = cbg._cexpander.get_transferblock(orgQ).binarystring
            staQpos  = max(omsr[nodeQ]) - ( len(cexpQstr) - cexpQstr.rfind("1") )
            endQpos  = staQpos + nonuniform_aa_offset

            # get slice of the pacbporf around the max(OMSR) sbjct value
            orgS     = cbg.organism_by_node(nodeS)
            cexpSstr = cbg._cexpander.get_transferblock(orgS).binarystring
            staSpos  = max(omsr[nodeS]) - ( len(cexpSstr) - cexpSstr.rfind("1") )
            endSpos  = staSpos + nonuniform_aa_offset
            
            # correct endQpos if > pacbporf.orfQ.protein_endPY
            endQpos = min([ pacbporf.orfQ.protein_endPY, endQpos ])
            editedQ = endQpos != staQpos + nonuniform_aa_offset

            # correct endSpos if > pacbporf.orfQ.protein_endPY
            endSpos = min([ pacbporf.orfS.protein_endPY, endSpos ])
            editedS = endSpos != staSpos + nonuniform_aa_offset
            
            if editedQ and editedS:
                if ( endSpos - (staSpos + nonuniform_aa_offset) ) >=\
                (endQpos - (staQpos + nonuniform_aa_offset) ):
                    # editing on Sbjct is gte as on Query -> take Sbjct
                    (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos )
                else:
                    # other way around -> take Query
                    (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos )
            elif editedS:
                # take by sbjct coords
                (q,m,s,coords) = pacbporf.alignmentpart_by_sbjct( staSpos, endSpos )
            else:
                # unedited or edited Query -> take by query coords
                (q,m,s,coords) = pacbporf.alignmentpart_by_query( staQpos, endQpos )

            # get bitscore-ratio of this Query/Sbjct slice
            (qS,qE,sS,sE)  = coords
            bitscoreratio = pacb.calculate_bitscoreratio(q,s)

            # if more gaps in this alignment slice then expected -> a pacbp split will follow
            # the slice is of size (2*omsr_offset)+1
            if q.find('-'*gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                splitpos = qS + q.find('-'*gap_size)
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            elif s.find('-'*gap_size) >= 0:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                splitpos = sS + s.find('-'*gap_size)
                # correct splitpos by pacbp.sbjct_start
                splitpos = splitpos - pacbp.sbjct_start

            elif bitscoreratio <= max_bitscoreratio_threshold:
                # convert (back) to pacbp and obtain position where to split
                pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                # correct for matches on the left of the match string
                splitpos = qS + m.find(" ")
                # correct splitpos by pacbp.query_start
                splitpos = splitpos - pacbp.query_start

            else:
                ################################################################
                if verbose:
                    print nodeQ, nodeS, "'%s' '%s' '%s'" % (q,m,s), coords,
                    print "settings:", (nonuniform_aa_offset, gap_size), 
                    print "bitsratio: %1.3f" % bitscoreratio
                ################################################################
                # not passing the cut-off for splitting this pacbp
                continue

            ####################################################################
            if verbose:
                print "3p,", nodeQ,nodeS, (q,m,s,coords),
                print "bitsratio: %1.3f (thr:%1.3f)" % (
                            bitscoreratio,max_bitscoreratio_threshold)
                print pacbp, "relative splitpos:", splitpos
                pacbp.print_protein(_linesize=120)
            ####################################################################

            # now split the pacbp on this position and recreate the pacbporf
            pacbpL = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitpos,splitpos),returnside='left')

            if pacbpL:
                newpacbporf = pacb.conversion.pacbp2pacbporf(
                                pacbpL,pacbporf.orfQ,pacbporf.orfS)
                newpacbporf.extend_pacbporf_after_stops()
                # store to replacements dict
                replacements[(currentkey,nodeQ,nodeS)] = newpacbporf
                ################################################################
                if verbose:
                    print pacbpL
                    pacbpL.print_protein(_linesize=120)
                    print newpacbporf
                ################################################################
                # increase counter for how much pacbps are corrected
                PACBPS_CORRECTED+=1


        # do the replacements of 3' PacbP corrections
        status = _update_cbg_with_pacbporf_replacements(cbg,replacements)
        if status == True:
            pass    # cbg succesfully updated; still an OMSR
        elif status == False:
            # raise a NoOverallMinimalSpanningRange Exception
            raise NoOverallMinimalSpanningRange, str(cbg)
        elif status == None:
            pass    # no updates done at all
        else:
            # NOT POSSIBLE -> status isa NoneBoolean
            pass

    ####################################################################
    if verbose and PACBPS_CORRECTED:
        print "REPLACEMENTS DONE:", PACBPS_CORRECTED, "omit5pside:",
        print omit5pside, "omit3pside", omit3pside
        print cbg
        cbg.printmultiplealignment()
    ####################################################################


    # return if there is something improved
    if PACBPS_CORRECTED: return True
    else:                return False
Ejemplo n.º 4
0
def CodingBlockGraph2GeneTreeGraph(cbg):
    """
    Convert CodingBlockGraph 2 GeneTree

    @attention: function just converts, error check is not performed here!

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph instance

    @rtype:  GeneTreeGraph
    @return: GeneTreeGraph instance
    """
    gtg = GeneTreeGraph()
    cbgnode2orgnode = {}
    for node in cbg.get_nodes():
        org = cbg._organism_from_node(node)
        gtg.add_node(org)
        # add node/org combi to mapping dict
        cbgnode2orgnode[node] = org
    # now add all the edges
    omsr = cbg.overall_minimal_spanning_range()
    for (n1, n2) in cbg.pairwisecrosscombinations_node():
        if cbg.has_edge(n1, n2):
            # get pacbp(orf) object
            thepacbp = cbg.get_pacbps_by_nodes(node1=n1, node2=n2)[0]
            # get relative coordinates of the OMSR part of the alignment
            omsrQs = thepacbp.alignmentposition_by_query_pos(min(omsr[n1]))
            omsrQe = thepacbp.alignmentposition_by_query_pos(max(omsr[n1]))

            # CHECK these coordinates; pacb.exceptions.CoordinateOutOfRange can occur
            # in freaky cases. They shouldn't, but do without discovered reason.
            # However, in the majority of cases, it is just a 1/few aa offset, which
            # can be easily corrected here.
            if str(omsrQs) == str(pacb.exceptions.CoordinateOutOfRange):
                if thepacbp.__class__.__name__ == 'PacbP':
                    # solve by taking thepacbp.query_start
                    omsrQs = thepacbp.alignmentposition_by_query_pos(
                        thepacbp.query_start)
                else:
                    # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']:
                    # solve by taking orginal alignment position start
                    omsrQs = thepacbp.alignmentposition_by_query_pos(
                        thepacbp._get_original_alignment_pos_start().query_pos)

                ###########################################################################
                ## print warning message(s)
                #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQs, ",
                #print "node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) )
                #print "WARNING: min(omsr(", min( omsr[n1] ), ")", min(omsr[n1]),
                #print max(omsr[n1]), " taken ->", thepacbp.query_start, omsrQs
                #print "WARNING: ", thepacbp
                ###########################################################################

            if str(omsrQe) == str(pacb.exceptions.CoordinateOutOfRange):
                if thepacbp.__class__.__name__ == 'PacbP':
                    # solve by taking thepacbp.query_end
                    omsrQe = thepacbp.alignmentposition_by_query_pos(
                        thepacbp.query_end)
                else:
                    # thepacbp.__class__.__name__ in ['PacbPDNA','PacbPORF']:
                    # solve by taking orginal alignment position end
                    omsrQe = thepacbp.alignmentposition_by_query_pos(
                        thepacbp._get_original_alignment_pos_end().query_pos
                    ) + 1  # add +1 to create a python list range coordinate

                ###########################################################################
                ## print warning message(s)
                #print "WARNING: pacb.exceptions.CoordinateOutOfRange (omsrQe, ",
                #print node %s in CodingBlockGraph2GeneTreeGraph" % ( str(n1) )
                #print "WARNING: max(omsr(", max( omsr[n1] ), ")", min(omsr[n1]),
                #print max(omsr[n1]), " taken ->", thepacbp.query_end, omsrQe
                #print "WARNING: ", thepacbp
                ###########################################################################

            else:
                # omsrQe was nicely an integer; add +1 because max(OMSR) is not a range coord
                omsrQe += 1

            # calculate identityscore
            identityscore = pacb.calculate_identityscore(
                thepacbp.alignment[omsrQs:omsrQe])
        else:
            # this edge is absent in the CBG!
            # TODO -> this will cause a crash a few lines later
            # by definition, a CBG MUST HAVE ALL EDGES at this stage!
            print "about to crash!!!!"
            print cbg
            print cbg.node_count(), cbg.edge_count(), "missing:", (n1, n2)
            identityscore = 0.0
        # get organism identifyers from node and add edge
        o1, o2 = cbgnode2orgnode[n1], cbgnode2orgnode[n2]

        # Wt used is identityscore == Identity + 0.5* Similarity
        gtg.add_edge(o1, o2, wt=identityscore)

        # add additional statistics to gtg object. Wt used is
        # identitypercentage is TRUE aa indentity %
        identityperc = pacb.calculate_identity(
            thepacbp.alignment[omsrQs:omsrQe])
        gtg._aa_identity_percentages[(o1, o2)] = identityperc
        gtg._aa_identity_percentages[(o2, o1)] = identityperc

        # bitscoreratio is ratio of bits / max bits
        bitscoreratio = pacb.calculate_bitscoreratio(
            thepacbp.query[omsrQs:omsrQe],
            thepacbp.sbjct[omsrQs:omsrQe],
            matrix=thepacbp.MATRIX)
        gtg._bitscore_ratios[(o1, o2)] = bitscoreratio
        gtg._bitscore_ratios[(o2, o1)] = bitscoreratio

        # ntidentity is obviously nt identity%
        dnaQseq, dnaSseq = thepacbp.get_unextended_aligned_dna_sequences()
        ntidentity = sequence_identity_ratio(dnaQseq, dnaSseq)
        gtg._nt_identity_percentages[(o1, o2)] = ntidentity
        gtg._nt_identity_percentages[(o2, o1)] = ntidentity

    # check if the graph is saturated (complete)
    # if not (organism/node/orf missing), add this as a zero-wt edge
    gtg.makecompletegraph(wt=0.0)
    # and return this new genetree graph
    return gtg