Exemple #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    (options, args) = E.Start(parser)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n"
    )

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        map_sequence2mali = alignlib_lite.py_makeAlignmentVector()

        alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" *
                                                 l).copy(map_sequence2mali)

        options.stdout.write("\t".join(
            (cur_record.title, "ref",
             str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) +
                             "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"])

    parser.set_defaults(
    )

    (options, args) = E.Start(parser)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n")

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        map_sequence2mali = alignlib_lite.py_makeAlignmentVector()

        alignlib_lite.py_AlignmentFormatExplicit(0, sequence,
                                                 0, "X" * l).copy(map_sequence2mali)

        options.stdout.write("\t".join((
            cur_record.title,
            "ref",
            str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) + "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped))

    E.Stop()
def PrintPrettyAlignment( seq_wobble, seq_cds, seq_pep, map_p2c, options ):
    """print a pretty alignment."""

    f = alignlib_lite.py_AlignmentFormatExplicit( map_p2c, seq_wobble, seq_cds )
    wobble_ali, cds_ali = f.mRowAlignment, f.mColAlignment
    
    wi, ci, pi = 0, 0, 0
    frags_w, frags_c, frags_p = [], [], []
    for x in range( 0, len( wobble_ali )):

        if wi % 3 == 0:
            if pi < len(seq_pep):
                frags_p.append( "  %s " % seq_pep[pi])
            frags_w.append (" ")
            frags_c.append (" ")
            pi += 1

        frags_w.append( wobble_ali[x] )
        frags_c.append( cds_ali[x] )
        if wobble_ali[x] != "-":
            wi += 1
            
        if len(frags_w) > 120 and len(frags_w) % 3 == 0:
            options.stdlog.write( "#" + "".join(frags_w) + "\n" )
            options.stdlog.write( "#" + "".join(frags_p) + "\n" )
            options.stdlog.write( "#" + "".join(frags_c) + "\n" )                
            options.stdlog.write( "#\n" )
            frags_w, frags_c, frags_p = [], [], []


    options.stdlog.write( "#" + "".join(frags_w) + "\n" )
    options.stdlog.write( "#" + "".join(frags_p) + "\n" )
    options.stdlog.write( "#" + "".join(frags_c) + "\n" )                
    options.stdlog.write( "#\n" )
Exemple #4
0
def printPrettyAlignment(seq_wobble, seq_cds, seq_pep, map_p2c, options):
    """print a pretty alignment."""

    f = alignlib_lite.py_AlignmentFormatExplicit(map_p2c, seq_wobble, seq_cds)
    wobble_ali, cds_ali = f.mRowAlignment, f.mColAlignment

    wi, ci, pi = 0, 0, 0
    frags_w, frags_c, frags_p = [], [], []
    for x in range(0, len(wobble_ali)):

        if wi % 3 == 0:
            if pi < len(seq_pep):
                frags_p.append("  %s " % seq_pep[pi])
            frags_w.append(" ")
            frags_c.append(" ")
            pi += 1

        frags_w.append(wobble_ali[x])
        frags_c.append(cds_ali[x])
        if wobble_ali[x] != "-":
            wi += 1

        if len(frags_w) > 120 and len(frags_w) % 3 == 0:
            options.stdlog.write("#" + "".join(frags_w) + "\n")
            options.stdlog.write("#" + "".join(frags_p) + "\n")
            options.stdlog.write("#" + "".join(frags_c) + "\n")
            options.stdlog.write("#\n")
            frags_w, frags_c, frags_p = [], [], []

    options.stdlog.write("#" + "".join(frags_w) + "\n")
    options.stdlog.write("#" + "".join(frags_p) + "\n")
    options.stdlog.write("#" + "".join(frags_c) + "\n")
    options.stdlog.write("#\n")
Exemple #5
0
def AlignPair(pair, anchor=0):
    """align a pair of introns."""

    map_intron_a2b = alignlib_lite.py_makeAlignmentVector()

    if param_loglevel >= 1:
        print "# aligning %s-%i with %s-%i: lengths %i and %i" % (
            pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2,
            len(pair.mAlignedSequence1), len(pair.mAlignedSequence2))
        sys.stdout.flush()

    s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor
    s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor

    if param_method == "dialigned":
        dialign.Align(s1, s2, map_intron_a2b)
    elif param_method == "dialignedlgs":
        dialignlgs.Align(s1, s2, map_intron_a2b)
    elif param_method == "dbaligned":
        dba.Align(s1, s2, map_intron_a2b)
    elif param_method == "clusaligned":
        raise NotImplementedError("clustalw wrapper not up-to-date")
        clustal.Align(s1, s2, map_intron_a2b)

    if anchor:
        map_intron_a2b.removeRowRegion(
            anchor + len(pair.mAlignedSequence1) + 1,
            map_intron_a2b.getRowTo())
        map_intron_a2b.removeRowRegion(1, anchor)
        map_intron_a2b.removeColRegion(
            anchor + len(pair.mAlignedSequence2) + 1,
            map_intron_a2b.getColTo())
        map_intron_a2b.removeColRegion(1, anchor)
        map_intron_a2b.moveAlignment(-anchor, -anchor)

    if map_intron_a2b.getLength() == 0:
        if param_loglevel >= 1:
            print "# Error: empty intron alignment"
        return False

    seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1)
    seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2)

    data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2)

    pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
    pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo
    pair.mMethod = param_method

    pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(
    ), map_intron_a2b.getLength()
    pair.mAligned = pair.mLength - pair.mNumGaps

    if param_loglevel >= 2:
        print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2

    return True
Exemple #6
0
def AlignPair(pair, anchor=0):
    """align a pair of introns."""

    map_intron_a2b = alignlib_lite.py_makeAlignmentVector()

    if param_loglevel >= 1:
        print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1,
                                                                  pair.mToken2, pair.mIntronId2,
                                                                  len(pair.mAlignedSequence1),
                                                                  len(pair.mAlignedSequence2))
        sys.stdout.flush()

    s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor
    s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor

    if param_method == "dialigned":
        dialign.Align(s1, s2, map_intron_a2b)
    elif param_method == "dialignedlgs":
        dialignlgs.Align(s1, s2, map_intron_a2b)
    elif param_method == "dbaligned":
        dba.Align(s1, s2, map_intron_a2b)
    elif param_method == "clusaligned":
        raise NotImplementedError("clustalw wrapper not up-to-date")
        clustal.Align(s1, s2, map_intron_a2b)

    if anchor:
        map_intron_a2b.removeRowRegion(
            anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo())
        map_intron_a2b.removeRowRegion(1, anchor)
        map_intron_a2b.removeColRegion(
            anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo())
        map_intron_a2b.removeColRegion(1, anchor)
        map_intron_a2b.moveAlignment(-anchor, -anchor)

    if map_intron_a2b.getLength() == 0:
        if param_loglevel >= 1:
            print "# Error: empty intron alignment"
        return False

    seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1)
    seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2)

    data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2)

    pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
    pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo
    pair.mMethod = param_method

    pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(
    ), map_intron_a2b.getLength()
    pair.mAligned = pair.mLength - pair.mNumGaps

    if param_loglevel >= 2:
        print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2

    return True
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options,
                     diag_width = 2, max_advance = 2 ):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """
    
    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder() )

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()
    
    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue( x )
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X": 
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N": 
            x += 1
            continue

        # skip over gaps in wobble 
        if seq_wobble.asChar(x) == "-": 
            x += 1
            continue

        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2)
                options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y,
                                                                                 Genomics.MapCodon2AA( c ),
                                                                                 pep_seq[int(x/3)]) )
                                      
            options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \
                                      (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) ))
            
        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib_lite.py_makeAlignmentVector()

            ## backtrack to previous three codons and align
            ## three codons for double frameshifts that span two codons and
            ## produce two X's and six WWWWWW.

            ## number of nucleotides to extend (should be multiple of 3)
            ## less than 12 caused failure for some peptides.
            d = 15
            
            # extend by amound dx
            dx = (x % 3) + d
            
            x_start = max(0, x - dx )
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib_lite.py_RIGHT ))

            if (x_start, y_start) == last_start:
                raise ValueError( "infinite loop detected" )

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq) )
            y_end = min(y_start + 2 * d, len(cds_seq) )

            wobble_fragment = alignlib_lite.py_makeSequence(wobble_seq[x_start:x_end])
            cds_fragment = alignlib_lite.py_makeSequence(cds_seq[y_start:y_end])
            
            AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options )

            if options.loglevel >= 10:
                 options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end,
                                                                                           y_start, y_end,
                                                                                           str(alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c,
                                                                                                                                 wobble_fragment, 
                                                                                                                                 cds_fragment ))))
                 
                 options.stdlog.flush()

            ## clear alignment
            map_p2c.removeRowRegion( x_start, x_end )
            ngap = 0
            last_x, last_y = None, None
            for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) )
                    if s < 0:
                        raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))
                    
                    map_p2c.addPair( x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s ))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion( last_x, last_x + 1 )

                    last_x += 1
                    map_p2c.addPair( last_x, last_y )
                    if options.loglevel >= 6:
                        options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \
                                              (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s ))
                        options.stdlog.flush()                    
                    ngap = 0
                    
            ## exit condition if alignment is shorter than problematic residue
            ## need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    ## only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")
                    
            s = 0
            
        s = matrix.getValue( xr, seq_cds.asResidue(y) )

        if s < 0:
            raise ValueError("mis-matching residues.")
        
        map_p2c.addPair( x, y, float(s) )
        
        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert( map_p2c.getRowTo() <= seq_wobble.getLength() )
    assert( map_p2c.getColTo() <= seq_cds.getLength() )
Exemple #8
0
    parser.add_option("-o",
                      "--options",
                      dest="options",
                      type="string",
                      help="BlastZ options.")

    parser.set_defaults(input_filename_seq1=None,
                        input_filename_seq2=None,
                        options="B=0 C=2")

    (options, args) = E.Start(parser)

    wrapper = BlastZ(options.options)

    import alignlib_lite
    seqs1 = Genomics.ReadPeptideSequences(
        open(options.input_filename_seq1, "r"))
    seqs2 = Genomics.ReadPeptideSequences(
        open(options.input_filename_seq2, "r"))
    seq1 = seqs1[seqs1.keys()[0]]
    seq2 = seqs2[seqs2.keys()[0]]
    result = alignlib_lite.py_makeAlignmentVector()
    wrapper.Align(seq1, seq2, result)

    print str(
        alignlib_lite.py_AlignmentFormatExplicit(
            result, alignlib_lite.py_makeSequence(seq1),
            alignlib_lite.py_makeSequence(seq2)))

    E.Stop()
Exemple #9
0
def AlignCodonBased(seq_wobble,
                    seq_cds,
                    seq_peptide,
                    map_p2c,
                    options,
                    diag_width=2,
                    max_advance=2):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """

    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation(
        1, -10, 1, alignlib_lite.py_getDefaultEncoder())

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()

    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue(x)
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X":
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N":
            x += 1
            continue

        # skip over gaps in wobble
        if seq_wobble.asChar(x) == "-":
            x += 1
            continue

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y +
                                                       1) + seq_cds.asChar(y +
                                                                           2)
                options.stdlog.write(
                    "# c=%s, x=%i, y=%i, aa=%s target=%s\n" %
                    (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)]))

            options.stdlog.write(
                "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" %
                (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr,
                 seq_cds.asResidue(y), str(s)))

        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib_lite.py_makeAlignmentVector()

            # backtrack to previous three codons and align
            # three codons for double frameshifts that span two codons and
            # produce two X's and six WWWWWW.

            # number of nucleotides to extend (should be multiple of 3)
            # less than 12 caused failure for some peptides.
            d = 15

            # extend by amound dx
            dx = (x % 3) + d

            x_start = max(0, x - dx)
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0,
                          map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT))

            if (x_start, y_start) == last_start:
                raise ValueError("infinite loop detected")

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq))
            y_end = min(y_start + 2 * d, len(cds_seq))

            wobble_fragment = alignlib_lite.py_makeSequence(
                wobble_seq[x_start:x_end])
            cds_fragment = alignlib_lite.py_makeSequence(
                cds_seq[y_start:y_end])

            AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c,
                            options)

            if options.loglevel >= 10:
                options.stdlog.write(
                    "# fragmented alignment from %i-%i, %i-%i:\n%s\n" %
                    (x_start, x_end, y_start, y_end,
                     str(
                         alignlib_lite.py_AlignmentFormatExplicit(
                             tmp_map_p2c, wobble_fragment, cds_fragment))))

                options.stdlog.flush()

            # clear alignment
            map_p2c.removeRowRegion(x_start, x_end)
            ngap = 0
            last_x, last_y = None, None
            for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue(seq_wobble.asResidue(x),
                                        seq_cds.asResidue(y))
                    if s < 0:
                        raise ValueError(
                            "mismatched residue wobble: %i (%s), cds: %i (%s)"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))

                    map_p2c.addPair(x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y),
                               xr, seq_cds.asResidue(y), s))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to
                # next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion(last_x, last_x + 1)

                    last_x += 1
                    map_p2c.addPair(last_x, last_y)
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (last_x, seq_wobble.asChar(last_x), last_y,
                               seq_cds.asChar(last_y), xr,
                               seq_cds.asResidue(last_y), s))
                        options.stdlog.flush()
                    ngap = 0

            # exit condition if alignment is shorter than problematic residue
            # need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    # only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")

            s = 0

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if s < 0:
            raise ValueError("mis-matching residues.")

        map_p2c.addPair(x, y, float(s))

        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert (map_p2c.getRowTo() <= seq_wobble.getLength())
    assert (map_p2c.getColTo() <= seq_cds.getLength())
Exemple #10
0
    def Align( self, method, anchor = 0, loglevel = 1 ):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """
        
        map_a2b = alignlib_lite.py_makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor    

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign( self.mOptionsDialign )
            dialign.Align( s1, s2, map_a2b )
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ )
            blastz.Align( s1, s2, map_a2b )
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement( self.mSequence2 )

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS )
            dialignlgs.Align( s1, s2, map_a2b ) 
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align( s1, s2, map_a2b )
        elif method == "clustal":
            raise NotImplementedError( "clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align( s1, s2, map_a2b )
        elif method == "nw":
            seq1 = alignlib_lite.py_makeSequence( s1 )
            seq2 = alignlib_lite.py_makeSequence( s2 )
            alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL,
                                                      gop=-12.0,
                                                      gep=-2.0 )
            alignator.align( map_a2b, seq1, seq2 )
        elif method == "sw":                        
            seq1 = alignlib_lite.py_makeSequence( s1 )
            seq2 = alignlib_lite.py_makeSequence( s2 )
            alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw )
        else:
            ## use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() )
            map_a2b.removeRowRegion( 1, anchor)        
            map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() )        
            map_a2b.removeColRegion( 1, anchor)
            map_a2b.moveAlignment( -anchor, -anchor )

        f = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, 
                                              alignlib_lite.py_makeSequence( self.mSequence1),
                                              alignlib_lite.py_makeSequence( self.mSequence2) )

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib_lite.py_AlignmentFormatEmissions( map_a2b )
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()        
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()        
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()
        input_filename_seq2 = None,
        options = "B=0 C=2")
    
    (options, args) = E.Start( parser ) 
    
    wrapper = BlastZ( options.options )

    import alignlib_lite
    seqs1 = Genomics.ReadPeptideSequences( open(options.input_filename_seq1, "r") )
    seqs2 = Genomics.ReadPeptideSequences( open(options.input_filename_seq2, "r") )
    seq1 = seqs1[seqs1.keys()[0]]
    seq2 = seqs2[seqs2.keys()[0]]    
    result = alignlib_lite.py_makeAlignmentVector()
    wrapper.Align( seq1, seq2, result) 

    print str( alignlib_lite.py_AlignmentFormatExplicit( result,
                                                 alignlib_lite.py_makeSequence( seq1 ),
                                                 alignlib_lite.py_makeSequence( seq2 ) ) )
    
    E.Stop()
        
            
        
                                 
        
        
        
        
    
        
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string",
                       help="peptide sequence [Default=%default]" )

    parser.add_option( "-f", "--format", dest="format", type="string",
                       help="output format [Default=%default]" )

    parser.add_option( "-e", "--expand",  dest="expand", action="store_true",
                       help="expand positions from peptide to nucleotide alignment [Default=%default]")

    parser.add_option( "-m", "--map",  dest="filename_map", type="string",
                       help="map alignments [Default=%default]")
    
    parser.add_option( "-c", "--codons",  dest="require_codons", action="store_true",
                       help="require codons [Default=%default]")

    parser.add_option( "--one-based-coordinates",  dest="one_based_coordinates", action="store_true",
                       help="expect one-based coordinates. The default are zero based coordinates [Default=%default].")

    parser.add_option( "--no-identical",  dest="no_identical", action="store_true",
                       help="do not output identical pairs [Default=%default]" )

    parser.add_option( "-g", "--no-gaps",  dest="no_gaps", action="store_true",
                       help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option( "-x", "--exons",  dest="filename_exons", type="string",
                       help="filename with exon boundaries [Default=%default]")
    
    parser.add_option( "-o", "--outfile",  dest="filename_outfile", type="string",
                       help="filename to save links [Default=%default]")

    parser.add_option( "--min-length",  dest="min_length", type="int",
                       help="minimum length of alignment [Default=%default]")

    parser.add_option( "--filter",  dest="filename_filter", type="string",
                       help="given a set of previous alignments, only write new pairs [Default=%default].")

    parser.set_defaults(
        filename_sequences = None,
        filename_exons = None,
        filename_map = None,
        filename_outfile = None,
        no_gaps = False,
        format = "fasta",
        expand = False,
        require_codons = False,
        no_identical = False,
        min_length = 0,
        report_step = 100,
        one_based_coordinates = False,
        filename_filter = None)

    (options, args) = E.Start( parser, add_mysql_options = True )

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") )
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i sequences\n" % len(sequences) )
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") )
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i exons\n" % len(exons) )
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#": continue
            m = Map()
            m.read( line )
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i maps\n" % len(map_old2new) )
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:        
            options.stdlog.write( "# reading filtering information.\n" )
            sys.stdout.flush()
            
        map_pair2hids = {}

        if os.path.exists( options.filename_filter ):
            
            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator( infile )

            while 1:
                cur_record = iterator.next()
                if cur_record is None: break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None: break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids: map_pair2hids[id] = []

                map_pair2hids[id].append( s )

            infile.close()
            
        if options.loglevel >= 1:        
            options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) )
            sys.stdout.flush()
    else:
        map_pair2hids = None
        
    if options.loglevel >= 1:
        options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None
        
    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links( sys.stdin ):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) )
                sys.stdout.flush()
                
        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write( "# read link %s\n" %  str(link) )
            
        row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] )
        col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] )

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3 
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 )
            link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 )            
            
        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli,
            link.mSbjctFrom, link.mSbjctAli ).copy(  map_row2col )
        
        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in row with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                      map_old2new[link.mQueryToken].mMapOld2New,
                                      map_row2col,
                                      alignlib_lite.py_RR )
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()            
            if options.loglevel >= 3:
                options.stdlog.write( "# combining in col with %s\n" %\
                                      str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) ))

            alignlib_lite.py_combineAlignment( tmp1_map_row2col,
                                       map_row2col,
                                       map_old2new[link.mSbjctToken].mMapOld2New,
                                       alignlib_lite.py_CR )
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col )

        dr = row_seq.getLength() - map_row2col.getRowTo() 
        dc = col_seq.getLength() - map_row2col.getColTo() 
        if dr < 0 or dc < 0:
            raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\
                                          (link.mQueryToken,
                                           link.mSbjctToken,
                                           row_seq.getLength(),
                                           col_seq.getLength(),
                                           str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))
            

        if options.loglevel >= 2:
            options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                         row_seq, 
                                                                         col_seq )) + "\n" )
        ## check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()
            
            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write( "# %s\n" % str(map_row2col) )
                options.stdlog.write( "# %s\n" % str(link) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) )
                options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) )
                options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, 
                                                                                    row_seq,
                                                                                    col_seq ) )

                raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken))

        ## if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            ## Get overlapping segments
            segments = Exons.MatchExons( map_row2col, exons1, exons2 )
            
            for a,b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in the input files.

                from1, to1 = GetAdjustedBoundaries( a, exons1 )
                from2, to2 = GetAdjustedBoundaries( b, exons2 )

                alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col,
                                       from1+1, to1, from2+1, to2 )
                
                mode = Write( tmp1_map_row2col, row_seq, col_seq, link,
                              no_gaps = options.no_gaps,
                              no_identical = options.no_identical,
                              min_length = options.min_length,
                              suffix1="_%s" % str(a),
                              suffix2="_%s" % str(b),
                              outfile = outfile,
                              pair_filter = map_pair2hid,
                              format = options.format )

                if mode not in counts: counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write( map_row2col, row_seq, col_seq, link,
                          min_length = options.min_length,                          
                          no_gaps = options.no_gaps,
                          no_identical = options.no_identical,
                          outfile = outfile,
                          pair_filter = map_pair2hids,
                          format = options.format )
            
            if mode not in counts: counts[mode] = 0
            counts[mode] += 1

        noutput += 1
        
    if outfile: outfile.close()
    
    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) ))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) )

    E.Stop()
Exemple #13
0
def Write(map_row2col,
          row_seq,
          col_seq,
          link,
          no_gaps=False,
          no_identical=False,
          min_length=0,
          suffix1="",
          suffix2="",
          outfile=None,
          pair_filter=None,
          format="fasta"):
    """write alignment based on map_row2col."""

    status = None

    filter_status = "new"

    if map_row2col.getLength() == 0:
        status = "empty"

    if not status:

        f = alignlib_lite.py_AlignmentFormatExplicit(map_row2col, row_seq,
                                                     col_seq)

        row_from = map_row2col.getRowFrom()
        row_to = map_row2col.getRowTo()
        col_from = map_row2col.getColFrom()
        col_to = map_row2col.getColTo()
        row_ali, col_ali = f.mRowAlignment, f.mColAlignment

    if not status:
        if no_gaps:
            # remove gaps from fasta
            r = []
            c = []
            for x in range(len(row_ali)):
                if row_ali[x] != "-" and col_ali[x] != "-":
                    r.append(row_ali[x])
                    c.append(col_ali[x])
            row_ali = string.join(r, "")
            col_ali = string.join(c, "")

    if not status and len(row_ali) < min_length:
        status = "length"

    if not status and no_identical:
        if row_ali == col_ali:
            status = "identical"

    if not status:

        if pair_filter:
            id = "%s-%s" % (link.mQueryToken, link.mSbjctToken)
            if id in pair_filter:
                h = Genomics.GetHID(row_ali + ";" + col_ali)
                if h in pair_filter[id]:
                    filter_status = "old"

        translation1 = Genomics.TranslateDNA2Protein(row_ali)
        translation2 = Genomics.TranslateDNA2Protein(col_ali)

        if "X" in translation1 or "x" in translation2:
            status = "stops"
        else:
            status = "success"

        if filter_status == "new":
            if format == "fasta":
                print ">%s%s %s %s\n%s\n>%s%s %s %s\n%s" % (
                    link.mQueryToken, suffix1, row_from, row_to, row_ali,
                    link.mSbjctToken, suffix2, col_from, col_to, col_ali)
            elif format == "dummy":
                pass
            else:
                raise ValueError("unknown format")

    if outfile:
        outfile.write("%s%s\t%s%s\t%s\t%i\t%s\n" %
                      (link.mQueryToken, suffix1, link.mSbjctToken, suffix2,
                       status, map_row2col.getLength(), filter_status))

    return status
Exemple #14
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="peptide sequence [Default=%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format [Default=%default]")

    parser.add_option(
        "-e",
        "--expand",
        dest="expand",
        action="store_true",
        help=
        "expand positions from peptide to nucleotide alignment [Default=%default]"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="map alignments [Default=%default]")

    parser.add_option("-c",
                      "--codons",
                      dest="require_codons",
                      action="store_true",
                      help="require codons [Default=%default]")

    parser.add_option(
        "--one-based-coordinates",
        dest="one_based_coordinates",
        action="store_true",
        help=
        "expect one-based coordinates. The default are zero based coordinates [Default=%default]."
    )

    parser.add_option("--no-identical",
                      dest="no_identical",
                      action="store_true",
                      help="do not output identical pairs [Default=%default]")

    parser.add_option(
        "-g",
        "--no-gaps",
        dest="no_gaps",
        action="store_true",
        help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option("-x",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exon boundaries [Default=%default]")

    parser.add_option("-o",
                      "--outfile",
                      dest="filename_outfile",
                      type="string",
                      help="filename to save links [Default=%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of alignment [Default=%default]")

    parser.add_option(
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "given a set of previous alignments, only write new pairs [Default=%default]."
    )

    parser.set_defaults(filename_sequences=None,
                        filename_exons=None,
                        filename_map=None,
                        filename_outfile=None,
                        no_gaps=False,
                        format="fasta",
                        expand=False,
                        require_codons=False,
                        no_identical=False,
                        min_length=0,
                        report_step=100,
                        one_based_coordinates=False,
                        filename_filter=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_sequences, "r"))
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i sequences\n" % len(sequences))
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"))
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i exons\n" % len(exons))
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i maps\n" % len(map_old2new))
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:
            options.stdlog.write("# reading filtering information.\n")
            sys.stdout.flush()

        map_pair2hids = {}

        if os.path.exists(options.filename_filter):

            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator(infile)

            while 1:
                cur_record = iterator.next()
                if cur_record is None:
                    break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None:
                    break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids:
                    map_pair2hids[id] = []

                map_pair2hids[id].append(s)

            infile.close()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# read filtering information for %i pairs.\n" %
                len(map_pair2hids))
            sys.stdout.flush()
    else:
        map_pair2hids = None

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" %
                             (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None

    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links(sys.stdin):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write("# iterations: %i in %i seconds.\n" %
                                     (iterations, time.time() - t1))
                sys.stdout.flush()

        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write("# read link %s\n" % str(link))

        row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken])
        col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken])

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment(link.mQueryAli, 3)
            link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3)

        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli, link.mSbjctFrom,
            link.mSbjctAli).copy(map_row2col)

        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in row with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mQueryToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New,
                map_row2col, alignlib_lite.py_RR)
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in col with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mSbjctToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_row2col,
                map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR)
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        dr = row_seq.getLength() - map_row2col.getRowTo()
        dc = col_seq.getLength() - map_row2col.getColTo()
        if dr < 0 or dc < 0:
            raise ValueError(
                "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s"
                %
                (link.mQueryToken, link.mSbjctToken, row_seq.getLength(),
                 col_seq.getLength(),
                 str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))

        if options.loglevel >= 2:
            options.stdlog.write(
                str(
                    alignlib_lite.py_AlignmentFormatExplicit(
                        map_row2col, row_seq, col_seq)) + "\n")
        # check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()

            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write("# %s\n" % str(map_row2col))
                options.stdlog.write("# %s\n" % str(link))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mQueryToken]))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mSbjctToken]))
                options.stdlog.write("#\n%s\n" %
                                     alignlib_lite.py_AlignmentFormatExplicit(
                                         map_row2col, row_seq, col_seq))

                raise ValueError(
                    "incomplete codons %i in pair %s - %s" %
                    (naligned, link.mQueryToken, link.mSbjctToken))

        # if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            # Get overlapping segments
            segments = Exons.MatchExons(map_row2col, exons1, exons2)

            for a, b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in
                # the input files.

                from1, to1 = GetAdjustedBoundaries(a, exons1)
                from2, to2 = GetAdjustedBoundaries(b, exons2)

                alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col,
                                               from1 + 1, to1, from2 + 1, to2)

                mode = Write(tmp1_map_row2col,
                             row_seq,
                             col_seq,
                             link,
                             no_gaps=options.no_gaps,
                             no_identical=options.no_identical,
                             min_length=options.min_length,
                             suffix1="_%s" % str(a),
                             suffix2="_%s" % str(b),
                             outfile=outfile,
                             pair_filter=map_pair2hid,
                             format=options.format)

                if mode not in counts:
                    counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write(map_row2col,
                         row_seq,
                         col_seq,
                         link,
                         min_length=options.min_length,
                         no_gaps=options.no_gaps,
                         no_identical=options.no_identical,
                         outfile=outfile,
                         pair_filter=map_pair2hids,
                         format=options.format)

            if mode not in counts:
                counts[mode] = 0
            counts[mode] += 1

        noutput += 1

    if outfile:
        outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join(
            map(lambda x, y: "%s=%i" %
                (x, y), counts.keys(), counts.values())))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Exemple #15
0
                if unaligned_pair and \
                        unaligned_pair.mToken1 == pair.mToken1 and \
                        unaligned_pair.mToken2 == pair.mToken2 and \
                        unaligned_pair.mIntronId1 == pair.mIntronId1:

                    map_a2b = alignlib_lite.py_makeAlignmentVector()
                    f = AlignmentFormatEmissions(
                        pair.mFrom1,
                        pair.mAlignedSequence1,
                        pair.mFrom2,
                        pair.mAlignedSequence2).copy(map_a2b)
                    map_a2b.moveAlignment(-unaligned_pair.mFrom1 +
                                          1, -unaligned_pair.mFrom2 + 1)

                    data = alignlib_lite.py_AlignmentFormatExplicit(map_a2b,
                                                                    alignlib_lite.py_makeSequence(
                                                                        unaligned_pair.mAlignedSequence1),
                                                                    alignlib_lite.py_makeSequence(unaligned_pair.mAlignedSequence2))

                    from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
                    from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo

                    pair.mAlignedSequence1 = ali1
                    pair.mAlignedSequence2 = ali2

                else:
                    raise "sequence not found for pair %s" % str(pair)

            if param_do_gblocks:
                if param_loglevel >= 4:
                    print "# length before: %i %i" % (len(pair.mAlignedSequence1), pair.mAligned)
                pair.mAlignedSequence1, pair.mAlignedSequence2 = gblocks.GetBlocks(
def Write( map_row2col, row_seq, col_seq, link,
           no_gaps = False, no_identical = False,
           min_length = 0,
           suffix1="", suffix2="",
           outfile = None,
           pair_filter = None,
           format = "fasta" ):
    """write alignment based on map_row2col."""
        
    status = None

    filter_status = "new"
    
    if map_row2col.getLength() == 0:
        status = "empty"

    if not status:

        f = alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq )
        
        row_from = map_row2col.getRowFrom()
        row_to = map_row2col.getRowTo()
        col_from = map_row2col.getColFrom()
        col_to = map_row2col.getColTo()
        row_ali, col_ali = f.mRowAlignment, f.mColAlignment

    if not status:
        if no_gaps:
            # remove gaps from fasta
            r = []
            c = []
            for x in range(len(row_ali)):
                if row_ali[x] != "-" and col_ali[x] != "-":
                    r.append( row_ali[x] )
                    c.append( col_ali[x] )
            row_ali = string.join(r, "")
            col_ali = string.join(c, "")            

    if not status and len(row_ali) < min_length:
        status = "length"

    if not status and no_identical:
            if row_ali == col_ali:
                status = "identical"

    if not status:

        if pair_filter:
            id = "%s-%s" % (link.mQueryToken, link.mSbjctToken)
            if id in pair_filter:
                h = Genomics.GetHID( row_ali + ";" + col_ali ) 
                if h in pair_filter[id]:
                    filter_status = "old"
        
        translation1 = Genomics.TranslateDNA2Protein( row_ali )
        translation2 = Genomics.TranslateDNA2Protein( col_ali )        

        if "X" in translation1 or "x" in translation2:
            status = "stops"
        else:
            status = "success"

        if filter_status == "new":
            if format == "fasta":
                print ">%s%s %s %s\n%s\n>%s%s %s %s\n%s" % (link.mQueryToken, suffix1, row_from, row_to, row_ali, 
                                                            link.mSbjctToken, suffix2, col_from, col_to, col_ali )
            elif format == "dummy":
                pass
            else:
                raise ValueError("unknown format")

    if outfile:
        outfile.write( "%s%s\t%s%s\t%s\t%i\t%s\n" % (link.mQueryToken, suffix1, link.mSbjctToken, suffix2,
                                                     status, map_row2col.getLength(), filter_status ) )

    return status
Exemple #17
0
def getAlignmentFull(m, q, t, options):
    """print alignment with gaps in both query and target."""
    a = alignlib_lite.py_AlignmentFormatExplicit(
        m, alignlib_lite.py_makeSequence(q), alignlib_lite.py_makeSequence(t))
    return a.mRowAlignment, a.mColAlignment
Exemple #18
0
            if param_is_compressed:
                if unaligned_pair and \
                        unaligned_pair.mToken1 == pair.mToken1 and \
                        unaligned_pair.mToken2 == pair.mToken2 and \
                        unaligned_pair.mIntronId1 == pair.mIntronId1:

                    map_a2b = alignlib_lite.py_makeAlignmentVector()
                    f = AlignmentFormatEmissions(
                        pair.mFrom1, pair.mAlignedSequence1, pair.mFrom2,
                        pair.mAlignedSequence2).copy(map_a2b)
                    map_a2b.moveAlignment(-unaligned_pair.mFrom1 + 1,
                                          -unaligned_pair.mFrom2 + 1)

                    data = alignlib_lite.py_AlignmentFormatExplicit(
                        map_a2b,
                        alignlib_lite.py_makeSequence(
                            unaligned_pair.mAlignedSequence1),
                        alignlib_lite.py_makeSequence(
                            unaligned_pair.mAlignedSequence2))

                    from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo
                    from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo

                    pair.mAlignedSequence1 = ali1
                    pair.mAlignedSequence2 = ali2

                else:
                    raise "sequence not found for pair %s" % str(pair)

            if param_do_gblocks:
                if param_loglevel >= 4:
                    print "# length before: %i %i" % (len(
Exemple #19
0
    def read(self, line):

        data = string.split(line[:-1], "\t")

        if len(data) == 26:
            (
                self.mPredictionId,
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
                self.mAlignmentString,
                self.mNAssembled,
            ) = data
        elif len(data) == 25:
            (
                self.mPredictionId,
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
                self.mAlignmentString,
            ) = data
        elif len(data) == 24:
            (
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
                self.mAlignmentString,
            ) = data
        elif len(data) == 23:
            (
                self.mQueryToken,
                self.mSbjctToken,
                self.mSbjctStrand,
                self.mRank,
                self.score,
                self.mQueryFrom,
                self.mQueryTo,
                self.mQueryAli,
                self.mSbjctFrom,
                self.mSbjctTo,
                self.mSbjctAli,
                self.mQueryLength,
                self.mQueryCoverage,
                self.mNGaps,
                self.mNFrameShifts,
                self.mNIntrons,
                self.mNSplits,
                self.mNStopCodons,
                self.mPercentIdentity,
                self.mPercentSimilarity,
                self.mTranslation,
                self.mSbjctGenomeFrom,
                self.mSbjctGenomeTo,
            ) = data
            self.mAlignmentString = ""
        else:
            raise ValueError, "unknown format: %i fields in line %s" % (
                len(data), line[:-1])

        (self.score, self.mQueryCoverage,
         self.mPercentIdentity, self.mPercentSimilarity) = map(
             float, (self.score, self.mQueryCoverage, self.mPercentIdentity,
                     self.mPercentSimilarity))

        (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength,
         self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom,
         self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits,
         self.mNStopCodons, self.mNFrameShifts, self.mNAssembled) = map(
             int, (self.mPredictionId, self.mQueryFrom, self.mQueryTo,
                   self.mQueryLength, self.mSbjctFrom, self.mSbjctTo,
                   self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps,
                   self.mNIntrons, self.mNSplits, self.mNStopCodons,
                   self.mNFrameShifts, self.mNAssembled))

        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )

            if self.mQueryAli != "" and self.mSbjctAli != "":

                alignlib_lite.py_AlignmentFormatExplicit(
                    self.mQueryFrom, self.mQueryAli, self.mSbjctFrom,
                    self.mSbjctAli).copy(self.mMapPeptide2Translation)

            self.mMapPeptide2Genome = Genomics.String2Alignment(
                self.mAlignmentString)
Exemple #20
0
def getAlignmentFull(m, q, t, options):
    """print alignment with gaps in both query and target."""
    a = alignlib_lite.py_AlignmentFormatExplicit(
        m, alignlib_lite.py_makeSequence(q), alignlib_lite.py_makeSequence(t))
    return a.mRowAlignment, a.mColAlignment
    def read( self, line ):

        data = string.split( line[:-1], "\t")
        
        if len(data) == 26:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString, self.mNAssembled,
              ) = data
        elif len(data) == 25:
            ( self.mPredictionId,
              self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              ) = data
        elif len(data) == 24:            
            ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              self.mAlignmentString,
              ) = data
        elif len(data) == 23:
            ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand,
              self.mRank, self.score,
              self.mQueryFrom, self.mQueryTo, self.mQueryAli,
              self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli,
              self.mQueryLength, self.mQueryCoverage, 
              self.mNGaps, self.mNFrameShifts, self.mNIntrons,
              self.mNSplits, self.mNStopCodons,
              self.mPercentIdentity, self.mPercentSimilarity,
              self.mTranslation,
              self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
              ) = data
            self.mAlignmentString = ""
        else:
            raise ValueError, "unknown format: %i fields in line %s" % (len(data), line[:-1])

        (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map (\
            float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity))

        (self.mPredictionId, 
         self.mQueryFrom, self.mQueryTo, self.mQueryLength,
         self.mSbjctFrom, self.mSbjctTo,
         self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
         self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons,
         self.mNFrameShifts, self.mNAssembled) = map (\
            int, ( self.mPredictionId,
                   self.mQueryFrom, self.mQueryTo, self.mQueryLength,
                   self.mSbjctFrom, self.mSbjctTo,
                   self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
                   self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons,
                   self.mNFrameShifts, self.mNAssembled))

        if self.mExpand:        
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector()

            if self.mQueryAli != "" and self.mSbjctAli != "":
                
                alignlib_lite.py_AlignmentFormatExplicit(
                    self.mQueryFrom, self.mQueryAli,
                    self.mSbjctFrom, self.mSbjctAli).copy( self.mMapPeptide2Translation )

            self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )