Beispiel #1
0
def prune_aln(aln, what, fh_out=sys.stdout):
    """Prune what columns from alignment and print result
    """

    assert what in ['any_gap', 'all_gap', 'identical']

    keep_cols = []
    for i in xrange(aln.get_alignment_length()):
        # deprecated: col = aln.get_column(i)
        col_nucs = [sr.seq[i].upper() for sr in aln]
        counter = Counter(col_nucs)

        if what == 'any_gap':
            if any([bioutils.isgap(c) for c in counter.keys()]):
                continue
        if what == 'all_gap':
            if all([bioutils.isgap(c) for c in counter.keys()]):
                continue
        if what == 'identical':
            if len(set(counter.keys())) == 1:
                continue

        keep_cols.append(i)

    # FIXME add support for proper alignment output, not just
    # concatenated fasta
    LOG.info("Keeping the following columns: %s" %
             (', '.join([str(x + 1) for x in keep_cols])))
    for s in aln:
        fh_out.write(">%s\n" % s.id)
        fh_out.write('%s\n' % ''.join([s.seq[i] for i in keep_cols]))
def prune_aln(aln, what, fh_out=sys.stdout):
    """Prune what columns from alignment and print result
    """

    assert what in ['any_gap', 'all_gap', 'identical']
    
    keep_cols = []
    for i in xrange(aln.get_alignment_length()):
        # deprecated: col = aln.get_column(i)
        col_nucs = [sr.seq[i].upper() for sr in aln]
        counter = Counter(col_nucs)

        if what == 'any_gap':
            if any([bioutils.isgap(c) for c in counter.keys()]):
                continue
        if what == 'all_gap':
            if all([bioutils.isgap(c) for c in counter.keys()]):
                continue
        if what == 'identical':
            if len(set(counter.keys())) == 1:
                continue

        keep_cols.append(i)

    # FIXME add support for proper alignment output, not just
    # concatenated fasta
    LOG.info("Keeping the following columns: %s" % (
        ', '.join([str(x+1) for x in keep_cols])))
    for s in aln:
        fh_out.write(">%s\n" % s.id)
        fh_out.write('%s\n' % ''.join([s.seq[i] for i in keep_cols]))
Beispiel #3
0
def comp_pairwise_ident_matrix(seqrecs):
    """Returns a fake matrix (symmetric 2d list) of pairwise
    identities. Valid index range is [i][j], where i>=j, j>=0 and
    i<nseqs. values for i=j are None!
    """
    nseqs = len(seqrecs)

    # intentionally a list, not a matrix, because numpy doesn't know
    # about symmetric arrays
    mx = []
    for i in xrange(nseqs):
        jdists = []
        for j in xrange(0, i):
            s1 = str(seqrecs[i].seq).upper()
            s2 = str(seqrecs[j].seq).upper()
            pwid = pairwise_identity(s1, s2)
            jdists.append(pwid)

            if False:
                # tmp hack dna dist
                dist = sum(c1 != c2
                         for c1, c2 in izip(s1, s2)
                         if not bioutils.isgap(c1) and not bioutils.isgap(c2))
                print "TMP: dist %s vs %s: %d" % (seqrecs[i].id, seqrecs[j].id, dist)

        jdists.append(None) # self comparison not defined
        mx.append(jdists)
    return mx
Beispiel #4
0
def comp_pairwise_ident_matrix(seqrecs):
    """Returns a fake matrix (symmetric 2d list) of pairwise
    identities. Valid index range is [i][j], where i>=j, j>=0 and
    i<nseqs. values for i=j are None!
    """
    nseqs = len(seqrecs)

    # intentionally a list, not a matrix, because numpy doesn't know
    # about symmetric arrays
    mx = []
    for i in xrange(nseqs):
        jdists = []
        for j in xrange(0, i):
            s1 = str(seqrecs[i].seq).upper()
            s2 = str(seqrecs[j].seq).upper()
            pwid = pairwise_identity(s1, s2)
            jdists.append(pwid)

            if False:
                # tmp hack dna dist
                dist = sum(
                    c1 != c2 for c1, c2 in izip(s1, s2)
                    if not bioutils.isgap(c1) and not bioutils.isgap(c2))
                print "TMP: dist %s vs %s: %d" % (seqrecs[i].id, seqrecs[j].id,
                                                  dist)

        jdists.append(None)  # self comparison not defined
        mx.append(jdists)
    return mx
Beispiel #5
0
def pairwise_identity(s1, s2):
    """Return fractional pairwise identity between two aligned
    strings, which is defined here as the number of identical residues
    (case sensitive), divived by the smaller of the two ungapped
    sequences.

    Uppercase your sequence for case insensitivity. For mixed RNA/DNA
    you might want to replace T's with U's vice versa.
    
    Based on ideas from
    http://code.activestate.com/recipes/499304-hamming-distance/
    """

    assert len(s1) == len(s2)
    idents = sum(c1 == c2 for c1, c2 in izip(s1, s2)
                 if not bioutils.isgap(c1) and not bioutils.isgap(c2))
    min_ungapped_len = min(len(bioutils.ungap(s1)), len(bioutils.ungap(s2)))
    return idents / float(min_ungapped_len)
Beispiel #6
0
def pairwise_identity(s1, s2):
    """Return fractional pairwise identity between two aligned
    strings, which is defined here as the number of identical residues
    (case sensitive), divived by the smaller of the two ungapped
    sequences.

    Uppercase your sequence for case insensitivity. For mixed RNA/DNA
    you might want to replace T's with U's vice versa.
    
    Based on ideas from
    http://code.activestate.com/recipes/499304-hamming-distance/
    """
    
    assert len(s1) == len(s2)
    idents = sum(c1 == c2
                 for c1, c2 in izip(s1, s2) 
                 if not bioutils.isgap(c1) and not bioutils.isgap(c2))
    min_ungapped_len = min(len(bioutils.ungap(s1)), len(bioutils.ungap(s2)))
    return idents / float(min_ungapped_len)