def prune_aln(aln, what, fh_out=sys.stdout): """Prune what columns from alignment and print result """ assert what in ['any_gap', 'all_gap', 'identical'] keep_cols = [] for i in xrange(aln.get_alignment_length()): # deprecated: col = aln.get_column(i) col_nucs = [sr.seq[i].upper() for sr in aln] counter = Counter(col_nucs) if what == 'any_gap': if any([bioutils.isgap(c) for c in counter.keys()]): continue if what == 'all_gap': if all([bioutils.isgap(c) for c in counter.keys()]): continue if what == 'identical': if len(set(counter.keys())) == 1: continue keep_cols.append(i) # FIXME add support for proper alignment output, not just # concatenated fasta LOG.info("Keeping the following columns: %s" % (', '.join([str(x + 1) for x in keep_cols]))) for s in aln: fh_out.write(">%s\n" % s.id) fh_out.write('%s\n' % ''.join([s.seq[i] for i in keep_cols]))
def prune_aln(aln, what, fh_out=sys.stdout): """Prune what columns from alignment and print result """ assert what in ['any_gap', 'all_gap', 'identical'] keep_cols = [] for i in xrange(aln.get_alignment_length()): # deprecated: col = aln.get_column(i) col_nucs = [sr.seq[i].upper() for sr in aln] counter = Counter(col_nucs) if what == 'any_gap': if any([bioutils.isgap(c) for c in counter.keys()]): continue if what == 'all_gap': if all([bioutils.isgap(c) for c in counter.keys()]): continue if what == 'identical': if len(set(counter.keys())) == 1: continue keep_cols.append(i) # FIXME add support for proper alignment output, not just # concatenated fasta LOG.info("Keeping the following columns: %s" % ( ', '.join([str(x+1) for x in keep_cols]))) for s in aln: fh_out.write(">%s\n" % s.id) fh_out.write('%s\n' % ''.join([s.seq[i] for i in keep_cols]))
def comp_pairwise_ident_matrix(seqrecs): """Returns a fake matrix (symmetric 2d list) of pairwise identities. Valid index range is [i][j], where i>=j, j>=0 and i<nseqs. values for i=j are None! """ nseqs = len(seqrecs) # intentionally a list, not a matrix, because numpy doesn't know # about symmetric arrays mx = [] for i in xrange(nseqs): jdists = [] for j in xrange(0, i): s1 = str(seqrecs[i].seq).upper() s2 = str(seqrecs[j].seq).upper() pwid = pairwise_identity(s1, s2) jdists.append(pwid) if False: # tmp hack dna dist dist = sum(c1 != c2 for c1, c2 in izip(s1, s2) if not bioutils.isgap(c1) and not bioutils.isgap(c2)) print "TMP: dist %s vs %s: %d" % (seqrecs[i].id, seqrecs[j].id, dist) jdists.append(None) # self comparison not defined mx.append(jdists) return mx
def comp_pairwise_ident_matrix(seqrecs): """Returns a fake matrix (symmetric 2d list) of pairwise identities. Valid index range is [i][j], where i>=j, j>=0 and i<nseqs. values for i=j are None! """ nseqs = len(seqrecs) # intentionally a list, not a matrix, because numpy doesn't know # about symmetric arrays mx = [] for i in xrange(nseqs): jdists = [] for j in xrange(0, i): s1 = str(seqrecs[i].seq).upper() s2 = str(seqrecs[j].seq).upper() pwid = pairwise_identity(s1, s2) jdists.append(pwid) if False: # tmp hack dna dist dist = sum( c1 != c2 for c1, c2 in izip(s1, s2) if not bioutils.isgap(c1) and not bioutils.isgap(c2)) print "TMP: dist %s vs %s: %d" % (seqrecs[i].id, seqrecs[j].id, dist) jdists.append(None) # self comparison not defined mx.append(jdists) return mx
def pairwise_identity(s1, s2): """Return fractional pairwise identity between two aligned strings, which is defined here as the number of identical residues (case sensitive), divived by the smaller of the two ungapped sequences. Uppercase your sequence for case insensitivity. For mixed RNA/DNA you might want to replace T's with U's vice versa. Based on ideas from http://code.activestate.com/recipes/499304-hamming-distance/ """ assert len(s1) == len(s2) idents = sum(c1 == c2 for c1, c2 in izip(s1, s2) if not bioutils.isgap(c1) and not bioutils.isgap(c2)) min_ungapped_len = min(len(bioutils.ungap(s1)), len(bioutils.ungap(s2))) return idents / float(min_ungapped_len)