def test_squeeze_seq(self): """squeeze should collapse homopolymers to one nuc.""" seq = "AAAGGGAAACCCGGGA" self.assertEqual(squeeze_seq(seq), "AGACGA") self.assertEqual(squeeze_seq("AAAATATTTAGGC"), "ATATAGC") self.assertEqual(squeeze_seq(""), "") self.assertEqual(squeeze_seq("ATGCATGCATGC"), "ATGCATGCATGC")
def prefix_filter_flowgrams(flowgrams, squeeze=False): """Filters flowgrams by common prefixes. flowgrams: iterable source of flowgrams squeeze: if True, collapse all poly-X to X Returns prefix mapping. """ # collect flowgram sequences if squeeze: seqs = imap( lambda f: (f.Name, squeeze_seq(str(f.toSeq(truncate=True)))), flowgrams) else: seqs = imap(lambda f: (f.Name, str(f.toSeq(truncate=True))), flowgrams) # equivalent but more efficient than #seqs = [(f.Name, str(f.toSeq(truncate=True))) for f in flowgrams] # get prefix mappings mapping = build_prefix_map(seqs) l = len(mapping) orig_l = sum([len(a) for a in mapping.values()]) + l return (l, orig_l, mapping)