def test_aligned(self): distance_iter = seqstats.pairwise_distance_iter( seq_iter = self.seqs, per_site = False, aligned = True, ignore_gaps = True) for i, (seq1, seq2, d, drc) in enumerate(distance_iter): self.assertEqual( self.expected[seq1.id][seq2.id], d) self.assertEqual(i, 2) distance_iter = seqstats.pairwise_distance_iter( seq_iter = self.seqs, per_site = True, aligned = True, ignore_gaps = False) for i, (seq1, seq2, d, drc) in enumerate(distance_iter): self.assertAlmostEqual( self.expected[seq1.id][seq2.id] / 6.0, d) self.assertEqual(i, 2) distance_iter = seqstats.pairwise_distance_iter( seq_iter = self.seqs, per_site = True, aligned = True, ignore_gaps = True) for i, (seq1, seq2, d, drc) in enumerate(distance_iter): self.assertAlmostEqual( self.expected[seq1.id][seq2.id] / 4.0, d) self.assertEqual(i, 2)
def test_unaligned_muscle(self): if not functions.which('muscle'): _LOG.warning('muscle not found... skipping tests.') return distance_iter = seqstats.pairwise_distance_iter( seq_iter = self.seqs, per_site = False, aligned = False, ignore_gaps = True, aligner_tools = ['muscle']) for i, (seq1, seq2, d, drc) in enumerate(distance_iter): self.assertEqual( self.expected[seq1.id][seq2.id], d) self.assertEqual(i, 2) distance_iter = seqstats.pairwise_distance_iter( seq_iter = self.seqs, per_site = True, aligned = False, ignore_gaps = True, aligner_tools = ['muscle']) for i, (seq1, seq2, d, drc) in enumerate(distance_iter): self.assertAlmostEqual( self.expected[seq1.id][seq2.id] / 4.0, d) self.assertEqual(i, 2)
def test_amino_acid_seqs(self): seqs = [ SeqRecord(Seq('MILV*XQP*'), id='1'), SeqRecord(Seq('MILV*XQQ*'), id='2'), SeqRecord(Seq('MILV*XPP*'), id='3'), ] expected = {} expected['1'] = {'2': 1, '3': 1} expected['2'] = {'1': 1, '3': 2} expected['3'] = {'1': 1, '2': 2} distance_iter = seqstats.pairwise_distance_iter( seq_iter = seqs, alphabet = alphabets.ProteinAlphabet(), per_site = False, aligned = True, ignore_gaps = True) for i, (seq1, seq2, d, drc) in enumerate(distance_iter): self.assertEqual(drc, None) self.assertEqual( expected[seq1.id][seq2.id], d) self.assertEqual(i, 2)
def summarize_distances(seq_iter, sample_size=0, per_site=True, aligned=False, ignore_gaps=True, alphabet=None, do_full_alignment=False, full_alignment_out_path=None, aligner_tools=['mafft', 'muscle'], full_aligner_tools=None, rng=None, log_frequency=0): if ((not aligned) and (do_full_alignment)): if not full_aligner_tools: full_aligner_tools = aligner_tools seq_iter = align.align(seq_iter, tools=full_aligner_tools, out_path=full_alignment_out_path) aligned = True if sample_size > 0: distance_iter = seqstats.sample_distance_iter( seq_iter=seq_iter, sample_size=sample_size, aligned=aligned, ignore_gaps=ignore_gaps, per_site=per_site, alphabet=alphabet, aligner_tools=aligner_tools, rng=rng) else: distance_iter = seqstats.pairwise_distance_iter( seq_iter=seq_iter, aligned=aligned, ignore_gaps=ignore_gaps, per_site=per_site, alphabet=alphabet, aligner_tools=aligner_tools) distances = {} rev_comp_errors = [] for i, (seq1, seq2, d, drc) in enumerate(distance_iter): if (log_frequency > 0) and (((i + 1) % log_frequency) == 0): _LOG.info('{0}: Calulating distance for comparison {1}...'.format( datetime.datetime.now(), (i + 1))) if (drc != None) and (drc < d): _LOG.warning('reverse complement of {0} is more similar to ' '{1} ({2:.5f} vs {3:.5f})'.format( seq1.id, seq2.id, drc, d)) rev_comp_errors.append((seq1.id, seq2.id, d, drc)) if sample_size > 0: if not seq1.id in distances: distances[seq1.id] = stats.SampleSummarizer(samples=[d]) continue distances[seq1.id].add_sample(d) else: if not seq1.id in distances: distances[seq1.id] = stats.SampleSummarizer(samples=[d]) else: distances[seq1.id].add_sample(d) if not seq2.id in distances: distances[seq2.id] = stats.SampleSummarizer(samples=[d]) else: distances[seq2.id].add_sample(d) return distances, rev_comp_errors
def summarize_distances(seq_iter, sample_size = 0, per_site = True, aligned = False, ignore_gaps = True, alphabet = None, do_full_alignment = False, full_alignment_out_path = None, aligner_tools = ['mafft', 'muscle'], full_aligner_tools = None, rng = None, log_frequency = 0): if ((not aligned) and (do_full_alignment)): if not full_aligner_tools: full_aligner_tools = aligner_tools seq_iter = align.align(seq_iter, tools = full_aligner_tools, out_path = full_alignment_out_path) aligned = True if sample_size > 0: distance_iter = seqstats.sample_distance_iter( seq_iter = seq_iter, sample_size = sample_size, aligned = aligned, ignore_gaps = ignore_gaps, per_site = per_site, alphabet = alphabet, aligner_tools = aligner_tools, rng = rng) else: distance_iter = seqstats.pairwise_distance_iter( seq_iter = seq_iter, aligned = aligned, ignore_gaps = ignore_gaps, per_site = per_site, alphabet = alphabet, aligner_tools = aligner_tools) distances = {} rev_comp_errors = [] for i, (seq1, seq2, d, drc) in enumerate(distance_iter): if (log_frequency > 0) and (((i + 1) % log_frequency) == 0): _LOG.info('{0}: Calulating distance for comparison {1}...'.format( datetime.datetime.now(), (i + 1))) if (drc != None) and (drc < d): _LOG.warning('reverse complement of {0} is more similar to ' '{1} ({2:.5f} vs {3:.5f})'.format(seq1.id, seq2.id, drc, d)) rev_comp_errors.append((seq1.id, seq2.id, d, drc)) if sample_size > 0: if not seq1.id in distances: distances[seq1.id] = stats.SampleSummarizer(samples = [d]) continue distances[seq1.id].add_sample(d) else: if not seq1.id in distances: distances[seq1.id] = stats.SampleSummarizer(samples = [d]) else: distances[seq1.id].add_sample(d) if not seq2.id in distances: distances[seq2.id] = stats.SampleSummarizer(samples = [d]) else: distances[seq2.id].add_sample(d) return distances, rev_comp_errors