Example #1
0
    def test_aligned(self):
        distance_iter = seqstats.pairwise_distance_iter(
                seq_iter = self.seqs,
                per_site = False,
                aligned = True,
                ignore_gaps = True)
        for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
            self.assertEqual(
                    self.expected[seq1.id][seq2.id],
                    d)
        self.assertEqual(i, 2)

        distance_iter = seqstats.pairwise_distance_iter(
                seq_iter = self.seqs,
                per_site = True,
                aligned = True,
                ignore_gaps = False)
        for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
            self.assertAlmostEqual(
                    self.expected[seq1.id][seq2.id] / 6.0,
                    d)
        self.assertEqual(i, 2)

        distance_iter = seqstats.pairwise_distance_iter(
                seq_iter = self.seqs,
                per_site = True,
                aligned = True,
                ignore_gaps = True)
        for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
            self.assertAlmostEqual(
                    self.expected[seq1.id][seq2.id] / 4.0,
                    d)
        self.assertEqual(i, 2)
Example #2
0
    def test_unaligned_muscle(self):
        if not functions.which('muscle'):
            _LOG.warning('muscle not found... skipping tests.')
            return
        distance_iter = seqstats.pairwise_distance_iter(
                seq_iter = self.seqs,
                per_site = False,
                aligned = False,
                ignore_gaps = True,
                aligner_tools = ['muscle'])
        for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
            self.assertEqual(
                    self.expected[seq1.id][seq2.id],
                    d)
        self.assertEqual(i, 2)

        distance_iter = seqstats.pairwise_distance_iter(
                seq_iter = self.seqs,
                per_site = True,
                aligned = False,
                ignore_gaps = True,
                aligner_tools = ['muscle'])
        for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
            self.assertAlmostEqual(
                    self.expected[seq1.id][seq2.id] / 4.0,
                    d)
        self.assertEqual(i, 2)
Example #3
0
    def test_amino_acid_seqs(self):
        seqs = [
                SeqRecord(Seq('MILV*XQP*'), id='1'),
                SeqRecord(Seq('MILV*XQQ*'), id='2'),
                SeqRecord(Seq('MILV*XPP*'), id='3'),
                ]
        expected = {}
        expected['1'] = {'2': 1, '3': 1}
        expected['2'] = {'1': 1, '3': 2}
        expected['3'] = {'1': 1, '2': 2}

        distance_iter = seqstats.pairwise_distance_iter(
                seq_iter = seqs,
                alphabet = alphabets.ProteinAlphabet(),
                per_site = False,
                aligned = True,
                ignore_gaps = True)
        for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
            self.assertEqual(drc, None)
            self.assertEqual(
                    expected[seq1.id][seq2.id],
                    d)
        self.assertEqual(i, 2)
Example #4
0
def summarize_distances(seq_iter,
                        sample_size=0,
                        per_site=True,
                        aligned=False,
                        ignore_gaps=True,
                        alphabet=None,
                        do_full_alignment=False,
                        full_alignment_out_path=None,
                        aligner_tools=['mafft', 'muscle'],
                        full_aligner_tools=None,
                        rng=None,
                        log_frequency=0):
    if ((not aligned) and (do_full_alignment)):
        if not full_aligner_tools:
            full_aligner_tools = aligner_tools
        seq_iter = align.align(seq_iter,
                               tools=full_aligner_tools,
                               out_path=full_alignment_out_path)
        aligned = True
    if sample_size > 0:
        distance_iter = seqstats.sample_distance_iter(
            seq_iter=seq_iter,
            sample_size=sample_size,
            aligned=aligned,
            ignore_gaps=ignore_gaps,
            per_site=per_site,
            alphabet=alphabet,
            aligner_tools=aligner_tools,
            rng=rng)
    else:
        distance_iter = seqstats.pairwise_distance_iter(
            seq_iter=seq_iter,
            aligned=aligned,
            ignore_gaps=ignore_gaps,
            per_site=per_site,
            alphabet=alphabet,
            aligner_tools=aligner_tools)
    distances = {}
    rev_comp_errors = []
    for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
        if (log_frequency > 0) and (((i + 1) % log_frequency) == 0):
            _LOG.info('{0}: Calulating distance for comparison {1}...'.format(
                datetime.datetime.now(), (i + 1)))
        if (drc != None) and (drc < d):
            _LOG.warning('reverse complement of {0} is more similar to '
                         '{1} ({2:.5f} vs {3:.5f})'.format(
                             seq1.id, seq2.id, drc, d))
            rev_comp_errors.append((seq1.id, seq2.id, d, drc))
        if sample_size > 0:
            if not seq1.id in distances:
                distances[seq1.id] = stats.SampleSummarizer(samples=[d])
                continue
            distances[seq1.id].add_sample(d)
        else:
            if not seq1.id in distances:
                distances[seq1.id] = stats.SampleSummarizer(samples=[d])
            else:
                distances[seq1.id].add_sample(d)
            if not seq2.id in distances:
                distances[seq2.id] = stats.SampleSummarizer(samples=[d])
            else:
                distances[seq2.id].add_sample(d)
    return distances, rev_comp_errors
Example #5
0
def summarize_distances(seq_iter,
        sample_size = 0,
        per_site = True,
        aligned = False,
        ignore_gaps = True,
        alphabet = None,
        do_full_alignment = False,
        full_alignment_out_path = None,
        aligner_tools = ['mafft', 'muscle'],
        full_aligner_tools = None,
        rng = None,
        log_frequency = 0):
    if ((not aligned) and (do_full_alignment)):
        if not full_aligner_tools:
            full_aligner_tools = aligner_tools
        seq_iter = align.align(seq_iter,
                tools = full_aligner_tools,
                out_path = full_alignment_out_path)
        aligned = True
    if sample_size > 0:
        distance_iter = seqstats.sample_distance_iter(
                seq_iter = seq_iter,
                sample_size = sample_size,
                aligned = aligned,
                ignore_gaps = ignore_gaps,
                per_site = per_site,
                alphabet = alphabet,
                aligner_tools = aligner_tools,
                rng = rng)
    else:
        distance_iter = seqstats.pairwise_distance_iter(
                seq_iter = seq_iter,
                aligned = aligned,
                ignore_gaps = ignore_gaps,
                per_site = per_site,
                alphabet = alphabet,
                aligner_tools = aligner_tools)
    distances = {}
    rev_comp_errors = []
    for i, (seq1, seq2, d, drc) in enumerate(distance_iter):
        if (log_frequency > 0) and (((i + 1) % log_frequency) == 0):
            _LOG.info('{0}: Calulating distance for comparison {1}...'.format(
                    datetime.datetime.now(),
                    (i + 1)))
        if (drc != None) and (drc < d):
            _LOG.warning('reverse complement of {0} is more similar to '
                    '{1} ({2:.5f} vs {3:.5f})'.format(seq1.id, seq2.id, drc, d))
            rev_comp_errors.append((seq1.id, seq2.id, d, drc))
        if sample_size > 0:
            if not seq1.id in distances:
                distances[seq1.id] = stats.SampleSummarizer(samples = [d])
                continue
            distances[seq1.id].add_sample(d)
        else:
            if not seq1.id in distances:
                distances[seq1.id] = stats.SampleSummarizer(samples = [d])
            else:
                distances[seq1.id].add_sample(d)
            if not seq2.id in distances:
                distances[seq2.id] = stats.SampleSummarizer(samples = [d])
            else:
                distances[seq2.id].add_sample(d)
    return distances, rev_comp_errors