コード例 #1
0
ファイル: test_alignment.py プロジェクト: BANSHEE-/scikit-bio
    def test_to_phylip_no_positions(self):
        d1 = DNASequence('', id="d1")
        d2 = DNASequence('', id="d2")
        a = Alignment([d1, d2])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()
コード例 #2
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(template_alignment,
                                                          DNASequence,
                                                          validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
コード例 #3
0
ファイル: test_alignment.py プロジェクト: BANSHEE-/scikit-bio
    def test_to_phylip_unequal_sequence_lengths(self):
        d1 = DNASequence('A-CT', id="d1")
        d2 = DNASequence('TTA', id="d2")
        d3 = DNASequence('.-AC', id="d3")
        a = Alignment([d1, d2, d3])

        with self.assertRaises(SequenceCollectionError):
            a.to_phylip()
コード例 #4
0
ファイル: align_seqs.py プロジェクト: Bonder-MJ/qiime
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(
                    template_alignment, DNASequence, validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), identifier=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), identifier=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
コード例 #5
0
    def test_validate_lengths(self):
        """
        """
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(
            Alignment([DNASequence('TTT', id="d1")])._validate_lengths())
        self.assertFalse(
            Alignment(
                [DNASequence('TTT', id="d1"),
                 DNASequence('TT', id="d2")])._validate_lengths())
コード例 #6
0
    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'})
        expected = "\n".join([
            "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)
コード例 #7
0
    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'})
        expected = "\n".join([
            "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)
コード例 #8
0
    def test_subalignment(self):
        """subalignment functions as expected
        """
        # keep seqs by ids
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by ids (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d2 = DNASequence('TAC', id="d2")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNASequence('.C-GTTGG..', id="d1")
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        d3 = DNASequence('-C-GTTGC--', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        expected = Alignment([d2])
        self.assertEqual(actual, expected)
コード例 #9
0
    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.a1.is_valid())
        self.assertTrue(self.a2.is_valid())
        self.assertTrue(self.empty.is_valid())

        # invalid because of length mismatch
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

        # invalid because of invalid charaters
        d1 = DNASequence('..ACC-GTXGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())
コード例 #10
0
def generate_lane_mask(infile, entropy_threshold, existing_mask=None):
    """ Generates lane mask dynamically by calculating base frequencies

    infile: open file object for aligned fasta file
    entropy_threshold:  float value that designates the percentage of entropic
     positions to be removed, i.e., 0.10 means the 10% most entropic positions
     are removed.

    """
    aln = Alignment.from_fasta_records(parse_fasta(infile), DNA)
    uncertainty = aln.position_entropies(nan_on_non_standard_chars=False)

    uncertainty_sorted = sorted(uncertainty)

    cutoff_index = int(round((len(uncertainty_sorted) - 1) *
                             (1 - entropy_threshold)))

    max_uncertainty = uncertainty_sorted[cutoff_index]

    # This correction is for small datasets with a small possible number of
    # uncertainty values.
    highest_certainty = min(uncertainty_sorted)

    lane_mask = ""

    for base in uncertainty:
        if base >= max_uncertainty and base != highest_certainty:
            lane_mask += "0"
        else:
            lane_mask += "1"

    return lane_mask
コード例 #11
0
ファイル: make_phylogeny.py プロジェクト: Bonder-MJ/qiime
    def getResult(self, aln_path, *args, **kwargs):
        """Returns alignment from sequences.

        Currently does not allow parameter tuning of program and uses
        default parameters -- this is bad and should be fixed.

        #TODO: allow command-line access to important aln params.
        """
        module = self.Params['Module']
        # standard qiime says we just consider the first word as the unique ID
        # the rest of the defline of the fasta alignment often doesn't match
        # the otu names in the otu table
        with open(aln_path) as aln_f:
            seqs = Alignment.from_fasta_records(
                parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]),
                DNA)
        # This ugly little line of code lets us pass a skbio Alignment when a
        # a cogent alignment is expected.
        seqs.getIntMap = seqs.int_map
        result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent)

        try:
            root_method = kwargs['root_method']
            if root_method == 'midpoint':
                result = root_midpt(result)
            elif root_method == 'tree_method_default':
                pass
        except KeyError:
            pass
        return result
コード例 #12
0
ファイル: test_alignment.py プロジェクト: teravest/scikit-bio
    def test_majority_consensus(self):
        """majority_consensus functions as expected
        """
        d1 = DNASequence('TTT', identifier="d1")
        d2 = DNASequence('TT-', identifier="d2")
        d3 = DNASequence('TC-', identifier="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNASequence('TT-'))

        d1 = DNASequence('T', identifier="d1")
        d2 = DNASequence('A', identifier="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNASequence('T'), DNASequence('A')])

        self.assertEqual(self.empty.majority_consensus(), '')
コード例 #13
0
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95):
    """ remove sequences very different from the majority consensus

    given aligned sequences, will:
     1. calculate a majority consensus (most common symbol at each position
        of the alignment);
     2. compute the mean/std edit distance of each seq to the consensus;
     3. discard sequences whose edit dist is greater than the cutoff, which is
        defined as being `num_stds` greater than the mean.

    """
    # load the alignment and compute the consensus sequence
    aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA)
    consensus_seq = aln.majority_consensus()
    # compute the hamming distance between all sequences in the alignment
    # and the consensus sequence
    dists_to_consensus = [s.distance(consensus_seq) for s in aln]
    # compute the average and standard deviation distance from the consensus
    average_distance = mean(dists_to_consensus)
    std_distance = std(dists_to_consensus)
    # compute the distance cutoff
    dist_cutoff = average_distance + num_stds * std_distance
    # for all sequences, determine if they're distance to the consensus
    # is less then or equal to the cutoff distance. if so, add the sequence's
    # identifier to the list of sequence identifiers to keep
    seqs_to_keep = []
    for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus):
        if dist_to_consensus <= dist_cutoff:
            seqs_to_keep.append(seq_id)
    # filter the alignment to only keep the sequences identified in the step
    # above
    filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep)
    # and return the filtered alignment
    return filtered_aln
コード例 #14
0
ファイル: make_phylogeny.py プロジェクト: wilkox/qiime
    def getResult(self, aln_path, *args, **kwargs):
        """Returns alignment from sequences.

        Currently does not allow parameter tuning of program and uses
        default parameters -- this is bad and should be fixed.

        #TODO: allow command-line access to important aln params.
        """
        module = self.Params['Module']
        # standard qiime says we just consider the first word as the unique ID
        # the rest of the defline of the fasta alignment often doesn't match
        # the otu names in the otu table
        with open(aln_path) as aln_f:
            seqs = Alignment.from_fasta_records(
                parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA)
        # This ugly little line of code lets us pass a skbio Alignment when a
        # a cogent alignment is expected.
        seqs.getIntMap = seqs.int_map
        result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent)

        try:
            root_method = kwargs['root_method']
            if root_method == 'midpoint':
                result = root_midpt(result)
            elif root_method == 'tree_method_default':
                pass
        except KeyError:
            pass
        return result
コード例 #15
0
def generate_lane_mask(infile, entropy_threshold, existing_mask=None):
    """ Generates lane mask dynamically by calculating base frequencies

    infile: open file object for aligned fasta file
    entropy_threshold:  float value that designates the percentage of entropic
     positions to be removed, i.e., 0.10 means the 10% most entropic positions
     are removed.

    """
    aln = Alignment.from_fasta_records(parse_fasta(infile), DNA)
    uncertainty = aln.position_entropies(nan_on_non_standard_chars=False)

    uncertainty_sorted = sorted(uncertainty)

    cutoff_index = int(
        round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold)))

    max_uncertainty = uncertainty_sorted[cutoff_index]

    # This correction is for small datasets with a small possible number of
    # uncertainty values.
    highest_certainty = min(uncertainty_sorted)

    lane_mask = ""

    for base in uncertainty:
        if base >= max_uncertainty and base != highest_certainty:
            lane_mask += "0"
        else:
            lane_mask += "1"

    return lane_mask
コード例 #16
0
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95):
    """ remove sequences very different from the majority consensus

    given aligned sequences, will:
     1. calculate a majority consensus (most common symbol at each position
        of the alignment);
     2. compute the mean/std edit distance of each seq to the consensus;
     3. discard sequences whose edit dist is greater than the cutoff, which is
        defined as being `num_stds` greater than the mean.

    """
    # load the alignment and compute the consensus sequence
    aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA)
    consensus_seq = aln.majority_consensus()
    # compute the hamming distance between all sequences in the alignment
    # and the consensus sequence
    dists_to_consensus = [s.distance(consensus_seq) for s in aln]
    # compute the average and standard deviation distance from the consensus
    average_distance = mean(dists_to_consensus)
    std_distance = std(dists_to_consensus)
    # compute the distance cutoff
    dist_cutoff = average_distance + num_stds * std_distance
    # for all sequences, determine if they're distance to the consensus
    # is less then or equal to the cutoff distance. if so, add the sequence's
    # identifier to the list of sequence identifiers to keep
    seqs_to_keep = []
    for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus):
        if dist_to_consensus <= dist_cutoff:
            seqs_to_keep.append(seq_id)
    # filter the alignment to only keep the sequences identified in the step
    # above
    filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep)
    # and return the filtered alignment
    return filtered_aln
コード例 #17
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label =\
         '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2 =\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.sequence_objects_a = [
         DNASequence('ACTCGAGATC', 'seq1'),
         DNASequence('GGCCT', 'seq2')
     ]
     self.sequence_objects_b = [
         BiologicalSequence('ACTCGAGATC', 'seq1'),
         BiologicalSequence('GGCCT', 'seq2')
     ]
     seqs = [
         DNASequence("ACC--G-GGTA..", id="seq1"),
         DNASequence("TCC--G-GGCA..", id="seqs2")
     ]
     self.alignment = Alignment(seqs)
コード例 #18
0
ファイル: test_align_seqs.py プロジェクト: Honglongwu/qiime
    def setUp(self):
        fd, self.infernal_test1_input_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta")
        close(fd)
        with open(self.infernal_test1_input_fp, "w") as in_f:
            in_f.write("\n".join(infernal_test1_input_fasta))

        fd, self.infernal_test1_template_fp = mkstemp(prefix="InfernalAlignerTests_", suffix="template.sto")
        close(fd)
        with open(self.infernal_test1_template_fp, "w") as in_f:
            in_f.write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.result_fp, "w").close()

        fd, self.log_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".log")
        close(fd)
        open(self.log_fp, "w").close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
        ]

        self.infernal_test1_aligner = InfernalAligner({"template_filepath": self.infernal_test1_template_fp})
        self.infernal_test1_expected_aln = Alignment.from_fasta_records(
            parse_fasta(infernal_test1_expected_alignment), DNA
        )
コード例 #19
0
ファイル: test_alignment.py プロジェクト: teravest/scikit-bio
    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence('..ACC-GTTGG..', identifier="d1")
        d2 = DNASequence('TTACCGGT-GGCC', identifier="d2")
        d3 = DNASequence('.-ACC-GTTGC--', identifier="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {'d1': 'd1',
                                  'd3': 'd3',
                                  'd2': 'd2'})
        expected = "\n".join(["3 13",
                              "d1 ..ACC-GTTGG..",
                              "d2 TTACCGGT-GGCC",
                              "d3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)
コード例 #20
0
ファイル: test_alignment.py プロジェクト: teravest/scikit-bio
    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence('..ACC-GTTGG..', identifier="d1")
        d2 = DNASequence('TTACCGGT-GGCC', identifier="d2")
        d3 = DNASequence('.-ACC-GTTGC--', identifier="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {'s1': 'd1',
                                  's3': 'd3',
                                  's2': 'd2'})
        expected = "\n".join(["3 13",
                              "s1 ..ACC-GTTGG..",
                              "s2 TTACCGGT-GGCC",
                              "s3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)
コード例 #21
0
ファイル: test_alignment.py プロジェクト: teravest/scikit-bio
    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', identifier="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', identifier="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', identifier="d3")

        self.r1 = RNASequence('UUAU-', identifier="r1")
        self.r2 = RNASequence('ACGUU', identifier="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.empty = Alignment([])
コード例 #22
0
ファイル: test_align_seqs.py プロジェクト: Honglongwu/qiime
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        with open(self.pynast_test1_input_fp, "w") as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test1_template_fp, "w") as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, "w") as f:
            f.write(pynast_test1_template_fasta.replace("-", "."))

        fd, self.pynast_test_template_w_u_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_u_fp, "w") as f:
            f.write(pynast_test1_template_fasta.replace("T", "U"))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta")
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, "w") as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.result_fp, "w").close()
        fd, self.failure_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta")
        close(fd)
        open(self.failure_fp, "w").close()
        fd, self.log_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".log")
        close(fd)
        open(self.log_fp, "w").close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp,
        ]

        self.pynast_test1_aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 15})

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(parse_fasta(pynast_test1_expected_alignment), DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
            parse_fasta(pynast_test1_expected_failure), DNA
        )
コード例 #23
0
    def test_omit_gap_positions(self):
        """omitting gap positions functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_positions(1.0), expected)
        self.assertEqual(self.a2.omit_gap_positions(0.51), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.49), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.0), expected)

        self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)
コード例 #24
0
ファイル: test_align_seqs.py プロジェクト: wilkox/qiime
    def test_call_pynast_test1_alt_min_pct(self):
        """PyNastAligner: returns no result when min_pct too high
        """
        aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
            'min_pct': 100.0})

        actual_aln = aligner(self.pynast_test1_input_fp)
        expected_aln = Alignment([])

        self.assertEqual(actual_aln, expected_aln)
コード例 #25
0
    def test_omit_gap_sequences(self):
        """omitting gap sequences functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)
コード例 #26
0
ファイル: test_alignment.py プロジェクト: BANSHEE-/scikit-bio
    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', id="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', id="d3")

        self.r1 = RNASequence('UUAU-', id="r1")
        self.r2 = RNASequence('ACGUU', id="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.a3 = Alignment(self.seqs2, score=42.0,
                            start_end_positions=[(0, 3), (5, 9)])
        self.a4 = Alignment(self.seqs2, score=-42.0,
                            start_end_positions=[(1, 4), (6, 10)])
        self.empty = Alignment([])
コード例 #27
0
    def test_fasta_from_alignment_from_alignment(self):
        """should return correct fasta string for alignment object"""
        # alignment with a few sequences
        obs = fasta_from_alignment(self.alignment)
        self.assertEquals('>seq1\nACC--G-GGTA..\n>seqs2\nTCC--G-GGCA..', obs)

        # empty alginment
        obs = fasta_from_alignment(Alignment([]))
        self.assertEquals('', obs)

        # alignment with a few sequences
        obs = fasta_from_alignment(self.alignment, sort=False)
        self.assertEquals('>seq1\nACC--G-GGTA..\n>seqs2\nTCC--G-GGCA..', obs)
コード例 #28
0
ファイル: test_align_seqs.py プロジェクト: Honglongwu/qiime
    def test_call_infernal_test1_file_output(self):
        """InfernalAligner writes correct output files for infernal_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.infernal_test1_aligner(
            self.infernal_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp
        )

        self.assertTrue(actual is None, "Result should be None when result path provided.")

        expected_aln = self.infernal_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)
コード例 #29
0
ファイル: test_align_seqs.py プロジェクト: wilkox/qiime
    def test_call_infernal_test1_file_output(self):
        """InfernalAligner writes correct output files for infernal_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.infernal_test1_aligner(
            self.infernal_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.infernal_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)
コード例 #30
0
    def test_ne(self):
        """inequality operator functions as expected
        """
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass

        # SequenceCollections of different types are not equal
        self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2]))
        self.assertTrue(self.s1 != Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
コード例 #31
0
ファイル: test_align_seqs.py プロジェクト: Honglongwu/qiime
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp
        )

        self.assertTrue(actual is None, "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
コード例 #32
0
    def test_majority_consensus(self):
        """majority_consensus functions as expected
        """
        d1 = DNASequence('TTT', id="d1")
        d2 = DNASequence('TT-', id="d2")
        d3 = DNASequence('TC-', id="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNASequence('TT-'))

        d1 = DNASequence('T', id="d1")
        d2 = DNASequence('A', id="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNASequence('T'), DNASequence('A')])

        self.assertEqual(self.empty.majority_consensus(), '')
コード例 #33
0
ファイル: test_align_seqs.py プロジェクト: wilkox/qiime
    def test_call_pynast_test1_file_output(self):
        """PyNastAligner writes correct output files for pynast_test1 seqs
        """
        # do not collect results; check output files instead
        actual = self.pynast_test1_aligner(
            self.pynast_test1_input_fp, result_path=self.result_fp,
            log_path=self.log_fp, failure_path=self.failure_fp)

        self.assertTrue(actual is None,
                        "Result should be None when result path provided.")

        expected_aln = self.pynast_test1_expected_aln
        with open(self.result_fp) as result_f:
            actual_aln = Alignment.from_fasta_records(parse_fasta(
                    result_f), DNA)
        self.assertEqual(actual_aln, expected_aln)

        with open(self.failure_fp) as failure_f:
            actual_fail = SequenceCollection.from_fasta_records(
                    parse_fasta(failure_f), DNA)
        self.assertEqual(actual_fail.to_fasta(),
                         self.pynast_test1_expected_fail.to_fasta())
コード例 #34
0
ファイル: test_align_seqs.py プロジェクト: wilkox/qiime
    def setUp(self):
        fd, self.infernal_test1_input_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='.fasta')
        close(fd)
        with open(self.infernal_test1_input_fp, 'w') as in_f:
            in_f.write('\n'.join(infernal_test1_input_fasta))

        fd, self.infernal_test1_template_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='template.sto')
        close(fd)
        with open(self.infernal_test1_template_fp, 'w') as in_f:
            in_f.write(infernal_test1_template_stockholm)

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()

        fd, self.log_fp = mkstemp(
            prefix='InfernalAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.infernal_test1_input_fp,
            self.result_fp,
            self.log_fp,
            self.infernal_test1_template_fp,
        ]

        self.infernal_test1_aligner = InfernalAligner({
            'template_filepath': self.infernal_test1_template_fp,
        })
        self.infernal_test1_expected_aln = Alignment.from_fasta_records(
                parse_fasta(infernal_test1_expected_alignment),
                DNA)
コード例 #35
0
    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', id="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', id="d3")

        self.r1 = RNASequence('UUAU-', id="r1")
        self.r2 = RNASequence('ACGUU', id="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.empty = Alignment([])
コード例 #36
0
    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        Alignment(self.seqs1, validate=True)

        # invalid DNA character
        invalid_seqs1 = [
            self.d1, self.d2, self.d3,
            DNASequence('.-ACC-GTXGC--', id="i1")
        ]
        self.assertRaises(SequenceCollectionError,
                          Alignment,
                          invalid_seqs1,
                          validate=True)

        # invalid lengths (they're not all equal)
        invalid_seqs2 = [
            self.d1, self.d2, self.d3,
            DNASequence('.-ACC-GTGC--', id="i2")
        ]
        self.assertRaises(SequenceCollectionError,
                          Alignment,
                          invalid_seqs2,
                          validate=True)
コード例 #37
0
ファイル: test_align_seqs.py プロジェクト: wilkox/qiime
    def setUp(self):
        fd, self.pynast_test1_input_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        with open(self.pynast_test1_input_fp, 'w') as f:
            f.write(pynast_test1_input_fasta)

        fd, self.pynast_test1_template_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test1_template_fp, 'w') as f:
            f.write(pynast_test1_template_fasta)

        fd, self.pynast_test_template_w_dots_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_dots_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('-', '.'))

        fd, self.pynast_test_template_w_u_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_u_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.replace('T', 'U'))

        fd, self.pynast_test_template_w_lower_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='template.fasta')
        close(fd)
        with open(self.pynast_test_template_w_lower_fp, 'w') as f:
            f.write(pynast_test1_template_fasta.lower())

        # create temp file names (and touch them so we can reliably
        # clean them up)
        fd, self.result_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.result_fp, 'w').close()
        fd, self.failure_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.fasta')
        close(fd)
        open(self.failure_fp, 'w').close()
        fd, self.log_fp = mkstemp(
            prefix='PyNastAlignerTests_', suffix='.log')
        close(fd)
        open(self.log_fp, 'w').close()

        self._paths_to_clean_up = [
            self.pynast_test1_input_fp,
            self.result_fp,
            self.failure_fp,
            self.log_fp,
            self.pynast_test1_template_fp,
            self.pynast_test_template_w_dots_fp,
            self.pynast_test_template_w_u_fp,
            self.pynast_test_template_w_lower_fp
        ]

        self.pynast_test1_aligner = PyNastAligner({
            'template_filepath': self.pynast_test1_template_fp,
            'min_len': 15,
        })

        self.pynast_test1_expected_aln = Alignment.from_fasta_records(
                parse_fasta(pynast_test1_expected_alignment),
                    DNA)
        self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records(
                parse_fasta(pynast_test1_expected_failure), DNA)
コード例 #38
0
class AlignmentTests(TestCase):
    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', id="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', id="d3")

        self.r1 = RNASequence('UUAU-', id="r1")
        self.r2 = RNASequence('ACGUU', id="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.empty = Alignment([])

    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs1_t]
        expected = SequenceCollection.from_fasta_records(expected, DNASequence)
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.a2.degap()
        self.assertEqual(actual, expected)

    def test_distances(self):
        """distances functions as expected
        """
        expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13],
                    [4. / 13, 7. / 13, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances()
        self.assertEqual(actual, expected)

    def test_subalignment(self):
        """subalignment functions as expected
        """
        # keep seqs by ids
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by ids (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d2 = DNASequence('TAC', id="d2")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNASequence('.C-GTTGG..', id="d1")
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        d3 = DNASequence('-C-GTTGC--', id="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', id="d1")
        d3 = DNASequence('.AC', id="d3")
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNASequence('TCGGT-GGCC', id="d2")
        expected = Alignment([d2])
        self.assertEqual(actual, expected)

    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        Alignment(self.seqs1, validate=True)

        # invalid DNA character
        invalid_seqs1 = [
            self.d1, self.d2, self.d3,
            DNASequence('.-ACC-GTXGC--', id="i1")
        ]
        self.assertRaises(SequenceCollectionError,
                          Alignment,
                          invalid_seqs1,
                          validate=True)

        # invalid lengths (they're not all equal)
        invalid_seqs2 = [
            self.d1, self.d2, self.d3,
            DNASequence('.-ACC-GTGC--', id="i2")
        ]
        self.assertRaises(SequenceCollectionError,
                          Alignment,
                          invalid_seqs2,
                          validate=True)

    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.a1.is_valid())
        self.assertTrue(self.a2.is_valid())
        self.assertTrue(self.empty.is_valid())

        # invalid because of length mismatch
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

        # invalid because of invalid charaters
        d1 = DNASequence('..ACC-GTXGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

    def test_iter_positions(self):
        """iter_positions functions as expected
        """
        actual = list(self.a2.iter_positions())
        expected = [[RNASequence(j) for j in i]
                    for i in ['UA', 'UC', 'AG', 'UU', '-U']]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]
        self.assertEqual(actual, expected)

        actual = list(self.a2.iter_positions(constructor=str))
        expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]
        self.assertEqual(actual, expected)

    def test_majority_consensus(self):
        """majority_consensus functions as expected
        """
        d1 = DNASequence('TTT', id="d1")
        d2 = DNASequence('TT-', id="d2")
        d3 = DNASequence('TC-', id="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNASequence('TT-'))

        d1 = DNASequence('T', id="d1")
        d2 = DNASequence('A', id="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNASequence('T'), DNASequence('A')])

        self.assertEqual(self.empty.majority_consensus(), '')

    def test_omit_gap_positions(self):
        """omitting gap positions functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_positions(1.0), expected)
        self.assertEqual(self.a2.omit_gap_positions(0.51), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.49), expected)

        r1 = RNASequence('UUAU', id="r1")
        r2 = RNASequence('ACGU', id="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.0), expected)

        self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)

    def test_omit_gap_sequences(self):
        """omitting gap sequences functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)

    def test_position_counters(self):
        """position_counters functions as expected
        """
        expected = [
            Counter({
                'U': 1,
                'A': 1
            }),
            Counter({
                'U': 1,
                'C': 1
            }),
            Counter({
                'A': 1,
                'G': 1
            }),
            Counter({'U': 2}),
            Counter({
                '-': 1,
                'U': 1
            })
        ]
        self.assertEqual(self.a2.position_counters(), expected)

        self.assertEqual(self.empty.position_counters(), [])

    def test_position_frequencies(self):
        """computing position frequencies functions as expected
        """
        expected = [
            defaultdict(int, {
                'U': 0.5,
                'A': 0.5
            }),
            defaultdict(int, {
                'U': 0.5,
                'C': 0.5
            }),
            defaultdict(int, {
                'A': 0.5,
                'G': 0.5
            }),
            defaultdict(int, {'U': 1.0}),
            defaultdict(int, {
                '-': 0.5,
                'U': 0.5
            })
        ]
        self.assertEqual(self.a2.position_frequencies(), expected)

        self.assertEqual(self.empty.position_frequencies(), [])

    def test_position_entropies(self):
        """computing positional uncertainties functions as expected

        tested by calculating values as described in this post:
         http://stackoverflow.com/a/15476958/3424666
        """
        expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(), expected,
                                       5)

        expected = [1.0, 1.0, 1.0, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(base=2),
                                       expected, 5)

        np.testing.assert_almost_equal(self.empty.position_entropies(base=2),
                                       [])

    def test_k_word_frequencies(self):
        """k_word_frequencies functions as expected
        """
        expected = [
            defaultdict(int, {
                'U': 3 / 5,
                'A': 1 / 5,
                '-': 1 / 5
            }),
            defaultdict(int, {
                'A': 1 / 5,
                'C': 1 / 5,
                'G': 1 / 5,
                'U': 2 / 5
            })
        ]
        actual = self.a2.k_word_frequencies(k=1)
        for a, e in zip(actual, expected):
            self.assertEqual(sorted(a), sorted(e), 5)
            np.testing.assert_almost_equal(sorted(a.values()),
                                           sorted(e.values()), 5)

    def test_sequence_length(self):
        """sequence_length functions as expected
        """
        self.assertEqual(self.a1.sequence_length(), 13)
        self.assertEqual(self.a2.sequence_length(), 5)
        self.assertEqual(self.empty.sequence_length(), 0)

    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'})
        expected = "\n".join([
            "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)

    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence('..ACC-GTTGG..', id="d1")
        d2 = DNASequence('TTACCGGT-GGCC', id="d2")
        d3 = DNASequence('.-ACC-GTTGC--', id="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'})
        expected = "\n".join([
            "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"
        ])
        self.assertEqual(phylip_str, expected)

    def test_validate_lengths(self):
        """
        """
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(
            Alignment([DNASequence('TTT', id="d1")])._validate_lengths())
        self.assertFalse(
            Alignment(
                [DNASequence('TTT', id="d1"),
                 DNASequence('TT', id="d2")])._validate_lengths())
コード例 #39
0
ファイル: test_alignment.py プロジェクト: teravest/scikit-bio
class AlignmentTests(TestCase):

    def setUp(self):
        self.d1 = DNASequence('..ACC-GTTGG..', identifier="d1")
        self.d2 = DNASequence('TTACCGGT-GGCC', identifier="d2")
        self.d3 = DNASequence('.-ACC-GTTGC--', identifier="d3")

        self.r1 = RNASequence('UUAU-', identifier="r1")
        self.r2 = RNASequence('ACGUU', identifier="r2")

        self.seqs1 = [self.d1, self.d2, self.d3]
        self.seqs2 = [self.r1, self.r2]

        self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'),
                        ('d3', '.-ACC-GTTGC--')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]

        self.a1 = Alignment(self.seqs1)
        self.a2 = Alignment(self.seqs2)
        self.empty = Alignment([])

    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs1_t]
        expected = SequenceCollection.from_fasta_records(expected, DNASequence)
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.a2.degap()
        self.assertEqual(actual, expected)

    def test_distances(self):
        """distances functions as expected
        """
        expected = [[0, 6./13, 4./13],
                    [6./13, 0, 7./13],
                    [4./13, 7./13, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
        actual = self.a1.distances()
        self.assertEqual(actual, expected)

    def test_subalignment(self):
        """subalignment functions as expected
        """
        # keep seqs by identifiers
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by indices
        actual = self.a1.subalignment(seqs_to_keep=[0, 2])
        expected = Alignment([self.d1, self.d3])
        self.assertEqual(actual, expected)

        # keep seqs by identifiers (invert)
        actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep seqs by indices (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      invert_seqs_to_keep=True)
        expected = Alignment([self.d2])
        self.assertEqual(actual, expected)

        # keep positions
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', identifier="d1")
        d2 = DNASequence('TAC', identifier="d2")
        d3 = DNASequence('.AC', identifier="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep positions (invert)
        actual = self.a1.subalignment(positions_to_keep=[0, 2, 3],
                                      invert_positions_to_keep=True)
        d1 = DNASequence('.C-GTTGG..', identifier="d1")
        d2 = DNASequence('TCGGT-GGCC', identifier="d2")
        d3 = DNASequence('-C-GTTGC--', identifier="d3")
        expected = Alignment([d1, d2, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3])
        d1 = DNASequence('.AC', identifier="d1")
        d3 = DNASequence('.AC', identifier="d3")
        expected = Alignment([d1, d3])
        self.assertEqual(actual, expected)

        # keep seqs and positions (invert)
        actual = self.a1.subalignment(seqs_to_keep=[0, 2],
                                      positions_to_keep=[0, 2, 3],
                                      invert_seqs_to_keep=True,
                                      invert_positions_to_keep=True)
        d2 = DNASequence('TCGGT-GGCC', identifier="d2")
        expected = Alignment([d2])
        self.assertEqual(actual, expected)

    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        Alignment(self.seqs1, validate=True)

        # invalid DNA character
        invalid_seqs1 = [self.d1, self.d2, self.d3,
                         DNASequence('.-ACC-GTXGC--', identifier="i1")]
        self.assertRaises(SequenceCollectionError, Alignment,
                          invalid_seqs1, validate=True)

        # invalid lengths (they're not all equal)
        invalid_seqs2 = [self.d1, self.d2, self.d3,
                         DNASequence('.-ACC-GTGC--', identifier="i2")]
        self.assertRaises(SequenceCollectionError, Alignment,
                          invalid_seqs2, validate=True)

    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.a1.is_valid())
        self.assertTrue(self.a2.is_valid())
        self.assertTrue(self.empty.is_valid())

        # invalid because of length mismatch
        d1 = DNASequence('..ACC-GTTGG..', identifier="d1")
        d2 = DNASequence('TTACCGGT-GGC', identifier="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

        # invalid because of invalid charaters
        d1 = DNASequence('..ACC-GTXGG..', identifier="d1")
        d2 = DNASequence('TTACCGGT-GGCC', identifier="d2")
        self.assertFalse(Alignment([d1, d2]).is_valid())

    def test_iter_positions(self):
        """iter_positions functions as expected
        """
        actual = list(self.a2.iter_positions())
        expected = [[RNASequence(j) for j in i] for i in
                    ['UA', 'UC', 'AG', 'UU', '-U']]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]
        self.assertEqual(actual, expected)

        actual = list(self.a2.iter_positions(constructor=str))
        expected = [list('UA'),
                    list('UC'),
                    list('AG'),
                    list('UU'),
                    list('-U')]
        self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')]
        self.assertEqual(actual, expected)

    def test_majority_consensus(self):
        """majority_consensus functions as expected
        """
        d1 = DNASequence('TTT', identifier="d1")
        d2 = DNASequence('TT-', identifier="d2")
        d3 = DNASequence('TC-', identifier="d3")
        a1 = Alignment([d1, d2, d3])
        self.assertEqual(a1.majority_consensus(), DNASequence('TT-'))

        d1 = DNASequence('T', identifier="d1")
        d2 = DNASequence('A', identifier="d2")
        a1 = Alignment([d1, d2])
        self.assertTrue(a1.majority_consensus() in
                        [DNASequence('T'), DNASequence('A')])

        self.assertEqual(self.empty.majority_consensus(), '')

    def test_omit_gap_positions(self):
        """omitting gap positions functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_positions(1.0), expected)
        self.assertEqual(self.a2.omit_gap_positions(0.51), expected)

        r1 = RNASequence('UUAU', identifier="r1")
        r2 = RNASequence('ACGU', identifier="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.49), expected)

        r1 = RNASequence('UUAU', identifier="r1")
        r2 = RNASequence('ACGU', identifier="r2")
        expected = Alignment([r1, r2])
        self.assertEqual(self.a2.omit_gap_positions(0.0), expected)

        self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty)
        self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)

    def test_omit_gap_sequences(self):
        """omitting gap sequences functions as expected
        """
        expected = self.a2
        self.assertEqual(self.a2.omit_gap_sequences(1.0), expected)
        self.assertEqual(self.a2.omit_gap_sequences(0.20), expected)

        expected = Alignment([self.r2])
        self.assertEqual(self.a2.omit_gap_sequences(0.19), expected)

        self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty)
        self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)

    def test_position_counters(self):
        """position_counters functions as expected
        """
        expected = [Counter({'U': 1, 'A': 1}),
                    Counter({'U': 1, 'C': 1}),
                    Counter({'A': 1, 'G': 1}),
                    Counter({'U': 2}),
                    Counter({'-': 1, 'U': 1})]
        self.assertEqual(self.a2.position_counters(), expected)

        self.assertEqual(self.empty.position_counters(), [])

    def test_position_frequencies(self):
        """computing position frequencies functions as expected
        """
        expected = [defaultdict(int, {'U': 0.5, 'A': 0.5}),
                    defaultdict(int, {'U': 0.5, 'C': 0.5}),
                    defaultdict(int, {'A': 0.5, 'G': 0.5}),
                    defaultdict(int, {'U': 1.0}),
                    defaultdict(int, {'-': 0.5, 'U': 0.5})]
        self.assertEqual(self.a2.position_frequencies(), expected)

        self.assertEqual(self.empty.position_frequencies(), [])

    def test_position_entropies(self):
        """computing positional uncertainties functions as expected

        tested by calculating values as described in this post:
         http://stackoverflow.com/a/15476958/3424666
        """
        expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(),
                                       expected, 5)

        expected = [1.0, 1.0, 1.0, 0.0, np.nan]
        np.testing.assert_almost_equal(self.a2.position_entropies(base=2),
                                       expected, 5)

        np.testing.assert_almost_equal(self.empty.position_entropies(base=2),
                                       [])

    def test_k_word_frequencies(self):
        """k_word_frequencies functions as expected
        """
        expected = [defaultdict(int, {'U': 3/5, 'A': 1/5, '-': 1/5}),
                    defaultdict(int, {'A': 1/5, 'C': 1/5, 'G': 1/5, 'U': 2/5})]
        actual = self.a2.k_word_frequencies(k=1)
        for a, e in zip(actual, expected):
            self.assertEqual(sorted(a), sorted(e), 5)
            np.testing.assert_almost_equal(sorted(a.values()),
                                           sorted(e.values()), 5)

    def test_sequence_length(self):
        """sequence_length functions as expected
        """
        self.assertEqual(self.a1.sequence_length(), 13)
        self.assertEqual(self.a2.sequence_length(), 5)
        self.assertEqual(self.empty.sequence_length(), 0)

    def test_to_phylip(self):
        """to_phylip functions as expected
        """
        d1 = DNASequence('..ACC-GTTGG..', identifier="d1")
        d2 = DNASequence('TTACCGGT-GGCC', identifier="d2")
        d3 = DNASequence('.-ACC-GTTGC--', identifier="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=False)
        self.assertEqual(id_map, {'d1': 'd1',
                                  'd3': 'd3',
                                  'd2': 'd2'})
        expected = "\n".join(["3 13",
                              "d1 ..ACC-GTTGG..",
                              "d2 TTACCGGT-GGCC",
                              "d3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)

    def test_to_phylip_map_labels(self):
        """to_phylip functions as expected with label mapping
        """
        d1 = DNASequence('..ACC-GTTGG..', identifier="d1")
        d2 = DNASequence('TTACCGGT-GGCC', identifier="d2")
        d3 = DNASequence('.-ACC-GTTGC--', identifier="d3")
        a = Alignment([d1, d2, d3])

        phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s")
        self.assertEqual(id_map, {'s1': 'd1',
                                  's3': 'd3',
                                  's2': 'd2'})
        expected = "\n".join(["3 13",
                              "s1 ..ACC-GTTGG..",
                              "s2 TTACCGGT-GGCC",
                              "s3 .-ACC-GTTGC--"])
        self.assertEqual(phylip_str, expected)

    def test_validate_lengths(self):
        """
        """
        self.assertTrue(self.a1._validate_lengths())
        self.assertTrue(self.a2._validate_lengths())
        self.assertTrue(self.empty._validate_lengths())

        self.assertTrue(Alignment([
            DNASequence('TTT', identifier="d1")])._validate_lengths())
        self.assertFalse(Alignment([
            DNASequence('TTT', identifier="d1"),
            DNASequence('TT', identifier="d2")])._validate_lengths())
コード例 #40
0
    def __call__(self, seq_path, result_path=None, log_path=None,
                 failure_path=None, cmbuild_params=None, cmalign_params=None):

        log_params = []
        # load candidate sequences
        candidate_sequences = dict(parse_fasta(open(seq_path, 'U')))

        # load template sequences
        try:
            info, template_alignment, struct = list(MinimalRfamParser(open(
                self.Params['template_filepath'], 'U'),
                seq_constructor=ChangedSequence))[0]
        except RecordError:
            raise ValueError(
                "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.")

        moltype = self.Params['moltype']

        # Need to make separate mapping for unaligned sequences
        unaligned = SequenceCollection.from_fasta_records(
            candidate_sequences.iteritems(), DNASequence)
        mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_')
        mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()]

        # Turn on --gapthresh option in cmbuild to force alignment to full
        # model
        if cmbuild_params is None:
            cmbuild_params = {}
        cmbuild_params.update({'--gapthresh': 1.0})

        # record cmbuild parameters
        log_params.append('cmbuild parameters:')
        log_params.append(str(cmbuild_params))

        # Turn on --sub option in Infernal, since we know the unaligned sequences
        # are fragments.
        # Also turn on --gapthresh to use same gapthresh as was used to build
        # model
        if cmalign_params is None:
            cmalign_params = {}
        cmalign_params.update({'--sub': True, '--gapthresh': 1.0})

        # record cmalign parameters
        log_params.append('cmalign parameters:')
        log_params.append(str(cmalign_params))

        # Align sequences to alignment including alignment gaps.
        aligned, struct_string = cmalign_from_alignment(aln=template_alignment,
                                                        structure_string=struct,
                                                        seqs=mapped_seq_tuples,
                                                        moltype=moltype,
                                                        include_aln=True,
                                                        params=cmalign_params,
                                                        cmbuild_params=cmbuild_params)

        # Pull out original sequences from full alignment.
        infernal_aligned = []
        # Get a dict of the identifiers to sequences (note that this is a
        # cogent alignment object, hence the call to NamedSeqs)
        aligned_dict = aligned.NamedSeqs
        for n, o in new_to_old_ids.iteritems():
            aligned_seq = aligned_dict[n]
            infernal_aligned.append((o, aligned_seq))

        # Create an Alignment object from alignment dict
        infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence)

        if log_path is not None:
            log_file = open(log_path, 'w')
            log_file.write('\n'.join(log_params))
            log_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(infernal_aligned.to_fasta())
            result_file.close()
            return None
        else:
            try:
                return infernal_aligned
            except ValueError:
                return {}