Example #1
0
 def test_align_without_gaps(self):
     """dotplot has alignment coordinates if no gaps"""
     aln = ArrayAlignment(
         {"seq1": "ACGG", "seq2": "CGCA", "seq3": "CCG-"}, moltype="dna"
     )
     aln_plot = aln.dotplot("seq1", "seq2")
     self.assertNotEqual(aln_plot._aligned_coords, None)
    def test_trim_stop_codons_info(self):
        """trim_stop_codons should preserve info attribute"""
        seq_coll = SequenceCollection(
            data={
                "seq1": "ACGTAA",
                "seq2": "ACGACG",
                "seq3": "ACGCGT"
            },
            moltype=DNA,
            info={"key": "value"},
        )
        seq_coll = seq_coll.trim_stop_codons()
        self.assertEqual(seq_coll.info["key"], "value")

        # aligned
        aln = ArrayAlignment(
            data={
                "seq1": "ACGTAA",
                "seq2": "ACGTGA",
                "seq3": "ACGTAA"
            },
            moltype=DNA,
            info={"key": "value"},
        )
        aln = aln.trim_stop_codons()
        self.assertEqual(aln.info["key"], "value")
Example #3
0
    def test_recode_dense_alignment(self):
        """recode_dense_alignment: recode alignment to charge_2 alpha works
        """
        expected_c2 = ArrayAlignment(data={
            "1": "AKKAKAK",
            "2": "AKK-KAK",
            "3": "AAAAAA-"
        })
        expected_h3 = ArrayAlignment(data={
            "1": "PRRPRPR",
            "2": "PRR-RPR",
            "3": "PPPPYY-"
        })
        expected_aa = ArrayAlignment(data={
            "1": "AAAAAAA",
            "2": "AAA-AAA",
            "3": "AAAAAA-"
        })

        # provided with alphabet_id
        actual = recode_dense_alignment(self.aln, alphabet_id="charge_2")
        self.assertEqual(actual, expected_c2)
        # provided with alphabet_def
        actual = recode_dense_alignment(self.aln, alphabet_def=self.charge_2)
        self.assertEqual(actual, expected_c2)

        # different alphabet
        actual = recode_dense_alignment(self.aln, alphabet_id="hydropathy_3")
        self.assertEqual(actual, expected_h3)
        actual = recode_dense_alignment(self.aln,
                                        alphabet_def=self.hydropathy_3)
        self.assertEqual(actual, expected_h3)

        # different alphabet
        actual = recode_dense_alignment(self.aln, alphabet_def=self.all_to_a)
        self.assertEqual(actual, expected_aa)

        # original charactars which aren't remapped are let in original state
        actual = recode_dense_alignment(self.aln, alphabet_def=[("a", "b")])
        self.assertEqual(actual, self.aln)

        # non-alphabetic character mapped same as alphabetic characters
        actual = recode_dense_alignment(self.aln, alphabet_def=[(".", "-")])
        expected = ArrayAlignment(data={
            "1": "CDDFBXZ",
            "2": "CDD.BXZ",
            "3": "AAAASS."
        })
        self.assertEqual(actual, expected)
Example #4
0
 def test_reverse_complement_info(self):
     """reverse_complement should preserve info attribute"""
     dna = {
         "seq1": "--ACGT--GT---",
         "seq2": "TTACGTA-GT---",
         "seq3": "--ACGTA-GCC--",
     }
     # alignment with gaps
     aln = ArrayAlignment(data=dna, moltype=DNA, info={"key": "value"})
     aln_rc = aln.rc()
     self.assertEqual(aln_rc.info["key"], "value")
     # check collection, with gaps
     coll = SequenceCollection(data=dna, moltype=DNA, info={"key": "value"})
     coll_rc = coll.rc()
     self.assertEqual(coll_rc.info["key"], "value")
Example #5
0
    def concat(self, data):
        """returns an alignment

        Parameters
        ----------
        data
            series of alignment instances
        """
        names = self._name_callback(list(aln.names for aln in data))
        collated = defaultdict(list)
        for aln in data:
            assert isinstance(aln, ArrayAlignment) or isinstance(
                aln, Alignment)
            if self._intersect:
                seqs = aln.take_seqs(names).to_dict()
            else:
                seqs = defaultdict(lambda: "?" * len(aln))
                seqs.update(aln.to_dict())

            for name in names:
                collated[name].append(seqs[name])

        combined = {n: self._join_seq.join(collated[n]) for n in names}
        aln = ArrayAlignment(data=combined, moltype=self._moltype)
        return aln
Example #6
0
    def test_ArrayAlignment_without_moltype(self):
        """Expect MolType to be picked up from the sequences."""

        m1 = ArraySequence("UCAG", alphabet=RNA.alphabets.degen_gapped, name="rna1")
        m2 = ArraySequence("CCCR", alphabet=RNA.alphabets.degen_gapped, name="rna2")
        da = ArrayAlignment([m1, m2])
        exp_lines = [">rna1", "UCAG", ">rna2", "CCCR"]
        self.assertEqual(str(da), "\n".join(exp_lines) + "\n")
Example #7
0
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence("UCAGGG", name="rna1")
        self.rna2 = RnaSequence("YCU-RG", name="rna2")
        self.rna3 = RnaSequence("CAA-NR", name="rna3")
        self.model1 = ArraySequence("UCAGGG",
                                    name="rna1",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model2 = ArraySequence("YCU-RG",
                                    name="rna2",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model3 = ArraySequence("CAA-NR",
                                    name="rna3",
                                    alphabet=RNA.alphabets.degen_gapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], moltype=RNA)
        self.da = ArrayAlignment(
            [self.model1, self.model2, self.model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )

        # seqs no name
        self.nn_rna1 = RnaSequence("UCAGGG")
        self.nn_rna2 = RnaSequence("YCU-RG")
        self.nn_rna3 = RnaSequence("CAA-NR")

        self.nn_model1 = ArraySequence("UCAGGG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model2 = ArraySequence("YCU-RG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model3 = ArraySequence("CAA-NR",
                                       alphabet=RNA.alphabets.degen_gapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],
                                moltype=RNA)
        self.nn_da = ArrayAlignment(
            [self.nn_model1, self.nn_model2, self.nn_model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )
Example #8
0
 def test_aln_equality(self):
     # When does something compare equal?
     self.assertEqual(self.da == self.da, True)
     # one sequence less
     other_da1 = ArrayAlignment(
         [self.model1, self.model2], moltype=RNA, alphabet=RNA.alphabets.degen_gapped
     )
     self.assertEqual(self.da == other_da1, False)
     # seqs in different order -- doesn't matter
     other_da2 = ArrayAlignment(
         [self.model1, self.model3, self.model2],
         moltype=RNA,
         alphabet=RNA.alphabets.degen_gapped,
     )
     self.assertEqual(self.da == other_da2, True)
     # seqs in different encoding -- doesn't matter, only looks at data
     other_da3 = ArrayAlignment([self.model1, self.model2, self.model3])
     # Should this compare False even though the data is exactly the same?
     # The moltype is different...
     self.assertEqual(self.da == other_da3, True)
     assert alltrue(list(map(alltrue, self.da.array_seqs == other_da3.array_seqs)))
Example #9
0
    def setUp(self):
        """ Initialize some variables for the tests """
        self.canonical_abbrevs = "ACDEFGHIKLMNPQRSTVWY"
        self.ambiguous_abbrevs = "BXZ"

        self.all_to_a = [("A", self.canonical_abbrevs + self.ambiguous_abbrevs)]
        self.charge_2 = alphabets["charge_2"]
        self.hydropathy_3 = alphabets["hydropathy_3"]
        self.orig = alphabets["orig"]
        self.aln = ArrayAlignment(data={"1": "CDDFBXZ", "2": "CDD-BXZ", "3": "AAAASS-"})
        self.aln2 = make_aligned_seqs(
            data={"1": "CDDFBXZ", "2": "CDD-BXZ", "3": "AAAASS-"}
        )
Example #10
0
    def test_subset_positions_ArrayAlignment(self):
        # because dict order volatile, need to grab the
        # the index for ambig characters from the object
        # The full data comes from these seqs
        # 'UCAGGG'
        # 'YCU-RG'
        # 'CAA-NR'
        get_index = RNA.alphabets.degen_gapped.index
        G = get_index("-")
        N = get_index("N")
        R = get_index("R")
        Y = get_index("Y")
        full_data = array([[0, 1, 2, 3, 3, 3], [Y, 1, 0, G, R, 3],
                           [1, 2, 2, G, N, R]])

        model1 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("YCG",
                               name="rna2",
                               alphabet=RNA.alphabets.degen_gapped)
        model3 = ArraySequence("CAR",
                               name="rna3",
                               alphabet=RNA.alphabets.degen_gapped)
        sub_da = ArrayAlignment([model1, model2, model3],
                                moltype=RNA,
                                alphabet=RNA.alphabets.degen_gapped)

        sub_data = array([[0, 1, 3], [Y, 1, 3], [1, 2, R]])

        # First check some data
        self.assertEqual(self.da.array_seqs, full_data)
        self.assertEqual(self.da.array_positions, transpose(full_data))
        self.assertEqual(sub_da.array_seqs, sub_data)
        self.assertEqual(sub_da.array_positions, transpose(sub_data))

        obs_sub_da_TP = self.da.take_positions([0, 1, 5])
        obs_sub_da_SA = self.da.get_sub_alignment(pos=[0, 1, 5])

        # When using the get_sub_alignment method the data is right
        self.assertEqual(obs_sub_da_SA, sub_da)
        self.assertNotEqual(obs_sub_da_SA, self.da)
        self.assertEqual(obs_sub_da_SA.array_seqs, sub_data)
        self.assertEqual(obs_sub_da_SA.array_positions, transpose(sub_data))

        # For the take_positions method: Why does this work
        self.assertEqual(obs_sub_da_TP, sub_da)
        self.assertNotEqual(obs_sub_da_TP, self.da)
        # If the data doesn't match?
        self.assertEqual(obs_sub_da_TP.array_seqs, sub_data)
        self.assertEqual(obs_sub_da_TP.array_positions, transpose(sub_data))
Example #11
0
    def test_subset_seqs_ArrayAlignment(self):
        model1 = ArraySequence("UCG", name="rna1", alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("YCG", name="rna2", alphabet=RNA.alphabets.degen_gapped)
        model3 = ArraySequence("CAR", name="rna3", alphabet=RNA.alphabets.degen_gapped)
        sub_da = ArrayAlignment(
            [model1, model2, model3], moltype=RNA, alphabet=RNA.alphabets.degen_gapped
        )

        # take_seqs by name should have the same effect as
        # get_sub_alignment by seq idx?
        obs_sub_da_TS = self.da.take_seqs(["rna1"])
        obs_sub_da_SA = self.da.get_sub_alignment(seqs=[0])

        # These two are now the same. Fixed mapping of key to char array.
        self.assertEqual(obs_sub_da_TS, obs_sub_da_SA)
        self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))
Example #12
0
    def align_to_named_seq(self, seqs):
        """returns alignment to named seq"""
        seqs = seqs.to_moltype(self._moltype)
        ref_seq = seqs.get_seq(self._ref_name)
        aligned = None
        kwargs = self._kwargs.copy()

        def gap_in_ref(gap_char):
            gap_char = gap_char[0]

            def array_ref_gap(x):
                r = x.flatten()[0] != gap_char
                return r

            def standard_ref_gap(x):
                r = x[0] != gap_char
                return r

            func = {"-": standard_ref_gap}.get(gap_char, array_ref_gap)
            return func

        no_ref_gap = None

        for i in range(seqs.num_seqs):
            seq = seqs.seqs[i]
            if seq.name == self._ref_name:
                continue

            result = global_pairwise(ref_seq, seq, **kwargs)
            if no_ref_gap is None:
                gap = result.moltype.alphabet.to_indices(seqs.moltype.gap)
                no_ref_gap = gap_in_ref(gap)

            # as we're going to be using a pairwise distance that excludes gaps
            # eliminating positions with deletions in the reference
            result = result.filtered(no_ref_gap)
            if aligned is None:
                aligned = result.to_type(array_align=False)
                continue

            aligned = aligned.add_from_ref_aln(
                result.to_type(array_align=False))

        new = ArrayAlignment(aligned.to_dict(),
                             moltype=seqs.moltype,
                             info=seqs.info)
        return new
Example #13
0
    def likely_ancestral_seqs(self, locus=None) -> ArrayAlignment:
        """Returns the most likely reconstructed ancestral sequences as an
        alignment.

        Parameters
        ----------
        locus
            a named locus
        """
        prob_array = self.reconstruct_ancestral_seqs(locus=locus)
        seqs = []
        for edge, probs in list(prob_array.items()):
            seq = []
            for row in probs:
                by_p = [(p, state) for state, p in list(row.items())]
                seq.append(max(by_p)[1])
            seqs += [(edge, self.model.moltype.make_seq("".join(seq)))]
        return ArrayAlignment(data=seqs, moltype=self.model.moltype)
Example #14
0
def recode_dense_alignment(aln, alphabet_id=None, alphabet_def=None):
    """Return new ArrayAlignment recoded in the provided reduced-state alphabet

    aln: the ArrayAlignment object to be recoded
    alphabet_id: string identifying an alphabet in
        cogent3.util.recode_alignment.alphabets.
        (See cogent3.util.recode_alignment.alphabets.keys()
        for valid alphabet_ids.)
    alphabet_def: list of two-element tuples where first element is
        the new alphabet character and the second elements is an iterable
        object containing the old alphabet chars which should be mapped to
        the new char.
        e.g., [('A','CVILFMWAGSTPYH'),('B','QNDERKBZ')]
        (See cogent3.util.recode_alignment.alphabets.values()
        for more examples.)

    Note: either alphabet_id OR alphabet_def must be passed. Either
        provide the alphabet, or have it is looked up. If both are provided
        the alphabet_id is ignored.

    """

    # Construct a dict mapping from UInt8s in alignment to their
    # associated characters. This dict is then used for looking
    # up chars in the new and old alphabets.
    byte_map = dict(list(zip(aln.alphabet, list(range(len(aln.alphabet))))))

    # Construct a dict mapping old characters to new characters.
    alphabet_map = build_alphabet_map(
        alphabet_id=alphabet_id, alphabet_def=alphabet_def
    )

    # Create the recoded version of seqs.alphabet
    new_indices = list(range(len(aln.alphabet)))
    for old, new in list(alphabet_map.items()):
        new_indices[byte_map[old]] = byte_map[new]

    # Map the old alphabet onto the new alphabet. Note: characters that
    # that are not mapped are ignored. Returns a new ArrayAlignment.
    return ArrayAlignment(
        take(new_indices, aln.array_seqs).transpose(), aln.names[:], moltype=aln.moltype
    )
Example #15
0
    def concat(self, data):
        """returns an alignment

        Parameters
        ----------
        data
            series of alignment instances
        """
        if len(data) == 0:
            raise ValueError("no data")

        names = []
        for aln in data:
            if not (isinstance(aln, ArrayAlignment)
                    or isinstance(aln, Alignment)):
                raise TypeError(f"{type(aln)} invalid for concat")
            names.append(aln.names)

        names = self._name_callback(names)
        collated = defaultdict(list)
        if self._moltype is None:
            self._moltype = aln.moltype

        for aln in data:
            if self._moltype and aln.moltype != self._moltype:
                # try converting
                aln = aln.to_moltype(self.moltype)

            if self._intersect:
                seqs = aln.take_seqs(names).to_dict()
            else:
                seqs = defaultdict(lambda: "?" * len(aln))
                seqs.update(aln.to_dict())

            for name in names:
                collated[name].append(seqs[name])

        combined = {n: self._join_seq.join(collated[n]) for n in names}
        aln = ArrayAlignment(data=combined, moltype=self._moltype)
        return aln
Example #16
0
    def simulate_alignment(
        self,
        sequence_length=None,
        random_series=None,
        exclude_internal=True,
        locus=None,
        seed=None,
        root_sequence=None,
    ):
        """
        Returns an alignment of simulated sequences with key's corresponding to
        names from the current attached alignment.

        Parameters
        ----------
        sequence_length
            the legnth of the alignment to be simulated,
            default is the length of the attached alignment.
        random_series
            a random number generator.
        exclude_internal
            if True, only sequences for tips are returned.
        root_sequence
            a sequence from which all others evolve.

        """

        if sequence_length is None:
            lht = self.get_param_value("lht", locus=locus)
            sequence_length = len(lht.index)
            leaves = self.get_param_value("leaf_likelihoods", locus=locus)
            orig_ambig = {}
            for (seq_name, leaf) in list(leaves.items()):
                orig_ambig[seq_name] = leaf.get_ambiguous_positions()
        else:
            orig_ambig = {}

        if random_series is None:
            random_series = random.Random()
            random_series.seed(seed)

        def psub_for(edge, bin):
            return self.get_psub_for_edge(edge, bin=bin, locus=locus)

        if len(self.bin_names) > 1:
            hmm = self.get_param_value("bdist", locus=locus)
            site_bins = hmm.emit(sequence_length, random_series)
        else:
            site_bins = numpy.zeros([sequence_length], int)

        evolver = AlignmentEvolver(
            random_series,
            orig_ambig,
            exclude_internal,
            self.bin_names,
            site_bins,
            psub_for,
            self._motifs,
        )

        if root_sequence is not None:  # we convert to a vector of motifs
            if isinstance(root_sequence, str):
                root_sequence = self._model.moltype.make_seq(root_sequence)
            motif_len = self._model.get_alphabet().get_motif_len()
            root_sequence = root_sequence.get_in_motif_size(motif_len)
        else:
            mprobs = self.get_param_value("mprobs", locus=locus, edge="root")
            mprobs = self._model.calc_word_probs(mprobs)
            mprobs = dict((m, p) for (m, p) in zip(self._motifs, mprobs))
            root_sequence = random_sequence(random_series, mprobs,
                                            sequence_length)

        simulated_sequences = evolver(self._tree, root_sequence)

        return ArrayAlignment(data=simulated_sequences,
                              moltype=self._model.moltype)
Example #17
0
class AllTests(TestCase):
    def setUp(self):
        """setUp method for all tests"""
        # named sequences
        self.rna1 = RnaSequence("UCAGGG", name="rna1")
        self.rna2 = RnaSequence("YCU-RG", name="rna2")
        self.rna3 = RnaSequence("CAA-NR", name="rna3")
        self.model1 = ArraySequence("UCAGGG",
                                    name="rna1",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model2 = ArraySequence("YCU-RG",
                                    name="rna2",
                                    alphabet=RNA.alphabets.degen_gapped)
        self.model3 = ArraySequence("CAA-NR",
                                    name="rna3",
                                    alphabet=RNA.alphabets.degen_gapped)

        self.aln = Alignment([self.rna1, self.rna2, self.rna3], moltype=RNA)
        self.da = ArrayAlignment(
            [self.model1, self.model2, self.model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )

        # seqs no name
        self.nn_rna1 = RnaSequence("UCAGGG")
        self.nn_rna2 = RnaSequence("YCU-RG")
        self.nn_rna3 = RnaSequence("CAA-NR")

        self.nn_model1 = ArraySequence("UCAGGG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model2 = ArraySequence("YCU-RG",
                                       alphabet=RNA.alphabets.degen_gapped)
        self.nn_model3 = ArraySequence("CAA-NR",
                                       alphabet=RNA.alphabets.degen_gapped)

        self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3],
                                moltype=RNA)
        self.nn_da = ArrayAlignment(
            [self.nn_model1, self.nn_model2, self.nn_model3],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )

    def test_printing_named_seqs(self):
        """Printing named seqs should work the same on Aln and DenseAln"""
        # Note: the newline trailing each sequence is intentional, because
        # we want each FASTA-format record to be separated.
        exp_lines_general = [
            ">rna1", "UCAGGG", ">rna2", "YCU-RG", ">rna3", "CAA-NR"
        ]
        self.assertEqual(str(self.aln), "\n".join(exp_lines_general) + "\n")
        self.assertEqual(str(self.da), "\n".join(exp_lines_general) + "\n")

    def test_printing_unnamed_seqs(self):
        """Printing unnamed sequences should work the same on Aln and DenseAln
        """
        exp_lines_gen = [
            ">seq_0", "UCAGGG", ">seq_1", "YCU-RG", ">seq_2", "CAA-NR\n"
        ]
        self.assertEqual(str(self.nn_aln), "\n".join(exp_lines_gen))
        self.assertEqual(str(self.nn_da), "\n".join(exp_lines_gen))

    def test_ArrayAlignment_without_moltype(self):
        """Expect MolType to be picked up from the sequences."""

        m1 = ArraySequence("UCAG",
                           alphabet=RNA.alphabets.degen_gapped,
                           name="rna1")
        m2 = ArraySequence("CCCR",
                           alphabet=RNA.alphabets.degen_gapped,
                           name="rna2")
        da = ArrayAlignment([m1, m2])
        exp_lines = [">rna1", "UCAG", ">rna2", "CCCR"]
        self.assertEqual(str(da), "\n".join(exp_lines) + "\n")

    def test_names(self):
        # Should both alignments handle names the same way?
        self.assertEqual(self.aln.names, ["rna1", "rna2", "rna3"])
        self.assertEqual(self.da.names, ["rna1", "rna2", "rna3"])
        # On unnamed sequences the behavior is now the same.
        self.assertEqual(self.nn_aln.names, ["seq_0", "seq_1", "seq_2"])
        self.assertEqual(self.nn_da.names, ["seq_0", "seq_1", "seq_2"])

    def test_seqFreqs(self):
        """seqFreqs should work the same on Alignment and ArrayAlignment"""
        get_index = RNA.alphabets.degen_gapped.index
        # 'UCAGGG'
        # 'YCU-RG'
        # 'CAA-NR'

        expected_counts = {
            0: {
                "U": 1,
                "C": 1,
                "A": 1,
                "G": 3
            },
            1: {
                "Y": 1,
                "C": 1,
                "U": 1,
                "-": 1,
                "R": 1,
                "G": 1
            },
            2: {
                "C": 1,
                "A": 2,
                "-": 1,
                "N": 1,
                "R": 1
            },
        }
        got1 = self.da.counts_per_seq(allow_gap=True, include_ambiguity=True)
        got2 = self.aln.counts_per_seq(allow_gap=True, include_ambiguity=True)
        for pos, counts in expected_counts.items():
            for char in counts:
                self.assertEqual(got1[pos, char], expected_counts[pos][char])
                self.assertEqual(got2[pos, char], expected_counts[pos][char])

    def test_subset_positions_ArrayAlignment(self):
        # because dict order volatile, need to grab the
        # the index for ambig characters from the object
        # The full data comes from these seqs
        # 'UCAGGG'
        # 'YCU-RG'
        # 'CAA-NR'
        get_index = RNA.alphabets.degen_gapped.index
        G = get_index("-")
        N = get_index("N")
        R = get_index("R")
        Y = get_index("Y")
        full_data = array([[0, 1, 2, 3, 3, 3], [Y, 1, 0, G, R, 3],
                           [1, 2, 2, G, N, R]])

        model1 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("YCG",
                               name="rna2",
                               alphabet=RNA.alphabets.degen_gapped)
        model3 = ArraySequence("CAR",
                               name="rna3",
                               alphabet=RNA.alphabets.degen_gapped)
        sub_da = ArrayAlignment([model1, model2, model3],
                                moltype=RNA,
                                alphabet=RNA.alphabets.degen_gapped)

        sub_data = array([[0, 1, 3], [Y, 1, 3], [1, 2, R]])

        # First check some data
        self.assertEqual(self.da.array_seqs, full_data)
        self.assertEqual(self.da.array_positions, transpose(full_data))
        self.assertEqual(sub_da.array_seqs, sub_data)
        self.assertEqual(sub_da.array_positions, transpose(sub_data))

        obs_sub_da_TP = self.da.take_positions([0, 1, 5])
        obs_sub_da_SA = self.da.get_sub_alignment(pos=[0, 1, 5])

        # When using the get_sub_alignment method the data is right
        self.assertEqual(obs_sub_da_SA, sub_da)
        self.assertNotEqual(obs_sub_da_SA, self.da)
        self.assertEqual(obs_sub_da_SA.array_seqs, sub_data)
        self.assertEqual(obs_sub_da_SA.array_positions, transpose(sub_data))

        # For the take_positions method: Why does this work
        self.assertEqual(obs_sub_da_TP, sub_da)
        self.assertNotEqual(obs_sub_da_TP, self.da)
        # If the data doesn't match?
        self.assertEqual(obs_sub_da_TP.array_seqs, sub_data)
        self.assertEqual(obs_sub_da_TP.array_positions, transpose(sub_data))
        # Shouldn't the __eq__ method check the data at least?

    def test_subset_positions_Alignment(self):
        rna1 = RnaSequence("UCG", name="rna1")
        rna2 = RnaSequence("YCG", name="rna2")
        rna3 = RnaSequence("CAR", name="rna3")

        sub_aln = Alignment([rna1, rna2, rna3], moltype=RNA)

        obs_sub_aln = self.aln.take_positions([0, 1, 5])
        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertNotEqual(obs_sub_aln, self.aln)
        # string representations should be the same. This fails right
        # now, because sequence order is not maintained. See separate test.
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

    def test_take_positions_sequence_order(self):
        """Alignment take_positions should maintain seq order"""
        # This works
        self.assertEqual(self.da.names, ["rna1", "rna2", "rna3"])
        sub_da = self.da.get_sub_alignment(pos=[0, 1, 5])
        self.assertEqual(sub_da.names, ["rna1", "rna2", "rna3"])
        # seq order not maintained in Alignment
        self.assertEqual(self.aln.names, ["rna1", "rna2", "rna3"])
        sub_aln = self.aln.take_positions([0, 1, 5])
        self.assertEqual(sub_aln.names, ["rna1", "rna2", "rna3"])

    def test_subset_seqs_Alignment(self):
        rna1 = RnaSequence("UCG", name="rna1")
        rna2 = RnaSequence("YCG", name="rna2")
        rna3 = RnaSequence("CAR", name="rna3")

        sub_aln = Alignment([rna2, rna3], moltype=RNA)
        aln = Alignment([rna1, rna2, rna3], moltype=RNA)
        obs_sub_aln = aln.take_seqs(["rna2", "rna3"])

        self.assertEqual(obs_sub_aln, sub_aln)
        self.assertEqual(str(obs_sub_aln), str(sub_aln))

        # Selected sequences should be in specified order?
        obs_sub_aln_1 = self.aln.take_seqs(["rna3", "rna2"])
        obs_sub_aln_2 = self.aln.take_seqs(["rna2", "rna3"])
        self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))

    def test_subset_seqs_ArrayAlignment(self):
        model1 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("YCG",
                               name="rna2",
                               alphabet=RNA.alphabets.degen_gapped)
        model3 = ArraySequence("CAR",
                               name="rna3",
                               alphabet=RNA.alphabets.degen_gapped)
        sub_da = ArrayAlignment([model1, model2, model3],
                                moltype=RNA,
                                alphabet=RNA.alphabets.degen_gapped)

        # take_seqs by name should have the same effect as
        # get_sub_alignment by seq idx?
        obs_sub_da_TS = self.da.take_seqs(["rna1"])
        obs_sub_da_SA = self.da.get_sub_alignment(seqs=[0])

        # These two are now the same. Fixed mapping of key to char array.
        self.assertEqual(obs_sub_da_TS, obs_sub_da_SA)
        self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA))

    def test_aln_equality(self):
        # When does something compare equal?
        self.assertEqual(self.da == self.da, True)
        # one sequence less
        other_da1 = ArrayAlignment([self.model1, self.model2],
                                   moltype=RNA,
                                   alphabet=RNA.alphabets.degen_gapped)
        self.assertEqual(self.da == other_da1, False)
        # seqs in different order -- doesn't matter
        other_da2 = ArrayAlignment(
            [self.model1, self.model3, self.model2],
            moltype=RNA,
            alphabet=RNA.alphabets.degen_gapped,
        )
        self.assertEqual(self.da == other_da2, True)
        # seqs in different encoding -- doesn't matter, only looks at data
        other_da3 = ArrayAlignment([self.model1, self.model2, self.model3])
        # Should this compare False even though the data is exactly the same?
        # The moltype is different...
        self.assertEqual(self.da == other_da3, True)
        assert alltrue(
            list(map(alltrue, self.da.array_seqs == other_da3.array_seqs)))

    def test_seq_equality(self):
        model1 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        model2 = ArraySequence("UCG",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)
        # Shouldn't the above two sequences be equal?
        self.assertEqual(model1, model2)
        # string comparison is True
        self.assertEqual(str(model1), str(model2))

    def test_seq_ungapping(self):
        rna1 = RnaSequence("U-C-A-G-", name="rna1")
        model1 = ArraySequence("U-C-A-G-",
                               name="rna1",
                               alphabet=RNA.alphabets.degen_gapped)

        self.assertEqual(rna1, "U-C-A-G-")
        self.assertEqual(rna1.degap(), "UCAG")

        # check is produces the right string from the beginning
        self.assertEqual(str(model1), "U-C-A-G-")
        self.assertEqual(model1._data, [0, 4, 1, 4, 2, 4, 3, 4])
        # ArraySequence should maybe have the same degap method as normal seq
        self.assertEqual(str(model1.degap()), "UCAG")

    def test_the_rest_of_ModelSequence(self):
        """The class ArraySequence has 14 methods, but only 2 unittests.
        You might want to add some tests there..."""
        # note: mostly these are tested in derived classes, for convenience.
        pass
Example #18
0
def load_from_fasta(filename):
    infile = open_(filename, mode='rt')
    parser = MinimalFastaParser(infile)
    seqs = [(n, s) for n, s in parser]
    infile.close()
    return ArrayAlignment(data=seqs, moltype=DNA)
Example #19
0
def _get_seq_array(data):
    """returns [(n, seq), ...] as DenseArray"""
    return ArrayAlignment(data=data, moltype=DNA).array_seqs