def test_subset_seqs_Alignment(self): rna1 = RnaSequence("UCG", name="rna1") rna2 = RnaSequence("YCG", name="rna2") rna3 = RnaSequence("CAR", name="rna3") sub_aln = Alignment([rna2, rna3], moltype=RNA) aln = Alignment([rna1, rna2, rna3], moltype=RNA) obs_sub_aln = aln.take_seqs(["rna2", "rna3"]) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.take_seqs(["rna3", "rna2"]) obs_sub_aln_2 = self.aln.take_seqs(["rna2", "rna3"]) self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2))
def setUp(self): """Setup for Fasta tests.""" self.strings = ["AAAA", "CCCC", "gggg", "uuuu"] self.labels = ["1st", "2nd", "3rd", "4th"] self.infos = ["Dog", "Cat", "Mouse", "Rat"] self.sequences_with_labels = list(map(Sequence, self.strings)) self.sequences_with_names = list(map(Sequence, self.strings)) for l, sl, sn in zip(self.labels, self.sequences_with_labels, self.sequences_with_names): sl.label = l sn.name = l self.fasta_no_label = ">0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu\n" self.fasta_with_label = ">1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU\n" self.fasta_with_label_lw2 = ( ">1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU\n") self.alignment_dict = { "1st": "AAAA", "2nd": "CCCC", "3rd": "GGGG", "4th": "UUUU", } self.alignment_object = Alignment(self.alignment_dict) for label, info in zip(self.labels, self.infos): self.alignment_object.named_seqs[label].info = Info(species=info) self.fasta_with_label_species = ( ">1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU\n" ) self.alignment_object.RowOrder = ["1st", "2nd", "3rd", "4th"]
def alignment_traceback(seqs, aligned_positions, word_length): """Alignment object from state matrix and ending point.""" (starts, ends, maps) = map_traceback(aligned_positions) aligneds = [] for (start, end, amap, (name, seq)) in zip(starts, ends, maps, seqs): gs = Aligned(amap * word_length, seq[start * word_length:end * word_length]) aligneds.append((name, gs)) return Alignment(moltype=None, data=aligneds)
def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence("UCAGGG", name="rna1") self.rna2 = RnaSequence("YCU-RG", name="rna2") self.rna3 = RnaSequence("CAA-NR", name="rna3") self.model1 = ArraySequence("UCAGGG", name="rna1", alphabet=RNA.alphabets.degen_gapped) self.model2 = ArraySequence("YCU-RG", name="rna2", alphabet=RNA.alphabets.degen_gapped) self.model3 = ArraySequence("CAA-NR", name="rna3", alphabet=RNA.alphabets.degen_gapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], moltype=RNA) self.da = ArrayAlignment( [self.model1, self.model2, self.model3], moltype=RNA, alphabet=RNA.alphabets.degen_gapped, ) # seqs no name self.nn_rna1 = RnaSequence("UCAGGG") self.nn_rna2 = RnaSequence("YCU-RG") self.nn_rna3 = RnaSequence("CAA-NR") self.nn_model1 = ArraySequence("UCAGGG", alphabet=RNA.alphabets.degen_gapped) self.nn_model2 = ArraySequence("YCU-RG", alphabet=RNA.alphabets.degen_gapped) self.nn_model3 = ArraySequence("CAA-NR", alphabet=RNA.alphabets.degen_gapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3], moltype=RNA) self.nn_da = ArrayAlignment( [self.nn_model1, self.nn_model2, self.nn_model3], moltype=RNA, alphabet=RNA.alphabets.degen_gapped, )
def get_alignment(self, feature_types=None, where_feature=None, omit_redundant=True): """Arguments: - feature_types: annotations to be applied to the returned sequences - omit_redundant: exclude redundant gap positions""" seqs = [] annotations = {} for member in self.members: if feature_types: seq = member.get_annotated_aligned(feature_types, where_feature) else: seq = member.aligned_seq if seq is None: continue name = seq.name if self._rc: # names should reflect change to strand loc = member.location.copy() loc.strand *= -1 name = str(loc) annotations[name] = seq.data.annotations seq.name = seq.data.name = name seqs += [(name, seq)] if seqs is None: return None aln = Alignment(data=seqs, moltype=DNA) if self._rc: aln = aln.rc() if omit_redundant: aln = aln.omit_gap_pos() return aln
def test_subset_positions_Alignment(self): rna1 = RnaSequence("UCG", name="rna1") rna2 = RnaSequence("YCG", name="rna2") rna3 = RnaSequence("CAR", name="rna3") sub_aln = Alignment([rna1, rna2, rna3], moltype=RNA) obs_sub_aln = self.aln.take_positions([0, 1, 5]) self.assertEqual(obs_sub_aln, sub_aln) self.assertNotEqual(obs_sub_aln, self.aln) # string representations should be the same. This fails right # now, because sequence order is not maintained. See separate test. self.assertEqual(str(obs_sub_aln), str(sub_aln))
def setUp(self): """Setup for Clustal tests.""" self.unaligned_dict = { "1st": "AAA", "2nd": "CCCC", "3rd": "GGGG", "4th": "UUUU", } self.alignment_dict = { "1st": "AAAA", "2nd": "CCCC", "3rd": "GGGG", "4th": "UUUU", } # create alignment change order. self.alignment_object = Alignment(self.alignment_dict) self.alignment_order = ["2nd", "4th", "3rd", "1st"] self.alignment_object.RowOrder = self.alignment_order self.clustal_with_label = """CLUSTAL 1st AAAA 2nd CCCC 3rd GGGG 4th UUUU """ self.clustal_with_label_lw2 = """CLUSTAL 1st AA 2nd CC 3rd GG 4th UU 1st AA 2nd CC 3rd GG 4th UU """ self.clustal_with_label_reordered = """CLUSTAL 2nd CCCC 4th UUUU 3rd GGGG 1st AAAA """ self.clustal_with_label_lw2_reordered = """CLUSTAL
def test_user_function_multiple(self): """user defined composable functions should not interfere with each other""" from cogent3 import make_aligned_seqs from cogent3.core.alignment import Alignment u_function_1 = user_function(self.foo, "aligned", "aligned") u_function_2 = user_function(self.bar, "aligned", "pairwise_distances") aln_1 = make_aligned_seqs(data=[("a", "GCAAGCGTTTAT"), ("b", "GCTTTTGTCAAT")]) data = dict([("s1", "ACGTACGTA"), ("s2", "GTGTACGTA")]) aln_2 = Alignment(data=data, moltype="dna") got_1 = u_function_1(aln_1) got_2 = u_function_2(aln_2) self.assertEqual(got_1.to_dict(), {"a": "GCAA", "b": "GCTT"}) self.assertEqual(got_2, {("s1", "s2"): 2.0, ("s2", "s1"): 2.0})
def pairwise_to_multiple(pwise, ref_seq, moltype, info=None): """ turns pairwise alignments to a reference into a multiple alignment Parameters ---------- pwise Series of pairwise alignments to ref_seq as [(non-refseq name, aligned pair), ...] ref_seq The sequence common in all pairwise alignments moltype molecular type for the returned alignment info info object Returns ------- ArrayAlign """ if not hasattr(ref_seq, "name"): raise TypeError( f"ref_seq must be a cogent3 sequence, not {type(ref_seq)}") refseqs = [ s for _, aln in pwise for s in aln.seqs if s.name == ref_seq.name ] ref_gaps = _gap_union(refseqs) m = gap_coords_to_map(ref_gaps, len(ref_seq)) aligned = [Aligned(m, ref_seq)] for other_name, aln in pwise: curr_ref = aln.named_seqs[ref_seq.name] curr_ref_gaps = dict(curr_ref.map.get_gap_coordinates()) other_seq = aln.named_seqs[other_name] other_gaps = dict(other_seq.map.get_gap_coordinates()) diff_gaps = _combined_refseq_gaps(curr_ref_gaps, ref_gaps) inject = _gaps_for_injection(other_gaps, diff_gaps, len(other_seq.data)) if inject: m = gap_coords_to_map(inject, len(other_seq.data)) other_seq = Aligned(m, other_seq.data) aligned.append(other_seq) # default to ArrayAlign return Alignment(aligned, moltype=moltype, info=info).to_type(array_align=True, moltype=moltype)
def get_align_for_phylip(data, id_map=None): """ Convenience function to return aligment object from phylip data data: sequence of lines in phylip format (an open file, list, etc) id_map: optional id mapping from external ids to phylip labels - not sure if we're going to implement this returns Alignment object """ mpp = MinimalPhylipParser(data, id_map) tuples = [] for tup in mpp: tuples.append(tup) return Alignment(tuples)
class AllTests(TestCase): def setUp(self): """setUp method for all tests""" # named sequences self.rna1 = RnaSequence("UCAGGG", name="rna1") self.rna2 = RnaSequence("YCU-RG", name="rna2") self.rna3 = RnaSequence("CAA-NR", name="rna3") self.model1 = ArraySequence("UCAGGG", name="rna1", alphabet=RNA.alphabets.degen_gapped) self.model2 = ArraySequence("YCU-RG", name="rna2", alphabet=RNA.alphabets.degen_gapped) self.model3 = ArraySequence("CAA-NR", name="rna3", alphabet=RNA.alphabets.degen_gapped) self.aln = Alignment([self.rna1, self.rna2, self.rna3], moltype=RNA) self.da = ArrayAlignment( [self.model1, self.model2, self.model3], moltype=RNA, alphabet=RNA.alphabets.degen_gapped, ) # seqs no name self.nn_rna1 = RnaSequence("UCAGGG") self.nn_rna2 = RnaSequence("YCU-RG") self.nn_rna3 = RnaSequence("CAA-NR") self.nn_model1 = ArraySequence("UCAGGG", alphabet=RNA.alphabets.degen_gapped) self.nn_model2 = ArraySequence("YCU-RG", alphabet=RNA.alphabets.degen_gapped) self.nn_model3 = ArraySequence("CAA-NR", alphabet=RNA.alphabets.degen_gapped) self.nn_aln = Alignment([self.nn_rna1, self.nn_rna2, self.nn_rna3], moltype=RNA) self.nn_da = ArrayAlignment( [self.nn_model1, self.nn_model2, self.nn_model3], moltype=RNA, alphabet=RNA.alphabets.degen_gapped, ) def test_printing_named_seqs(self): """Printing named seqs should work the same on Aln and DenseAln""" # Note: the newline trailing each sequence is intentional, because # we want each FASTA-format record to be separated. exp_lines_general = [ ">rna1", "UCAGGG", ">rna2", "YCU-RG", ">rna3", "CAA-NR" ] self.assertEqual(str(self.aln), "\n".join(exp_lines_general) + "\n") self.assertEqual(str(self.da), "\n".join(exp_lines_general) + "\n") def test_printing_unnamed_seqs(self): """Printing unnamed sequences should work the same on Aln and DenseAln """ exp_lines_gen = [ ">seq_0", "UCAGGG", ">seq_1", "YCU-RG", ">seq_2", "CAA-NR\n" ] self.assertEqual(str(self.nn_aln), "\n".join(exp_lines_gen)) self.assertEqual(str(self.nn_da), "\n".join(exp_lines_gen)) def test_ArrayAlignment_without_moltype(self): """Expect MolType to be picked up from the sequences.""" m1 = ArraySequence("UCAG", alphabet=RNA.alphabets.degen_gapped, name="rna1") m2 = ArraySequence("CCCR", alphabet=RNA.alphabets.degen_gapped, name="rna2") da = ArrayAlignment([m1, m2]) exp_lines = [">rna1", "UCAG", ">rna2", "CCCR"] self.assertEqual(str(da), "\n".join(exp_lines) + "\n") def test_names(self): # Should both alignments handle names the same way? self.assertEqual(self.aln.names, ["rna1", "rna2", "rna3"]) self.assertEqual(self.da.names, ["rna1", "rna2", "rna3"]) # On unnamed sequences the behavior is now the same. self.assertEqual(self.nn_aln.names, ["seq_0", "seq_1", "seq_2"]) self.assertEqual(self.nn_da.names, ["seq_0", "seq_1", "seq_2"]) def test_seqFreqs(self): """seqFreqs should work the same on Alignment and ArrayAlignment""" get_index = RNA.alphabets.degen_gapped.index # 'UCAGGG' # 'YCU-RG' # 'CAA-NR' expected_counts = { 0: { "U": 1, "C": 1, "A": 1, "G": 3 }, 1: { "Y": 1, "C": 1, "U": 1, "-": 1, "R": 1, "G": 1 }, 2: { "C": 1, "A": 2, "-": 1, "N": 1, "R": 1 }, } got1 = self.da.counts_per_seq(allow_gap=True, include_ambiguity=True) got2 = self.aln.counts_per_seq(allow_gap=True, include_ambiguity=True) for pos, counts in expected_counts.items(): for char in counts: self.assertEqual(got1[pos, char], expected_counts[pos][char]) self.assertEqual(got2[pos, char], expected_counts[pos][char]) def test_subset_positions_ArrayAlignment(self): # because dict order volatile, need to grab the # the index for ambig characters from the object # The full data comes from these seqs # 'UCAGGG' # 'YCU-RG' # 'CAA-NR' get_index = RNA.alphabets.degen_gapped.index G = get_index("-") N = get_index("N") R = get_index("R") Y = get_index("Y") full_data = array([[0, 1, 2, 3, 3, 3], [Y, 1, 0, G, R, 3], [1, 2, 2, G, N, R]]) model1 = ArraySequence("UCG", name="rna1", alphabet=RNA.alphabets.degen_gapped) model2 = ArraySequence("YCG", name="rna2", alphabet=RNA.alphabets.degen_gapped) model3 = ArraySequence("CAR", name="rna3", alphabet=RNA.alphabets.degen_gapped) sub_da = ArrayAlignment([model1, model2, model3], moltype=RNA, alphabet=RNA.alphabets.degen_gapped) sub_data = array([[0, 1, 3], [Y, 1, 3], [1, 2, R]]) # First check some data self.assertEqual(self.da.array_seqs, full_data) self.assertEqual(self.da.array_positions, transpose(full_data)) self.assertEqual(sub_da.array_seqs, sub_data) self.assertEqual(sub_da.array_positions, transpose(sub_data)) obs_sub_da_TP = self.da.take_positions([0, 1, 5]) obs_sub_da_SA = self.da.get_sub_alignment(pos=[0, 1, 5]) # When using the get_sub_alignment method the data is right self.assertEqual(obs_sub_da_SA, sub_da) self.assertNotEqual(obs_sub_da_SA, self.da) self.assertEqual(obs_sub_da_SA.array_seqs, sub_data) self.assertEqual(obs_sub_da_SA.array_positions, transpose(sub_data)) # For the take_positions method: Why does this work self.assertEqual(obs_sub_da_TP, sub_da) self.assertNotEqual(obs_sub_da_TP, self.da) # If the data doesn't match? self.assertEqual(obs_sub_da_TP.array_seqs, sub_data) self.assertEqual(obs_sub_da_TP.array_positions, transpose(sub_data)) # Shouldn't the __eq__ method check the data at least? def test_subset_positions_Alignment(self): rna1 = RnaSequence("UCG", name="rna1") rna2 = RnaSequence("YCG", name="rna2") rna3 = RnaSequence("CAR", name="rna3") sub_aln = Alignment([rna1, rna2, rna3], moltype=RNA) obs_sub_aln = self.aln.take_positions([0, 1, 5]) self.assertEqual(obs_sub_aln, sub_aln) self.assertNotEqual(obs_sub_aln, self.aln) # string representations should be the same. This fails right # now, because sequence order is not maintained. See separate test. self.assertEqual(str(obs_sub_aln), str(sub_aln)) def test_take_positions_sequence_order(self): """Alignment take_positions should maintain seq order""" # This works self.assertEqual(self.da.names, ["rna1", "rna2", "rna3"]) sub_da = self.da.get_sub_alignment(pos=[0, 1, 5]) self.assertEqual(sub_da.names, ["rna1", "rna2", "rna3"]) # seq order not maintained in Alignment self.assertEqual(self.aln.names, ["rna1", "rna2", "rna3"]) sub_aln = self.aln.take_positions([0, 1, 5]) self.assertEqual(sub_aln.names, ["rna1", "rna2", "rna3"]) def test_subset_seqs_Alignment(self): rna1 = RnaSequence("UCG", name="rna1") rna2 = RnaSequence("YCG", name="rna2") rna3 = RnaSequence("CAR", name="rna3") sub_aln = Alignment([rna2, rna3], moltype=RNA) aln = Alignment([rna1, rna2, rna3], moltype=RNA) obs_sub_aln = aln.take_seqs(["rna2", "rna3"]) self.assertEqual(obs_sub_aln, sub_aln) self.assertEqual(str(obs_sub_aln), str(sub_aln)) # Selected sequences should be in specified order? obs_sub_aln_1 = self.aln.take_seqs(["rna3", "rna2"]) obs_sub_aln_2 = self.aln.take_seqs(["rna2", "rna3"]) self.assertNotEqual(str(obs_sub_aln_1), str(obs_sub_aln_2)) def test_subset_seqs_ArrayAlignment(self): model1 = ArraySequence("UCG", name="rna1", alphabet=RNA.alphabets.degen_gapped) model2 = ArraySequence("YCG", name="rna2", alphabet=RNA.alphabets.degen_gapped) model3 = ArraySequence("CAR", name="rna3", alphabet=RNA.alphabets.degen_gapped) sub_da = ArrayAlignment([model1, model2, model3], moltype=RNA, alphabet=RNA.alphabets.degen_gapped) # take_seqs by name should have the same effect as # get_sub_alignment by seq idx? obs_sub_da_TS = self.da.take_seqs(["rna1"]) obs_sub_da_SA = self.da.get_sub_alignment(seqs=[0]) # These two are now the same. Fixed mapping of key to char array. self.assertEqual(obs_sub_da_TS, obs_sub_da_SA) self.assertEqual(str(obs_sub_da_TS), str(obs_sub_da_SA)) def test_aln_equality(self): # When does something compare equal? self.assertEqual(self.da == self.da, True) # one sequence less other_da1 = ArrayAlignment([self.model1, self.model2], moltype=RNA, alphabet=RNA.alphabets.degen_gapped) self.assertEqual(self.da == other_da1, False) # seqs in different order -- doesn't matter other_da2 = ArrayAlignment( [self.model1, self.model3, self.model2], moltype=RNA, alphabet=RNA.alphabets.degen_gapped, ) self.assertEqual(self.da == other_da2, True) # seqs in different encoding -- doesn't matter, only looks at data other_da3 = ArrayAlignment([self.model1, self.model2, self.model3]) # Should this compare False even though the data is exactly the same? # The moltype is different... self.assertEqual(self.da == other_da3, True) assert alltrue( list(map(alltrue, self.da.array_seqs == other_da3.array_seqs))) def test_seq_equality(self): model1 = ArraySequence("UCG", name="rna1", alphabet=RNA.alphabets.degen_gapped) model2 = ArraySequence("UCG", name="rna1", alphabet=RNA.alphabets.degen_gapped) # Shouldn't the above two sequences be equal? self.assertEqual(model1, model2) # string comparison is True self.assertEqual(str(model1), str(model2)) def test_seq_ungapping(self): rna1 = RnaSequence("U-C-A-G-", name="rna1") model1 = ArraySequence("U-C-A-G-", name="rna1", alphabet=RNA.alphabets.degen_gapped) self.assertEqual(rna1, "U-C-A-G-") self.assertEqual(rna1.degap(), "UCAG") # check is produces the right string from the beginning self.assertEqual(str(model1), "U-C-A-G-") self.assertEqual(model1._data, [0, 4, 1, 4, 2, 4, 3, 4]) # ArraySequence should maybe have the same degap method as normal seq self.assertEqual(str(model1.degap()), "UCAG") def test_the_rest_of_ModelSequence(self): """The class ArraySequence has 14 methods, but only 2 unittests. You might want to add some tests there...""" # note: mostly these are tested in derived classes, for convenience. pass