def test_omit_gap_positions(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNA('UUAU', metadata={'id': "r1"}) r2 = RNA('ACGU', metadata={'id': "r2"}) expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNA('UUAU', metadata={'id': "r1"}) r2 = RNA('ACGU', metadata={'id': "r2"}) expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. seqs = [] for i in range(33): seqs.append(DNA('-.', metadata={'id': str(i)})) aln = Alignment(seqs) self.assertEqual(aln.omit_gap_positions(1 - np.finfo(float).eps), Alignment([DNA('', metadata={'id': str(i)}) for i in range(33)]))
def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc', quality=range(4)) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc', quality=range(4)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc1', quality=range(4)), DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1]) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)), DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map)
def test_compute_score_and_traceback_matrices_invalid(self): # if the sequence contains a character that is not in the # substitution matrix, an informative error should be raised m = make_identity_substitution_matrix(2, -1) self.assertRaises(ValueError, _compute_score_and_traceback_matrices, Alignment([DNA('AWG')]), Alignment([DNA('ACGT')]), 5, 2, m)
def compute_distance_matrix(msa_file, csvfile="distance_mat.csv"): """ load up some aligned sequences, and compute a distance matrix compute distances between the sequences using the hamming function see also: scipy.spatial.distance.hamming @args msa_file: multiple sequence alignment in fasta format @type msa_file: str @args csvfile: output distance matrix file in csv format @type csvfile: str """ records = [] for rec in SeqIO.parse(msa_file, "fasta"): records.append(RNA(rec.seq, rec.id)) aln = Alignment(records) master_dm = aln.distances() ## writing the result to a csv file csv_header_row = [header for header in master_dm.ids] ## result as a list of list with open(csvfile, "w") as output: writer = csv.writer(output, lineterminator="\n") writer.writerows([csv_header_row]) writer.writerows(master_dm) output.close()
def test_to_phylip_no_positions(self): d1 = DNASequence('', id="d1") d2 = DNASequence('', id="d2") a = Alignment([d1, d2]) with self.assertRaises(SequenceCollectionError): a.to_phylip()
def test_init_matrices_sw(self): expected_score_m = np.zeros((5, 4)) expected_tback_m = [[0, 0, 0, 0], [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1]] actual_score_m, actual_tback_m = _init_matrices_sw( Alignment([DNA('AAA')]), Alignment([DNA('AAAA')]), 5, 2) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
def test_to_phylip_unequal_sequence_lengths(self): d1 = DNASequence('A-CT', id="d1") d2 = DNASequence('TTA', id="d2") d3 = DNASequence('.-AC', id="d3") a = Alignment([d1, d2, d3]) with self.assertRaises(SequenceCollectionError): a.to_phylip()
def test_init_matrices_nw(self): expected_score_m = [[0, -5, -7, -9], [-5, 0, 0, 0], [-7, 0, 0, 0], [-9, 0, 0, 0], [-11, 0, 0, 0]] expected_tback_m = [[0, 3, 3, 3], [2, -1, -1, -1], [2, -1, -1, -1], [2, -1, -1, -1], [2, -1, -1, -1]] actual_score_m, actual_tback_m = _init_matrices_nw( Alignment([DNA('AAA')]), Alignment([DNA('AAAA')]), 5, 2) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
def test_validate_lengths(self): self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue(Alignment([ DNASequence('TTT', id="d1")])._validate_lengths()) self.assertFalse(Alignment([ DNASequence('TTT', id="d1"), DNASequence('TT', id="d2")])._validate_lengths())
def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence("..ACC-GTTGG..", id="d1") d2 = DNASequence("TTACCGGT-GGCC", id="d2") d3 = DNASequence(".-ACC-GTTGC--", id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {"s1": "d1", "s3": "d3", "s2": "d2"}) expected = "\n".join(["3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected)
def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence("..ACC-GTTGG..", id="d1") d2 = DNASequence("TTACCGGT-GGCC", id="d2") d3 = DNASequence(".-ACC-GTTGC--", id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {"d1": "d1", "d3": "d3", "d2": "d2"}) expected = "\n".join(["3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected)
def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join([ "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected)
def test_majority_consensus(self): d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertTrue(a1.majority_consensus().equals(DNASequence('TT-'))) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '')
def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join([ "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected)
def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', metadata={ 'id': "abc", 'description': 'desc' }, positional_metadata={'quality': range(4)}) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', metadata={ 'id': "seq1", 'description': 'desc' }, positional_metadata={'quality': range(4)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', )) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', metadata={ 'id': "abc", 'description': 'desc1' }, positional_metadata={'quality': range(4)}), DNA('TGCA', metadata={ 'id': "def", 'description': 'desc2' }, positional_metadata={'quality': range(4)[::-1]}) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', metadata={ 'id': "seq1", 'description': 'desc1' }, positional_metadata={'quality': (0, 1, 2, 3)}), DNA('TGCA', metadata={ 'id': "seq2", 'description': 'desc2' }, positional_metadata={'quality': (3, 2, 1, 0)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map)
def setUp(self): # ids all same length, seqs longer than 10 chars dna_3_seqs = Alignment([ DNA('..ACC-GTTGG..', id="d1"), DNA('TTACCGGT-GGCC', id="d2"), DNA('.-ACC-GTTGC--', id="d3") ]) # id lengths from 0 to 10, with mixes of numbers, characters, and # spaces. sequence characters are a mix of cases and gap characters. # sequences are shorter than 10 chars variable_length_ids = Alignment([ RNA('.-ACGU'), RNA('UGCA-.', id='a'), RNA('.ACGU-', id='bb'), RNA('ugca-.', id='1'), RNA('AaAaAa', id='abcdefghij'), RNA('GGGGGG', id='ab def42ij') ]) # sequences with 20 chars = exactly two chunks of size 10 two_chunks = Alignment([ DNA('..ACC-GTTGG..AATGC.C', id='foo'), DNA('TTACCGGT-GGCCTA-GCAT', id='bar') ]) # single sequence with more than two chunks single_seq_long = Alignment( [DNA('..ACC-GTTGG..AATGC.C----', id='foo')]) # single sequence with only a single character (minimal writeable # alignment) single_seq_short = Alignment([DNA('-')]) # alignments that can be written in phylip format self.objs = [ dna_3_seqs, variable_length_ids, two_chunks, single_seq_long, single_seq_short ] self.fps = map(get_data_path, [ 'phylip_dna_3_seqs', 'phylip_variable_length_ids', 'phylip_two_chunks', 'phylip_single_seq_long', 'phylip_single_seq_short' ]) # alignments that cannot be written in phylip format, paired with their # expected error message regexps self.invalid_objs = [ # no seqs (Alignment([]), 'one sequence'), # no positions (Alignment([DNA('', id="d1"), DNA('', id="d2")]), 'one position'), # ids too long (Alignment( [RNA('ACGU', id="foo"), RNA('UGCA', id="alongsequenceid")]), '10.*alongsequenceid') ]
def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence("TTT", id="d1") d2 = DNASequence("TT-", id="d2") d3 = DNASequence("TC-", id="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence("TT-")) d1 = DNASequence("T", id="d1") d2 = DNASequence("A", id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence("T"), DNASequence("A")]) self.assertEqual(self.empty.majority_consensus(), "")
def test_subalignment(self): """subalignment functions as expected """ # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected)
def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid())
def test_fastq_to_alignment(self): for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: for observed_kwargs in kwargs: _drop_kwargs(observed_kwargs, 'seq_num') constructor = observed_kwargs.get('constructor', Sequence) expected_kwargs = {} expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' expected = Alignment([ constructor(c[2], metadata={ 'id': c[0], 'description': c[1] }, positional_metadata={ 'quality': np.array(c[3], dtype=np.uint8) }, **expected_kwargs) for c in components ]) observed = _fastq_to_alignment(valid, **observed_kwargs) self.assertEqual(observed, expected)
def test_generate_lane_mask(self): sample_alignment = """>1 AAAAT >2 AAAGG >3 AACCC >4 A----""".split('\n') aln = Alignment.from_fasta_records(parse_fasta(sample_alignment), DNA) actual_lanemask = generate_lane_mask(aln, 0.00) self.assertEqual(actual_lanemask, "11111") actual_lanemask = generate_lane_mask(aln, 0.10) self.assertEqual(actual_lanemask, "11100") actual_lanemask = generate_lane_mask(aln, 0.20) self.assertEqual(actual_lanemask, "11100") actual_lanemask = generate_lane_mask(aln, 0.40) self.assertEqual(actual_lanemask, "11000") actual_lanemask = generate_lane_mask(aln, 0.60) self.assertEqual(actual_lanemask, "11000") actual_lanemask = generate_lane_mask(aln, 0.80) self.assertEqual(actual_lanemask, "10000") actual_lanemask = generate_lane_mask(aln, 1.00) self.assertEqual(actual_lanemask, "00000")
def align_two_alignments(aln1_fp, aln2_fp, moltype, params=None): """Returns an Alignment object from two existing Alignments. Parameters ---------- aln1_fp : string file path of 1st alignment aln2_fp : string file path of 2nd alignment params : dict of parameters to pass in to the Mafft app controller. Returns ------- The aligned sequences. """ # Create Mafft app. app = Mafft(InputHandler='_input_as_paths', params=params, SuppressStderr=False) app._command = 'mafft-profile' # Get results using int_map as input to app res = app([aln1_fp, aln2_fp]) return Alignment.read(res['StdOut'], constructor=moltype)
def test_validate_lengths(self): self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue( Alignment([DNA('TTT', metadata={'id': "d1"})])._validate_lengths())
def test_fastq_to_alignment(self): for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: for observed_kwargs in kwargs: _drop_kwargs(observed_kwargs, 'seq_num') constructor = observed_kwargs.get('constructor', Sequence) # Can't use partials for this because the read # function below can't operate on partials expected_kwargs = {} if hasattr(constructor, 'lowercase'): expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' expected = Alignment( [constructor( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': np.array(c[3], dtype=np.uint8)}, **expected_kwargs) for c in components]) observed = _fastq_to_alignment(valid, **observed_kwargs) self.assertEqual(observed, expected)
def setUp(self): """Setup for Fasta tests.""" self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu'] self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu' self.fasta_with_label =\ '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU' self.fasta_with_label_lw2 =\ '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU' self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.sequence_objects_a = [ DNASequence('ACTCGAGATC', 'seq1'), DNASequence('GGCCT', 'seq2') ] self.sequence_objects_b = [ BiologicalSequence('ACTCGAGATC', 'seq1'), BiologicalSequence('GGCCT', 'seq2') ] seqs = [ DNASequence("ACC--G-GGTA..", id="seq1"), DNASequence("TCC--G-GGCA..", id="seqs2") ] self.alignment = Alignment(seqs)
def test_to_phylip_no_positions(self): d1 = DNASequence('', id="d1") d2 = DNASequence('', id="d2") a = Alignment([d1, d2]) with self.assertRaises(SequenceCollectionError): npt.assert_warns(DeprecationWarning, a.to_phylip)
def test_subalignment(self): # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNA('.AC', metadata={'id': "d1"}) d2 = DNA('TAC', metadata={'id': "d2"}) d3 = DNA('.AC', metadata={'id': "d3"}) expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNA('.C-GTTGG..', metadata={'id': "d1"}) d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"}) d3 = DNA('-C-GTTGC--', metadata={'id': "d3"}) expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNA('.AC', metadata={'id': "d1"}) d3 = DNA('.AC', metadata={'id': "d3"}) expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"}) expected = Alignment([d2]) self.assertEqual(actual, expected)
def test_omit_gap_sequences(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. aln = Alignment([DNA('.' * 33, id='abc'), DNA('-' * 33, id='def')]) self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps), Alignment([]))
def test_majority_consensus_constructor(self): d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) obs = npt.assert_warns(UserWarning, a1.majority_consensus, constructor=str) self.assertEqual(obs, 'TT-')
def test_phylip_to_alignment_valid_files(self): for valid_files, components in self.valid_configurations: for valid in valid_files: observed = _phylip_to_alignment(valid) expected = Alignment([ Sequence(seq, metadata={'id': ID}) for (seq, ID) in components ]) self.assertEqual(observed, expected)
def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True)
def setUp(self): self.d1 = DNASequence("..ACC-GTTGG..", id="d1") self.d2 = DNASequence("TTACCGGT-GGCC", id="d2") self.d3 = DNASequence(".-ACC-GTTGC--", id="d3") self.r1 = RNASequence("UUAU-", id="r1") self.r2 = RNASequence("ACGUU", id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [("d1", "..ACC-GTTGG.."), ("d2", "TTACCGGT-GGCC"), ("d3", ".-ACC-GTTGC--")] self.seqs2_t = [("r1", "UUAU-"), ("r2", "ACGUU")] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) self.empty = Alignment([])
def test_omit_gap_positions(self): """omitting gap positions functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)
def test_subalignment_filter_out_everything(self): exp = Alignment([]) # no sequences obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True) self.assertEqual(obs, exp) # no positions obs = self.a1.subalignment(positions_to_keep=None, invert_positions_to_keep=True) self.assertEqual(obs, exp)
def test_position_frequencies_floating_point_precision(self): # Test that a position with no variation yields a frequency of exactly # 1.0. Note that it is important to use self.assertEqual here instead # of self.assertAlmostEqual because we want to test for exactly 1.0. A # previous implementation of Alignment.position_frequencies added # (1 / sequence_count) for each occurrence of a character in a position # to compute the frequencies (see # https://github.com/biocore/scikit-bio/issues/801). In certain cases, # this yielded a frequency slightly less than 1.0 due to roundoff # error. The test case here uses an alignment of 10 sequences with no # variation at a position. This test case exposes the roundoff error # present in the previous implementation because 1/10 added 10 times # yields a number slightly less than 1.0. This occurs because 1/10 # cannot be represented exactly as a floating point number. seqs = [] for i in range(10): seqs.append(DNA('A', id=str(i))) aln = Alignment(seqs) self.assertEqual(aln.position_frequencies(), [defaultdict(float, {'A': 1.0})])
def test_position_frequencies_floating_point_precision(self): # Test that a position with no variation yields a frequency of exactly # 1.0. Note that it is important to use self.assertEqual here instead # of self.assertAlmostEqual because we want to test for exactly 1.0. A # previous implementation of Alignment.position_frequencies added # (1 / sequence_count) for each occurrence of a character in a position # to compute the frequencies (see # https://github.com/biocore/scikit-bio/issues/801). In certain cases, # this yielded a frequency slightly less than 1.0 due to roundoff # error. The test case here uses an alignment of 10 sequences with no # variation at a position. This test case exposes the roundoff error # present in the previous implementation because 1/10 added 10 times # yields a number slightly less than 1.0. This occurs because 1/10 # cannot be represented exactly as a floating point number. seqs = [] for i in range(10): seqs.append(DNA('A', metadata={'id': str(i)})) aln = Alignment(seqs) self.assertEqual(aln.position_frequencies(), [defaultdict(float, {'A': 1.0})])
def test_filter_gap_high_entropy_low(self): result = filter_positions(self.alignment_with_gaps, self.maximum_gap_frequency_100, self.maximum_position_entropy_10) aln = Alignment([ BiologicalSequence('A-', id="seq1"), BiologicalSequence('A-', id="seq2"), BiologicalSequence('A-', id="seq3"), BiologicalSequence('A-', id="seq4") ]) self.assertEqual(result, aln)
def test_majority_consensus(self): # empty cases self.assertEqual( self.empty.majority_consensus(), Sequence('')) self.assertEqual( self.no_positions.majority_consensus(), RNA('')) # alignment where all sequences are the same aln = Alignment([DNA('AG', metadata={'id': 'a'}), DNA('AG', metadata={'id': 'b'})]) self.assertEqual(aln.majority_consensus(), DNA('AG')) # no ties d1 = DNA('TTT', metadata={'id': "d1"}) d2 = DNA('TT-', metadata={'id': "d2"}) d3 = DNA('TC-', metadata={'id': "d3"}) a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNA('TT-')) # ties d1 = DNA('T', metadata={'id': "d1"}) d2 = DNA('A', metadata={'id': "d2"}) a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNA('T'), DNA('A')])
def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) # no sequences self.empty = Alignment([]) # sequences, but no positions self.no_positions = Alignment([RNA('', id='a'), RNA('', id='b')])
def test_majority_consensus(self): # empty cases self.assertTrue( self.empty.majority_consensus().equals(Sequence(''))) self.assertTrue( self.no_positions.majority_consensus().equals(RNA(''))) # alignment where all sequences are the same aln = Alignment([DNA('AG', id='a'), DNA('AG', id='b')]) self.assertTrue(aln.majority_consensus().equals(DNA('AG'))) # no ties d1 = DNA('TTT', id="d1") d2 = DNA('TT-', id="d2") d3 = DNA('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertTrue(a1.majority_consensus().equals(DNA('TT-'))) # ties d1 = DNA('T', id="d1") d2 = DNA('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNA('T'), DNA('A')])
def test_traceback(self): score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 6], [-11, -7, -3, 1]] score_m = np.array(score_m) tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) # start at bottom-right expected = ([BiologicalSequence("ACG-")], [BiologicalSequence("ACGT")], 1, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]), Alignment([DNA('ACGT')]), 4, 3) self.assertEqual(actual, expected) # four sequences in two alignments score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 6], [-11, -7, -3, 1]] score_m = np.array(score_m) tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) # start at bottom-right expected = ([BiologicalSequence("ACG-"), BiologicalSequence("ACG-")], [BiologicalSequence("ACGT"), BiologicalSequence("ACGT")], 1, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG', 's1'), DNA('ACG', 's2')]), Alignment([DNA('ACGT', 's3'), DNA('ACGT', 's4')]), 4, 3) self.assertEqual(actual, expected) # start at highest-score expected = ([BiologicalSequence("ACG")], [BiologicalSequence("ACG")], 6, 0, 0) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]), Alignment([DNA('ACGT')]), 3, 3) self.assertEqual(actual, expected) # terminate traceback before top-right tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 0, 3], [2, 2, 2, 1], [2, 2, 2, 2]] tback_m = np.array(tback_m) expected = ("G", "G", 6, 2, 2) expected = ([BiologicalSequence("G")], [BiologicalSequence("G")], 6, 2, 2) actual = _traceback(tback_m, score_m, Alignment([DNA('ACG')]), Alignment([DNA('ACGT')]), 3, 3) self.assertEqual(actual, expected)
def align_unaligned_seqs(seqs_fp, moltype=DNA, params=None, accurate=False): """Aligns unaligned sequences Parameters ---------- seqs_fp : string file path of the input fasta file moltype : {skbio.DNA, skbio.RNA, skbio.Protein} params : dict-like type It pass the additional parameter settings to the application. Default is None. accurate : boolean Perform accurate alignment or not. It will sacrifice performance if set to True. Default is False. Returns ------- Alignment object The aligned sequences. See Also -------- skbio.Alignment skbio.DNA skbio.RNA skbio.Protein """ # Create Mafft app. app = Mafft(InputHandler='_input_as_path', params=params) # Turn on correct sequence type app.Parameters[MOLTYPE_MAP[moltype]].on() # Do not report progress app.Parameters['--quiet'].on() # More accurate alignment, sacrificing performance. if accurate: app.Parameters['--globalpair'].on() app.Parameters['--maxiterate'].Value = 1000 # Get results using int_map as input to app res = app(seqs_fp) # Get alignment as dict out of results alignment = Alignment.read(res['StdOut'], constructor=moltype) # Clean up res.cleanUp() return alignment
def parse_deblur_output(seqs_fp, derep_clusters): """ Parse deblur output file into an OTU map. Parameters ---------- seqs_fp: string file path to deblurred sequences derep_clusters: dictionary dictionary of dereplicated sequences map Returns ------- clusters: dictionary dictionary of clusters including dereplicated sequence labels Notes ----- For each deblurred sequence in seqs_fp, use the sequence label to obtain all dereplicated sequence labels belonging to it (from derep_clusters) to create entries in a new dictionary where the keys are actual sequences (not the labels). Note not all sequences in derep_clusters will be in seqs_fp since they could have been removed in the artifact filtering step. """ clusters = {} # Replace representative sequence name with actual sequence in cluster msa_fa = Alignment.read(seqs_fp, format='fasta') for label, seq in Alignment.iteritems(msa_fa): cluster_id = label.split(';')[0] seq2 = str(seq.degap()) if seq2 not in clusters: clusters[seq2] = [] if cluster_id not in derep_clusters: raise ValueError( 'Seed ID %s does not exist in .uc file' % cluster_id) else: clusters[seq2].extend(derep_clusters[cluster_id]) return clusters
def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', metadata={'id': "abc", 'description': 'desc'}, positional_metadata={'quality': range(4)}) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', metadata={'id': "seq1", 'description': 'desc'}, positional_metadata={'quality': range(4)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', metadata={'id': "abc", 'description': 'desc1'}, positional_metadata={'quality': range(4)}), DNA('TGCA', metadata={'id': "def", 'description': 'desc2'}, positional_metadata={'quality': range(4)[::-1]}) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', metadata={'id': "seq1", 'description': 'desc1'}, positional_metadata={'quality': (0, 1, 2, 3)}), DNA('TGCA', metadata={'id': "seq2", 'description': 'desc2'}, positional_metadata={'quality': (3, 2, 1, 0)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map)
def reformat_treepuzzle(gene_tree, species_tree, gene_msa_fa_fp, output_tree_fp, output_msa_phy_fp): """ Reformat input trees to the format accepted by Tree-Puzzle Parameters ---------- gene_tree: skbio.TreeNode TreeNode instance for gene tree species_tree_fp: skbio.TreeNode TreeNode instance for species tree gene_msa_fa_fp: string file path to gene alignments in FASTA format output_tree_fp: string file path to output trees (Nexus format) output_msa_phy_fp: string file path to output MSA in PHYLIP format See Also -------- skbio.TreeNode """ # remove the root branch length (output with ALF) for node in gene_tree.postorder(): if node.is_root(): node.length = None for node in species_tree.postorder(): if node.is_root(): node.length = None # trim gene tree leaves to exclude '_GENENAME' (if exists) trim_gene_tree_leaves(gene_tree) join_trees(gene_tree, species_tree, output_tree_fp) # trim FASTA sequence labels to exclude '/GENENAME' (if exists) msa_fa = Alignment.read(gene_msa_fa_fp, format='fasta') msa_fa_update_ids, new_to_old_ids = msa_fa.update_ids(func=id_mapper) msa_fa_update_ids.write(output_msa_phy_fp, format='phylip')
def setUp(self): self.d1 = DNA('..ACC-GTTGG..', metadata={'id': "d1"}) self.d2 = DNA('TTACCGGT-GGCC', metadata={'id': "d2"}) self.d3 = DNA('.-ACC-GTTGC--', metadata={'id': "d3"}) self.r1 = RNA('UUAU-', metadata={'id': "r1"}) self.r2 = RNA('ACGUU', metadata={'id': "r2"}) self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) # no sequences self.empty = Alignment([]) # sequences, but no positions self.no_positions = Alignment([RNA('', metadata={'id': 'a'}), RNA('', metadata={'id': 'b'})])
class AlignmentTests(TestCase): def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) self.empty = Alignment([]) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): """distances functions as expected """ expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances(dumb_distance) self.assertEqual(actual, expected) def test_score(self): self.assertEqual(self.a3.score(), 42.0) self.assertEqual(self.a4.score(), -42.0) def test_start_end_positions(self): self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)]) self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)]) def test_subalignment(self): """subalignment functions as expected """ # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected) def test_subalignment_filter_out_everything(self): exp = Alignment([]) # no sequences obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True) self.assertEqual(obs, exp) # no positions obs = self.a1.subalignment(positions_to_keep=None, invert_positions_to_keep=True) self.assertEqual(obs, exp) def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) # invalid lengths (they're not all equal) invalid_seqs2 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', id="i2")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs2, validate=True) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) def test_iter_positions(self): """iter_positions functions as expected """ actual = list(self.a2.iter_positions()) expected = [[RNASequence(j) for j in i] for i in ['UA', 'UC', 'AG', 'UU', '-U']] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence('TT-')) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '') def test_omit_gap_positions(self): """omitting gap positions functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) def test_omit_gap_sequences(self): """omitting gap sequences functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) def test_position_counters(self): """position_counters functions as expected """ expected = [Counter({'U': 1, 'A': 1}), Counter({'U': 1, 'C': 1}), Counter({'A': 1, 'G': 1}), Counter({'U': 2}), Counter({'-': 1, 'U': 1})] self.assertEqual(self.a2.position_counters(), expected) self.assertEqual(self.empty.position_counters(), []) def test_position_frequencies(self): """computing position frequencies functions as expected """ expected = [defaultdict(int, {'U': 0.5, 'A': 0.5}), defaultdict(int, {'U': 0.5, 'C': 0.5}), defaultdict(int, {'A': 0.5, 'G': 0.5}), defaultdict(int, {'U': 1.0}), defaultdict(int, {'-': 0.5, 'U': 0.5})] self.assertEqual(self.a2.position_frequencies(), expected) self.assertEqual(self.empty.position_frequencies(), []) def test_position_entropies(self): """computing positional uncertainties functions as expected tested by calculating values as described in this post: http://stackoverflow.com/a/15476958/3424666 """ expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected = [defaultdict(int, {'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5}), defaultdict(int, {'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5, 'U': 2 / 5})] actual = self.a2.k_word_frequencies(k=1) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): """sequence_length functions as expected """ self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join(["3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join(["3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_to_phylip_unequal_sequence_lengths(self): d1 = DNASequence('A-CT', id="d1") d2 = DNASequence('TTA', id="d2") d3 = DNASequence('.-AC', id="d3") a = Alignment([d1, d2, d3]) with self.assertRaises(SequenceCollectionError): a.to_phylip() def test_to_phylip_no_sequences(self): with self.assertRaises(SequenceCollectionError): Alignment([]).to_phylip() def test_to_phylip_no_positions(self): d1 = DNASequence('', id="d1") d2 = DNASequence('', id="d2") a = Alignment([d1, d2]) with self.assertRaises(SequenceCollectionError): a.to_phylip() def test_validate_lengths(self): """ """ self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue(Alignment([ DNASequence('TTT', id="d1")])._validate_lengths()) self.assertFalse(Alignment([ DNASequence('TTT', id="d1"), DNASequence('TT', id="d2")])._validate_lengths())
class AlignmentTests(TestCase): def setUp(self): self.d1 = DNA('..ACC-GTTGG..', metadata={'id': "d1"}) self.d2 = DNA('TTACCGGT-GGCC', metadata={'id': "d2"}) self.d3 = DNA('.-ACC-GTTGC--', metadata={'id': "d3"}) self.r1 = RNA('UUAU-', metadata={'id': "r1"}) self.r2 = RNA('ACGUU', metadata={'id': "r2"}) self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) # no sequences self.empty = Alignment([]) # sequences, but no positions self.no_positions = Alignment([RNA('', metadata={'id': 'a'}), RNA('', metadata={'id': 'b'})]) def test_degap(self): expected = SequenceCollection([ DNA('ACCGTTGG', metadata={'id': "d1"}), DNA('TTACCGGTGGCC', metadata={'id': "d2"}), DNA('ACCGTTGC', metadata={'id': "d3"})]) actual = self.a1.degap() self.assertEqual(actual, expected) expected = SequenceCollection([ RNA('UUAU', metadata={'id': "r1"}), RNA('ACGUU', metadata={'id': "r2"})]) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances(dumb_distance) self.assertEqual(actual, expected) def test_score(self): self.assertEqual(self.a3.score(), 42.0) self.assertEqual(self.a4.score(), -42.0) def test_start_end_positions(self): self.assertEqual(self.a3.start_end_positions(), [(0, 3), (5, 9)]) self.assertEqual(self.a4.start_end_positions(), [(1, 4), (6, 10)]) def test_subalignment(self): # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNA('.AC', metadata={'id': "d1"}) d2 = DNA('TAC', metadata={'id': "d2"}) d3 = DNA('.AC', metadata={'id': "d3"}) expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNA('.C-GTTGG..', metadata={'id': "d1"}) d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"}) d3 = DNA('-C-GTTGC--', metadata={'id': "d3"}) expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNA('.AC', metadata={'id': "d1"}) d3 = DNA('.AC', metadata={'id': "d3"}) expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNA('TCGGT-GGCC', metadata={'id': "d2"}) expected = Alignment([d2]) self.assertEqual(actual, expected) def test_subalignment_filter_out_everything(self): exp = Alignment([]) # no sequences obs = self.a1.subalignment(seqs_to_keep=None, invert_seqs_to_keep=True) self.assertEqual(obs, exp) # no positions obs = self.a1.subalignment(positions_to_keep=None, invert_positions_to_keep=True) self.assertEqual(obs, exp) def test_init_not_equal_lengths(self): invalid_seqs = [self.d1, self.d2, self.d3, DNA('.-ACC-GTGC--', metadata={'id': "i2"})] self.assertRaises(AlignmentError, Alignment, invalid_seqs) def test_init_equal_lengths(self): seqs = [self.d1, self.d2, self.d3] Alignment(seqs) def test_iter_positions(self): actual = list(self.a2.iter_positions()) expected = [ [RNA('U', metadata={'id': 'r1'}), RNA('A', metadata={'id': 'r2'})], [RNA('U', metadata={'id': 'r1'}), RNA('C', metadata={'id': 'r2'})], [RNA('A', metadata={'id': 'r1'}), RNA('G', metadata={'id': 'r2'})], [RNA('U', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})], [RNA('-', metadata={'id': 'r1'}), RNA('U', metadata={'id': 'r2'})] ] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.assertEqual(actual, expected) def test_majority_consensus(self): # empty cases self.assertEqual( self.empty.majority_consensus(), Sequence('')) self.assertEqual( self.no_positions.majority_consensus(), RNA('')) # alignment where all sequences are the same aln = Alignment([DNA('AG', metadata={'id': 'a'}), DNA('AG', metadata={'id': 'b'})]) self.assertEqual(aln.majority_consensus(), DNA('AG')) # no ties d1 = DNA('TTT', metadata={'id': "d1"}) d2 = DNA('TT-', metadata={'id': "d2"}) d3 = DNA('TC-', metadata={'id': "d3"}) a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNA('TT-')) # ties d1 = DNA('T', metadata={'id': "d1"}) d2 = DNA('A', metadata={'id': "d2"}) a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNA('T'), DNA('A')]) def test_omit_gap_positions(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNA('UUAU', metadata={'id': "r1"}) r2 = RNA('ACGU', metadata={'id': "r2"}) expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNA('UUAU', metadata={'id': "r1"}) r2 = RNA('ACGU', metadata={'id': "r2"}) expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. seqs = [] for i in range(33): seqs.append(DNA('-.', metadata={'id': str(i)})) aln = Alignment(seqs) self.assertEqual(aln.omit_gap_positions(1 - np.finfo(float).eps), Alignment([DNA('', metadata={'id': str(i)}) for i in range(33)])) def test_omit_gap_sequences(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. aln = Alignment([DNA('.' * 33, metadata={'id': 'abc'}), DNA('-' * 33, metadata={'id': 'def'})]) self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps), Alignment([])) def test_position_counters(self): self.assertEqual(self.empty.position_counters(), []) self.assertEqual(self.no_positions.position_counters(), []) expected = [Counter({'U': 1, 'A': 1}), Counter({'U': 1, 'C': 1}), Counter({'A': 1, 'G': 1}), Counter({'U': 2}), Counter({'-': 1, 'U': 1})] self.assertEqual(self.a2.position_counters(), expected) def test_position_frequencies(self): self.assertEqual(self.empty.position_frequencies(), []) self.assertEqual(self.no_positions.position_frequencies(), []) expected = [defaultdict(float, {'U': 0.5, 'A': 0.5}), defaultdict(float, {'U': 0.5, 'C': 0.5}), defaultdict(float, {'A': 0.5, 'G': 0.5}), defaultdict(float, {'U': 1.0}), defaultdict(float, {'-': 0.5, 'U': 0.5})] self.assertEqual(self.a2.position_frequencies(), expected) def test_position_frequencies_floating_point_precision(self): # Test that a position with no variation yields a frequency of exactly # 1.0. Note that it is important to use self.assertEqual here instead # of self.assertAlmostEqual because we want to test for exactly 1.0. A # previous implementation of Alignment.position_frequencies added # (1 / sequence_count) for each occurrence of a character in a position # to compute the frequencies (see # https://github.com/biocore/scikit-bio/issues/801). In certain cases, # this yielded a frequency slightly less than 1.0 due to roundoff # error. The test case here uses an alignment of 10 sequences with no # variation at a position. This test case exposes the roundoff error # present in the previous implementation because 1/10 added 10 times # yields a number slightly less than 1.0. This occurs because 1/10 # cannot be represented exactly as a floating point number. seqs = [] for i in range(10): seqs.append(DNA('A', metadata={'id': str(i)})) aln = Alignment(seqs) self.assertEqual(aln.position_frequencies(), [defaultdict(float, {'A': 1.0})]) def test_position_entropies(self): # tested by calculating values as described in this post: # http://stackoverflow.com/a/15476958/3424666 expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_kmer_frequencies(self): expected = [defaultdict(float, {'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5}), defaultdict(float, {'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5, 'U': 2 / 5})] actual = self.a2.kmer_frequencies(k=1, relative=True) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_validate_lengths(self): self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue(Alignment([ DNA('TTT', metadata={'id': "d1"})])._validate_lengths())