def test_to_phylip_no_positions(self): d1 = DNASequence('', id="d1") d2 = DNASequence('', id="d2") a = Alignment([d1, d2]) with self.assertRaises(SequenceCollectionError): a.to_phylip()
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def test_to_phylip_unequal_sequence_lengths(self): d1 = DNASequence('A-CT', id="d1") d2 = DNASequence('TTA', id="d2") d3 = DNASequence('.-AC', id="d3") a = Alignment([d1, d2, d3]) with self.assertRaises(SequenceCollectionError): a.to_phylip()
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), identifier=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), identifier=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def test_validate_lengths(self): """ """ self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue( Alignment([DNASequence('TTT', id="d1")])._validate_lengths()) self.assertFalse( Alignment( [DNASequence('TTT', id="d1"), DNASequence('TT', id="d2")])._validate_lengths())
def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join([ "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected)
def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join([ "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected)
def test_subalignment(self): """subalignment functions as expected """ # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected)
def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid())
def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int(round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def getResult(self, aln_path, *args, **kwargs): """Returns alignment from sequences. Currently does not allow parameter tuning of program and uses default parameters -- this is bad and should be fixed. #TODO: allow command-line access to important aln params. """ module = self.Params['Module'] # standard qiime says we just consider the first word as the unique ID # the rest of the defline of the fasta alignment often doesn't match # the otu names in the otu table with open(aln_path) as aln_f: seqs = Alignment.from_fasta_records( parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA) # This ugly little line of code lets us pass a skbio Alignment when a # a cogent alignment is expected. seqs.getIntMap = seqs.int_map result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent) try: root_method = kwargs['root_method'] if root_method == 'midpoint': result = root_midpt(result) elif root_method == 'tree_method_default': pass except KeyError: pass return result
def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence('TTT', identifier="d1") d2 = DNASequence('TT-', identifier="d2") d3 = DNASequence('TC-', identifier="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence('TT-')) d1 = DNASequence('T', identifier="d1") d2 = DNASequence('A', identifier="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '')
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95): """ remove sequences very different from the majority consensus given aligned sequences, will: 1. calculate a majority consensus (most common symbol at each position of the alignment); 2. compute the mean/std edit distance of each seq to the consensus; 3. discard sequences whose edit dist is greater than the cutoff, which is defined as being `num_stds` greater than the mean. """ # load the alignment and compute the consensus sequence aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA) consensus_seq = aln.majority_consensus() # compute the hamming distance between all sequences in the alignment # and the consensus sequence dists_to_consensus = [s.distance(consensus_seq) for s in aln] # compute the average and standard deviation distance from the consensus average_distance = mean(dists_to_consensus) std_distance = std(dists_to_consensus) # compute the distance cutoff dist_cutoff = average_distance + num_stds * std_distance # for all sequences, determine if they're distance to the consensus # is less then or equal to the cutoff distance. if so, add the sequence's # identifier to the list of sequence identifiers to keep seqs_to_keep = [] for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus): if dist_to_consensus <= dist_cutoff: seqs_to_keep.append(seq_id) # filter the alignment to only keep the sequences identified in the step # above filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep) # and return the filtered alignment return filtered_aln
def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int( round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def setUp(self): """Setup for Fasta tests.""" self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu'] self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu' self.fasta_with_label =\ '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU' self.fasta_with_label_lw2 =\ '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU' self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.sequence_objects_a = [ DNASequence('ACTCGAGATC', 'seq1'), DNASequence('GGCCT', 'seq2') ] self.sequence_objects_b = [ BiologicalSequence('ACTCGAGATC', 'seq1'), BiologicalSequence('GGCCT', 'seq2') ] seqs = [ DNASequence("ACC--G-GGTA..", id="seq1"), DNASequence("TCC--G-GGCA..", id="seqs2") ] self.alignment = Alignment(seqs)
def setUp(self): fd, self.infernal_test1_input_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta") close(fd) with open(self.infernal_test1_input_fp, "w") as in_f: in_f.write("\n".join(infernal_test1_input_fasta)) fd, self.infernal_test1_template_fp = mkstemp(prefix="InfernalAlignerTests_", suffix="template.sto") close(fd) with open(self.infernal_test1_template_fp, "w") as in_f: in_f.write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta") close(fd) open(self.result_fp, "w").close() fd, self.log_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".log") close(fd) open(self.log_fp, "w").close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({"template_filepath": self.infernal_test1_template_fp}) self.infernal_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(infernal_test1_expected_alignment), DNA )
def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', identifier="d1") d2 = DNASequence('TTACCGGT-GGCC', identifier="d2") d3 = DNASequence('.-ACC-GTTGC--', identifier="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join(["3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected)
def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', identifier="d1") d2 = DNASequence('TTACCGGT-GGCC', identifier="d2") d3 = DNASequence('.-ACC-GTTGC--', identifier="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join(["3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected)
def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', identifier="d1") self.d2 = DNASequence('TTACCGGT-GGCC', identifier="d2") self.d3 = DNASequence('.-ACC-GTTGC--', identifier="d3") self.r1 = RNASequence('UUAU-', identifier="r1") self.r2 = RNASequence('ACGUU', identifier="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.empty = Alignment([])
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) with open(self.pynast_test1_input_fp, "w") as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test1_template_fp, "w") as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_dots_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("-", ".")) fd, self.pynast_test_template_w_u_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_u_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("T", "U")) fd, self.pynast_test_template_w_lower_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_lower_fp, "w") as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.result_fp, "w").close() fd, self.failure_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.failure_fp, "w").close() fd, self.log_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".log") close(fd) open(self.log_fp, "w").close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp, ] self.pynast_test1_aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 15}) self.pynast_test1_expected_aln = Alignment.from_fasta_records(parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA )
def test_omit_gap_positions(self): """omitting gap positions functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty)
def test_call_pynast_test1_alt_min_pct(self): """PyNastAligner: returns no result when min_pct too high """ aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, 'min_pct': 100.0}) actual_aln = aligner(self.pynast_test1_input_fp) expected_aln = Alignment([]) self.assertEqual(actual_aln, expected_aln)
def test_omit_gap_sequences(self): """omitting gap sequences functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty)
def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.a3 = Alignment(self.seqs2, score=42.0, start_end_positions=[(0, 3), (5, 9)]) self.a4 = Alignment(self.seqs2, score=-42.0, start_end_positions=[(1, 4), (6, 10)]) self.empty = Alignment([])
def test_fasta_from_alignment_from_alignment(self): """should return correct fasta string for alignment object""" # alignment with a few sequences obs = fasta_from_alignment(self.alignment) self.assertEquals('>seq1\nACC--G-GGTA..\n>seqs2\nTCC--G-GGCA..', obs) # empty alginment obs = fasta_from_alignment(Alignment([])) self.assertEquals('', obs) # alignment with a few sequences obs = fasta_from_alignment(self.alignment, sort=False) self.assertEquals('>seq1\nACC--G-GGTA..\n>seqs2\nTCC--G-GGCA..', obs)
def test_call_infernal_test1_file_output(self): """InfernalAligner writes correct output files for infernal_test1 seqs """ # do not collect results; check output files instead actual = self.infernal_test1_aligner( self.infernal_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp ) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.infernal_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA) self.assertEqual(actual_aln, expected_aln)
def test_call_infernal_test1_file_output(self): """InfernalAligner writes correct output files for infernal_test1 seqs """ # do not collect results; check output files instead actual = self.infernal_test1_aligner( self.infernal_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.infernal_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln)
def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp ) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records(parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence('TT-')) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '')
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records( parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def setUp(self): fd, self.infernal_test1_input_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) with open(self.infernal_test1_input_fp, 'w') as in_f: in_f.write('\n'.join(infernal_test1_input_fasta)) fd, self.infernal_test1_template_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='template.sto') close(fd) with open(self.infernal_test1_template_fp, 'w') as in_f: in_f.write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({ 'template_filepath': self.infernal_test1_template_fp, }) self.infernal_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(infernal_test1_expected_alignment), DNA)
def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.empty = Alignment([])
def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1") ] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) # invalid lengths (they're not all equal) invalid_seqs2 = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', id="i2") ] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs2, validate=True)
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) with open(self.pynast_test1_input_fp, 'w') as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test1_template_fp, 'w') as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_dots_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('-', '.')) fd, self.pynast_test_template_w_u_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_u_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('T', 'U')) fd, self.pynast_test_template_w_lower_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_lower_fp, 'w') as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.failure_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.failure_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA)
class AlignmentTests(TestCase): def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.empty = Alignment([]) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): """distances functions as expected """ expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) def test_subalignment(self): """subalignment functions as expected """ # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected) def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1") ] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) # invalid lengths (they're not all equal) invalid_seqs2 = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', id="i2") ] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs2, validate=True) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) def test_iter_positions(self): """iter_positions functions as expected """ actual = list(self.a2.iter_positions()) expected = [[RNASequence(j) for j in i] for i in ['UA', 'UC', 'AG', 'UU', '-U']] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence('TT-')) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '') def test_omit_gap_positions(self): """omitting gap positions functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', id="r1") r2 = RNASequence('ACGU', id="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) def test_omit_gap_sequences(self): """omitting gap sequences functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) def test_position_counters(self): """position_counters functions as expected """ expected = [ Counter({ 'U': 1, 'A': 1 }), Counter({ 'U': 1, 'C': 1 }), Counter({ 'A': 1, 'G': 1 }), Counter({'U': 2}), Counter({ '-': 1, 'U': 1 }) ] self.assertEqual(self.a2.position_counters(), expected) self.assertEqual(self.empty.position_counters(), []) def test_position_frequencies(self): """computing position frequencies functions as expected """ expected = [ defaultdict(int, { 'U': 0.5, 'A': 0.5 }), defaultdict(int, { 'U': 0.5, 'C': 0.5 }), defaultdict(int, { 'A': 0.5, 'G': 0.5 }), defaultdict(int, {'U': 1.0}), defaultdict(int, { '-': 0.5, 'U': 0.5 }) ] self.assertEqual(self.a2.position_frequencies(), expected) self.assertEqual(self.empty.position_frequencies(), []) def test_position_entropies(self): """computing positional uncertainties functions as expected tested by calculating values as described in this post: http://stackoverflow.com/a/15476958/3424666 """ expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected = [ defaultdict(int, { 'U': 3 / 5, 'A': 1 / 5, '-': 1 / 5 }), defaultdict(int, { 'A': 1 / 5, 'C': 1 / 5, 'G': 1 / 5, 'U': 2 / 5 }) ] actual = self.a2.k_word_frequencies(k=1) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): """sequence_length functions as expected """ self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join([ "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected) def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join([ "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected) def test_validate_lengths(self): """ """ self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue( Alignment([DNASequence('TTT', id="d1")])._validate_lengths()) self.assertFalse( Alignment( [DNASequence('TTT', id="d1"), DNASequence('TT', id="d2")])._validate_lengths())
class AlignmentTests(TestCase): def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', identifier="d1") self.d2 = DNASequence('TTACCGGT-GGCC', identifier="d2") self.d3 = DNASequence('.-ACC-GTTGC--', identifier="d3") self.r1 = RNASequence('UUAU-', identifier="r1") self.r2 = RNASequence('ACGUU', identifier="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.empty = Alignment([]) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected) def test_distances(self): """distances functions as expected """ expected = [[0, 6./13, 4./13], [6./13, 0, 7./13], [4./13, 7./13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) def test_subalignment(self): """subalignment functions as expected """ # keep seqs by identifiers actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by identifiers (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', identifier="d1") d2 = DNASequence('TAC', identifier="d2") d3 = DNASequence('.AC', identifier="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', identifier="d1") d2 = DNASequence('TCGGT-GGCC', identifier="d2") d3 = DNASequence('-C-GTTGC--', identifier="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', identifier="d1") d3 = DNASequence('.AC', identifier="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', identifier="d2") expected = Alignment([d2]) self.assertEqual(actual, expected) def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', identifier="i1")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) # invalid lengths (they're not all equal) invalid_seqs2 = [self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', identifier="i2")] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs2, validate=True) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', identifier="d1") d2 = DNASequence('TTACCGGT-GGC', identifier="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', identifier="d1") d2 = DNASequence('TTACCGGT-GGCC', identifier="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) def test_iter_positions(self): """iter_positions functions as expected """ actual = list(self.a2.iter_positions()) expected = [[RNASequence(j) for j in i] for i in ['UA', 'UC', 'AG', 'UU', '-U']] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) actual = list(self.a2.iter_positions(constructor=str)) expected = [list('UA'), list('UC'), list('AG'), list('UU'), list('-U')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.assertEqual(actual, expected) def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence('TTT', identifier="d1") d2 = DNASequence('TT-', identifier="d2") d3 = DNASequence('TC-', identifier="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence('TT-')) d1 = DNASequence('T', identifier="d1") d2 = DNASequence('A', identifier="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '') def test_omit_gap_positions(self): """omitting gap positions functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_positions(1.0), expected) self.assertEqual(self.a2.omit_gap_positions(0.51), expected) r1 = RNASequence('UUAU', identifier="r1") r2 = RNASequence('ACGU', identifier="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.49), expected) r1 = RNASequence('UUAU', identifier="r1") r2 = RNASequence('ACGU', identifier="r2") expected = Alignment([r1, r2]) self.assertEqual(self.a2.omit_gap_positions(0.0), expected) self.assertEqual(self.empty.omit_gap_positions(0.0), self.empty) self.assertEqual(self.empty.omit_gap_positions(0.49), self.empty) self.assertEqual(self.empty.omit_gap_positions(1.0), self.empty) def test_omit_gap_sequences(self): """omitting gap sequences functions as expected """ expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) def test_position_counters(self): """position_counters functions as expected """ expected = [Counter({'U': 1, 'A': 1}), Counter({'U': 1, 'C': 1}), Counter({'A': 1, 'G': 1}), Counter({'U': 2}), Counter({'-': 1, 'U': 1})] self.assertEqual(self.a2.position_counters(), expected) self.assertEqual(self.empty.position_counters(), []) def test_position_frequencies(self): """computing position frequencies functions as expected """ expected = [defaultdict(int, {'U': 0.5, 'A': 0.5}), defaultdict(int, {'U': 0.5, 'C': 0.5}), defaultdict(int, {'A': 0.5, 'G': 0.5}), defaultdict(int, {'U': 1.0}), defaultdict(int, {'-': 0.5, 'U': 0.5})] self.assertEqual(self.a2.position_frequencies(), expected) self.assertEqual(self.empty.position_frequencies(), []) def test_position_entropies(self): """computing positional uncertainties functions as expected tested by calculating values as described in this post: http://stackoverflow.com/a/15476958/3424666 """ expected = [0.69314, 0.69314, 0.69314, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(), expected, 5) expected = [1.0, 1.0, 1.0, 0.0, np.nan] np.testing.assert_almost_equal(self.a2.position_entropies(base=2), expected, 5) np.testing.assert_almost_equal(self.empty.position_entropies(base=2), []) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected = [defaultdict(int, {'U': 3/5, 'A': 1/5, '-': 1/5}), defaultdict(int, {'A': 1/5, 'C': 1/5, 'G': 1/5, 'U': 2/5})] actual = self.a2.k_word_frequencies(k=1) for a, e in zip(actual, expected): self.assertEqual(sorted(a), sorted(e), 5) np.testing.assert_almost_equal(sorted(a.values()), sorted(e.values()), 5) def test_sequence_length(self): """sequence_length functions as expected """ self.assertEqual(self.a1.sequence_length(), 13) self.assertEqual(self.a2.sequence_length(), 5) self.assertEqual(self.empty.sequence_length(), 0) def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', identifier="d1") d2 = DNASequence('TTACCGGT-GGCC', identifier="d2") d3 = DNASequence('.-ACC-GTTGC--', identifier="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join(["3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', identifier="d1") d2 = DNASequence('TTACCGGT-GGCC', identifier="d2") d3 = DNASequence('.-ACC-GTTGC--', identifier="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join(["3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--"]) self.assertEqual(phylip_str, expected) def test_validate_lengths(self): """ """ self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue(Alignment([ DNASequence('TTT', identifier="d1")])._validate_lengths()) self.assertFalse(Alignment([ DNASequence('TTT', identifier="d1"), DNASequence('TT', identifier="d2")])._validate_lengths())
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(parse_fasta(open(seq_path, 'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open( self.Params['template_filepath'], 'U'), seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError( "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.") moltype = self.Params['moltype'] # Need to make separate mapping for unaligned sequences unaligned = SequenceCollection.from_fasta_records( candidate_sequences.iteritems(), DNASequence) mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_') mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()] # Turn on --gapthresh option in cmbuild to force alignment to full # model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh': 1.0}) # record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) # Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. # Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub': True, '--gapthresh': 1.0}) # record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) # Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment, structure_string=struct, seqs=mapped_seq_tuples, moltype=moltype, include_aln=True, params=cmalign_params, cmbuild_params=cmbuild_params) # Pull out original sequences from full alignment. infernal_aligned = [] # Get a dict of the identifiers to sequences (note that this is a # cogent alignment object, hence the call to NamedSeqs) aligned_dict = aligned.NamedSeqs for n, o in new_to_old_ids.iteritems(): aligned_seq = aligned_dict[n] infernal_aligned.append((o, aligned_seq)) # Create an Alignment object from alignment dict infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence) if log_path is not None: log_file = open(log_path, 'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(infernal_aligned.to_fasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}