def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1])
def test_reverse_complement(self): self.assertEqual(self.b1.reverse_complement(), DNASequence("TGTAATC")) self.assertEqual(self.b2.reverse_complement(), DNASequence("GGTACCGGT")) self.assertRaises(BiologicalSequenceError, self.b3.reverse_complement) self.assertEqual(self.b4.reverse_complement(), DNASequence("NVHDBMRSWYK"))
def setUp(self): """Setup for Fasta tests.""" self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu'] self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu' self.fasta_with_label =\ '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU' self.fasta_with_label_lw2 =\ '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU' self.alignment_dict = { '1st': 'AAAA', '2nd': 'CCCC', '3rd': 'GGGG', '4th': 'UUUU' } self.sequence_objects_a = [ DNASequence('ACTCGAGATC', 'seq1'), DNASequence('GGCCT', 'seq2') ] self.sequence_objects_b = [ BiologicalSequence('ACTCGAGATC', 'seq1'), BiologicalSequence('GGCCT', 'seq2') ] seqs = [ DNASequence("ACC--G-GGTA..", id="seq1"), DNASequence("TCC--G-GGCA..", id="seqs2") ] self.alignment = Alignment(seqs)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def test_nondegenerates_gap_mixed_case(self): exp = [ DNASequence('-A.a'), DNASequence('-A.c'), DNASequence('-C.a'), DNASequence('-C.c') ] obs = sorted(DNASequence('-M.m').nondegenerates(), key=str) self.assertEqual(obs, exp)
def test_nondegenerates_mixed_degens(self): exp = [ DNASequence('AGC'), DNASequence('AGT'), DNASequence('GGC'), DNASequence('GGT') ] obs = sorted(DNASequence('RGY').nondegenerates(), key=str) self.assertEqual(obs, exp)
def test_subalignment(self): """subalignment functions as expected """ # keep seqs by ids actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3']) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by indices actual = self.a1.subalignment(seqs_to_keep=[0, 2]) expected = Alignment([self.d1, self.d3]) self.assertEqual(actual, expected) # keep seqs by ids (invert) actual = self.a1.subalignment(seqs_to_keep=['d1', 'd3'], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep seqs by indices (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], invert_seqs_to_keep=True) expected = Alignment([self.d2]) self.assertEqual(actual, expected) # keep positions actual = self.a1.subalignment(positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d2 = DNASequence('TAC', id="d2") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep positions (invert) actual = self.a1.subalignment(positions_to_keep=[0, 2, 3], invert_positions_to_keep=True) d1 = DNASequence('.C-GTTGG..', id="d1") d2 = DNASequence('TCGGT-GGCC', id="d2") d3 = DNASequence('-C-GTTGC--', id="d3") expected = Alignment([d1, d2, d3]) self.assertEqual(actual, expected) # keep seqs and positions actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3]) d1 = DNASequence('.AC', id="d1") d3 = DNASequence('.AC', id="d3") expected = Alignment([d1, d3]) self.assertEqual(actual, expected) # keep seqs and positions (invert) actual = self.a1.subalignment(seqs_to_keep=[0, 2], positions_to_keep=[0, 2, 3], invert_seqs_to_keep=True, invert_positions_to_keep=True) d2 = DNASequence('TCGGT-GGCC', id="d2") expected = Alignment([d2]) self.assertEqual(actual, expected)
def test_validate_lengths(self): """ """ self.assertTrue(self.a1._validate_lengths()) self.assertTrue(self.a2._validate_lengths()) self.assertTrue(self.empty._validate_lengths()) self.assertTrue( Alignment([DNASequence('TTT', id="d1")])._validate_lengths()) self.assertFalse( Alignment( [DNASequence('TTT', id="d1"), DNASequence('TT', id="d2")])._validate_lengths())
def test_to_phylip_map_labels(self): """to_phylip functions as expected with label mapping """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=True, label_prefix="s") self.assertEqual(id_map, {'s1': 'd1', 's3': 'd3', 's2': 'd2'}) expected = "\n".join([ "3 13", "s1 ..ACC-GTTGG..", "s2 TTACCGGT-GGCC", "s3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected)
def test_to_phylip(self): """to_phylip functions as expected """ d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") d3 = DNASequence('.-ACC-GTTGC--', id="d3") a = Alignment([d1, d2, d3]) phylip_str, id_map = a.to_phylip(map_labels=False) self.assertEqual(id_map, {'d1': 'd1', 'd3': 'd3', 'd2': 'd2'}) expected = "\n".join([ "3 13", "d1 ..ACC-GTTGG..", "d2 TTACCGGT-GGCC", "d3 .-ACC-GTTGC--" ]) self.assertEqual(phylip_str, expected)
def test_iupac_degenerate_characters(self): exp = set([ 'B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y', 'b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y' ]) self.assertEqual(self.b1.iupac_degenerate_characters(), exp) self.assertEqual(DNASequence.iupac_degenerate_characters(), exp)
def test_iupac_degeneracies(self): exp = { 'B': set(['C', 'T', 'G']), 'D': set(['A', 'T', 'G']), 'H': set(['A', 'C', 'T']), 'K': set(['T', 'G']), 'M': set(['A', 'C']), 'N': set(['A', 'C', 'T', 'G']), 'S': set(['C', 'G']), 'R': set(['A', 'G']), 'W': set(['A', 'T']), 'V': set(['A', 'C', 'G']), 'Y': set(['C', 'T']), 'b': set(['c', 't', 'g']), 'd': set(['a', 't', 'g']), 'h': set(['a', 'c', 't']), 'k': set(['t', 'g']), 'm': set(['a', 'c']), 'n': set(['a', 'c', 't', 'g']), 's': set(['c', 'g']), 'r': set(['a', 'g']), 'w': set(['a', 't']), 'v': set(['a', 'c', 'g']), 'y': set(['c', 't']) } self.assertEqual(self.b1.iupac_degeneracies(), exp) self.assertEqual(DNASequence.iupac_degeneracies(), exp)
def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.a1.is_valid()) self.assertTrue(self.a2.is_valid()) self.assertTrue(self.empty.is_valid()) # invalid because of length mismatch d1 = DNASequence('..ACC-GTTGG..', id="d1") d2 = DNASequence('TTACCGGT-GGC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid()) # invalid because of invalid charaters d1 = DNASequence('..ACC-GTXGG..', id="d1") d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.assertFalse(Alignment([d1, d2]).is_valid())
def test_iupac_characters(self): exp = { 'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W', 'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r', 't', 'w', 'v', 'y' } self.assertEqual(self.b1.iupac_characters(), exp) self.assertEqual(DNASequence.iupac_characters(), exp)
def setUp(self): self.d1 = DNASequence('..ACC-GTTGG..', id="d1") self.d2 = DNASequence('TTACCGGT-GGCC', id="d2") self.d3 = DNASequence('.-ACC-GTTGC--', id="d3") self.r1 = RNASequence('UUAU-', id="r1") self.r2 = RNASequence('ACGUU', id="r2") self.seqs1 = [self.d1, self.d2, self.d3] self.seqs2 = [self.r1, self.r2] self.seqs1_t = [('d1', '..ACC-GTTGG..'), ('d2', 'TTACCGGT-GGCC'), ('d3', '.-ACC-GTTGC--')] self.seqs2_t = [('r1', 'UUAU-'), ('r2', 'ACGUU')] self.a1 = Alignment(self.seqs1) self.a2 = Alignment(self.seqs2) self.empty = Alignment([])
def test_alphabet(self): """alphabet property functions as expected""" exp = { 'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W', 'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r', 't', 'w', 'v', 'y' } self.assertEqual(self.b1.alphabet(), exp) self.assertEqual(DNASequence.alphabet(), exp)
def test_complement_map(self): exp = { '-': '-', '.': '.', 'A': 'T', 'C': 'G', 'B': 'V', 'D': 'H', 'G': 'C', 'H': 'D', 'K': 'M', 'M': 'K', 'N': 'N', 'S': 'S', 'R': 'Y', 'T': 'A', 'W': 'W', 'V': 'B', 'Y': 'R', 'a': 't', 'c': 'g', 'b': 'v', 'd': 'h', 'g': 'c', 'h': 'd', 'k': 'm', 'm': 'k', 'n': 'n', 's': 's', 'r': 'y', 't': 'a', 'w': 'w', 'v': 'b', 'y': 'r' } self.assertEqual(self.b1.complement_map(), exp) self.assertEqual(DNASequence.complement_map(), exp)
def check_dna_chars_primers(header, mapping_data, errors, disable_primer_check=False ): """ Checks for valid DNA characters in primer fields Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors disable_primer_check: If True, disables tests for valid primer sequences. """ valid_dna_chars = DNASequence.iupac_characters() valid_dna_chars.add(',') # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = ["ReversePrimer"] if not disable_primer_check: header_fields_to_check.append("LinkerPrimerSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) # Check for non-DNA characters for curr_data in range(len(mapping_data)): for curr_ix in check_indices: for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def test_majority_consensus(self): """majority_consensus functions as expected """ d1 = DNASequence('TTT', id="d1") d2 = DNASequence('TT-', id="d2") d3 = DNASequence('TC-', id="d3") a1 = Alignment([d1, d2, d3]) self.assertEqual(a1.majority_consensus(), DNASequence('TT-')) d1 = DNASequence('T', id="d1") d2 = DNASequence('A', id="d2") a1 = Alignment([d1, d2]) self.assertTrue(a1.majority_consensus() in [DNASequence('T'), DNASequence('A')]) self.assertEqual(self.empty.majority_consensus(), '')
def check_dna_chars_primers(header, mapping_data, errors, disable_primer_check=False): """ Checks for valid DNA characters in primer fields Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors disable_primer_check: If True, disables tests for valid primer sequences. """ valid_dna_chars = DNASequence.iupac_characters() valid_dna_chars.add(',') # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = ["ReversePrimer"] if not disable_primer_check: header_fields_to_check.append("LinkerPrimerSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) # Check for non-DNA characters for curr_data in range(len(mapping_data)): for curr_ix in check_indices: for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def test_init_validate(self): """initialization with validation functions as expected """ Alignment(self.seqs1, validate=True) # invalid DNA character invalid_seqs1 = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTXGC--', id="i1") ] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs1, validate=True) # invalid lengths (they're not all equal) invalid_seqs2 = [ self.d1, self.d2, self.d3, DNASequence('.-ACC-GTGC--', id="i2") ] self.assertRaises(SequenceCollectionError, Alignment, invalid_seqs2, validate=True)
def check_dna_chars_bcs(header, mapping_data, errors, has_barcodes=True): """ Checks for valid DNA characters in barcode field Also flags empty fields as errors unless flags are passed to suppress barcode or primer checks. header: list of header strings mapping_data: list of lists of raw metadata mapping file data errors: list of errors has_barcodes: If True, will test for perform barcodes test (presence, uniqueness, valid IUPAC DNA chars). """ valid_dna_chars = DNASequence.iupac_standard_characters() # Detect fields directly, in case user does not have fields in proper # order in the mapping file (this will generate error separately) header_fields_to_check = [] if has_barcodes: header_fields_to_check.append("BarcodeSequence") check_indices = [] for curr_field in range(len(header)): if header[curr_field] in header_fields_to_check: check_indices.append(curr_field) # Correction factor for header being the first line correction_ix = 1 # Check for missing data for curr_data in range(len(mapping_data)): for curr_ix in check_indices: if len(mapping_data[curr_data][curr_ix]) == 0: errors.append("Missing expected DNA sequence\t%d,%d" % (curr_data + correction_ix, curr_ix)) continue for curr_nt in mapping_data[curr_data][curr_ix]: if curr_nt not in valid_dna_chars: errors.append("Invalid DNA sequence detected: %s\t%d,%d" % (mapping_data[curr_data][curr_ix], curr_data + correction_ix, curr_ix)) continue return errors
def setUp(self): self.empty = DNASequence('') self.b1 = DNASequence('GATTACA') self.b2 = DNASequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = DNASequence( 'ACCGGUACC', id="bad-seq-1", description="Not a DNA sequence") self.b4 = DNASequence( 'MRWSYKVHDBN', id="degen", description="All of the degenerate bases") self.b5 = DNASequence('.G--ATTAC-A...')
def setUp(self): """ Initialize values to be used in tests """ self.empty = DNASequence('') self.b1 = DNASequence('GATTACA') self.b2 = DNASequence( 'ACCGGTACC', identifier="test-seq-2", description="A test sequence") self.b3 = DNASequence( 'ACCGGUACC', identifier="bad-seq-1", description="Not a DNA sequence") self.b4 = DNASequence( 'MRWSYKVHDBN', identifier="degen", description="All of the degenerate bases") self.b5 = DNASequence('.G--ATTAC-A...')
def test_is_reverse_complement(self): self.assertFalse(self.b1.is_reverse_complement(self.b1)) self.assertTrue(self.b1.is_reverse_complement(DNASequence('TGTAATC'))) self.assertTrue( self.b4.is_reverse_complement(DNASequence('NVHDBMRSWYK')))
def run_ampliconnoise(mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform='flx', seqnoise_resolution=None, truncate_len=None): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) create_dir(output_dir) if seqnoise_resolution is None: if platform == 'flx': seqnoise_resolution = '30.0' elif platform == 'titanium': seqnoise_resolution = '25.0' else: raise RuntimeError('seqnoise_resolution not set, and no' + ' default for platform ' + platform) if truncate_len is None: if platform == 'flx': truncate_len = '220' elif platform == 'titanium': truncate_len = '400' else: raise RuntimeError('truncate_len not set, and no' + ' default for platform ' + platform) # these are filenames minus extension, and are sample IDs sample_names = [] primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index('SampleID')]) bc_seqs.append(map_data[i][headers.index('BarcodeSequence')]) primer = (map_data[i][headers.index('LinkerPrimerSequence')]) for char, bases in DNASequence.iupac_degeneracies().iteritems(): primer = primer.replace(char, '[' + ''.join(bases) + ']') primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError( 'Error: only one primer per mapping file supported.') one_primer = primer_seqs[0] commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger, [mapping_fp, sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir, 'map.csv'), 'w') for i in range(len(sample_names)): fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n') fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = '_' + truncate_len if suppress_perseus: fasta_result_names = [ sample_name + post_pyro_tail + '_seqnoise_cd.fa' for sample_name in sample_names ] else: fasta_result_names = [ sample_name + '_Good.fa' for sample_name in sample_names ] cmd = 'cd ' + output_dir # see also os.chdir above commands.append([('change to output dir', cmd)]) cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt' commands.append([('confirm pyro lookup filepath environment variable', cmd) ]) cmd = 'SplitKeys.pl ' + one_primer + ' map.csv < ' +\ os.path.join(called_dir, sff_txt_fp) +\ ' > splitkeys_log.txt 2> unassigned.fna' commands.append([('split sff.txt via barcodes (keys)', cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == 'flx': cmd = 'Clean360.pl ' + one_primer + ' ' + sample_name + ' < ' +\ sample_name + '.raw' commands.append([('clean flows ' + sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == 'titanium': cmd = 'CleanMinMax.pl ' + one_primer + ' ' + sample_name + ' < ' +\ sample_name + '.raw' commands.append([('clean flows ' + sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = "mpirun -np " + str(numnodes) + " PyroDist -in " +\ sample_name + ".dat -out " + \ sample_name + " > " + sample_name + ".pdout" commands.append([('pyrodist ' + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name +\ " > " + sample_name + ".fcout" commands.append([('fcluster pyrodist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = "mpirun -np " + str(numnodes) + " PyroNoise -din " +\ sample_name + ".dat -out " +\ sample_name + "_pyronoise " + "-lin " +\ sample_name + ".list -s 60.0 -c 0.01 > " +\ sample_name + "_pyronoise.pnout" commands.append([('pyronoise ' + sample_name, cmd)]) cmd = 'Parse.pl ' + bc_seqs[i] + one_primer + ' ' + truncate_len + ' < ' +\ sample_name + '_pyronoise_cd.fa' + ' > ' + sample_name + '_' +\ truncate_len + '.fa' commands.append([('truncate ' + sample_name, cmd)]) # now start with post_pyro_tail cmd = "mpirun -np " + str(numnodes) + " SeqDist -in " +\ sample_name + post_pyro_tail +\ ".fa > " + sample_name + post_pyro_tail + ".seqdist" commands.append([('seqdist ' + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " +\ sample_name + post_pyro_tail + "fcl > " +\ sample_name + post_pyro_tail + ".fcout" commands.append([('fcluster seqdist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = "mpirun -np " + str(numnodes) + " SeqNoise -in " +\ sample_name + post_pyro_tail +\ ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " +\ sample_name + post_pyro_tail +\ "_seqnoise -lin " + sample_name + post_pyro_tail + 'fcl.list -min ' +\ sample_name + '_pyronoise' +\ '.mapping -s ' + seqnoise_resolution + ' -c 0.08 > ' +\ sample_name + post_pyro_tail + '.snout' commands.append([('seqnoise ' + sample_name, cmd)]) if not suppress_perseus: cmd = 'Perseus -sin ' + sample_name + post_pyro_tail +\ '_seqnoise_cd.fa > ' +\ sample_name + '.per' commands.append([('Perseus ' + sample_name, cmd)]) cmd = 'Class.pl ' + sample_name + '.per ' +\ str(chimera_alpha) + ' ' + str(chimera_beta) +\ ' > ' + sample_name + '.class' commands.append([('Class.pl ' + sample_name, cmd)]) cmd = 'FilterGoodClass.pl ' + sample_name + post_pyro_tail +\ '_seqnoise_cd.fa ' +\ sample_name + '.class 0.5 > ' + sample_name + '_Chi.fa 2> ' +\ sample_name + '_Good.fa' commands.append([('FilterGoodClass ' + sample_name, cmd)]) cmd = 'unweight_fasta.py -i %s -o %s -l %s' %\ (fasta_result_names[i], sample_name + '_unw.fna', sample_name) commands.append([('unweight fasta ' + sample_name, cmd)]) cmd = 'cat ' +\ ' '.join([sample_name + '_unw.fna' for sample_name in sample_names]) +\ ' > ' + output_filepath # this should be an abs filepath commands.append([('cat into one fasta file', cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_ampliconnoise(mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform='flx', seqnoise_resolution=None, truncate_len=None): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) create_dir(output_dir) if seqnoise_resolution is None: if platform == 'flx': seqnoise_resolution = '30.0' elif platform == 'titanium': seqnoise_resolution = '25.0' else: raise RuntimeError('seqnoise_resolution not set, and no' + ' default for platform ' + platform) if truncate_len is None: if platform == 'flx': truncate_len = '220' elif platform == 'titanium': truncate_len = '400' else: raise RuntimeError('truncate_len not set, and no' + ' default for platform ' + platform) # these are filenames minus extension, and are sample IDs sample_names = [] primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index('SampleID')]) bc_seqs.append(map_data[i][headers.index('BarcodeSequence')]) primer = (map_data[i][headers.index('LinkerPrimerSequence')]) for char, bases in DNASequence.iupac_degeneracies().iteritems(): primer = primer.replace(char, '[' + ''.join(bases) + ']') primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError( 'Error: only one primer per mapping file supported.') one_primer = primer_seqs[0] commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger, [mapping_fp, sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir, 'map.csv'), 'w') for i in range(len(sample_names)): fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n') fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = '_' + truncate_len if suppress_perseus: fasta_result_names = [sample_name + post_pyro_tail + '_seqnoise_cd.fa' for sample_name in sample_names] else: fasta_result_names = [sample_name + '_Good.fa' for sample_name in sample_names] cmd = 'cd ' + output_dir # see also os.chdir above commands.append([('change to output dir', cmd)]) cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt' commands.append([('confirm pyro lookup filepath environment variable', cmd)]) cmd = 'SplitKeys.pl ' + one_primer + ' map.csv < ' +\ os.path.join(called_dir, sff_txt_fp) +\ ' > splitkeys_log.txt 2> unassigned.fna' commands.append([('split sff.txt via barcodes (keys)', cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == 'flx': cmd = 'Clean360.pl ' + one_primer + ' ' + sample_name + ' < ' +\ sample_name + '.raw' commands.append([('clean flows ' + sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == 'titanium': cmd = 'CleanMinMax.pl ' + one_primer + ' ' + sample_name + ' < ' +\ sample_name + '.raw' commands.append([('clean flows ' + sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = "mpirun -np " + str(numnodes) + " PyroDist -in " +\ sample_name + ".dat -out " + \ sample_name + " > " + sample_name + ".pdout" commands.append([('pyrodist ' + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name +\ " > " + sample_name + ".fcout" commands.append([('fcluster pyrodist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = "mpirun -np " + str(numnodes) + " PyroNoise -din " +\ sample_name + ".dat -out " +\ sample_name + "_pyronoise " + "-lin " +\ sample_name + ".list -s 60.0 -c 0.01 > " +\ sample_name + "_pyronoise.pnout" commands.append([('pyronoise ' + sample_name, cmd)]) cmd = 'Parse.pl ' + bc_seqs[i] + one_primer + ' ' + truncate_len + ' < ' +\ sample_name + '_pyronoise_cd.fa' + ' > ' + sample_name + '_' +\ truncate_len + '.fa' commands.append([('truncate ' + sample_name, cmd)]) # now start with post_pyro_tail cmd = "mpirun -np " + str(numnodes) + " SeqDist -in " +\ sample_name + post_pyro_tail +\ ".fa > " + sample_name + post_pyro_tail + ".seqdist" commands.append([('seqdist ' + sample_name, cmd)]) cmd = "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " +\ sample_name + post_pyro_tail + "fcl > " +\ sample_name + post_pyro_tail + ".fcout" commands.append([('fcluster seqdist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = "mpirun -np " + str(numnodes) + " SeqNoise -in " +\ sample_name + post_pyro_tail +\ ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " +\ sample_name + post_pyro_tail +\ "_seqnoise -lin " + sample_name + post_pyro_tail + 'fcl.list -min ' +\ sample_name + '_pyronoise' +\ '.mapping -s ' + seqnoise_resolution + ' -c 0.08 > ' +\ sample_name + post_pyro_tail + '.snout' commands.append([('seqnoise ' + sample_name, cmd)]) if not suppress_perseus: cmd = 'Perseus -sin ' + sample_name + post_pyro_tail +\ '_seqnoise_cd.fa > ' +\ sample_name + '.per' commands.append([('Perseus ' + sample_name, cmd)]) cmd = 'Class.pl ' + sample_name + '.per ' +\ str(chimera_alpha) + ' ' + str(chimera_beta) +\ ' > ' + sample_name + '.class' commands.append([('Class.pl ' + sample_name, cmd)]) cmd = 'FilterGoodClass.pl ' + sample_name + post_pyro_tail +\ '_seqnoise_cd.fa ' +\ sample_name + '.class 0.5 > ' + sample_name + '_Chi.fa 2> ' +\ sample_name + '_Good.fa' commands.append([('FilterGoodClass ' + sample_name, cmd)]) cmd = 'unweight_fasta.py -i %s -o %s -l %s' %\ (fasta_result_names[i], sample_name + '_unw.fna', sample_name) commands.append([('unweight fasta ' + sample_name, cmd)]) cmd = 'cat ' +\ ' '.join([sample_name + '_unw.fna' for sample_name in sample_names]) +\ ' > ' + output_filepath # this should be an abs filepath commands.append([('cat into one fasta file', cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def test_iupac_standard_characters(self): """iupac_standard_characters property functions as expected""" exp = set("ACGTacgt") self.assertEqual(self.b1.iupac_standard_characters(), exp) self.assertEqual(DNASequence.iupac_standard_characters(), exp)
class DNASequenceTests(TestCase): def setUp(self): self.empty = DNASequence('') self.b1 = DNASequence('GATTACA') self.b2 = DNASequence('ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = DNASequence('ACCGGUACC', id="bad-seq-1", description="Not a DNA sequence") self.b4 = DNASequence('MRWSYKVHDBN', id="degen", description="All of the degenerate bases") self.b5 = DNASequence('.G--ATTAC-A...') def test_alphabet(self): exp = { 'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W', 'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r', 't', 'w', 'v', 'y' } self.assertEqual(self.b1.alphabet(), exp) self.assertEqual(DNASequence.alphabet(), exp) def test_gap_alphabet(self): self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_complement_map(self): exp = { '-': '-', '.': '.', 'A': 'T', 'C': 'G', 'B': 'V', 'D': 'H', 'G': 'C', 'H': 'D', 'K': 'M', 'M': 'K', 'N': 'N', 'S': 'S', 'R': 'Y', 'T': 'A', 'W': 'W', 'V': 'B', 'Y': 'R', 'a': 't', 'c': 'g', 'b': 'v', 'd': 'h', 'g': 'c', 'h': 'd', 'k': 'm', 'm': 'k', 'n': 'n', 's': 's', 'r': 'y', 't': 'a', 'w': 'w', 'v': 'b', 'y': 'r' } self.assertEqual(self.b1.complement_map(), exp) self.assertEqual(DNASequence.complement_map(), exp) def test_iupac_standard_characters(self): exp = set("ACGTacgt") self.assertEqual(self.b1.iupac_standard_characters(), exp) self.assertEqual(DNASequence.iupac_standard_characters(), exp) def test_iupac_degeneracies(self): exp = { 'B': set(['C', 'T', 'G']), 'D': set(['A', 'T', 'G']), 'H': set(['A', 'C', 'T']), 'K': set(['T', 'G']), 'M': set(['A', 'C']), 'N': set(['A', 'C', 'T', 'G']), 'S': set(['C', 'G']), 'R': set(['A', 'G']), 'W': set(['A', 'T']), 'V': set(['A', 'C', 'G']), 'Y': set(['C', 'T']), 'b': set(['c', 't', 'g']), 'd': set(['a', 't', 'g']), 'h': set(['a', 'c', 't']), 'k': set(['t', 'g']), 'm': set(['a', 'c']), 'n': set(['a', 'c', 't', 'g']), 's': set(['c', 'g']), 'r': set(['a', 'g']), 'w': set(['a', 't']), 'v': set(['a', 'c', 'g']), 'y': set(['c', 't']) } self.assertEqual(self.b1.iupac_degeneracies(), exp) self.assertEqual(DNASequence.iupac_degeneracies(), exp) def test_iupac_degenerate_characters(self): exp = set([ 'B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y', 'b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y' ]) self.assertEqual(self.b1.iupac_degenerate_characters(), exp) self.assertEqual(DNASequence.iupac_degenerate_characters(), exp) def test_iupac_characters(self): exp = { 'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W', 'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r', 't', 'w', 'v', 'y' } self.assertEqual(self.b1.iupac_characters(), exp) self.assertEqual(DNASequence.iupac_characters(), exp) def test_complement(self): self.assertEqual(self.b1.complement(), DNASequence("CTAATGT")) self.assertEqual(self.b2.complement(), DNASequence("TGGCCATGG")) self.assertRaises(BiologicalSequenceError, self.b3.complement) self.assertEqual(self.b4.complement(), DNASequence("KYWSRMBDHVN")) self.assertEqual(self.b5.complement(), DNASequence(".C--TAATG-T...")) def test_reverse_complement(self): self.assertEqual(self.b1.reverse_complement(), DNASequence("TGTAATC")) self.assertEqual(self.b2.reverse_complement(), DNASequence("GGTACCGGT")) self.assertRaises(BiologicalSequenceError, self.b3.reverse_complement) self.assertEqual(self.b4.reverse_complement(), DNASequence("NVHDBMRSWYK")) def test_unsupported_characters(self): self.assertEqual(self.b1.unsupported_characters(), set()) self.assertEqual(self.b2.unsupported_characters(), set()) self.assertEqual(self.b3.unsupported_characters(), set('U')) self.assertEqual(self.b4.unsupported_characters(), set()) def test_has_unsupported_characters(self): self.assertFalse(self.b1.has_unsupported_characters()) self.assertFalse(self.b2.has_unsupported_characters()) self.assertTrue(self.b3.has_unsupported_characters()) self.assertFalse(self.b4.has_unsupported_characters()) def test_is_reverse_complement(self): self.assertFalse(self.b1.is_reverse_complement(self.b1)) self.assertTrue(self.b1.is_reverse_complement(DNASequence('TGTAATC'))) self.assertTrue( self.b4.is_reverse_complement(DNASequence('NVHDBMRSWYK'))) def test_nondegenerates_invalid(self): with self.assertRaises(BiologicalSequenceError): list(DNASequence('AZA').nondegenerates()) def test_nondegenerates_empty(self): self.assertEqual(list(self.empty.nondegenerates()), [self.empty]) def test_nondegenerates_no_degens(self): self.assertEqual(list(self.b1.nondegenerates()), [self.b1]) def test_nondegenerates_all_degens(self): # Same chars. exp = [ DNASequence('CC'), DNASequence('CG'), DNASequence('GC'), DNASequence('GG') ] # Sort based on sequence string, as order is not guaranteed. obs = sorted(DNASequence('SS').nondegenerates(), key=str) self.assertEqual(obs, exp) # Different chars. exp = [ DNASequence('AC'), DNASequence('AG'), DNASequence('GC'), DNASequence('GG') ] obs = sorted(DNASequence('RS').nondegenerates(), key=str) self.assertEqual(obs, exp) # Odd number of chars. obs = list(DNASequence('NNN').nondegenerates()) self.assertEqual(len(obs), 4**3) def test_nondegenerates_mixed_degens(self): exp = [ DNASequence('AGC'), DNASequence('AGT'), DNASequence('GGC'), DNASequence('GGT') ] obs = sorted(DNASequence('RGY').nondegenerates(), key=str) self.assertEqual(obs, exp) def test_nondegenerates_gap_mixed_case(self): exp = [ DNASequence('-A.a'), DNASequence('-A.c'), DNASequence('-C.a'), DNASequence('-C.c') ] obs = sorted(DNASequence('-M.m').nondegenerates(), key=str) self.assertEqual(obs, exp)
class DNASequenceTests(TestCase): def setUp(self): self.empty = DNASequence('') self.b1 = DNASequence('GATTACA') self.b2 = DNASequence( 'ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = DNASequence( 'ACCGGUACC', id="bad-seq-1", description="Not a DNA sequence") self.b4 = DNASequence( 'MRWSYKVHDBN', id="degen", description="All of the degenerate bases") self.b5 = DNASequence('.G--ATTAC-A...') def test_alphabet(self): exp = { 'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W', 'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r', 't', 'w', 'v', 'y' } self.assertEqual(self.b1.alphabet(), exp) self.assertEqual(DNASequence.alphabet(), exp) def test_gap_alphabet(self): self.assertEqual(self.b1.gap_alphabet(), set('-.')) def test_complement_map(self): exp = { '-': '-', '.': '.', 'A': 'T', 'C': 'G', 'B': 'V', 'D': 'H', 'G': 'C', 'H': 'D', 'K': 'M', 'M': 'K', 'N': 'N', 'S': 'S', 'R': 'Y', 'T': 'A', 'W': 'W', 'V': 'B', 'Y': 'R', 'a': 't', 'c': 'g', 'b': 'v', 'd': 'h', 'g': 'c', 'h': 'd', 'k': 'm', 'm': 'k', 'n': 'n', 's': 's', 'r': 'y', 't': 'a', 'w': 'w', 'v': 'b', 'y': 'r' } self.assertEqual(self.b1.complement_map(), exp) self.assertEqual(DNASequence.complement_map(), exp) def test_iupac_standard_characters(self): exp = set("ACGTacgt") self.assertEqual(self.b1.iupac_standard_characters(), exp) self.assertEqual(DNASequence.iupac_standard_characters(), exp) def test_iupac_degeneracies(self): exp = { 'B': set(['C', 'T', 'G']), 'D': set(['A', 'T', 'G']), 'H': set(['A', 'C', 'T']), 'K': set(['T', 'G']), 'M': set(['A', 'C']), 'N': set(['A', 'C', 'T', 'G']), 'S': set(['C', 'G']), 'R': set(['A', 'G']), 'W': set(['A', 'T']), 'V': set(['A', 'C', 'G']), 'Y': set(['C', 'T']), 'b': set(['c', 't', 'g']), 'd': set(['a', 't', 'g']), 'h': set(['a', 'c', 't']), 'k': set(['t', 'g']), 'm': set(['a', 'c']), 'n': set(['a', 'c', 't', 'g']), 's': set(['c', 'g']), 'r': set(['a', 'g']), 'w': set(['a', 't']), 'v': set(['a', 'c', 'g']), 'y': set(['c', 't']) } self.assertEqual(self.b1.iupac_degeneracies(), exp) self.assertEqual(DNASequence.iupac_degeneracies(), exp) def test_iupac_degenerate_characters(self): exp = set(['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y', 'b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y']) self.assertEqual(self.b1.iupac_degenerate_characters(), exp) self.assertEqual(DNASequence.iupac_degenerate_characters(), exp) def test_iupac_characters(self): exp = { 'A', 'C', 'B', 'D', 'G', 'H', 'K', 'M', 'N', 'S', 'R', 'T', 'W', 'V', 'Y', 'a', 'c', 'b', 'd', 'g', 'h', 'k', 'm', 'n', 's', 'r', 't', 'w', 'v', 'y' } self.assertEqual(self.b1.iupac_characters(), exp) self.assertEqual(DNASequence.iupac_characters(), exp) def test_complement(self): self.assertEqual(self.b1.complement(), DNASequence("CTAATGT")) self.assertEqual(self.b2.complement(), DNASequence("TGGCCATGG")) self.assertRaises(BiologicalSequenceError, self.b3.complement) self.assertEqual(self.b4.complement(), DNASequence("KYWSRMBDHVN")) self.assertEqual(self.b5.complement(), DNASequence(".C--TAATG-T...")) def test_reverse_complement(self): self.assertEqual(self.b1.reverse_complement(), DNASequence("TGTAATC")) self.assertEqual(self.b2.reverse_complement(), DNASequence("GGTACCGGT")) self.assertRaises(BiologicalSequenceError, self.b3.reverse_complement) self.assertEqual(self.b4.reverse_complement(), DNASequence("NVHDBMRSWYK")) def test_unsupported_characters(self): self.assertEqual(self.b1.unsupported_characters(), set()) self.assertEqual(self.b2.unsupported_characters(), set()) self.assertEqual(self.b3.unsupported_characters(), set('U')) self.assertEqual(self.b4.unsupported_characters(), set()) def test_has_unsupported_characters(self): self.assertFalse(self.b1.has_unsupported_characters()) self.assertFalse(self.b2.has_unsupported_characters()) self.assertTrue(self.b3.has_unsupported_characters()) self.assertFalse(self.b4.has_unsupported_characters()) def test_is_reverse_complement(self): self.assertFalse(self.b1.is_reverse_complement(self.b1)) self.assertTrue( self.b1.is_reverse_complement(DNASequence('TGTAATC'))) self.assertTrue( self.b4.is_reverse_complement(DNASequence('NVHDBMRSWYK'))) def test_nondegenerates_invalid(self): with self.assertRaises(BiologicalSequenceError): list(DNASequence('AZA').nondegenerates()) def test_nondegenerates_empty(self): self.assertEqual(list(self.empty.nondegenerates()), [self.empty]) def test_nondegenerates_no_degens(self): self.assertEqual(list(self.b1.nondegenerates()), [self.b1]) def test_nondegenerates_all_degens(self): # Same chars. exp = [DNASequence('CC'), DNASequence('CG'), DNASequence('GC'), DNASequence('GG')] # Sort based on sequence string, as order is not guaranteed. obs = sorted(DNASequence('SS').nondegenerates(), key=str) self.assertEqual(obs, exp) # Different chars. exp = [DNASequence('AC'), DNASequence('AG'), DNASequence('GC'), DNASequence('GG')] obs = sorted(DNASequence('RS').nondegenerates(), key=str) self.assertEqual(obs, exp) # Odd number of chars. obs = list(DNASequence('NNN').nondegenerates()) self.assertEqual(len(obs), 4**3) def test_nondegenerates_mixed_degens(self): exp = [DNASequence('AGC'), DNASequence('AGT'), DNASequence('GGC'), DNASequence('GGT')] obs = sorted(DNASequence('RGY').nondegenerates(), key=str) self.assertEqual(obs, exp) def test_nondegenerates_gap_mixed_case(self): exp = [DNASequence('-A.a'), DNASequence('-A.c'), DNASequence('-C.a'), DNASequence('-C.c')] obs = sorted(DNASequence('-M.m').nondegenerates(), key=str) self.assertEqual(obs, exp)
def setUp(self): self.empty = DNASequence('') self.b1 = DNASequence('GATTACA') self.b2 = DNASequence('ACCGGTACC', id="test-seq-2", description="A test sequence") self.b3 = DNASequence('ACCGGUACC', id="bad-seq-1", description="Not a DNA sequence") self.b4 = DNASequence('MRWSYKVHDBN', id="degen", description="All of the degenerate bases") self.b5 = DNASequence('.G--ATTAC-A...')
def test_iupac_standard_characters(self): exp = set("ACGTacgt") self.assertEqual(self.b1.iupac_standard_characters(), exp) self.assertEqual(DNASequence.iupac_standard_characters(), exp)
def test_complement(self): self.assertEqual(self.b1.complement(), DNASequence("CTAATGT")) self.assertEqual(self.b2.complement(), DNASequence("TGGCCATGG")) self.assertRaises(BiologicalSequenceError, self.b3.complement) self.assertEqual(self.b4.complement(), DNASequence("KYWSRMBDHVN")) self.assertEqual(self.b5.complement(), DNASequence(".C--TAATG-T..."))
def test_nondegenerates_invalid(self): with self.assertRaises(BiologicalSequenceError): list(DNASequence('AZA').nondegenerates())
def test_iupac_degenerate_characters(self): exp = set(['B', 'D', 'H', 'K', 'M', 'N', 'S', 'R', 'W', 'V', 'Y', 'b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y']) self.assertEqual(self.b1.iupac_degenerate_characters(), exp) self.assertEqual(DNASequence.iupac_degenerate_characters(), exp)
def test_nondegenerates_all_degens(self): # Same chars. exp = [ DNASequence('CC'), DNASequence('CG'), DNASequence('GC'), DNASequence('GG') ] # Sort based on sequence string, as order is not guaranteed. obs = sorted(DNASequence('SS').nondegenerates(), key=str) self.assertEqual(obs, exp) # Different chars. exp = [ DNASequence('AC'), DNASequence('AG'), DNASequence('GC'), DNASequence('GG') ] obs = sorted(DNASequence('RS').nondegenerates(), key=str) self.assertEqual(obs, exp) # Odd number of chars. obs = list(DNASequence('NNN').nondegenerates()) self.assertEqual(len(obs), 4**3)