def test_empty_inputs(self): """ (1) Indexdb should set output_dir to the same directory as where the input FASTA file is located; (2) SortMeRNA should fail if an empty result path is passed; (3) SortMeRNA should fail if an empty seq path is passed """ sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=None) self.files_to_remove.extend(db_files_to_remove) fasta_dir = dirname(abspath(self.file_reference_seq_fp)) out_dir = dirname(sortmerna_db) self.assertEqual(fasta_dir, out_dir) self.assertRaises(ValueError, sortmerna_ref_cluster, seq_path=self.file_read_seqs_fp, sortmerna_db=sortmerna_db, refseqs_fp=self.file_reference_seq_fp, result_path=None) self.assertRaises(ValueError, sortmerna_ref_cluster, seq_path=None, sortmerna_db=sortmerna_db, refseqs_fp=self.file_reference_seq_fp, result_path=join(self.output_dir, "sortmerna_otus.txt"))
def build_index_sortmerna(ref_fp, working_dir): """Build a SortMeRNA index for all reference databases. Parameters ---------- ref_fp: tuple filepaths to FASTA reference databases working_dir: string working directory path Returns ------- all_db: tuple filepaths to SortMeRNA indexed reference databases all_file_to_remove: list index files to remove """ all_db = [] all_files_to_remove = [] for db in ref_fp: # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=db, max_pos=10000, output_dir=working_dir) all_db.append(sortmerna_db) all_files_to_remove.extend(files_to_remove) return tuple(all_db), all_files_to_remove
def test_remove_artifacts_seqs_index_prebuilt(self): """ Test remove_artifacts_seqs() function for removing sequences not matching to a reference database using SortMeRNA. This test passes a built index. """ seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"), ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"), ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"), ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"), ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"), ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")] exp_seqs = ["seq1", "seq2", "seq3", "seq4", "seq5", "seq6"] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref_fp = join(self.working_dir, "ref3.fasta") with open(ref_fp, 'w') as ref_f: for seq in ref: ref_f.write(">%s\n%s\n" % seq) self.files_to_remove.append(ref_fp) # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=ref_fp, max_pos=10000, output_dir=self.working_dir) self.files_to_remove.extend(files_to_remove) output_fp = join(self.working_dir, "seqs_filtered.fasta") output_fp = remove_artifacts_seqs(seqs_fp=seqs_fp, ref_fp=(ref_fp, ), working_dir=self.working_dir, ref_db_fp=(sortmerna_db, ), negate=False, threads=1) obs_seqs = [] with open(output_fp, 'U') as output_f: for label, seq in parse_fasta(output_f): obs_seqs.append(label) self.assertEqual(obs_seqs, exp_seqs)
def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if not params['sortmerna_db']: # Build the sortmerna database from the reference_seqs_fp sortmerna_db, db_files_to_remove = \ build_database_sortmerna(params['refseqs_fp'], max_pos=params['sortmerna_max_pos'], output_dir=working_dir) self.files_to_remove += db_files_to_remove params['sortmerna_db'] = sortmerna_db
def _precommand_initiation( self, input_fp, output_dir, working_dir, params): if not params['sortmerna_db']: # Build the sortmerna database from the reference_seqs_fp sortmerna_db, db_files_to_remove = \ build_database_sortmerna(params['refseqs_fp'], max_pos=params['sortmerna_max_pos'], output_dir=working_dir) self.files_to_remove += db_files_to_remove params['sortmerna_db'] = sortmerna_db
def test_sortmerna_map_sam_alignments(self): """ SortMeRNA version 2.0 for mapping sequences onto a reference outputting Blast and SAM alignments """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA mapper app_result = sortmerna_map(seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, output_sam=True) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_map.blast', 'sortmerna_map.sam', 'sortmerna_map.log']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) sam_alignments_fp = app_result['SAMAlignments'].name # Check there are 30 alignments in the SAM output (1 per read) with open(sam_alignments_fp, 'U') as sam_actual: entries = (line.strip().split('\t') for line in sam_actual) actual_alignments = {r[0]: r[1:] for r in entries} # 30 alignments expected + 2 lines for @HD and @PG fields self.assertEqual(32, len(actual_alignments)) # Check this alignment exists self.assertTrue("HMPMockV1.2.Staggered2.673827_47" in actual_alignments) self.assertEqual("295053", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][1]) self.assertEqual("AS:i:418", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][10]) # Check alignment for random read is NULL self.assertTrue("simulated_random_reads.fa.000000000" in actual_alignments) self.assertEqual("*", actual_alignments[ "simulated_random_reads.fa.000000000"][1])
def test_sortmerna_map_num_alignments(self): """ SortMeRNA version 2.0 for mapping sequences onto a reference outputting first INT num_alignments passing the E-value threshold (rather than first INT best alignments) """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA mapper app_result = sortmerna_map(seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, num_alignments=1) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_map.blast', 'sortmerna_map.log']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) blast_alignments_fp = app_result['BlastAlignments'].name # Check there are 30 alignments (1 per read) with open(blast_alignments_fp, 'U') as blast_actual: entries = (line.strip().split('\t') for line in blast_actual) actual_alignments = {r[0]: r[1:] for r in entries} self.assertEqual(30, len(actual_alignments)) # Check this alignment exists self.assertTrue("HMPMockV1.2.Staggered2.673827_47" in actual_alignments) self.assertEqual("97.3", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][1]) self.assertEqual("100", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][12]) # Check alignment for random read is NULL self.assertTrue("simulated_random_reads.fa.000000000" in actual_alignments) self.assertEqual("*", actual_alignments[ "simulated_random_reads.fa.000000000"][0])
def test_sortmerna_map_sam_alignments_with_tags(self): """ SortMeRNA version 2.0 for mapping sequences onto a reference outputting SAM alignments with @SQ tags """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA mapper app_result = sortmerna_map(seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, output_sam=True, sam_SQ_tags=True, blast_format=None) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_map.sam', 'sortmerna_map.log']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) sam_alignments_fp = app_result['SAMAlignments'].name # Check there are 30 alignments in the SAM output (1 per read) with open(sam_alignments_fp, 'U') as sam_actual: actual_entries = [line.strip().split('\t') for line in sam_actual] # 30 alignments expected + 2 lines for @HD and @PG fields + 5 lines # for the @SQ tags self.assertEqual(37, len(actual_entries)) # Check all expected @SQ tags have been included SQ_array = [['@SQ', 'SN:42684', 'LN:1501'], ['@SQ', 'SN:342684', 'LN:1486'], ['@SQ', 'SN:426848', 'LN:1486'], ['@SQ', 'SN:295053', 'LN:1389'], ['@SQ', 'SN:879972', 'LN:1371']] for entry in SQ_array: self.assertTrue(entry in actual_entries)
def test_indexdb_default_param(self): """ Test indexing a database using SortMeRNA """ sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) expected_db_files = set(sortmerna_db + ext for ext in ['.bursttrie_0.dat', '.kmer_0.dat', '.pos_0.dat', '.stats']) # Make sure all db_files exist for fp in expected_db_files: self.assertTrue(exists(fp)) # Add files to be remove self.files_to_remove.extend(db_files_to_remove)
def test_tabular_output(self): """ SortMeRNA should output a BLAST tabular output """ sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA clusters, failures, smr_files_to_remove = sortmerna_ref_cluster( seq_path=self.file_read_seqs_fp, sortmerna_db=sortmerna_db, refseqs_fp=self.file_reference_seq_fp, result_path=join(self.output_dir, "sortmerna_otus.txt"), tabular=True) self.assertTrue(exists(join(self.output_dir, "sortmerna_otus.blast")))
def test_best_or_num_alignments(self): """ SortMeRNA should fail with "best" and "num_alignments" both set to True """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) self.assertRaises(ValueError, sortmerna_map, seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, best=1, num_alignments=1)
def test_blast_or_sam(self): """ SortMeRNA should fail with output_sam and blast_format both set to False """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) self.assertRaises(ValueError, sortmerna_map, seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, output_sam=False, blast_format=None)
def remove_artifacts_seqs(seqs_fp, ref_fp, output_fp, ref_db_fp=None, negate=False, threads=1): """Remove artifacts from FASTA file using SortMeRNA. Parameters ---------- seqs_fp: string file path to FASTA input sequence file ref_fp: tuple file path(s) to FASTA database file output_fp: string file path to store output results ref_db_fp: string or tuple, optional file path(s) to indexed FASTA database negate: boolean, optional if True, discard all input sequences aligning to reference database threads: integer, optional number of threads to use for SortMeRNA """ working_dir = join(dirname(output_fp), "working_dir") if not exists(working_dir): makedirs(working_dir) aligned_seq_ids = set() files_to_remove = [] for i, db in enumerate(ref_fp): # create working directory for each # reference database db_dir_base = splitext(basename(db))[0] db_dir = join(working_dir, db_dir_base) if not exists(db_dir): makedirs(db_dir) if ref_db_fp: sortmerna_db = ref_db_fp[i] else: # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=db, max_pos=10000, output_dir=db_dir) # run SortMeRNA app_result = sortmerna_map(seq_path=seqs_fp, output_dir=db_dir, refseqs_fp=db, sortmerna_db=sortmerna_db, threads=threads, best=1) # Print SortMeRNA errors stderr_fp = app_result['StdErr'].name if stat(stderr_fp).st_size != 0: with open(stderr_fp, 'U') as stderr_f: for line in stderr_f: print line raise ValueError("Could not run SortMeRNA.") for line in app_result['BlastAlignments']: line = line.strip().split('\t') if line[1] == '*': continue else: aligned_seq_ids.add(line[0]) # remove indexed database files remove_files(files_to_remove, error_on_missing=False) if negate: def op(x): return x not in aligned_seq_ids else: def op(x): return x in aligned_seq_ids # if negate = False, only output sequences # matching to at least one of the databases with open(seqs_fp, 'U') as seqs_f: with open(output_fp, 'w') as out_f: for label, seq in parse_fasta(seqs_f): label = label.split()[0] if op(label): out_f.write(">%s\n%s\n" % (label, seq))
def test_remove_artifacts_seqs_mismatch_ref_index(self): """ Test remove_artifacts_seqs() function for removing sequences not matching to a reference database using SortMeRNA. A ValueError() should be raised when a user passes a reference sequence and an index database that do not match. """ seqs = [("seq1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq2", "CCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("seq3", "TCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCC"), ("seq4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCC"), ("seq5", "CTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATAGGGTC"), ("seq6", "TTGAGCCTAAAACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAAT"), ("phix1", "TCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCC"), ("phix2", "CTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGC"), ("phix3", "GCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGCCGCCAAAA")] seqs_fp = join(self.working_dir, "seqs.fasta") with open(seqs_fp, 'w') as seqs_f: for seq in seqs: seqs_f.write(">%s\n%s\n" % seq) ref = [("ref1", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTA" "GTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref2", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref3", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref4", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT"), ("ref5", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATAGGGT"), ("ref6", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAAACGTCCGTAG" "TCGGCTTTGTAAATCCCTGGGTAAATCGGGT")] ref_fp = join(self.working_dir, "ref5.fasta") with open(ref_fp, 'w') as ref_f: for seq in ref: ref_f.write(">%s\n%s\n" % seq) self.files_to_remove.append(ref_fp) ref_bis = [("ref7", "attaaatcagttatcgtttatttgatagttcctttactacatgga" "tatc"), ("ref8", "accttacgagaaatcaaagtctttgggttctggggggagtatggt" "cgcaaggctgaaacttaaaggaattgacggaaggg"), ("ref9", "aattgcgataacgaacgagaccttaacctactaaatagtgctgct" "agcatttgc"), ("ref10", "gacgggtgacggagaattagggttcgattccggagagggagcct" "gagaaacggctaccacatccaag")] ref_bis_fp = join(self.working_dir, "ref_bis.fasta") with open(ref_bis_fp, 'w') as ref_bis_f: for seq in ref_bis: ref_bis_f.write(">%s\n%s\n" % seq) # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=ref_bis_fp, max_pos=10000, output_dir=self.working_dir) self.files_to_remove.extend(files_to_remove) self.assertRaises(ValueError, remove_artifacts_seqs, seqs_fp=seqs_fp, ref_fp=(ref_fp, ), working_dir=self.working_dir, ref_db_fp=(sortmerna_db, ), negate=False, threads=1)
def remove_artifacts_seqs(seqs_fp, ref_fp, output_fp, ref_db_fp=None, negate=False, threads=1): """Remove artifacts from FASTA file using SortMeRNA. Parameters ---------- seqs_fp: string file path to FASTA input sequence file ref_fp: tuple file path(s) to FASTA database file output_fp: string file path to store output results ref_db_fp: string or tuple, optional file path(s) to indexed FASTA database negate: boolean, optional if True, discard all input sequences aligning to reference database threads: integer, optional number of threads to use for SortMeRNA """ working_dir = join(dirname(output_fp), "working_dir") if not exists(working_dir): makedirs(working_dir) aligned_seq_ids = set() files_to_remove = [] for i, db in enumerate(ref_fp): # create working directory for each # reference database db_dir_base = splitext(basename(db))[0] db_dir = join(working_dir, db_dir_base) if not exists(db_dir): makedirs(db_dir) if ref_db_fp: sortmerna_db = ref_db_fp[i] else: # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=db, max_pos=10000, output_dir=db_dir) # run SortMeRNA app_result = sortmerna_map( seq_path=seqs_fp, output_dir=db_dir, refseqs_fp=db, sortmerna_db=sortmerna_db, threads=threads, best=1) # Print SortMeRNA errors stderr_fp = app_result['StdErr'].name if stat(stderr_fp).st_size != 0: with open(stderr_fp, 'U') as stderr_f: for line in stderr_f: print line raise ValueError("Could not run SortMeRNA.") for line in app_result['BlastAlignments']: line = line.strip().split('\t') if line[1] == '*': continue else: aligned_seq_ids.add(line[0]) # remove indexed database files remove_files(files_to_remove, error_on_missing=False) if negate: def op(x): return x not in aligned_seq_ids else: def op(x): return x in aligned_seq_ids # if negate = False, only output sequences # matching to at least one of the databases with open(seqs_fp, 'U') as seqs_f: with open(output_fp, 'w') as out_f: for label, seq in parse_fasta(seqs_f): label = label.split()[0] if op(label): out_f.write(">%s\n%s\n" % (label, seq))
def test_sortmerna_default_param(self): """ SortMeRNA version 2.0 reference OTU picking works with default settings """ # rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA cluster_map, failures, smr_files_to_remove = sortmerna_ref_cluster( seq_path=self.file_read_seqs_fp, sortmerna_db=sortmerna_db, refseqs_fp=self.file_reference_seq_fp, result_path=join(self.output_dir, "sortmerna_otus.txt")) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_otus_otus.txt', 'sortmerna_otus.log', 'sortmerna_otus_denovo.fasta', 'sortmerna_otus.fasta']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) # Files created sortmerna to be deleted (StdErr and StdOut were already # removed in sortmerna_ref_cluster) self.files_to_remove.extend(output_files) # Random reads that should not appear in any output file random_reads = ['simulated_random_reads.fa.000000000', 'simulated_random_reads.fa.000000001', 'simulated_random_reads.fa.000000002', 'simulated_random_reads.fa.000000003', 'simulated_random_reads.fa.000000004', 'simulated_random_reads.fa.000000005', 'simulated_random_reads.fa.000000006', 'simulated_random_reads.fa.000000007', 'simulated_random_reads.fa.000000008', 'simulated_random_reads.fa.000000009'] # Reads passing E-value threshold and with similarity/coverage >=97% otu_reads = ['HMPMockV1.2.Staggered2.673827_47', 'HMPMockV1.2.Staggered2.673827_115', 'HMPMockV1.2.Staggered2.673827_122', 'HMPMockV1.2.Staggered2.673827_161', 'HMPMockV1.2.Staggered2.673827_180', 'HMPMockV1.2.Staggered2.673827_203', 'HMPMockV1.2.Staggered2.673827_207', 'HMPMockV1.2.Staggered2.673827_215', 'HMPMockV1.2.Staggered2.673827_218', 'HMPMockV1.2.Staggered2.673827_220'] # Reads passing E-value threshold and with similarity/coverage <97% denovo_reads = ['HMPMockV1.2.Staggered2.673827_0', 'HMPMockV1.2.Staggered2.673827_1', 'HMPMockV1.2.Staggered2.673827_2', 'HMPMockV1.2.Staggered2.673827_3', 'HMPMockV1.2.Staggered2.673827_4', 'HMPMockV1.2.Staggered2.673827_5', 'HMPMockV1.2.Staggered2.673827_6', 'HMPMockV1.2.Staggered2.673827_7', 'HMPMockV1.2.Staggered2.673827_8', 'HMPMockV1.2.Staggered2.673827_9'] # Check correct number of OTU clusters in file otu_clusters = ['295053'] f_aligned = open(output_files[3], "U") f_otumap = open(output_files[0], "U") f_denovo = open(output_files[2], "U") # Verify the aligned FASTA file for label, seq in parse_fasta(f_aligned): id = label.split()[0] # Read is not random self.assertNotIn(id, random_reads) # Read is either in otu_reads or denovo_reads self.assertIn(id, otu_reads+denovo_reads) f_aligned.close() # Verify the de novo reads FASTA file for label, seq in parse_fasta(f_denovo): id = label.split()[0] # Read is not random self.assertNotIn(id, random_reads) # Read is not an OTU read self.assertNotIn(id, otu_reads) # Read is a de novo read self.assertIn(id, denovo_reads) f_denovo.close() # Check the OTU map for line in f_otumap: otu_entry = line.split() # Cluster ID is correct self.assertIn(otu_entry[0], otu_clusters) # Each read in the cluster must exclusively be an OTU read for read in otu_entry[1:]: self.assertNotIn(read, random_reads) self.assertNotIn(read, denovo_reads) self.assertIn(read, otu_reads) f_otumap.close() # Check returned list of lists of clusters expected_cluster = ['HMPMockV1.2.Staggered2.673827_47', 'HMPMockV1.2.Staggered2.673827_115', 'HMPMockV1.2.Staggered2.673827_122', 'HMPMockV1.2.Staggered2.673827_161', 'HMPMockV1.2.Staggered2.673827_180', 'HMPMockV1.2.Staggered2.673827_203', 'HMPMockV1.2.Staggered2.673827_207', 'HMPMockV1.2.Staggered2.673827_215', 'HMPMockV1.2.Staggered2.673827_218', 'HMPMockV1.2.Staggered2.673827_220'] # Should only have 1 cluster self.assertEqual(1, len(cluster_map)) for actual_cluster in cluster_map.itervalues(): actual_cluster.sort() expected_cluster.sort() self.assertEqual(actual_cluster, expected_cluster) # Check log file number of clusters and failures corresponds to # the results in the output files f_log = open(output_files[1], "U") num_clusters = 0 num_failures = 0 for line in f_log: if line.startswith(" Total OTUs"): num_clusters = (re.split(' = ', line)[1]).strip() elif line.startswith(" Total reads for de novo clustering"): num_failures = (re.split(' = ', line)[1]).strip() f_log.close() self.assertEqual(int(num_clusters), len(otu_clusters)) self.assertEqual(int(num_failures), len(denovo_reads))