def test_tabular_output(self): """ SortMeRNA should output a BLAST tabular output """ sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA clusters, failures, smr_files_to_remove = sortmerna_ref_cluster( seq_path=self.file_read_seqs_fp, sortmerna_db=sortmerna_db, refseqs_fp=self.file_reference_seq_fp, result_path=join(self.output_dir, "sortmerna_otus.txt"), tabular=True) self.assertTrue(exists(join(self.output_dir, "sortmerna_otus.blast")))
def test_sortmerna_default_param(self): """ SortMeRNA version 2.0 reference OTU picking works with default settings """ # rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA cluster_map, failures, smr_files_to_remove = sortmerna_ref_cluster( seq_path=self.file_read_seqs_fp, sortmerna_db=sortmerna_db, refseqs_fp=self.file_reference_seq_fp, result_path=join(self.output_dir, "sortmerna_otus.txt")) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_otus_otus.txt', 'sortmerna_otus.log', 'sortmerna_otus_denovo.fasta', 'sortmerna_otus.fasta']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) # Files created sortmerna to be deleted (StdErr and StdOut were already # removed in sortmerna_ref_cluster) self.files_to_remove.extend(output_files) # Random reads that should not appear in any output file random_reads = ['simulated_random_reads.fa.000000000', 'simulated_random_reads.fa.000000001', 'simulated_random_reads.fa.000000002', 'simulated_random_reads.fa.000000003', 'simulated_random_reads.fa.000000004', 'simulated_random_reads.fa.000000005', 'simulated_random_reads.fa.000000006', 'simulated_random_reads.fa.000000007', 'simulated_random_reads.fa.000000008', 'simulated_random_reads.fa.000000009'] # Reads passing E-value threshold and with similarity/coverage >=97% otu_reads = ['HMPMockV1.2.Staggered2.673827_47', 'HMPMockV1.2.Staggered2.673827_115', 'HMPMockV1.2.Staggered2.673827_122', 'HMPMockV1.2.Staggered2.673827_161', 'HMPMockV1.2.Staggered2.673827_180', 'HMPMockV1.2.Staggered2.673827_203', 'HMPMockV1.2.Staggered2.673827_207', 'HMPMockV1.2.Staggered2.673827_215', 'HMPMockV1.2.Staggered2.673827_218', 'HMPMockV1.2.Staggered2.673827_220'] # Reads passing E-value threshold and with similarity/coverage <97% denovo_reads = ['HMPMockV1.2.Staggered2.673827_0', 'HMPMockV1.2.Staggered2.673827_1', 'HMPMockV1.2.Staggered2.673827_2', 'HMPMockV1.2.Staggered2.673827_3', 'HMPMockV1.2.Staggered2.673827_4', 'HMPMockV1.2.Staggered2.673827_5', 'HMPMockV1.2.Staggered2.673827_6', 'HMPMockV1.2.Staggered2.673827_7', 'HMPMockV1.2.Staggered2.673827_8', 'HMPMockV1.2.Staggered2.673827_9'] # Check correct number of OTU clusters in file otu_clusters = ['295053'] f_aligned = open(output_files[3], "U") f_otumap = open(output_files[0], "U") f_denovo = open(output_files[2], "U") # Verify the aligned FASTA file for label, seq in parse_fasta(f_aligned): id = label.split()[0] # Read is not random self.assertNotIn(id, random_reads) # Read is either in otu_reads or denovo_reads self.assertIn(id, otu_reads+denovo_reads) f_aligned.close() # Verify the de novo reads FASTA file for label, seq in parse_fasta(f_denovo): id = label.split()[0] # Read is not random self.assertNotIn(id, random_reads) # Read is not an OTU read self.assertNotIn(id, otu_reads) # Read is a de novo read self.assertIn(id, denovo_reads) f_denovo.close() # Check the OTU map for line in f_otumap: otu_entry = line.split() # Cluster ID is correct self.assertIn(otu_entry[0], otu_clusters) # Each read in the cluster must exclusively be an OTU read for read in otu_entry[1:]: self.assertNotIn(read, random_reads) self.assertNotIn(read, denovo_reads) self.assertIn(read, otu_reads) f_otumap.close() # Check returned list of lists of clusters expected_cluster = ['HMPMockV1.2.Staggered2.673827_47', 'HMPMockV1.2.Staggered2.673827_115', 'HMPMockV1.2.Staggered2.673827_122', 'HMPMockV1.2.Staggered2.673827_161', 'HMPMockV1.2.Staggered2.673827_180', 'HMPMockV1.2.Staggered2.673827_203', 'HMPMockV1.2.Staggered2.673827_207', 'HMPMockV1.2.Staggered2.673827_215', 'HMPMockV1.2.Staggered2.673827_218', 'HMPMockV1.2.Staggered2.673827_220'] # Should only have 1 cluster self.assertEqual(1, len(cluster_map)) for actual_cluster in cluster_map.itervalues(): actual_cluster.sort() expected_cluster.sort() self.assertEqual(actual_cluster, expected_cluster) # Check log file number of clusters and failures corresponds to # the results in the output files f_log = open(output_files[1], "U") num_clusters = 0 num_failures = 0 for line in f_log: if line.startswith(" Total OTUs"): num_clusters = (re.split(' = ', line)[1]).strip() elif line.startswith(" Total reads for de novo clustering"): num_failures = (re.split(' = ', line)[1]).strip() f_log.close() self.assertEqual(int(num_clusters), len(otu_clusters)) self.assertEqual(int(num_failures), len(denovo_reads))