def test_sortmerna_map_sam_alignments(self): """ SortMeRNA version 2.0 for mapping sequences onto a reference outputting Blast and SAM alignments """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA mapper app_result = sortmerna_map(seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, output_sam=True) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_map.blast', 'sortmerna_map.sam', 'sortmerna_map.log']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) sam_alignments_fp = app_result['SAMAlignments'].name # Check there are 30 alignments in the SAM output (1 per read) with open(sam_alignments_fp, 'U') as sam_actual: entries = (line.strip().split('\t') for line in sam_actual) actual_alignments = {r[0]: r[1:] for r in entries} # 30 alignments expected + 2 lines for @HD and @PG fields self.assertEqual(32, len(actual_alignments)) # Check this alignment exists self.assertTrue("HMPMockV1.2.Staggered2.673827_47" in actual_alignments) self.assertEqual("295053", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][1]) self.assertEqual("AS:i:418", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][10]) # Check alignment for random read is NULL self.assertTrue("simulated_random_reads.fa.000000000" in actual_alignments) self.assertEqual("*", actual_alignments[ "simulated_random_reads.fa.000000000"][1])
def test_sortmerna_map_num_alignments(self): """ SortMeRNA version 2.0 for mapping sequences onto a reference outputting first INT num_alignments passing the E-value threshold (rather than first INT best alignments) """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA mapper app_result = sortmerna_map(seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, num_alignments=1) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_map.blast', 'sortmerna_map.log']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) blast_alignments_fp = app_result['BlastAlignments'].name # Check there are 30 alignments (1 per read) with open(blast_alignments_fp, 'U') as blast_actual: entries = (line.strip().split('\t') for line in blast_actual) actual_alignments = {r[0]: r[1:] for r in entries} self.assertEqual(30, len(actual_alignments)) # Check this alignment exists self.assertTrue("HMPMockV1.2.Staggered2.673827_47" in actual_alignments) self.assertEqual("97.3", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][1]) self.assertEqual("100", actual_alignments[ "HMPMockV1.2.Staggered2.673827_47"][12]) # Check alignment for random read is NULL self.assertTrue("simulated_random_reads.fa.000000000" in actual_alignments) self.assertEqual("*", actual_alignments[ "simulated_random_reads.fa.000000000"][0])
def test_sortmerna_map_sam_alignments_with_tags(self): """ SortMeRNA version 2.0 for mapping sequences onto a reference outputting SAM alignments with @SQ tags """ # Rebuild the index sortmerna_db, db_files_to_remove = build_database_sortmerna( abspath(self.file_reference_seq_fp), max_pos=250, output_dir=self.output_dir) # Files created by indexdb_rna to be deleted self.files_to_remove.extend(db_files_to_remove) # Run SortMeRNA mapper app_result = sortmerna_map(seq_path=self.file_read_seqs_fp, output_dir=self.output_dir, refseqs_fp=self.file_reference_seq_fp, sortmerna_db=sortmerna_db, output_sam=True, sam_SQ_tags=True, blast_format=None) # Check all sortmerna output files exist output_files = [join(self.output_dir, ext) for ext in ['sortmerna_map.sam', 'sortmerna_map.log']] # Check output files exist for fp in output_files: self.assertTrue(exists(fp)) sam_alignments_fp = app_result['SAMAlignments'].name # Check there are 30 alignments in the SAM output (1 per read) with open(sam_alignments_fp, 'U') as sam_actual: actual_entries = [line.strip().split('\t') for line in sam_actual] # 30 alignments expected + 2 lines for @HD and @PG fields + 5 lines # for the @SQ tags self.assertEqual(37, len(actual_entries)) # Check all expected @SQ tags have been included SQ_array = [['@SQ', 'SN:42684', 'LN:1501'], ['@SQ', 'SN:342684', 'LN:1486'], ['@SQ', 'SN:426848', 'LN:1486'], ['@SQ', 'SN:295053', 'LN:1389'], ['@SQ', 'SN:879972', 'LN:1371']] for entry in SQ_array: self.assertTrue(entry in actual_entries)
def remove_artifacts_seqs(seqs_fp, ref_fp, output_fp, ref_db_fp=None, negate=False, threads=1): """Remove artifacts from FASTA file using SortMeRNA. Parameters ---------- seqs_fp: string file path to FASTA input sequence file ref_fp: tuple file path(s) to FASTA database file output_fp: string file path to store output results ref_db_fp: string or tuple, optional file path(s) to indexed FASTA database negate: boolean, optional if True, discard all input sequences aligning to reference database threads: integer, optional number of threads to use for SortMeRNA """ working_dir = join(dirname(output_fp), "working_dir") if not exists(working_dir): makedirs(working_dir) aligned_seq_ids = set() files_to_remove = [] for i, db in enumerate(ref_fp): # create working directory for each # reference database db_dir_base = splitext(basename(db))[0] db_dir = join(working_dir, db_dir_base) if not exists(db_dir): makedirs(db_dir) if ref_db_fp: sortmerna_db = ref_db_fp[i] else: # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=db, max_pos=10000, output_dir=db_dir) # run SortMeRNA app_result = sortmerna_map(seq_path=seqs_fp, output_dir=db_dir, refseqs_fp=db, sortmerna_db=sortmerna_db, threads=threads, best=1) # Print SortMeRNA errors stderr_fp = app_result['StdErr'].name if stat(stderr_fp).st_size != 0: with open(stderr_fp, 'U') as stderr_f: for line in stderr_f: print line raise ValueError("Could not run SortMeRNA.") for line in app_result['BlastAlignments']: line = line.strip().split('\t') if line[1] == '*': continue else: aligned_seq_ids.add(line[0]) # remove indexed database files remove_files(files_to_remove, error_on_missing=False) if negate: def op(x): return x not in aligned_seq_ids else: def op(x): return x in aligned_seq_ids # if negate = False, only output sequences # matching to at least one of the databases with open(seqs_fp, 'U') as seqs_f: with open(output_fp, 'w') as out_f: for label, seq in parse_fasta(seqs_f): label = label.split()[0] if op(label): out_f.write(">%s\n%s\n" % (label, seq))
def remove_artifacts_seqs(seqs_fp, ref_fp, output_fp, ref_db_fp=None, negate=False, threads=1): """Remove artifacts from FASTA file using SortMeRNA. Parameters ---------- seqs_fp: string file path to FASTA input sequence file ref_fp: tuple file path(s) to FASTA database file output_fp: string file path to store output results ref_db_fp: string or tuple, optional file path(s) to indexed FASTA database negate: boolean, optional if True, discard all input sequences aligning to reference database threads: integer, optional number of threads to use for SortMeRNA """ working_dir = join(dirname(output_fp), "working_dir") if not exists(working_dir): makedirs(working_dir) aligned_seq_ids = set() files_to_remove = [] for i, db in enumerate(ref_fp): # create working directory for each # reference database db_dir_base = splitext(basename(db))[0] db_dir = join(working_dir, db_dir_base) if not exists(db_dir): makedirs(db_dir) if ref_db_fp: sortmerna_db = ref_db_fp[i] else: # build index sortmerna_db, files_to_remove = \ build_database_sortmerna( fasta_path=db, max_pos=10000, output_dir=db_dir) # run SortMeRNA app_result = sortmerna_map( seq_path=seqs_fp, output_dir=db_dir, refseqs_fp=db, sortmerna_db=sortmerna_db, threads=threads, best=1) # Print SortMeRNA errors stderr_fp = app_result['StdErr'].name if stat(stderr_fp).st_size != 0: with open(stderr_fp, 'U') as stderr_f: for line in stderr_f: print line raise ValueError("Could not run SortMeRNA.") for line in app_result['BlastAlignments']: line = line.strip().split('\t') if line[1] == '*': continue else: aligned_seq_ids.add(line[0]) # remove indexed database files remove_files(files_to_remove, error_on_missing=False) if negate: def op(x): return x not in aligned_seq_ids else: def op(x): return x in aligned_seq_ids # if negate = False, only output sequences # matching to at least one of the databases with open(seqs_fp, 'U') as seqs_f: with open(output_fp, 'w') as out_f: for label, seq in parse_fasta(seqs_f): label = label.split()[0] if op(label): out_f.write(">%s\n%s\n" % (label, seq))
def remove_artifacts_seqs(seqs_fp, ref_fp, working_dir, ref_db_fp, negate=False, threads=1, verbose=False): """Remove artifacts from FASTA file using SortMeRNA. Parameters ---------- seqs_fp: string file path to FASTA input sequence file ref_fp: tuple file path(s) to FASTA database file working_dir: string working directory path ref_db_fp: tuple file path(s) to indexed FASTA database negate: boolean, optional if True, discard all input sequences aligning to reference database threads: integer, optional number of threads to use for SortMeRNA verbose: boolean, optional If true, output SortMeRNA errors """ output_fp = join(working_dir, "%s.no_artifacts" % basename(seqs_fp)) aligned_seq_ids = set() for i, db in enumerate(ref_fp): # run SortMeRNA app_result = sortmerna_map(seq_path=seqs_fp, output_dir=working_dir, refseqs_fp=db, sortmerna_db=ref_db_fp[i], threads=threads, best=1) # Print SortMeRNA errors stderr_fp = app_result['StdErr'].name if stat(stderr_fp).st_size != 0: if verbose: with open(stderr_fp, 'U') as stderr_f: for line in stderr_f: print(line) raise ValueError("Could not run SortMeRNA.") for line in app_result['BlastAlignments']: line = line.strip().split('\t') if line[1] == '*': continue else: aligned_seq_ids.add(line[0]) if negate: def op(x): return x not in aligned_seq_ids else: def op(x): return x in aligned_seq_ids # if negate = False, only output sequences # matching to at least one of the databases with open(seqs_fp, 'U') as seqs_f: with open(output_fp, 'w') as out_f: for label, seq in parse_fasta(seqs_f): label = label.split()[0] if op(label): out_f.write(">%s\n%s\n" % (label, seq)) return output_fp