def test_deblur_with_non_default_error_profile(self): error_dist = [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005 ] seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # Trying with a numpy array error_dist = np.array([ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005 ]) seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) self.assertEqual(obs, exp)
def test_deblur_with_non_default_error_profile(self): error_dist = [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005, ] seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag", ) ] # Trying with a numpy array error_dist = np.array( [ 1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005, 0.0000005, 0.0000005, ] ) seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f), error_dist=error_dist) self.assertEqual(obs, exp)
def test_deblur_indel(self): """Test if also removes indel sequences """ seqs_f = StringIO(TEST_SEQS_2) # add the MSA for the indel seqs = sequence_generator(seqs_f) newseqs = [] for chead, cseq in seqs: tseq = cseq[:10] + '-' + cseq[10:] newseqs.append((chead, tseq)) # now add a sequence with an A insertion tseq = cseq[:10] + 'A' + cseq[10:-1] + '-' newseqs.append((chead, tseq)) obs = deblur(newseqs) # remove the '-' (same as in launch_workflow) for s in obs: s.sequence = s.sequence.replace('-', '') # the expected output exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # make sure we get 1 sequence as output self.assertEqual(len(obs), 1) # and that it is the correct sequence self.assertEqual(obs[0].sequence, exp[0].sequence)
def test_deblur(self): seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f)) exp = [ Sequence("E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag")] self.assertEqual(obs, exp)
def test_deblur_toy_example(self): seqs_f = StringIO(TEST_SEQS_1) obs = deblur(sequence_generator(seqs_f)) exp = [ Sequence( "E.Coli;size=1000;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] self.assertEqual(obs, exp)
def test_deblur(self): seqs_f = StringIO(TEST_SEQS_2) obs = deblur(parse_fasta(seqs_f)) exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag", ) ] self.assertEqual(obs, exp)
def test_deblur_indel(self): """Test if also removes indel sequences """ seqs_f = StringIO(TEST_SEQS_2) # add the MSA for the indel seqs = sequence_generator(seqs_f) newseqs = [] for chead, cseq in seqs: tseq = cseq[:10] + '-' + cseq[10:] newseqs.append((chead, tseq)) # now add a sequence with an A insertion at the expected freq. (30 < 0.02 * (720 / 0.47) where 0.47 is the mod_factor) so should be removed cseq = newseqs[0][1] tseq = cseq[:10] + 'A' + cseq[11:-1] + '-' chead = '>indel1-read;size=30;' newseqs.append((chead, tseq)) # and add a sequence with an A insertion but at higher freq. (not expected by indel upper bound - (31 > 0.02 * (720 / 0.47) so should not be removed) cseq = newseqs[0][1] tseq = cseq[:10] + 'A' + cseq[11:-1] + '-' chead = '>indel2-read;size=31;' newseqs.append((chead, tseq)) obs = deblur(newseqs) # remove the '-' (same as in launch_workflow) for s in obs: s.sequence = s.sequence.replace('-', '') # the expected output exp = [ Sequence( "E.Coli-999;size=720;", "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt" "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg" "gcaagcttgagtctcgtagaggggggcagaattccag") ] # make sure we get 2 sequences as output - the original and the indel2 (too many reads for the expected indel probabilty) self.assertEqual(len(obs), 2) # and that it is the correct sequence self.assertEqual(obs[0].sequence, exp[0].sequence) self.assertEqual(obs[1].label, '>indel2-read;size=31;')
def launch_workflow(seqs_fp, working_dir, mean_error, error_dist, indel_prob, indel_max, trim_length, left_trim_length, min_size, ref_fp, ref_db_fp, threads_per_sample=1, sim_thresh=None, coverage_thresh=None): """Launch full deblur workflow for a single post split-libraries fasta file Parameters ---------- seqs_fp: string a post split library fasta file for debluring working_dir: string working directory path mean_error: float mean error for original sequence estimate error_dist: list list of error probabilities for each hamming distance indel_prob: float insertion/deletion (indel) probability indel_max: integer maximal indel number trim_length: integer sequence trim length left_trim_length: integer trim the first n reads min_size: integer upper limit on sequence abundance (discard sequences below limit) ref_fp: tuple filepath(s) to FASTA reference database for artifact removal ref_db_fp: tuple filepath(s) to SortMeRNA indexed database for artifact removal threads_per_sample: integer, optional number of threads to use for SortMeRNA/mafft/vsearch (0 for max available) sim_thresh: float, optional the minimal similarity for a sequence to the database. if None, take the defaults (0.65 for negate=False, 0.95 for negate=True) coverage_thresh: float, optional the minimal coverage for alignment of a sequence to the database. if None, take the defaults (0.3 for negate=False, 0.95 for negate=True) Return ------ output_no_chimers_fp : string filepath to fasta file with no chimeras of None if error encountered """ logger = logging.getLogger(__name__) logger.info('--------------------------------------------------------') logger.info('launch_workflow for file %s' % seqs_fp) # Step 1: Trim sequences to specified length output_trim_fp = join(working_dir, "%s.trim" % basename(seqs_fp)) with open(output_trim_fp, 'w') as out_f: for label, seq in trim_seqs(input_seqs=sequence_generator(seqs_fp), trim_len=trim_length, left_trim_len=left_trim_length): out_f.write(">%s\n%s\n" % (label, seq)) # Step 2: Dereplicate sequences output_derep_fp = join(working_dir, "%s.derep" % basename(output_trim_fp)) dereplicate_seqs(seqs_fp=output_trim_fp, output_fp=output_derep_fp, min_size=min_size, threads=threads_per_sample) # Step 3: Remove artifacts output_artif_fp, num_seqs_left, _ = remove_artifacts_seqs( seqs_fp=output_derep_fp, ref_fp=ref_fp, working_dir=working_dir, ref_db_fp=ref_db_fp, negate=True, threads=threads_per_sample, sim_thresh=sim_thresh) if not output_artif_fp: warnings.warn('Problem removing artifacts from file %s' % seqs_fp, UserWarning) logger.warning('remove artifacts failed, aborting') return None # Step 4: Multiple sequence alignment if num_seqs_left > 1: output_msa_fp = join(working_dir, "%s.msa" % basename(output_artif_fp)) alignment = multiple_sequence_alignment(seqs_fp=output_artif_fp, threads=threads_per_sample) if not alignment: warnings.warn( 'Problem performing multiple sequence alignment ' 'on file %s' % seqs_fp, UserWarning) logger.warning('msa failed. aborting') return None elif num_seqs_left == 1: # only one sequence after remove artifacts (but could be many reads) # no need to run MSA - just use the pre-msa file as input for next step output_msa_fp = output_artif_fp else: err_msg = ('No sequences left after artifact removal in ' 'file %s' % seqs_fp) warnings.warn(err_msg, UserWarning) logger.warning(err_msg) return None # Step 5: Launch deblur output_deblur_fp = join(working_dir, "%s.deblur" % basename(output_msa_fp)) with open(output_deblur_fp, 'w') as f: seqs = deblur(sequence_generator(output_msa_fp), mean_error, error_dist, indel_prob, indel_max) if seqs is None: warnings.warn( 'multiple sequence alignment file %s contains ' 'no sequences' % output_msa_fp, UserWarning) logger.warn('no sequences returned from deblur for file %s' % output_msa_fp) return None for s in seqs: # remove '-' from aligned sequences s.sequence = s.sequence.replace('-', '') f.write(s.to_fasta()) # Step 6: Chimera removal output_no_chimeras_fp = remove_chimeras_denovo_from_seqs( output_deblur_fp, working_dir, threads=threads_per_sample) logger.info('finished processing file') return output_no_chimeras_fp
def test_deblur_noseqs(self): """If no sequences supplied, need to return None """ res = deblur([]) self.assertEqual(res, None)
def launch_workflow(seqs_fp, working_dir, read_error, mean_error, error_dist, indel_prob, indel_max, trim_length, min_size, ref_fp, ref_db_fp, negate, threads=1, delim='_'): """Launch full deblur workflow. Parameters ---------- seqs_fp: string post split library sequences for debluring working_dir: string working directory path read_error: float read error rate mean_error: float mean error for original sequence estimate error_dist: list list of error probabilities for each hamming distance indel_prob: float insertion/deletion (indel) probability indel_max: integer maximal indel number trim_length: integer sequence trim length min_size: integer upper limit on sequence abundance (discard sequences below limit) ref_fp: tuple filepath(s) to FASTA reference database for artifact removal ref_db_fp: tuple filepath(s) to SortMeRNA indexed database for artifact removal negate: boolean discard all sequences aligning to the ref_fp database threads: integer, optional number of threads to use for SortMeRNA delim: string, optional delimiter in FASTA labels to separate sample ID from sequence ID Return ------ biom_fp: string filepath to BIOM table """ # Step 1: Trim sequences to specified length output_trim_fp = join(working_dir, "%s.trim" % basename(seqs_fp)) with open(seqs_fp, 'U') as in_f, open(output_trim_fp, 'w') as out_f: for label, seq in trim_seqs(input_seqs=parse_fasta(in_f), trim_len=trim_length): out_f.write(">%s\n%s\n" % (label, seq)) # Step 2: Dereplicate sequences output_derep_fp = join(working_dir, "%s.derep" % basename(output_trim_fp)) dereplicate_seqs(seqs_fp=output_trim_fp, output_fp=output_derep_fp, min_size=min_size, uc_output=True) # Step 3: Remove artifacts output_artif_fp = remove_artifacts_seqs(seqs_fp=output_derep_fp, ref_fp=ref_fp, working_dir=working_dir, ref_db_fp=ref_db_fp, negate=negate, threads=threads) # Step 4: Multiple sequence alignment output_msa_fp = join(working_dir, "%s.msa" % basename(output_artif_fp)) with open(output_msa_fp, 'w') as f: alignment = multiple_sequence_alignment(seqs_fp=output_artif_fp, threads=threads) f.write(alignment.to_fasta()) # Step 5: Launch deblur output_deblur_fp = join(working_dir, "%s.deblur" % basename(output_msa_fp)) with open(output_deblur_fp, 'w') as f: seqs = deblur(parse_fasta(output_msa_fp), read_error, mean_error, error_dist, indel_prob, indel_max) for s in seqs: # remove '-' from aligned sequences s.sequence = s.sequence.replace('-', '') f.write(s.to_fasta()) # Step 6: Chimera removal output_no_chimeras_fp = remove_chimeras_denovo_from_seqs( output_deblur_fp, working_dir) # Step 7: Generate BIOM table deblur_clrs, table = generate_biom_table(seqs_fp=output_no_chimeras_fp, uc_fp="%s.uc" % output_trim_fp, delim=delim) # Step 8: Write BIOM table to file if table.is_empty(): raise ValueError("Attempting to write an empty BIOM table.") biom_fp = join(working_dir, "%s.biom" % basename(seqs_fp)) write_biom_table(table, biom_fp) return biom_fp
def launch_workflow( seqs_fp, output_fp, read_error, mean_error, error_dist, indel_prob, indel_max, trim_length, min_size, ref_fp, ref_db_fp, negate, threads, delim, ): """Launch full deblur workflow. Parameters ---------- seqs_fp: string post split library sequences for debluring output_fp: string filepath to output file read_error: float read error rate mean_error: float mean error for original sequence estimate error_dist: list list of error probabilities for each hamming distance indel_prob: float insertion/deletion (indel) probability indel_max: integer maximal indel number trim_length: integer sequence trim length min_size: integer upper limit on sequence abundance (discard sequences below limit) ref_fp: tuple filepath(s) to FASTA reference database for artifact removal ref_db_fp: tuple filepath(s) to SortMeRNA indexed database for artifact removal negate: boolean discard all sequences aligning to the ref_fp database threads: integer number of threads to use for SortMeRNA delim: string delimiter in FASTA labels to separate sample ID from sequence ID """ # Step 1: Trim sequences to specified length output_trim_fp = join(dirname(output_fp), "%s.trim" % basename(seqs_fp)) with open(seqs_fp, "U") as in_f, open(output_trim_fp, "w") as out_f: for label, seq in trim_seqs(input_seqs=parse_fasta(in_f), trim_len=trim_length): out_f.write(">%s\n%s\n" % (label, seq)) # Step 2: Dereplicate sequences output_derep_fp = join(dirname(output_fp), "%s.derep" % basename(output_trim_fp)) dereplicate_seqs(seqs_fp=output_trim_fp, output_fp=output_derep_fp, min_size=min_size, uc_output=True) # Step 3: Remove artifacts output_artif_fp = join(dirname(output_fp), "%s.no_artifacts" % basename(output_derep_fp)) remove_artifacts_seqs( seqs_fp=output_derep_fp, ref_fp=ref_fp, output_fp=output_artif_fp, ref_db_fp=ref_db_fp, negate=negate, threads=threads, ) # Step 4: Multiple sequence alignment output_msa_fp = join(dirname(output_fp), "%s.msa" % basename(output_artif_fp)) with open(output_msa_fp, "w") as f: alignment = multiple_sequence_alignment(output_artif_fp) f.write(alignment.to_fasta()) # Step 5: Launch deblur output_deblur_fp = join(dirname(output_fp), "%s.deblur" % basename(output_msa_fp)) with open(output_deblur_fp, "w") as f: seqs = deblur(parse_fasta(output_msa_fp), read_error, mean_error, error_dist, indel_prob, indel_max) for s in seqs: # remove '-' from aligned sequences s.sequence = s.sequence.replace("-", "") f.write(s.to_fasta()) # Step 6: Chimera removal output_no_chimeras_fp = join(dirname(output_fp), "%s.no_chimeras" % basename(output_deblur_fp)) remove_chimeras_denovo_from_seqs(output_deblur_fp, output_no_chimeras_fp) # Step 7: Generate BIOM table deblur_clrs, table = generate_biom_table(seqs_fp=output_no_chimeras_fp, uc_fp="%s.uc" % output_trim_fp, delim=delim) # Step 8: Write BIOM table to file if table.is_empty(): raise ValueError("Attempting to write an empty BIOM table.") with biom_open(output_fp, "w") as f: if HAVE_H5PY: table.to_hdf5(h5grp=f, generated_by="deblur") else: table.to_json(direct_io=f, generated_by="deblur")