Example #1
0
    def test_deblur_with_non_default_error_profile(self):
        error_dist = [
            1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025,
            0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005,
            0.0000005, 0.0000005
        ]
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        # Trying with a numpy array
        error_dist = np.array([
            1, 0.05, 0.000005, 0.000005, 0.000005, 0.000005, 0.0000025,
            0.0000025, 0.0000025, 0.0000025, 0.0000025, 0.0000005, 0.0000005,
            0.0000005, 0.0000005
        ])
        seqs_f = StringIO(TEST_SEQS_2)
        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)

        self.assertEqual(obs, exp)
Example #2
0
    def test_deblur_with_non_default_error_profile(self):
        error_dist = [
            1,
            0.05,
            0.000005,
            0.000005,
            0.000005,
            0.000005,
            0.0000025,
            0.0000025,
            0.0000025,
            0.0000025,
            0.0000025,
            0.0000005,
            0.0000005,
            0.0000005,
            0.0000005,
        ]
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag",
            )
        ]

        # Trying with a numpy array
        error_dist = np.array(
            [
                1,
                0.05,
                0.000005,
                0.000005,
                0.000005,
                0.000005,
                0.0000025,
                0.0000025,
                0.0000025,
                0.0000025,
                0.0000025,
                0.0000005,
                0.0000005,
                0.0000005,
                0.0000005,
            ]
        )
        seqs_f = StringIO(TEST_SEQS_2)
        obs = deblur(parse_fasta(seqs_f), error_dist=error_dist)

        self.assertEqual(obs, exp)
Example #3
0
    def test_deblur_indel(self):
        """Test if also removes indel sequences
        """
        seqs_f = StringIO(TEST_SEQS_2)

        # add the MSA for the indel
        seqs = sequence_generator(seqs_f)
        newseqs = []
        for chead, cseq in seqs:
            tseq = cseq[:10] + '-' + cseq[10:]
            newseqs.append((chead, tseq))
        # now add a sequence with an A insertion
        tseq = cseq[:10] + 'A' + cseq[10:-1] + '-'
        newseqs.append((chead, tseq))

        obs = deblur(newseqs)
        # remove the '-' (same as in launch_workflow)
        for s in obs:
            s.sequence = s.sequence.replace('-', '')

        # the expected output
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]
        # make sure we get 1 sequence as output
        self.assertEqual(len(obs), 1)
        # and that it is the correct sequence
        self.assertEqual(obs[0].sequence, exp[0].sequence)
Example #4
0
    def test_deblur(self):
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(parse_fasta(seqs_f))
        exp = [
            Sequence("E.Coli-999;size=720;",
                     "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                     "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                     "gcaagcttgagtctcgtagaggggggcagaattccag")]

        self.assertEqual(obs, exp)
Example #5
0
    def test_deblur_toy_example(self):
        seqs_f = StringIO(TEST_SEQS_1)
        obs = deblur(sequence_generator(seqs_f))
        exp = [
            Sequence(
                "E.Coli;size=1000;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]

        self.assertEqual(obs, exp)
Example #6
0
    def test_deblur(self):
        seqs_f = StringIO(TEST_SEQS_2)

        obs = deblur(parse_fasta(seqs_f))
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag",
            )
        ]

        self.assertEqual(obs, exp)
Example #7
0
    def test_deblur_indel(self):
        """Test if also removes indel sequences
        """
        seqs_f = StringIO(TEST_SEQS_2)

        # add the MSA for the indel
        seqs = sequence_generator(seqs_f)
        newseqs = []
        for chead, cseq in seqs:
            tseq = cseq[:10] + '-' + cseq[10:]
            newseqs.append((chead, tseq))

        # now add a sequence with an A insertion at the expected freq. (30 < 0.02 * (720 / 0.47) where 0.47 is the mod_factor) so should be removed
        cseq = newseqs[0][1]
        tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
        chead = '>indel1-read;size=30;'
        newseqs.append((chead, tseq))

        # and add a sequence with an A insertion but at higher freq. (not expected by indel upper bound - (31 > 0.02 * (720 / 0.47) so should not be removed)
        cseq = newseqs[0][1]
        tseq = cseq[:10] + 'A' + cseq[11:-1] + '-'
        chead = '>indel2-read;size=31;'
        newseqs.append((chead, tseq))

        obs = deblur(newseqs)

        # remove the '-' (same as in launch_workflow)
        for s in obs:
            s.sequence = s.sequence.replace('-', '')

        # the expected output
        exp = [
            Sequence(
                "E.Coli-999;size=720;",
                "tacggagggtgcaagcgttaatcggaattactgggcgtaaagcgcacgcaggcggt"
                "ttgttaagtcagatgtgaaatccccgggctcaacctgggaactgcatctgatactg"
                "gcaagcttgagtctcgtagaggggggcagaattccag")
        ]
        # make sure we get 2 sequences as output - the original and the indel2 (too many reads for the expected indel probabilty)
        self.assertEqual(len(obs), 2)
        # and that it is the correct sequence
        self.assertEqual(obs[0].sequence, exp[0].sequence)
        self.assertEqual(obs[1].label, '>indel2-read;size=31;')
Example #8
0
def launch_workflow(seqs_fp,
                    working_dir,
                    mean_error,
                    error_dist,
                    indel_prob,
                    indel_max,
                    trim_length,
                    left_trim_length,
                    min_size,
                    ref_fp,
                    ref_db_fp,
                    threads_per_sample=1,
                    sim_thresh=None,
                    coverage_thresh=None):
    """Launch full deblur workflow for a single post split-libraries fasta file

    Parameters
    ----------
    seqs_fp: string
        a post split library fasta file for debluring
    working_dir: string
        working directory path
    mean_error: float
        mean error for original sequence estimate
    error_dist: list
        list of error probabilities for each hamming distance
    indel_prob: float
        insertion/deletion (indel) probability
    indel_max: integer
        maximal indel number
    trim_length: integer
        sequence trim length
    left_trim_length: integer
        trim the first n reads
    min_size: integer
        upper limit on sequence abundance (discard sequences below limit)
    ref_fp: tuple
        filepath(s) to FASTA reference database for artifact removal
    ref_db_fp: tuple
        filepath(s) to SortMeRNA indexed database for artifact removal
    threads_per_sample: integer, optional
        number of threads to use for SortMeRNA/mafft/vsearch
        (0 for max available)
    sim_thresh: float, optional
        the minimal similarity for a sequence to the database.
        if None, take the defaults (0.65 for negate=False,
        0.95 for negate=True)
    coverage_thresh: float, optional
        the minimal coverage for alignment of a sequence to the database.
        if None, take the defaults (0.3 for negate=False, 0.95 for negate=True)

    Return
    ------
    output_no_chimers_fp : string
        filepath to fasta file with no chimeras of None if error encountered
    """
    logger = logging.getLogger(__name__)
    logger.info('--------------------------------------------------------')
    logger.info('launch_workflow for file %s' % seqs_fp)

    # Step 1: Trim sequences to specified length
    output_trim_fp = join(working_dir, "%s.trim" % basename(seqs_fp))
    with open(output_trim_fp, 'w') as out_f:
        for label, seq in trim_seqs(input_seqs=sequence_generator(seqs_fp),
                                    trim_len=trim_length,
                                    left_trim_len=left_trim_length):
            out_f.write(">%s\n%s\n" % (label, seq))
    # Step 2: Dereplicate sequences
    output_derep_fp = join(working_dir, "%s.derep" % basename(output_trim_fp))
    dereplicate_seqs(seqs_fp=output_trim_fp,
                     output_fp=output_derep_fp,
                     min_size=min_size,
                     threads=threads_per_sample)
    # Step 3: Remove artifacts
    output_artif_fp, num_seqs_left, _ = remove_artifacts_seqs(
        seqs_fp=output_derep_fp,
        ref_fp=ref_fp,
        working_dir=working_dir,
        ref_db_fp=ref_db_fp,
        negate=True,
        threads=threads_per_sample,
        sim_thresh=sim_thresh)
    if not output_artif_fp:
        warnings.warn('Problem removing artifacts from file %s' % seqs_fp,
                      UserWarning)
        logger.warning('remove artifacts failed, aborting')
        return None
    # Step 4: Multiple sequence alignment
    if num_seqs_left > 1:
        output_msa_fp = join(working_dir, "%s.msa" % basename(output_artif_fp))
        alignment = multiple_sequence_alignment(seqs_fp=output_artif_fp,
                                                threads=threads_per_sample)
        if not alignment:
            warnings.warn(
                'Problem performing multiple sequence alignment '
                'on file %s' % seqs_fp, UserWarning)
            logger.warning('msa failed. aborting')
            return None
    elif num_seqs_left == 1:
        # only one sequence after remove artifacts (but could be many reads)
        # no need to run MSA - just use the pre-msa file as input for next step
        output_msa_fp = output_artif_fp
    else:
        err_msg = ('No sequences left after artifact removal in '
                   'file %s' % seqs_fp)
        warnings.warn(err_msg, UserWarning)
        logger.warning(err_msg)
        return None
    # Step 5: Launch deblur
    output_deblur_fp = join(working_dir, "%s.deblur" % basename(output_msa_fp))
    with open(output_deblur_fp, 'w') as f:
        seqs = deblur(sequence_generator(output_msa_fp), mean_error,
                      error_dist, indel_prob, indel_max)
        if seqs is None:
            warnings.warn(
                'multiple sequence alignment file %s contains '
                'no sequences' % output_msa_fp, UserWarning)
            logger.warn('no sequences returned from deblur for file %s' %
                        output_msa_fp)
            return None
        for s in seqs:
            # remove '-' from aligned sequences
            s.sequence = s.sequence.replace('-', '')
            f.write(s.to_fasta())
    # Step 6: Chimera removal
    output_no_chimeras_fp = remove_chimeras_denovo_from_seqs(
        output_deblur_fp, working_dir, threads=threads_per_sample)
    logger.info('finished processing file')
    return output_no_chimeras_fp
Example #9
0
 def test_deblur_noseqs(self):
     """If no sequences supplied, need to return None
     """
     res = deblur([])
     self.assertEqual(res, None)
Example #10
0
def launch_workflow(seqs_fp,
                    working_dir,
                    read_error,
                    mean_error,
                    error_dist,
                    indel_prob,
                    indel_max,
                    trim_length,
                    min_size,
                    ref_fp,
                    ref_db_fp,
                    negate,
                    threads=1,
                    delim='_'):
    """Launch full deblur workflow.

    Parameters
    ----------
    seqs_fp: string
        post split library sequences for debluring
    working_dir: string
        working directory path
    read_error: float
        read error rate
    mean_error: float
        mean error for original sequence estimate
    error_dist: list
        list of error probabilities for each hamming distance
    indel_prob: float
        insertion/deletion (indel) probability
    indel_max: integer
        maximal indel number
    trim_length: integer
        sequence trim length
    min_size: integer
        upper limit on sequence abundance (discard sequences below limit)
    ref_fp: tuple
        filepath(s) to FASTA reference database for artifact removal
    ref_db_fp: tuple
        filepath(s) to SortMeRNA indexed database for artifact removal
    negate: boolean
        discard all sequences aligning to the ref_fp database
    threads: integer, optional
        number of threads to use for SortMeRNA
    delim: string, optional
        delimiter in FASTA labels to separate sample ID from sequence ID

    Return
    ------
    biom_fp: string
        filepath to BIOM table
    """

    # Step 1: Trim sequences to specified length
    output_trim_fp = join(working_dir, "%s.trim" % basename(seqs_fp))
    with open(seqs_fp, 'U') as in_f, open(output_trim_fp, 'w') as out_f:
        for label, seq in trim_seqs(input_seqs=parse_fasta(in_f),
                                    trim_len=trim_length):
            out_f.write(">%s\n%s\n" % (label, seq))
    # Step 2: Dereplicate sequences
    output_derep_fp = join(working_dir, "%s.derep" % basename(output_trim_fp))
    dereplicate_seqs(seqs_fp=output_trim_fp,
                     output_fp=output_derep_fp,
                     min_size=min_size,
                     uc_output=True)
    # Step 3: Remove artifacts
    output_artif_fp = remove_artifacts_seqs(seqs_fp=output_derep_fp,
                                            ref_fp=ref_fp,
                                            working_dir=working_dir,
                                            ref_db_fp=ref_db_fp,
                                            negate=negate,
                                            threads=threads)
    # Step 4: Multiple sequence alignment
    output_msa_fp = join(working_dir, "%s.msa" % basename(output_artif_fp))
    with open(output_msa_fp, 'w') as f:
        alignment = multiple_sequence_alignment(seqs_fp=output_artif_fp,
                                                threads=threads)
        f.write(alignment.to_fasta())
    # Step 5: Launch deblur
    output_deblur_fp = join(working_dir, "%s.deblur" % basename(output_msa_fp))
    with open(output_deblur_fp, 'w') as f:
        seqs = deblur(parse_fasta(output_msa_fp), read_error, mean_error,
                      error_dist, indel_prob, indel_max)
        for s in seqs:
            # remove '-' from aligned sequences
            s.sequence = s.sequence.replace('-', '')
            f.write(s.to_fasta())
    # Step 6: Chimera removal
    output_no_chimeras_fp = remove_chimeras_denovo_from_seqs(
        output_deblur_fp, working_dir)
    # Step 7: Generate BIOM table
    deblur_clrs, table = generate_biom_table(seqs_fp=output_no_chimeras_fp,
                                             uc_fp="%s.uc" % output_trim_fp,
                                             delim=delim)
    # Step 8: Write BIOM table to file
    if table.is_empty():
        raise ValueError("Attempting to write an empty BIOM table.")
    biom_fp = join(working_dir, "%s.biom" % basename(seqs_fp))
    write_biom_table(table, biom_fp)

    return biom_fp
Example #11
0
def launch_workflow(
    seqs_fp,
    output_fp,
    read_error,
    mean_error,
    error_dist,
    indel_prob,
    indel_max,
    trim_length,
    min_size,
    ref_fp,
    ref_db_fp,
    negate,
    threads,
    delim,
):
    """Launch full deblur workflow.

    Parameters
    ----------
    seqs_fp: string
        post split library sequences for debluring
    output_fp: string
        filepath to output file
    read_error: float
        read error rate
    mean_error: float
        mean error for original sequence estimate
    error_dist: list
        list of error probabilities for each hamming distance
    indel_prob: float
        insertion/deletion (indel) probability
    indel_max: integer
        maximal indel number
    trim_length: integer
        sequence trim length
    min_size: integer
        upper limit on sequence abundance (discard sequences below limit)
    ref_fp: tuple
        filepath(s) to FASTA reference database for artifact removal
    ref_db_fp: tuple
        filepath(s) to SortMeRNA indexed database for artifact removal
    negate: boolean
        discard all sequences aligning to the ref_fp database
    threads: integer
        number of threads to use for SortMeRNA
    delim: string
        delimiter in FASTA labels to separate sample ID from sequence ID
    """
    # Step 1: Trim sequences to specified length
    output_trim_fp = join(dirname(output_fp), "%s.trim" % basename(seqs_fp))
    with open(seqs_fp, "U") as in_f, open(output_trim_fp, "w") as out_f:
        for label, seq in trim_seqs(input_seqs=parse_fasta(in_f), trim_len=trim_length):
            out_f.write(">%s\n%s\n" % (label, seq))
    # Step 2: Dereplicate sequences
    output_derep_fp = join(dirname(output_fp), "%s.derep" % basename(output_trim_fp))
    dereplicate_seqs(seqs_fp=output_trim_fp, output_fp=output_derep_fp, min_size=min_size, uc_output=True)
    # Step 3: Remove artifacts
    output_artif_fp = join(dirname(output_fp), "%s.no_artifacts" % basename(output_derep_fp))
    remove_artifacts_seqs(
        seqs_fp=output_derep_fp,
        ref_fp=ref_fp,
        output_fp=output_artif_fp,
        ref_db_fp=ref_db_fp,
        negate=negate,
        threads=threads,
    )
    # Step 4: Multiple sequence alignment
    output_msa_fp = join(dirname(output_fp), "%s.msa" % basename(output_artif_fp))
    with open(output_msa_fp, "w") as f:
        alignment = multiple_sequence_alignment(output_artif_fp)
        f.write(alignment.to_fasta())
    # Step 5: Launch deblur
    output_deblur_fp = join(dirname(output_fp), "%s.deblur" % basename(output_msa_fp))
    with open(output_deblur_fp, "w") as f:
        seqs = deblur(parse_fasta(output_msa_fp), read_error, mean_error, error_dist, indel_prob, indel_max)
        for s in seqs:
            # remove '-' from aligned sequences
            s.sequence = s.sequence.replace("-", "")
            f.write(s.to_fasta())
    # Step 6: Chimera removal
    output_no_chimeras_fp = join(dirname(output_fp), "%s.no_chimeras" % basename(output_deblur_fp))
    remove_chimeras_denovo_from_seqs(output_deblur_fp, output_no_chimeras_fp)
    # Step 7: Generate BIOM table
    deblur_clrs, table = generate_biom_table(seqs_fp=output_no_chimeras_fp, uc_fp="%s.uc" % output_trim_fp, delim=delim)
    # Step 8: Write BIOM table to file
    if table.is_empty():
        raise ValueError("Attempting to write an empty BIOM table.")
    with biom_open(output_fp, "w") as f:
        if HAVE_H5PY:
            table.to_hdf5(h5grp=f, generated_by="deblur")
        else:
            table.to_json(direct_io=f, generated_by="deblur")