Ejemplo n.º 1
0
    def test_make_stats(self):
        """make_stats produces meaningful statistics."""
        map = self.mapping
        stats = """Clustersize\t#
1:\t\t2
2:\t\t1
5:\t\t1"""

        self.assertEqual(make_stats(map), stats)
Ejemplo n.º 2
0
   def test_make_stats(self):
      """make_stats produces meaningful statistics."""
      map = self.mapping
      stats = """Clustersize\t#
1:\t\t2
2:\t\t1
5:\t\t1""" 
       
      self.assertEqual(make_stats(map), stats)
Ejemplo n.º 3
0
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/",
               verbose=False, squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if(fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp,
                                                      barcode_mapping=barcode_mapping,
                                                      primer=primer)
        if verbose:
            log_fh.write(
                "Sequences in barcode mapping: %d\n" %
                len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp, primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp,
                                                     min_coverage=1,
                                                     # averaging produces too good flowgrams
                                                     # such that the greedy clustering clusters too much.
                                                     # Use the cluster centroid
                                                     # instead by using
                                                     # min_coverage 1
                                                     out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n"
                     % (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)
Ejemplo n.º 4
0
def preprocess(sff_fps,
               log_fh,
               fasta_fp=None,
               out_fp="/tmp/",
               verbose=False,
               squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if (fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp,
         l) = truncate_flowgrams_in_SFF(flowgrams,
                                        header,
                                        outdir=out_fp,
                                        barcode_mapping=barcode_mapping,
                                        primer=primer)
        if verbose:
            log_fh.write("Sequences in barcode mapping: %d\n" %
                         len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams,
                                                      header,
                                                      outdir=out_fp,
                                                      primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(
        mapping,
        trunc_sff_fp,
        min_coverage=1,
        # averaging produces too good flowgrams
        # such that the greedy clustering clusters too much.
        # Use the cluster centroid
        # instead by using
        # min_coverage 1
        out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n" %
                     (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)
Ejemplo n.º 5
0
def denoise_seqs(
        sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False,
        num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="",
        low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log",
        low_memory=False, verbose=False,
        error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
        max_num_rounds=None, titanium=False, checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if(checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if(preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif(cluster):
            preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir, verbose=verbose,
                                  squeeze=squeeze, primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file, mapping, bestscores, log_fh,
                         verbose=verbose, threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")
Ejemplo n.º 6
0
def denoise_seqs(sff_fps,
                 fasta_fp,
                 tmpoutdir,
                 preprocess_fp=None,
                 cluster=False,
                 num_cpus=1,
                 squeeze=True,
                 percent_id=0.97,
                 bail=1,
                 primer="",
                 low_cutoff=3.75,
                 high_cutoff=4.5,
                 log_fp="denoiser.log",
                 low_memory=False,
                 verbose=False,
                 error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat',
                 max_num_rounds=None,
                 titanium=False,
                 checkpoint_fp=None):
    """The main routine to denoise flowgrams"""

    # abort if binary is missing
    check_flowgram_ali_exe()

    if verbose:
        # switch of buffering for log file
        log_fh = open(tmpoutdir + "/" + log_fp, "w", 0)
    else:
        log_fh = None

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF files: %s\n" % ', '.join(sff_fps))
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Preprocess dir: %s\n" % preprocess_fp)
        if checkpoint_fp:
            log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp)
        log_fh.write("Primer sequence: %s\n" % primer)
        log_fh.write("Running on cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)
        log_fh.write("Low cut-off: %.2f\n" % low_cutoff)
        log_fh.write("High cut-off: %.2f\n" % high_cutoff)
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)

    # here we go ...
    # Phase I - clean up and truncate input sff
    if (checkpoint_fp):
        if (preprocess_fp):
            # skip preprocessing as we should have data
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        else:
            raise ApplicationError(
                "Resuming from checkpoint requires --preprocess option")

    else:
        if (preprocess_fp):
            # we already have preprocessed data, so use it
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(preprocess_fp)
        elif (cluster):
            preprocess_on_cluster(sff_fps,
                                  log_fp,
                                  fasta_fp=fasta_fp,
                                  out_fp=tmpoutdir,
                                  verbose=verbose,
                                  squeeze=squeeze,
                                  primer=primer)
            (deprefixed_sff_fp, l, mapping,
             seqs) = read_preprocessed_data(tmpoutdir)
        else:
            (deprefixed_sff_fp, l, mapping, seqs) = \
                preprocess(
                    sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir,
                    verbose=verbose, squeeze=squeeze, primer=primer)

        # preprocessor writes into same file, so better jump to end of file
        if verbose:
            log_fh.close()
            log_fh = open(tmpoutdir + "/" + log_fp, "a", 0)

    # phase II:
    # use prefix map based clustering as initial centroids and greedily
    # add flowgrams to clusters with a low threshold

    (new_sff_file, bestscores, mapping) = \
        greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l,
                          log_fh, num_cpus=num_cpus, on_cluster=cluster,
                          bail_out=bail, pair_id_thresh=percent_id,
                          threshold=low_cutoff, verbose=verbose,
                          fast_method=not low_memory,
                          error_profile=error_profile,
                          max_num_rounds=max_num_rounds,
                          checkpoint_fp=checkpoint_fp)

    # phase III phase:
    # Assign seqs to nearest existing centroid with high threshold
    secondary_clustering(new_sff_file,
                         mapping,
                         bestscores,
                         log_fh,
                         verbose=verbose,
                         threshold=high_cutoff)
    remove(new_sff_file)
    if (verbose):
        log_fh.write("Finished clustering\n")
        log_fh.write("Writing Clusters\n")
        log_fh.write(make_stats(mapping) + "\n")
    store_clusters(mapping, deprefixed_sff_fp, tmpoutdir)
    store_mapping(mapping, tmpoutdir, "denoiser")