def test_make_stats(self): """make_stats produces meaningful statistics.""" map = self.mapping stats = """Clustersize\t# 1:\t\t2 2:\t\t1 5:\t\t1""" self.assertEqual(make_stats(map), stats)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if(fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write( "Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if (fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], parse_fasta(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write("Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams( mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)
def denoise_seqs( sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if(checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if(preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif(cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")
def denoise_seqs(sff_fps, fasta_fp, tmpoutdir, preprocess_fp=None, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, titanium=False, checkpoint_fp=None): """The main routine to denoise flowgrams""" # abort if binary is missing check_flowgram_ali_exe() if verbose: # switch of buffering for log file log_fh = open(tmpoutdir + "/" + log_fp, "w", 0) else: log_fh = None # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF files: %s\n" % ', '.join(sff_fps)) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Preprocess dir: %s\n" % preprocess_fp) if checkpoint_fp: log_fh.write("Resuming denoiser from %s\n" % checkpoint_fp) log_fh.write("Primer sequence: %s\n" % primer) log_fh.write("Running on cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Low cut-off: %.2f\n" % low_cutoff) log_fh.write("High cut-off: %.2f\n" % high_cutoff) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... # Phase I - clean up and truncate input sff if (checkpoint_fp): if (preprocess_fp): # skip preprocessing as we should have data # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) else: raise ApplicationError( "Resuming from checkpoint requires --preprocess option") else: if (preprocess_fp): # we already have preprocessed data, so use it (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(preprocess_fp) elif (cluster): preprocess_on_cluster(sff_fps, log_fp, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) (deprefixed_sff_fp, l, mapping, seqs) = read_preprocessed_data(tmpoutdir) else: (deprefixed_sff_fp, l, mapping, seqs) = \ preprocess( sff_fps, log_fh, fasta_fp=fasta_fp, out_fp=tmpoutdir, verbose=verbose, squeeze=squeeze, primer=primer) # preprocessor writes into same file, so better jump to end of file if verbose: log_fh.close() log_fh = open(tmpoutdir + "/" + log_fp, "a", 0) # phase II: # use prefix map based clustering as initial centroids and greedily # add flowgrams to clusters with a low threshold (new_sff_file, bestscores, mapping) = \ greedy_clustering(deprefixed_sff_fp, seqs, mapping, tmpoutdir, l, log_fh, num_cpus=num_cpus, on_cluster=cluster, bail_out=bail, pair_id_thresh=percent_id, threshold=low_cutoff, verbose=verbose, fast_method=not low_memory, error_profile=error_profile, max_num_rounds=max_num_rounds, checkpoint_fp=checkpoint_fp) # phase III phase: # Assign seqs to nearest existing centroid with high threshold secondary_clustering(new_sff_file, mapping, bestscores, log_fh, verbose=verbose, threshold=high_cutoff) remove(new_sff_file) if (verbose): log_fh.write("Finished clustering\n") log_fh.write("Writing Clusters\n") log_fh.write(make_stats(mapping) + "\n") store_clusters(mapping, deprefixed_sff_fp, tmpoutdir) store_mapping(mapping, tmpoutdir, "denoiser")