def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"): """Splits a sff.txt file on barcode/mapping file.""" try: (flowgrams, header) = cat_sff_files(sff_file_handles) except ValueError: # reading in the binary sff usually shows up as ValueError raise FileFormatError( 'Wrong flogram file format. Make sure you pass the sff.txt format ' + 'produced by sffinfo. The binary .sff will not work here.') (inverse_map, map_count) = build_inverse_barcode_map(parse_fasta(map_file_handle)) filenames = [] # we might have many barcodes and reach python open file limit # therefor we go the slow way and open and close files each time # First set up all files with the headers only for barcode_id in map_count.keys(): fh = open(outdir + barcode_id, "w") write_sff_header(header, fh, map_count[barcode_id]) fh.close() filenames.append(outdir + barcode_id) # Then direct each flowgram into its barcode file for f in flowgrams: if f.Name in inverse_map: barcode_id = inverse_map[f.Name] fh = open(outdir + barcode_id, "a") fh.write(f.createFlowHeader() + "\n") return filenames
def filter_sff_file(flowgrams, header, filter_list, out_fh): """Filters all flowgrams in handle with filter. flowgrams: a list of flowgrams (or something similar) header: the header for the flowgrams filter_list: list of filters to be applied on sff.txt file out_fh: output file handle returns: number of flowgrams in filtered out file """ write_sff_header(header, out_fh) l = 0 for f in flowgrams: passed = True for filter in filter_list: passed = passed and filter(f) if not passed: # bail out break if (passed): out_fh.write(f.createFlowHeader() + "\n") l += 1 return l
def filter_sff_file(flowgrams, header, filter_list, out_fh): """Filters all flowgrams in handle with filter. flowgrams: a list of flowgrams (or something similar) header: the header for the flowgrams filter_list: list of filters to be applied on sff.txt file out_fh: output file handle returns: number of flowgrams in filtered out file """ write_sff_header(header, out_fh) l = 0 for f in flowgrams: passed = True for filter in filter_list: passed = passed and filter(f) if not passed: #bail out break if (passed): out_fh.write(f.createFlowHeader()+"\n") l += 1 return l
def split_sff(sff_file_handles, map_file_handle, outdir="/tmp/"): """Splits a sff.txt file on barcode/mapping file.""" try: (flowgrams, header) = cat_sff_files(sff_file_handles) except ValueError: #reading in the binary sff usually shows up as ValueError raise FileFormatError, 'Wrong flogram file format. Make sure you pass the sff.txt format '+\ 'produced by sffinfo. The binary .sff will not work here.' (inverse_map, map_count) = build_inverse_barcode_map(MinimalFastaParser(map_file_handle)) filenames = [] #we might have many barcodes and reach python open file limit # therefor we go the slow way and open and close files each time #First set up all files with the headers only for barcode_id in map_count.keys(): fh = open(outdir+barcode_id, "w") write_sff_header(header, fh, map_count[barcode_id]) fh.close() filenames.append(outdir+barcode_id) #Then direct each flowgram into its barcode file for f in flowgrams: if inverse_map.has_key(f.Name): barcode_id = inverse_map[f.Name] fh = open(outdir+barcode_id, "a") fh.write(f.createFlowHeader()+"\n") return filenames
def truncate_flowgrams_in_SFF(flowgrams, header, outhandle=None, outdir="/tmp/", barcode_mapping=None, primer=None, allow_num_ambigous=4): """Truncate flowgrams at low quality 3' end and strip key+primers. flowgrams: a list of flowgrams (or something similar) header: the header for the flowgrams outhandle: output file handle, can be None outdir: directory where random file will be created if outhandle is None barcode_mapping: dictionary mapping of read ids to barcode seqs. The barcode seq will be truncated of the 5' end of the read primer: primer sequence that will be truncated of the 5' end of the read allow_num_ambigous: number of 'N' allowed in flowgram """ out_filename = "" if not outhandle: fd, out_filename = mkstemp(dir=outdir, prefix="trunc_sff", suffix=".sff.txt") close(fd) outhandle = open(out_filename, "w") write_sff_header(header, outhandle) l = 0 for f in flowgrams: qual_trimmed_flowgram = f.getQualityTrimmedFlowgram() if barcode_mapping: if f.Name in barcode_mapping: trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram( primerseq=DEFAULT_KEYSEQ + barcode_mapping[f.Name] + primer) else: continue else: prim = DEFAULT_KEYSEQ if primer: prim += primer trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram( primerseq=prim) if (trunc_flowgram is not None): outhandle.write(trunc_flowgram.createFlowHeader() + "\n") l += 1 return (out_filename, l)
def truncate_flowgrams_in_SFF( flowgrams, header, outhandle=None, outdir="/tmp/", barcode_mapping=None, primer=None, allow_num_ambigous=4): """Truncate flowgrams at low quality 3' end and strip key+primers. flowgrams: a list of flowgrams (or something similar) header: the header for the flowgrams outhandle: output file handle, can be None outdir: directory where random file will be created if outhandle is None barcode_mapping: dictionary mapping of read ids to barcode seqs. The barcode seq will be truncated of the 5' end of the read primer: primer sequence that will be truncated of the 5' end of the read allow_num_ambigous: number of 'N' allowed in flowgram """ out_filename = "" if not outhandle: fd, out_filename = mkstemp(dir=outdir, prefix="trunc_sff", suffix=".sff.txt") close(fd) outhandle = open(out_filename, "w") write_sff_header(header, outhandle) l = 0 for f in flowgrams: qual_trimmed_flowgram = f.getQualityTrimmedFlowgram() if barcode_mapping: if f.Name in barcode_mapping: trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram( primerseq=DEFAULT_KEYSEQ + barcode_mapping[f.Name] + primer) else: continue else: prim = DEFAULT_KEYSEQ if primer: prim += primer trunc_flowgram = qual_trimmed_flowgram.getPrimerTrimmedFlowgram( primerseq=prim) if(trunc_flowgram is not None): outhandle.write(trunc_flowgram.createFlowHeader() + "\n") l += 1 return (out_filename, l)
def build_averaged_flowgrams(mapping, sff_fp, min_coverage=50, out_fp=None): """Build averaged flowgrams for each cluster in mapping. mapping: a cluster mapping as dictionary of lists sff_fp: pointer to sff.txt file, must be consistent with mapping min_coverage: number of flowgrams to average over for each cluster out_fp: ouput file name NOTE: This function has no test code, since it is mostly IO around tested functions """ l = len(mapping) (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) # update some values in the sff header header["# of Reads"] = l header["Index Length"] = "NA" if (out_fp): out_filename = out_fp else: fd, out_filename = mkstemp(dir="/tmp/", prefix="prefix_dereplicated", suffix=".sff.txt") close(fd) outhandle = open(out_filename, "w") # write out reduced flogram set write_sff_header(header, outhandle) seqs = {} # get a random sample for each cluster sample_keys = sample_mapped_keys(mapping, min_coverage) for ave_f, id in _average_flowgrams(mapping, flowgrams, sample_keys): outhandle.write(ave_f.createFlowHeader() + "\n") ave_f.Bases = ave_f.toSeq() seqs[id] = ave_f.Bases outhandle.close() return(out_filename, seqs)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if(key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write("Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n") break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if(prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if(newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return(non_clustered_filename, bestscores, cluster_mapping)
def greedy_clustering(sff_fp, seqs, cluster_mapping, outdir, num_flows, log_fh, num_cpus=1, on_cluster=False, bail_out=1, pair_id_thresh=0.97, verbose=False, threshold=3.75, fast_method=True, error_profile=DENOISER_DATA_DIR + 'FLX_error_profile.dat', max_num_rounds=None, checkpoint_fp=None): """second clustering phase of denoiser. sff_fp: flowgram file seqs: fasta seqs corresponding to sff_fp cluster_mapping: preliminary cluster mapping from phase I outdir: output directory num_flows: number of flowgrams in sff_fp (need to now before parsing sff_fp) log_fh: write verbose info to log_fh if set num_cpus:number of cpus to use of on_cluster ==True on_cluster: run in paralell if True bail_out: stop clustering with first cluster having bail_out members pair_id_thresh: always cluster flowgrams whose flowgram alignment implies a seq identity of pair_id_thresh or higher verbose: be verbose or not threshold: low clustering threshold for phase II fast_method: use more memory intensive but faster method error_profile: path to error profile *.dat file max_num_rounds: If set, will stop clustering after this many rounds """ (flowgrams, header) = lazy_parse_sff_handle(open(sff_fp)) l = num_flows spread = [1.0 for x in range(num_cpus)] (client_sockets, workers) = (None, None) if on_cluster: (client_sockets, workers, server_socket) = \ setup_cluster(num_cpus, outdir, verbose, error_profile) if checkpoint_fp: (checkpoint_key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys) = \ read_checkpoint(checkpoint_fp) skipping = True else: # ids stores all the active sequences # we initialize it with the ids from the seqs dict here, # as it starts with all active flows. ids = dict.fromkeys(seqs) sorted_keys = sort_mapping_by_size(cluster_mapping) bestscores = {} round_ctr = 1 # this is the main clustering loop, where most of the compute time is spent for key in sorted_keys: # skip until we reach the checkpoint if checkpoint_fp: if (checkpoint_key == key): if log_fh: log_fh.write("Resume denoising with %s\n" % key) skipping = False if (skipping): continue if (key not in cluster_mapping): # this guy already has been clustered continue if (max_num_rounds and round_ctr > max_num_rounds): if log_fh: log_fh.write( "Max number of rounds reached. " + "Aborting clustering phase II and continuing with phase III.\n" ) break prefix_clustersize = len(cluster_mapping[key]) # abort greedy first phase if (prefix_clustersize < bail_out): break # Do not take bad sequences as cluster seeds, as this will break the # code if ('N' in seqs[key]): continue # check and delete workers if no longer needed if on_cluster: num_cpus = adjust_workers(l, num_cpus, client_sockets, log_fh) # check for dead workers check_workers(workers, client_sockets, log_fh) if num_cpus != len(spread): spread = [1.0 for x in range(num_cpus)] # write checkpoint right before expensive computation starts # Currently, write checkpint every 50 rounds, # could easily be changed here or exposed to command line if (round_ctr % 50) == 0: write_checkpoint(key, round_ctr, cluster_mapping, ids, bestscores, sorted_keys, outdir) if log_fh: log_fh.write("Round %d:\n" % round_ctr) log_remaining_rounds(ids, cluster_mapping, bail_out, log_fh) ideal_flow = seq_to_flow(seqs[key]) (new_flowgrams, newl) = filter_with_flowgram(key, ideal_flow, flowgrams, header, ids, l, bestscores, log_fh, outdir, on_cluster=on_cluster, num_cpus=num_cpus, fast_method=fast_method, mapping=cluster_mapping, verbose=verbose, threshold=threshold, pair_id_thresh=pair_id_thresh, client_sockets=client_sockets, error_profile=error_profile, spread=spread) l = newl flowgrams = new_flowgrams round_ctr += 1 if (newl == 0): # all flowgrams clustered break # JR: I think this is too much info for the regular user, I leave it in, so # we can simply turn it on for debugging # if log_fh: # log_fh.write("Throughput Spread %s\n" % str(spread)) if on_cluster: stop_workers(client_sockets, log_fh) server_socket.close() # write all remaining flowgrams into file for next step # TODO: might use abstract FlowgramContainer here as well fd, non_clustered_filename = mkstemp(dir=outdir, prefix="ff", suffix=".sff.txt") close(fd) non_clustered_fh = open(non_clustered_filename, "w") write_sff_header(header, non_clustered_fh) for f in flowgrams: if (f.Name in ids): non_clustered_fh.write(f.createFlowHeader() + "\n") return (non_clustered_filename, bestscores, cluster_mapping)