def lzw_compute_slice(slice_start): """For each read, or read pair, in input_files, such that read_index % slice_step == slice_start, output the lzw score for the read, or the min lzw score for the pair.""" lzw_score = PipelineStepRunLZW.lzw_score with open(temp_file_names[slice_start], "a") as slice_output: for i, reads in enumerate(fasta.synchronized_iterator(input_files)): if i % slice_step == slice_start: lzw_min_score = min(lzw_score(r.sequence, threshold_readlength, cutoff) for r in reads) slice_output.write(str(lzw_min_score) + "\n")
def generate_lzw_filtered(fasta_files, output_files, cutoff_scores, threshold_readlength): assert len(fasta_files) == len(output_files) # This is the bulk of the computation. Everything else below is just binning by cutoff score. coalesced_score_file = PipelineStepRunLZW.lzw_compute(fasta_files, threshold_readlength) cutoff_scores.sort(reverse=True) # Make sure cutoff is from high to low readcount_list = [] # one item per cutoff outstreams_list = [] # one item per cutoff outfiles_list = [] # one item per cutoff for cutoff in cutoff_scores: readcount_list.append(0) outstreams = [] outfiles = [] for f in output_files: outfile_name = "%s-%f" % (f, cutoff) outfiles.append(outfile_name) outstreams.append(open(outfile_name, 'w')) outstreams_list.append(outstreams) outfiles_list.append(outfiles) outstreams_for_cutoff = list(zip(outstreams_list, cutoff_scores)) def score_iterator(score_file: str) -> Iterator[float]: with open(score_file, "r") as sf: for line in sf: yield float(line) total_reads = 0 for reads, score in zip(fasta.synchronized_iterator(fasta_files), score_iterator(coalesced_score_file)): total_reads += 1 for i, (outstreams, cutoff) in enumerate(outstreams_for_cutoff): if score > cutoff: readcount_list[i] += 1 for ostr, r in zip(outstreams, reads): ostr.write(r.header + "\n") ostr.write(r.sequence + "\n") break os.remove(coalesced_score_file) # closing all the streams for outstreams in outstreams_list: for ostr in outstreams: ostr.close() # get the right output file and metrics kept_count = 0 filtered = total_reads cutoff_frac = None for cutoff_frac, readcount, outfiles in zip(cutoff_scores, readcount_list, outfiles_list): if readcount > 0: # found the right bin kept_count = readcount filtered = total_reads - kept_count # move the output files over for outfile, output_file in zip(outfiles, output_files): command.execute("mv %s %s" % (outfile, output_file)) break if kept_count == 0: raise RuntimeError("All the reads are filtered by LZW with lowest cutoff: %f" % cutoff_frac) kept_ratio = float(kept_count)/float(total_reads) msg = "LZW filter: cutoff_frac: %f, total reads: %d, filtered reads: %d, " \ "kept ratio: %f" % (cutoff_frac, total_reads, filtered, kept_ratio) log.write(msg)