Exemple #1
0
 def lzw_compute_slice(slice_start):
     """For each read, or read pair, in input_files, such that read_index % slice_step == slice_start,
     output the lzw score for the read, or the min lzw score for the pair."""
     lzw_score = PipelineStepRunLZW.lzw_score
     with open(temp_file_names[slice_start], "a") as slice_output:
         for i, reads in enumerate(fasta.synchronized_iterator(input_files)):
             if i % slice_step == slice_start:
                 lzw_min_score = min(lzw_score(r.sequence, threshold_readlength, cutoff) for r in reads)
                 slice_output.write(str(lzw_min_score) + "\n")
Exemple #2
0
    def generate_lzw_filtered(fasta_files, output_files, cutoff_scores, threshold_readlength):
        assert len(fasta_files) == len(output_files)

        # This is the bulk of the computation.  Everything else below is just binning by cutoff score.
        coalesced_score_file = PipelineStepRunLZW.lzw_compute(fasta_files, threshold_readlength)

        cutoff_scores.sort(reverse=True) # Make sure cutoff is from high to low

        readcount_list = [] # one item per cutoff
        outstreams_list = [] # one item per cutoff
        outfiles_list = [] # one item per cutoff

        for cutoff in cutoff_scores:
            readcount_list.append(0)
            outstreams = []
            outfiles = []
            for f in output_files:
                outfile_name = "%s-%f" % (f, cutoff)
                outfiles.append(outfile_name)
                outstreams.append(open(outfile_name, 'w'))

            outstreams_list.append(outstreams)
            outfiles_list.append(outfiles)

        outstreams_for_cutoff = list(zip(outstreams_list, cutoff_scores))

        def score_iterator(score_file: str) -> Iterator[float]:
            with open(score_file, "r") as sf:
                for line in sf:
                    yield float(line)

        total_reads = 0
        for reads, score in zip(fasta.synchronized_iterator(fasta_files), score_iterator(coalesced_score_file)):
            total_reads += 1
            for i, (outstreams, cutoff) in enumerate(outstreams_for_cutoff):
                if score > cutoff:
                    readcount_list[i] += 1
                    for ostr, r in zip(outstreams, reads):
                        ostr.write(r.header + "\n")
                        ostr.write(r.sequence + "\n")
                    break
        os.remove(coalesced_score_file)

        # closing all the streams
        for outstreams in outstreams_list:
            for ostr in outstreams:
                ostr.close()

        # get the right output file and metrics
        kept_count = 0
        filtered = total_reads
        cutoff_frac = None
        for cutoff_frac, readcount, outfiles in zip(cutoff_scores, readcount_list, outfiles_list):
            if readcount > 0:
                # found the right bin
                kept_count = readcount
                filtered = total_reads - kept_count
                # move the output files over
                for outfile, output_file in zip(outfiles, output_files):
                    command.execute("mv %s %s" % (outfile, output_file))
                break

        if kept_count == 0:
            raise RuntimeError("All the reads are filtered by LZW with lowest cutoff: %f" % cutoff_frac)

        kept_ratio = float(kept_count)/float(total_reads)
        msg = "LZW filter: cutoff_frac: %f, total reads: %d, filtered reads: %d, " \
              "kept ratio: %f" % (cutoff_frac, total_reads, filtered, kept_ratio)
        log.write(msg)