Exemple #1
0
    def lzw_compute(input_files, threshold_readlength, slice_step=NUM_SLICES):
        """Spawn subprocesses on NUM_SLICES of the input files, then coalesce the
        scores into a temp file, and return that file's name."""

        temp_file_names = [f"lzwslice_{slice_step}_{slice_start}.txt" for slice_start in range(slice_step + 1)]
        for tfn in temp_file_names:
            assert not os.path.exists(tfn)

        @run_in_subprocess
        def lzw_compute_slice(slice_start):
            """For each read, or read pair, in input_files, such that read_index % slice_step == slice_start,
            output the lzw score for the read, or the min lzw score for the pair."""
            lzw_score = PipelineStepRunLZW.lzw_score
            with open(temp_file_names[slice_start], "a") as slice_output:
                for i, reads in enumerate(fasta.synchronized_iterator(input_files)):
                    if i % slice_step == slice_start:
                        lzw_min_score = min(lzw_score(r.sequence, threshold_readlength) for r in reads)
                        slice_output.write(str(lzw_min_score) + "\n")

        # slices run in parallel
        mt_map(lzw_compute_slice, range(slice_step))

        slice_outputs = temp_file_names[:-1]
        coalesced_score_file = temp_file_names[-1]
        # Paste can insert newlines at the end;  we grep those out.
        command.execute("paste -d '\n' " + " ".join(slice_outputs) + " | grep -v ^$ > " + coalesced_score_file)
        for tfn in slice_outputs:
            os.remove(tfn)
        return coalesced_score_file
Exemple #2
0
    def calc_contig2coverage(bam_filename):
        # PySAM pileup is CPU-intenstive.  Each CPU core is assigned a slice of the input bam file on which to perform pileup.  The slice contigs are selected by slice_idx modulo num_slices.  Each slice gets its own pair of temporary output files, one in CSV format and one in JSON.  In the end, these slice outputs are concatenated.  This is a similar pattern to run_lzw.
        num_physical_cpu = (cpu_count() + 1) // 2
        num_slices = num_physical_cpu
        output_csv_filenames = [
            f"tmp_slice_{num_slices}_{slice}.csv"
            for slice in range(num_slices + 1)
        ]
        output_json_filenames = [
            f"tmp_slice_{num_slices}_{slice}.json"
            for slice in range(num_slices + 1)
        ]
        for fn in output_csv_filenames + output_json_filenames:
            if os.path.exists(fn):
                os.remove(fn)

        @run_in_subprocess
        def compute_slice(slice_idx):
            with open(output_csv_filenames[slice_idx], "w") as output_csv, \
                 open(output_json_filenames[slice_idx], "w") as output_json, \
                 pysam.AlignmentFile(bam_filename, "rb") as input_bam:  # noqa: E126
                for contig_idx, contig_name in enumerate(input_bam.references):
                    if contig_idx % num_slices == slice_idx:
                        PipelineStepGenerateCoverageStats._process_contig(
                            input_bam, output_csv, output_json, contig_name)

        # Compute pileup for each slice
        with LongRunningCodeSection(
                "PipelineStepGenerateCoverageStats.calc_contig2coverage.mt_map"
        ):
            mt_map(compute_slice, range(num_slices))
        # Output CSV headers
        with open(output_csv_filenames[-1], "w") as ocsv:
            ocsv.write(",".join(COVERAGE_STATS_SCHEMA))
            ocsv.write("\n")
        # Output JSON dict open paren
        with open(output_json_filenames[-1], "w") as ojson:
            ojson.write("{")
        # Collate CSV slices
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''cat "${individual_slice_outputs[@]}" >> "${collated_csv}";''',  # note >> for appending
                named_args={
                    'collated_csv': output_csv_filenames[-1],
                    'individual_slice_outputs': output_csv_filenames[:-1]
                }))
        for tfn in output_csv_filenames[:-1]:
            os.remove(tfn)
        # Collate JSON slices, replacing final ", " with "}"
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''cat "${individual_slice_outputs[@]}" | sed 's=, $=}=' >> "${collated_json}";''',  # note >> for appending
                named_args={
                    'collated_json': output_json_filenames[-1],
                    'individual_slice_outputs': output_json_filenames[:-1]
                }))
        for tfn in output_json_filenames[:-1]:
            os.remove(tfn)
        return (output_csv_filenames[-1], output_json_filenames[-1])