Esempio n. 1
0
 def picard_mark_duplicates(self):
     jobs = []
     for sample in self.samples:
         alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".")
         input = alignment_file_prefix + "realigned.qsorted.bam"
         output = alignment_file_prefix + "sorted.dup.bam"
         metrics_file = alignment_file_prefix + "sorted.dup.metrics"
         
         job = picard.mark_duplicates([input], output, metrics_file)
         job.name = "picard_mark_duplicates." + sample.name
         jobs.append(job)
     return jobs
    def picard_mark_duplicates(self):
        """
            Runs Picard mark duplicates on the sorted bam file.
        """
        jobs = []
        for readset in [readset for readset in self.readsets if readset.bam]:
            input_file_prefix = readset.bam + '.'
            input = input_file_prefix + "bam"
            output = input_file_prefix + "dup.bam"
            metrics_file = readset.bam + ".dup.metrics"

            job = picard.mark_duplicates([input], output, metrics_file)
            job.name = "picard_mark_duplicates." + readset.name + ".dup." + self.run_id + "." + str(self.lane_number)
            jobs.append(job)

        self.add_copy_job_inputs(jobs)
        return jobs
Esempio n. 3
0
    def picard_mark_duplicates(self):
        """
        Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions
        (for both mates in the case of paired-end reads). All but the best pair (based on alignment score)
        will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.")

            job = picard.mark_duplicates(
                [alignment_file_prefix + "bam"],
                alignment_file_prefix + "mdup.bam",
                alignment_file_prefix + "mdup.metrics"
            )
            job.name = "picard_mark_duplicates." + sample.name
            jobs.append(job)
        return jobs
Esempio n. 4
0
    def picard_mark_duplicates(self):
        """
        Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions
        (for both mates in the case of paired-end reads). All but the best pair (based on alignment score)
        will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.")

            job = picard.mark_duplicates(
                [alignment_file_prefix + "bam"],
                alignment_file_prefix + "mdup.bam",
                alignment_file_prefix + "mdup.metrics"
            )
            job.name = "picard_mark_duplicates." + sample.name
            jobs.append(job)
        return jobs
Esempio n. 5
0
    def picard_mark_duplicates(self):
        """
        Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions
        (for both mates in the case of paired-end reads). All but the best pair (based on alignment score)
        will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join(self.output_dirs['alignment_output_directory'], sample.name, sample.name + ".")
            input = alignment_file_prefix + "merged.bam"
            output = alignment_file_prefix + "sorted.dup.bam"
            metrics_file = alignment_file_prefix + "sorted.dup.metrics"

            job = picard.mark_duplicates([input], output, metrics_file)
            job.name = "picard_mark_duplicates." + sample.name
            job.sample = [sample]
            jobs.append(job)

        report_file = os.path.join(self.output_dirs['report_output_directory'], "ChipSeq.picard_mark_duplicates.md")
        jobs.append(
            Job(
                [os.path.join(self.output_dirs['alignment_output_directory'], sample.name, sample.name + ".sorted.dup.bam") for sample in self.samples],
                [report_file],
                command="""\
mkdir -p {report_dir} && \\
cp \\
  {report_template_dir}/{basename_report_file} \\
  {report_file}""".format(
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    report_file=report_file, 
                    report_dir = self.output_dirs['report_output_directory']
                ),
                report_files=[report_file],
                name="picard_mark_duplicates_report")
        )

        return jobs
Esempio n. 6
0
    def picard_mark_duplicates(self):
        """
        Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions
        (for both mates in the case of paired-end reads). All but the best pair (based on alignment score)
        will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".")
            input = alignment_file_prefix + "merged.bam"
            output = alignment_file_prefix + "sorted.dup.bam"
            metrics_file = alignment_file_prefix + "sorted.dup.metrics"

            job = picard.mark_duplicates([input], output, metrics_file)
            job.name = "picard_mark_duplicates." + sample.name
            jobs.append(job)

        report_file = os.path.join("report", "ChipSeq.picard_mark_duplicates.md")
        jobs.append(
            Job(
                [os.path.join("alignment", sample.name, sample.name + ".sorted.dup.bam") for sample in self.samples],
                [report_file],
                command="""\
mkdir -p report && \\
cp \\
  {report_template_dir}/{basename_report_file} \\
  {report_file}""".format(
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    report_file=report_file
                ),
                report_files=[report_file],
                name="picard_mark_duplicates_report")
        )

        return jobs
Esempio n. 7
0
    def run_cicero(self):
        """
        Fusion detection specializing in internal tandem duplication (ITD)
        https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02043-x
        https://github.com/stjude/Cicero

        This software runs as a docker application. However, this can also be installed manually.
        As of May 2021, versions 0.2.0, 0.3.0 and 1.4.2 are available as modules on the HPF.

        Also runs RNApeg, a complementary tool to generate the junctions file for use by CICERO.
        Available on the HPF via RNApeg/20210226 and runs as a singularity container.
        """
        jobs = []

        for sample in self.samples:
            # Get fastq files
            if len(sample.readsets) > 1:
                raise Exception("Error: only one read set per sample allowed")
            if sample.readsets[0].bam:  # .bam input
                fastq_dir = os.path.join("fusions", "picard_sam_to_fastq",
                                         sample.name)
                bam = sample.readsets[0].bam
                fq1 = os.path.join(
                    self._output_dir, fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
                fq2 = os.path.join(
                    self._output_dir, fastq_dir,
                    os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz",
                                            bam)))
            elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split(
                    ".")[-1] == "gz":
                fq1 = sample.readsets[0].fastq1
                fq2 = sample.readsets[0].fastq2
            else:
                raise Exception(
                    "Error: only .bam and .fastq.gz inputs allowed")

            # Directories
            tmp_dir = "/localhd/${PBS_JOBID}"  # The variable should be unevaluated in the qsub script
            trim_dir = os.path.join(tmp_dir, "trimmomatic")
            align_dir = os.path.join(tmp_dir, "star")
            cicero_dir = os.path.join(tmp_dir, "cicero")
            rnapeg_dir = os.path.join(tmp_dir, "rnapeg")
            output_dir = os.path.join("fusions", "cicero", sample.name)

            # Files
            fq1_trimmed = os.path.join(
                trim_dir, "".join([sample.name, ".trimmed.R1.fq.gz"]))
            fq2_trimmed = os.path.join(
                trim_dir, "".join([sample.name, ".trimmed.R2.fq.gz"]))
            fq1_dropped = os.path.join(
                trim_dir, "".join([sample.name, ".filtered.R1.fq.gz"]))
            fq2_dropped = os.path.join(
                trim_dir, "".join([sample.name, ".filtered.R2.fq.gz"]))
            trim_log = os.path.join(trim_dir,
                                    "".join([sample.name, ".trim.log"]))
            star_bam = os.path.join(align_dir, "Aligned.sortedByCoord.out.bam")
            dedup_bam = os.path.join(align_dir,
                                     "Aligned.sortedByCoord.dedup.bam")
            dedup_metrics = os.path.join(
                align_dir, "Aligned.sortedByCoord.dedup.metrics")
            symlink_bam = os.path.join(cicero_dir, sample.name + ".bam")
            junction_file = os.path.join(
                rnapeg_dir, sample.name + ".bam.junctions.tab.shifted.tab")

            # Jobs
            trim = trimmomatic.trimmomatic(
                fq1, fq2, fq1_trimmed, fq1_dropped, fq2_trimmed, fq2_dropped,
                None, None,
                config.param("trimmomatic", "adapter_fasta",
                             required=False), trim_log)
            align = star.align(fq1_trimmed,
                               fq2_trimmed,
                               align_dir,
                               config.param("run_cicero", "genome_build"),
                               rg_id=sample.name,
                               rg_library=sample.name,
                               rg_sample=sample.name,
                               rg_platform="ILLUMINA",
                               sort_bam=True)
            index = samtools.index(star_bam)
            # Also indexes for us! idx_file=re.sub(r"\.bam$", ".bai", dedup_bam)
            dedup = picard.mark_duplicates([star_bam], dedup_bam,
                                           dedup_metrics)
            # RNApeg
            rna_peg = Job(
                input_files=[dedup_bam],
                output_files=[junction_file],
                module_entries=[("run_cicero", "module_rnapeg")],
                name="RNApeg",
                command="""ln -s \\\n{idx_file} \\\n{new_idx_file} && \\
ln -s {bamfile} \\\n{new_bamfile} && \\
singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd -B {outpath}:/results \\
$(which rnapeg.sif) RNApeg.sh -b {new_bamfile} \\\n   -f {ref} \\\n   -r {reflat}"""
                .format(bamfile=dedup_bam,
                        ref=config.param("run_cicero",
                                         "reference",
                                         required=True),
                        reflat=config.param("run_cicero",
                                            "reflat",
                                            required=True),
                        outpath=rnapeg_dir,
                        idx_file=re.sub(r"\.bam$", ".bai", dedup_bam),
                        new_bamfile=symlink_bam,
                        new_idx_file=symlink_bam + ".bai"))
            # Cicero
            cicero = Job(
                input_files=[dedup_bam, junction_file],
                output_files=[
                    os.path.join(cicero_dir, "CICERO_DATADIR", sample.name,
                                 "final_fusions.txt")
                ],
                module_entries=[("run_cicero", "module_cicero")],
                name="run_cicero" + sample.name,
                command=
                """singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd \\
                         $CICERO_PATH/CICERO_1.4.2.sif \\
Cicero.sh -n {threads} -b {bamfile} \\\n -g {genome} \\\n -r {reference} \\\n  -j {junction} -o {out_dir}"""
                .format(threads=config.param("run_cicero",
                                             "threads",
                                             required=True),
                        bamfile=symlink_bam,
                        genome=config.param("run_cicero",
                                            "genome",
                                            required=True),
                        reference=config.param("run_cicero",
                                               "cicero_data",
                                               required=True),
                        junction=junction_file,
                        out_dir=cicero_dir))
            save_out = Job(
                input_files=[
                    os.path.join(cicero_dir, "CICERO_DATADIR", sample.name,
                                 "final_fusions.txt")
                ],
                output_files=[os.path.join(output_dir, "final_fusions.txt")],
                name="save_cicero_results" + sample.name,
                command="""mv {files_to_keep} {target_dir}""".format(
                    files_to_keep=" ".join([
                        junction_file,
                        os.path.join(cicero_dir, "0*.{err,log}"),  # Logs
                        os.path.join(cicero_dir, "CICERO_DATADIR", sample.name,
                                     "*.{txt,frame.tab,html}")  #
                        # Result files
                    ]),
                    target_dir=output_dir)
            )  # the files in /localhd/ should be removed automatically upon job end

            job_mkdir = Job(
                command="mkdir -p {trim} {align} {cicero} {output} {rnapeg}".
                format(trim=trim_dir,
                       align=align_dir,
                       cicero=cicero_dir,
                       output=output_dir,
                       rnapeg=rnapeg_dir))
            combined_job = concat_jobs([
                job_mkdir, trim, align, index, dedup, rna_peg, cicero, save_out
            ],
                                       name="run_cicero." + sample.name)
            # Replace input and output specification
            combined_job._output_files = [
                os.path.join(output_dir, "final_fusions.txt")
            ]
            combined_job.input_files = [fq1, fq2]
            jobs.append(combined_job)
        return jobs
Esempio n. 8
0
    def ihec_preprocess_files(self):
        """
        Generate IHEC's files.
        
        """
        output_dir=self.output_dirs['ihecA_output_directory']
        jobs = []
        for sample in self.samples:
            alignment_directory = os.path.join(self.output_dirs['alignment_output_directory'], sample.name)
            # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet.
            readset_bams = [os.path.join(alignment_directory, readset.name, readset.name + ".sorted.bam") for readset in sample.readsets]
            sample_merge_bam = os.path.join(output_dir, sample.name + ".merged.bam")
            sample_merge_mdup_bam = os.path.join(output_dir, sample.name + ".merged.mdup.bam")
            sample_merge_mdup_metrics_file  = os.path.join(output_dir, sample.name + ".merged.mdup.metrics")

            mkdir_job = Job(command="mkdir -p " + output_dir)

            # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index.
            if len(sample.readsets) == 1:
                readset_bam = readset_bams[0]
                if os.path.isabs(readset_bam):
                    target_readset_bam = readset_bam
                else:
                    target_readset_bam = os.path.relpath(readset_bam, output_dir)

                job = concat_jobs([
                    mkdir_job,
                    Job([readset_bam], [sample_merge_bam], command="ln -s -f " + target_readset_bam + " " + sample_merge_bam, removable_files=[sample_merge_bam]),
                ], name="ihecs_preprocess_symlink." + sample.name)

            elif len(sample.readsets) > 1:
                job = concat_jobs([
                    mkdir_job,
                    picard.merge_sam_files(readset_bams, sample_merge_bam)
                ])
                job.name = "ihecs_preprocess_merge." + sample.name

            jobs.append(job)

            tmp_dir = config.param('ihec_preprocess_files', 'tmp_dir')
            job = concat_jobs([Job(command = "export TMPDIR={tmp_dir}".format(tmp_dir = tmp_dir)), picard.mark_duplicates([sample_merge_bam], sample_merge_mdup_bam, sample_merge_mdup_metrics_file)])
            job.name = "ihecs_preprocess_mark_duplicates." + sample.name
            jobs.append(job)
            
        return jobs