def picard_mark_duplicates(self): jobs = [] for sample in self.samples: alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".") input = alignment_file_prefix + "realigned.qsorted.bam" output = alignment_file_prefix + "sorted.dup.bam" metrics_file = alignment_file_prefix + "sorted.dup.metrics" job = picard.mark_duplicates([input], output, metrics_file) job.name = "picard_mark_duplicates." + sample.name jobs.append(job) return jobs
def picard_mark_duplicates(self): """ Runs Picard mark duplicates on the sorted bam file. """ jobs = [] for readset in [readset for readset in self.readsets if readset.bam]: input_file_prefix = readset.bam + '.' input = input_file_prefix + "bam" output = input_file_prefix + "dup.bam" metrics_file = readset.bam + ".dup.metrics" job = picard.mark_duplicates([input], output, metrics_file) job.name = "picard_mark_duplicates." + readset.name + ".dup." + self.run_id + "." + str(self.lane_number) jobs.append(job) self.add_copy_job_inputs(jobs) return jobs
def picard_mark_duplicates(self): """ Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions (for both mates in the case of paired-end reads). All but the best pair (based on alignment score) will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/). """ jobs = [] for sample in self.samples: alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.") job = picard.mark_duplicates( [alignment_file_prefix + "bam"], alignment_file_prefix + "mdup.bam", alignment_file_prefix + "mdup.metrics" ) job.name = "picard_mark_duplicates." + sample.name jobs.append(job) return jobs
def picard_mark_duplicates(self): """ Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions (for both mates in the case of paired-end reads). All but the best pair (based on alignment score) will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/). """ jobs = [] for sample in self.samples: alignment_file_prefix = os.path.join(self.output_dirs['alignment_output_directory'], sample.name, sample.name + ".") input = alignment_file_prefix + "merged.bam" output = alignment_file_prefix + "sorted.dup.bam" metrics_file = alignment_file_prefix + "sorted.dup.metrics" job = picard.mark_duplicates([input], output, metrics_file) job.name = "picard_mark_duplicates." + sample.name job.sample = [sample] jobs.append(job) report_file = os.path.join(self.output_dirs['report_output_directory'], "ChipSeq.picard_mark_duplicates.md") jobs.append( Job( [os.path.join(self.output_dirs['alignment_output_directory'], sample.name, sample.name + ".sorted.dup.bam") for sample in self.samples], [report_file], command="""\ mkdir -p {report_dir} && \\ cp \\ {report_template_dir}/{basename_report_file} \\ {report_file}""".format( report_template_dir=self.report_template_dir, basename_report_file=os.path.basename(report_file), report_file=report_file, report_dir = self.output_dirs['report_output_directory'] ), report_files=[report_file], name="picard_mark_duplicates_report") ) return jobs
def picard_mark_duplicates(self): """ Mark duplicates. Aligned reads per sample are duplicates if they have the same 5' alignment positions (for both mates in the case of paired-end reads). All but the best pair (based on alignment score) will be marked as a duplicate in the BAM file. Marking duplicates is done using [Picard](http://broadinstitute.github.io/picard/). """ jobs = [] for sample in self.samples: alignment_file_prefix = os.path.join("alignment", sample.name, sample.name + ".") input = alignment_file_prefix + "merged.bam" output = alignment_file_prefix + "sorted.dup.bam" metrics_file = alignment_file_prefix + "sorted.dup.metrics" job = picard.mark_duplicates([input], output, metrics_file) job.name = "picard_mark_duplicates." + sample.name jobs.append(job) report_file = os.path.join("report", "ChipSeq.picard_mark_duplicates.md") jobs.append( Job( [os.path.join("alignment", sample.name, sample.name + ".sorted.dup.bam") for sample in self.samples], [report_file], command="""\ mkdir -p report && \\ cp \\ {report_template_dir}/{basename_report_file} \\ {report_file}""".format( report_template_dir=self.report_template_dir, basename_report_file=os.path.basename(report_file), report_file=report_file ), report_files=[report_file], name="picard_mark_duplicates_report") ) return jobs
def run_cicero(self): """ Fusion detection specializing in internal tandem duplication (ITD) https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02043-x https://github.com/stjude/Cicero This software runs as a docker application. However, this can also be installed manually. As of May 2021, versions 0.2.0, 0.3.0 and 1.4.2 are available as modules on the HPF. Also runs RNApeg, a complementary tool to generate the junctions file for use by CICERO. Available on the HPF via RNApeg/20210226 and runs as a singularity container. """ jobs = [] for sample in self.samples: # Get fastq files if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam fq1 = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) fq2 = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": fq1 = sample.readsets[0].fastq1 fq2 = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") # Directories tmp_dir = "/localhd/${PBS_JOBID}" # The variable should be unevaluated in the qsub script trim_dir = os.path.join(tmp_dir, "trimmomatic") align_dir = os.path.join(tmp_dir, "star") cicero_dir = os.path.join(tmp_dir, "cicero") rnapeg_dir = os.path.join(tmp_dir, "rnapeg") output_dir = os.path.join("fusions", "cicero", sample.name) # Files fq1_trimmed = os.path.join( trim_dir, "".join([sample.name, ".trimmed.R1.fq.gz"])) fq2_trimmed = os.path.join( trim_dir, "".join([sample.name, ".trimmed.R2.fq.gz"])) fq1_dropped = os.path.join( trim_dir, "".join([sample.name, ".filtered.R1.fq.gz"])) fq2_dropped = os.path.join( trim_dir, "".join([sample.name, ".filtered.R2.fq.gz"])) trim_log = os.path.join(trim_dir, "".join([sample.name, ".trim.log"])) star_bam = os.path.join(align_dir, "Aligned.sortedByCoord.out.bam") dedup_bam = os.path.join(align_dir, "Aligned.sortedByCoord.dedup.bam") dedup_metrics = os.path.join( align_dir, "Aligned.sortedByCoord.dedup.metrics") symlink_bam = os.path.join(cicero_dir, sample.name + ".bam") junction_file = os.path.join( rnapeg_dir, sample.name + ".bam.junctions.tab.shifted.tab") # Jobs trim = trimmomatic.trimmomatic( fq1, fq2, fq1_trimmed, fq1_dropped, fq2_trimmed, fq2_dropped, None, None, config.param("trimmomatic", "adapter_fasta", required=False), trim_log) align = star.align(fq1_trimmed, fq2_trimmed, align_dir, config.param("run_cicero", "genome_build"), rg_id=sample.name, rg_library=sample.name, rg_sample=sample.name, rg_platform="ILLUMINA", sort_bam=True) index = samtools.index(star_bam) # Also indexes for us! idx_file=re.sub(r"\.bam$", ".bai", dedup_bam) dedup = picard.mark_duplicates([star_bam], dedup_bam, dedup_metrics) # RNApeg rna_peg = Job( input_files=[dedup_bam], output_files=[junction_file], module_entries=[("run_cicero", "module_rnapeg")], name="RNApeg", command="""ln -s \\\n{idx_file} \\\n{new_idx_file} && \\ ln -s {bamfile} \\\n{new_bamfile} && \\ singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd -B {outpath}:/results \\ $(which rnapeg.sif) RNApeg.sh -b {new_bamfile} \\\n -f {ref} \\\n -r {reflat}""" .format(bamfile=dedup_bam, ref=config.param("run_cicero", "reference", required=True), reflat=config.param("run_cicero", "reflat", required=True), outpath=rnapeg_dir, idx_file=re.sub(r"\.bam$", ".bai", dedup_bam), new_bamfile=symlink_bam, new_idx_file=symlink_bam + ".bai")) # Cicero cicero = Job( input_files=[dedup_bam, junction_file], output_files=[ os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "final_fusions.txt") ], module_entries=[("run_cicero", "module_cicero")], name="run_cicero" + sample.name, command= """singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd \\ $CICERO_PATH/CICERO_1.4.2.sif \\ Cicero.sh -n {threads} -b {bamfile} \\\n -g {genome} \\\n -r {reference} \\\n -j {junction} -o {out_dir}""" .format(threads=config.param("run_cicero", "threads", required=True), bamfile=symlink_bam, genome=config.param("run_cicero", "genome", required=True), reference=config.param("run_cicero", "cicero_data", required=True), junction=junction_file, out_dir=cicero_dir)) save_out = Job( input_files=[ os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "final_fusions.txt") ], output_files=[os.path.join(output_dir, "final_fusions.txt")], name="save_cicero_results" + sample.name, command="""mv {files_to_keep} {target_dir}""".format( files_to_keep=" ".join([ junction_file, os.path.join(cicero_dir, "0*.{err,log}"), # Logs os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "*.{txt,frame.tab,html}") # # Result files ]), target_dir=output_dir) ) # the files in /localhd/ should be removed automatically upon job end job_mkdir = Job( command="mkdir -p {trim} {align} {cicero} {output} {rnapeg}". format(trim=trim_dir, align=align_dir, cicero=cicero_dir, output=output_dir, rnapeg=rnapeg_dir)) combined_job = concat_jobs([ job_mkdir, trim, align, index, dedup, rna_peg, cicero, save_out ], name="run_cicero." + sample.name) # Replace input and output specification combined_job._output_files = [ os.path.join(output_dir, "final_fusions.txt") ] combined_job.input_files = [fq1, fq2] jobs.append(combined_job) return jobs
def ihec_preprocess_files(self): """ Generate IHEC's files. """ output_dir=self.output_dirs['ihecA_output_directory'] jobs = [] for sample in self.samples: alignment_directory = os.path.join(self.output_dirs['alignment_output_directory'], sample.name) # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet. readset_bams = [os.path.join(alignment_directory, readset.name, readset.name + ".sorted.bam") for readset in sample.readsets] sample_merge_bam = os.path.join(output_dir, sample.name + ".merged.bam") sample_merge_mdup_bam = os.path.join(output_dir, sample.name + ".merged.mdup.bam") sample_merge_mdup_metrics_file = os.path.join(output_dir, sample.name + ".merged.mdup.metrics") mkdir_job = Job(command="mkdir -p " + output_dir) # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index. if len(sample.readsets) == 1: readset_bam = readset_bams[0] if os.path.isabs(readset_bam): target_readset_bam = readset_bam else: target_readset_bam = os.path.relpath(readset_bam, output_dir) job = concat_jobs([ mkdir_job, Job([readset_bam], [sample_merge_bam], command="ln -s -f " + target_readset_bam + " " + sample_merge_bam, removable_files=[sample_merge_bam]), ], name="ihecs_preprocess_symlink." + sample.name) elif len(sample.readsets) > 1: job = concat_jobs([ mkdir_job, picard.merge_sam_files(readset_bams, sample_merge_bam) ]) job.name = "ihecs_preprocess_merge." + sample.name jobs.append(job) tmp_dir = config.param('ihec_preprocess_files', 'tmp_dir') job = concat_jobs([Job(command = "export TMPDIR={tmp_dir}".format(tmp_dir = tmp_dir)), picard.mark_duplicates([sample_merge_bam], sample_merge_mdup_bam, sample_merge_mdup_metrics_file)]) job.name = "ihecs_preprocess_mark_duplicates." + sample.name jobs.append(job) return jobs