def merge_and_cov_scaffolds(self): jobs = [] for sample in self.samples: cov_directory = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer'), "cov") ray_directory = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer')) job = picard.merge_sam_files( [ os.path.join(cov_directory, "sclip.1.bam"), os.path.join(cov_directory, "sclip.2.bam"), os.path.join(cov_directory, "OEAUNMAP.1.bam"), os.path.join(cov_directory, "OEAUNMAP.2.bam"), os.path.join(cov_directory, "ORPHAN.bam"), ], os.path.join(cov_directory, "readunmap.bam") ) job.name = "covSca_merge_" + sample.name jobs.append(job) job = bvatools.depth_of_coverage( os.path.join(cov_directory, "readunmap.bam"), os.path.join(cov_directory, "readunmap.cov.txt"), [], os.path.join(ray_directory, "Scaffolds.fasta"), "--gc --ommitN --minMappingQuality " + config.param('DEFAULT', 'min_mapping_quality') + " --threads " + config.param('merge_and_cov_scaffolds', 'threads') ) job.name = "covSca_" + sample.name jobs.append(job) return jobs
def picard_merge_sam_files(self): """ BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/). This step takes as input files: 1. Aligned and sorted BAM output files from previous bwa_mem_picard_sort_sam step if available 2. Else, BAM files from the readset file """ jobs = [] for sample in self.samples: alignment_directory = os.path.join("alignment", sample.name) # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet. readset_bams = [ os.path.join(alignment_directory, readset.name, readset.name + ".sorted.filtered.bam") for readset in sample.readsets ] sample_bam = os.path.join(alignment_directory, sample.name + ".merged.bam") mkdir_job = Job(command="mkdir -p " + os.path.dirname(sample_bam)) # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index. if len(sample.readsets) == 1: readset_bam = readset_bams[0] if os.path.isabs(readset_bam): target_readset_bam = readset_bam else: target_readset_bam = os.path.relpath( readset_bam, alignment_directory) job = concat_jobs([ mkdir_job, Job([readset_bam], [sample_bam], command="ln -s -f " + target_readset_bam + " " + sample_bam, removable_files=[sample_bam]), ], name="symlink_readset_sample_bam." + sample.name) elif len(sample.readsets) > 1: job = concat_jobs([ mkdir_job, picard.merge_sam_files(readset_bams, sample_bam) ]) job.name = "picard_merge_sam_files." + sample.name jobs.append(job) return jobs
def picard_merge_sam_files(self): jobs = [] for sample in self.samples: # Skip samples with one readset only, since symlink has been created at align step if len(sample.readsets) > 1: alignment_directory = os.path.join("alignment", sample.name) inputs = [os.path.join(alignment_directory, readset.name + ".sorted.bam") for readset in sample.readsets] output = os.path.join(alignment_directory, sample.name + ".sorted.bam") job = picard.merge_sam_files(inputs, output) job.name = "picard_merge_sam_files." + sample.name jobs.append(job) return jobs
def picard_merge_sam_files(self): """ BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/). """ jobs = [] for sample in self.samples: # Skip samples with one readset only, since symlink has been created at align step if len(sample.readsets) > 1: alignment_directory = os.path.join("alignment", sample.name) inputs = [os.path.join(alignment_directory, readset.name, "Aligned.sortedByCoord.out.bam") for readset in sample.readsets] output = os.path.join(alignment_directory, sample.name + ".sorted.bam") job = picard.merge_sam_files(inputs, output) job.name = "picard_merge_sam_files." + sample.name jobs.append(job) return jobs
def ihec_preprocess_files(self): """ Generate IHEC's files. """ output_dir=self.output_dirs['ihecA_output_directory'] jobs = [] for sample in self.samples: alignment_directory = os.path.join(self.output_dirs['alignment_output_directory'], sample.name) # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet. readset_bams = [os.path.join(alignment_directory, readset.name, readset.name + ".sorted.bam") for readset in sample.readsets] sample_merge_bam = os.path.join(output_dir, sample.name + ".merged.bam") sample_merge_mdup_bam = os.path.join(output_dir, sample.name + ".merged.mdup.bam") sample_merge_mdup_metrics_file = os.path.join(output_dir, sample.name + ".merged.mdup.metrics") mkdir_job = Job(command="mkdir -p " + output_dir) # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index. if len(sample.readsets) == 1: readset_bam = readset_bams[0] if os.path.isabs(readset_bam): target_readset_bam = readset_bam else: target_readset_bam = os.path.relpath(readset_bam, output_dir) job = concat_jobs([ mkdir_job, Job([readset_bam], [sample_merge_bam], command="ln -s -f " + target_readset_bam + " " + sample_merge_bam, removable_files=[sample_merge_bam]), ], name="ihecs_preprocess_symlink." + sample.name) elif len(sample.readsets) > 1: job = concat_jobs([ mkdir_job, picard.merge_sam_files(readset_bams, sample_merge_bam) ]) job.name = "ihecs_preprocess_merge." + sample.name jobs.append(job) tmp_dir = config.param('ihec_preprocess_files', 'tmp_dir') job = concat_jobs([Job(command = "export TMPDIR={tmp_dir}".format(tmp_dir = tmp_dir)), picard.mark_duplicates([sample_merge_bam], sample_merge_mdup_bam, sample_merge_mdup_metrics_file)]) job.name = "ihecs_preprocess_mark_duplicates." + sample.name jobs.append(job) return jobs
def merge_realigned(self): jobs = [] nb_jobs = config.param('gatk_indel_realigner', 'nb_jobs', type='posint') for sample in self.samples: alignment_directory = os.path.join("alignment", sample.name) realign_directory = os.path.join(alignment_directory, "realign") merged_realigned_bam = os.path.join(alignment_directory, sample.name + ".realigned.qsorted.bam") # if nb_jobs == 1, symlink has been created in indel_realigner and merging is not necessary if nb_jobs > 1: realigned_bams = [os.path.join(realign_directory, sequence['name'] + ".bam") for sequence in self.sequence_dictionary[0:min(nb_jobs - 1, len(self.sequence_dictionary))]] realigned_bams.append(os.path.join(realign_directory, "others.bam")) job = picard.merge_sam_files(realigned_bams, merged_realigned_bam) job.name = "merge_realigned." + sample.name jobs.append(job) return jobs
def picard_merge_sam_files(self): """ BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/). This step takes as input files: 1. Aligned and sorted BAM output files from previous bwa_mem_picard_sort_sam step if available 2. Else, BAM files from the readset file """ jobs = [] for sample in self.samples: alignment_directory = os.path.join("alignment", sample.name) # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet. readset_bams = [os.path.join(alignment_directory, readset.name, readset.name + ".sorted.filtered.bam") for readset in sample.readsets] sample_bam = os.path.join(alignment_directory, sample.name + ".merged.bam") mkdir_job = Job(command="mkdir -p " + os.path.dirname(sample_bam)) # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index. if len(sample.readsets) == 1: readset_bam = readset_bams[0] if os.path.isabs(readset_bam): target_readset_bam = readset_bam else: target_readset_bam = os.path.relpath(readset_bam, alignment_directory) job = concat_jobs([ mkdir_job, Job([readset_bam], [sample_bam], command="ln -s -f " + target_readset_bam + " " + sample_bam, removable_files=[sample_bam]), ], name="symlink_readset_sample_bam." + sample.name) elif len(sample.readsets) > 1: job = concat_jobs([ mkdir_job, picard.merge_sam_files(readset_bams, sample_bam) ]) job.name = "picard_merge_sam_files." + sample.name jobs.append(job) return jobs
def wiggle(self): """ Generate wiggle tracks suitable for multiple browsers. """ jobs = [] ##check the library status library = {} for readset in self.readsets: if not library.has_key(readset.sample) : library[readset.sample]="PAIRED_END" if readset.run_type == "SINGLE_END" : library[readset.sample]="SINGLE_END" for sample in self.samples: bam_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.") input_bam = bam_file_prefix + "bam" bed_graph_prefix = os.path.join("tracks", sample.name, sample.name) big_wig_prefix = os.path.join("tracks", "bigWig", sample.name) if (config.param('DEFAULT', 'strand_info') != 'fr-unstranded') and library[sample] == "PAIRED_END": input_bam_f1 = bam_file_prefix + "tmp1.forward.bam" input_bam_f2 = bam_file_prefix + "tmp2.forward.bam" input_bam_r1 = bam_file_prefix + "tmp1.reverse.bam" input_bam_r2 = bam_file_prefix + "tmp2.reverse.bam" output_bam_f = bam_file_prefix + "forward.bam" output_bam_r = bam_file_prefix + "reverse.bam" bam_f_job = concat_jobs([ samtools.view(input_bam, input_bam_f1, "-bh -F 256 -f 81"), samtools.view(input_bam, input_bam_f2, "-bh -F 256 -f 161"), picard.merge_sam_files([input_bam_f1, input_bam_f2], output_bam_f), Job(command="rm " + input_bam_f1 + " " + input_bam_f2) ], name="wiggle." + sample.name + ".forward_strandspec") # Remove temporary-then-deleted files from job output files, otherwise job is never up to date bam_f_job.output_files.remove(input_bam_f1) bam_f_job.output_files.remove(input_bam_f2) bam_r_job = concat_jobs([ Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig")), samtools.view(input_bam, input_bam_r1, "-bh -F 256 -f 97"), samtools.view(input_bam, input_bam_r2, "-bh -F 256 -f 145"), picard.merge_sam_files([input_bam_r1, input_bam_r2], output_bam_r), Job(command="rm " + input_bam_r1 + " " + input_bam_r2) ], name="wiggle." + sample.name + ".reverse_strandspec") # Remove temporary-then-deleted files from job output files, otherwise job is never up to date bam_r_job.output_files.remove(input_bam_r1) bam_r_job.output_files.remove(input_bam_r2) jobs.extend([bam_f_job, bam_r_job]) outputs = [ [bed_graph_prefix + ".forward.bedGraph", big_wig_prefix + ".forward.bw"], [bed_graph_prefix + ".reverse.bedGraph", big_wig_prefix + ".reverse.bw"], ] else: outputs = [[bed_graph_prefix + ".bedGraph", big_wig_prefix + ".bw"]] for bed_graph_output, big_wig_output in outputs: job = concat_jobs([ Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig"), removable_files=["tracks"]), bedtools.graph(input_bam, bed_graph_output, big_wig_output,library[sample]) ], name="wiggle." + re.sub(".bedGraph", "", os.path.basename(bed_graph_output))) jobs.append(job) return jobs