Example #1
0
 def merge_and_cov_scaffolds(self):
     jobs = []
     
     for sample in self.samples:
         cov_directory = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer'), "cov")
         ray_directory = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer'))
         
         job = picard.merge_sam_files(
             [
              os.path.join(cov_directory, "sclip.1.bam"),
              os.path.join(cov_directory, "sclip.2.bam"),
              os.path.join(cov_directory, "OEAUNMAP.1.bam"),
              os.path.join(cov_directory, "OEAUNMAP.2.bam"),
              os.path.join(cov_directory, "ORPHAN.bam"),
             ],
             os.path.join(cov_directory, "readunmap.bam")
         )
         job.name = "covSca_merge_" + sample.name
         jobs.append(job)
         
         job = bvatools.depth_of_coverage(
             os.path.join(cov_directory, "readunmap.bam"), 
             os.path.join(cov_directory, "readunmap.cov.txt"), 
             [], 
             os.path.join(ray_directory, "Scaffolds.fasta"),
             "--gc --ommitN --minMappingQuality " + config.param('DEFAULT', 'min_mapping_quality') + " --threads " + config.param('merge_and_cov_scaffolds', 'threads')
         )
         job.name = "covSca_" + sample.name
         jobs.append(job)
     
     return jobs
Example #2
0
    def picard_merge_sam_files(self):
        """
        BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/).

        This step takes as input files:

        1. Aligned and sorted BAM output files from previous bwa_mem_picard_sort_sam step if available
        2. Else, BAM files from the readset file
        """

        jobs = []
        for sample in self.samples:
            alignment_directory = os.path.join("alignment", sample.name)
            # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet.
            readset_bams = [
                os.path.join(alignment_directory, readset.name,
                             readset.name + ".sorted.filtered.bam")
                for readset in sample.readsets
            ]
            sample_bam = os.path.join(alignment_directory,
                                      sample.name + ".merged.bam")

            mkdir_job = Job(command="mkdir -p " + os.path.dirname(sample_bam))

            # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index.
            if len(sample.readsets) == 1:
                readset_bam = readset_bams[0]
                if os.path.isabs(readset_bam):
                    target_readset_bam = readset_bam
                else:
                    target_readset_bam = os.path.relpath(
                        readset_bam, alignment_directory)

                job = concat_jobs([
                    mkdir_job,
                    Job([readset_bam], [sample_bam],
                        command="ln -s -f " + target_readset_bam + " " +
                        sample_bam,
                        removable_files=[sample_bam]),
                ],
                                  name="symlink_readset_sample_bam." +
                                  sample.name)

            elif len(sample.readsets) > 1:
                job = concat_jobs([
                    mkdir_job,
                    picard.merge_sam_files(readset_bams, sample_bam)
                ])
                job.name = "picard_merge_sam_files." + sample.name

            jobs.append(job)

        return jobs
Example #3
0
 def picard_merge_sam_files(self):
     jobs = []
     for sample in self.samples:
         # Skip samples with one readset only, since symlink has been created at align step
         if len(sample.readsets) > 1:
             alignment_directory = os.path.join("alignment", sample.name)
             inputs = [os.path.join(alignment_directory, readset.name + ".sorted.bam") for readset in sample.readsets]
             output = os.path.join(alignment_directory, sample.name + ".sorted.bam")
             
             job = picard.merge_sam_files(inputs, output)
             job.name = "picard_merge_sam_files." + sample.name
             jobs.append(job)
     return jobs
Example #4
0
    def picard_merge_sam_files(self):
        """
        BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            # Skip samples with one readset only, since symlink has been created at align step
            if len(sample.readsets) > 1:
                alignment_directory = os.path.join("alignment", sample.name)
                inputs = [os.path.join(alignment_directory, readset.name, "Aligned.sortedByCoord.out.bam") for readset in sample.readsets]
                output = os.path.join(alignment_directory, sample.name + ".sorted.bam")

                job = picard.merge_sam_files(inputs, output)
                job.name = "picard_merge_sam_files." + sample.name
                jobs.append(job)
        return jobs
Example #5
0
    def picard_merge_sam_files(self):
        """
        BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        for sample in self.samples:
            # Skip samples with one readset only, since symlink has been created at align step
            if len(sample.readsets) > 1:
                alignment_directory = os.path.join("alignment", sample.name)
                inputs = [os.path.join(alignment_directory, readset.name, "Aligned.sortedByCoord.out.bam") for readset in sample.readsets]
                output = os.path.join(alignment_directory, sample.name + ".sorted.bam")

                job = picard.merge_sam_files(inputs, output)
                job.name = "picard_merge_sam_files." + sample.name
                jobs.append(job)
        return jobs
Example #6
0
    def ihec_preprocess_files(self):
        """
        Generate IHEC's files.
        
        """
        output_dir=self.output_dirs['ihecA_output_directory']
        jobs = []
        for sample in self.samples:
            alignment_directory = os.path.join(self.output_dirs['alignment_output_directory'], sample.name)
            # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet.
            readset_bams = [os.path.join(alignment_directory, readset.name, readset.name + ".sorted.bam") for readset in sample.readsets]
            sample_merge_bam = os.path.join(output_dir, sample.name + ".merged.bam")
            sample_merge_mdup_bam = os.path.join(output_dir, sample.name + ".merged.mdup.bam")
            sample_merge_mdup_metrics_file  = os.path.join(output_dir, sample.name + ".merged.mdup.metrics")

            mkdir_job = Job(command="mkdir -p " + output_dir)

            # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index.
            if len(sample.readsets) == 1:
                readset_bam = readset_bams[0]
                if os.path.isabs(readset_bam):
                    target_readset_bam = readset_bam
                else:
                    target_readset_bam = os.path.relpath(readset_bam, output_dir)

                job = concat_jobs([
                    mkdir_job,
                    Job([readset_bam], [sample_merge_bam], command="ln -s -f " + target_readset_bam + " " + sample_merge_bam, removable_files=[sample_merge_bam]),
                ], name="ihecs_preprocess_symlink." + sample.name)

            elif len(sample.readsets) > 1:
                job = concat_jobs([
                    mkdir_job,
                    picard.merge_sam_files(readset_bams, sample_merge_bam)
                ])
                job.name = "ihecs_preprocess_merge." + sample.name

            jobs.append(job)

            tmp_dir = config.param('ihec_preprocess_files', 'tmp_dir')
            job = concat_jobs([Job(command = "export TMPDIR={tmp_dir}".format(tmp_dir = tmp_dir)), picard.mark_duplicates([sample_merge_bam], sample_merge_mdup_bam, sample_merge_mdup_metrics_file)])
            job.name = "ihecs_preprocess_mark_duplicates." + sample.name
            jobs.append(job)
            
        return jobs
Example #7
0
 def merge_realigned(self):
     jobs = []
     nb_jobs = config.param('gatk_indel_realigner', 'nb_jobs', type='posint')
     
     for sample in self.samples:
         alignment_directory = os.path.join("alignment", sample.name)
         realign_directory = os.path.join(alignment_directory, "realign")
         merged_realigned_bam = os.path.join(alignment_directory, sample.name + ".realigned.qsorted.bam")
         
         # if nb_jobs == 1, symlink has been created in indel_realigner and merging is not necessary
         if nb_jobs > 1:
             realigned_bams = [os.path.join(realign_directory, sequence['name'] + ".bam") for sequence in self.sequence_dictionary[0:min(nb_jobs - 1, len(self.sequence_dictionary))]]
             realigned_bams.append(os.path.join(realign_directory, "others.bam"))
             
             job = picard.merge_sam_files(realigned_bams, merged_realigned_bam)
             job.name = "merge_realigned." + sample.name
             jobs.append(job)
     return jobs
Example #8
0
    def picard_merge_sam_files(self):
        """
        BAM readset files are merged into one file per sample. Merge is done using [Picard](http://broadinstitute.github.io/picard/).

        This step takes as input files:

        1. Aligned and sorted BAM output files from previous bwa_mem_picard_sort_sam step if available
        2. Else, BAM files from the readset file
        """

        jobs = []
        for sample in self.samples:
            alignment_directory = os.path.join("alignment", sample.name)
            # Find input readset BAMs first from previous bwa_mem_picard_sort_sam job, then from original BAMs in the readset sheet.
            readset_bams = [os.path.join(alignment_directory, readset.name, readset.name + ".sorted.filtered.bam") for readset in sample.readsets]
            sample_bam = os.path.join(alignment_directory, sample.name + ".merged.bam")

            mkdir_job = Job(command="mkdir -p " + os.path.dirname(sample_bam))

            # If this sample has one readset only, create a sample BAM symlink to the readset BAM, along with its index.
            if len(sample.readsets) == 1:
                readset_bam = readset_bams[0]
                if os.path.isabs(readset_bam):
                    target_readset_bam = readset_bam
                else:
                    target_readset_bam = os.path.relpath(readset_bam, alignment_directory)

                job = concat_jobs([
                    mkdir_job,
                    Job([readset_bam], [sample_bam], command="ln -s -f " + target_readset_bam + " " + sample_bam, removable_files=[sample_bam]),
                ], name="symlink_readset_sample_bam." + sample.name)

            elif len(sample.readsets) > 1:
                job = concat_jobs([
                    mkdir_job,
                    picard.merge_sam_files(readset_bams, sample_bam)
                ])
                job.name = "picard_merge_sam_files." + sample.name

            jobs.append(job)

        return jobs
Example #9
0
    def wiggle(self):
        """
        Generate wiggle tracks suitable for multiple browsers.
        """

        jobs = []
        
        ##check the library status
        library = {}
        for readset in self.readsets:
            if not library.has_key(readset.sample) :
                library[readset.sample]="PAIRED_END"
            if readset.run_type == "SINGLE_END" :
                library[readset.sample]="SINGLE_END"
        
        for sample in self.samples:
            bam_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.")
            input_bam = bam_file_prefix + "bam"
            bed_graph_prefix = os.path.join("tracks", sample.name, sample.name)
            big_wig_prefix = os.path.join("tracks", "bigWig", sample.name)

            if (config.param('DEFAULT', 'strand_info') != 'fr-unstranded') and library[sample] == "PAIRED_END":
                input_bam_f1 = bam_file_prefix + "tmp1.forward.bam"
                input_bam_f2 = bam_file_prefix + "tmp2.forward.bam"
                input_bam_r1 = bam_file_prefix + "tmp1.reverse.bam"
                input_bam_r2 = bam_file_prefix + "tmp2.reverse.bam"
                output_bam_f = bam_file_prefix + "forward.bam"
                output_bam_r = bam_file_prefix + "reverse.bam"

                bam_f_job = concat_jobs([
                    samtools.view(input_bam, input_bam_f1, "-bh -F 256 -f 81"),
                    samtools.view(input_bam, input_bam_f2, "-bh -F 256 -f 161"),
                    picard.merge_sam_files([input_bam_f1, input_bam_f2], output_bam_f),
                    Job(command="rm " + input_bam_f1 + " " + input_bam_f2)
                ], name="wiggle." + sample.name + ".forward_strandspec")
                # Remove temporary-then-deleted files from job output files, otherwise job is never up to date
                bam_f_job.output_files.remove(input_bam_f1)
                bam_f_job.output_files.remove(input_bam_f2)

                bam_r_job = concat_jobs([
                    Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig")),
                    samtools.view(input_bam, input_bam_r1, "-bh -F 256 -f 97"),
                    samtools.view(input_bam, input_bam_r2, "-bh -F 256 -f 145"),
                    picard.merge_sam_files([input_bam_r1, input_bam_r2], output_bam_r),
                    Job(command="rm " + input_bam_r1 + " " + input_bam_r2)
                ], name="wiggle." + sample.name + ".reverse_strandspec")
                # Remove temporary-then-deleted files from job output files, otherwise job is never up to date
                bam_r_job.output_files.remove(input_bam_r1)
                bam_r_job.output_files.remove(input_bam_r2)

                jobs.extend([bam_f_job, bam_r_job])

                outputs = [
                    [bed_graph_prefix + ".forward.bedGraph", big_wig_prefix + ".forward.bw"],
                    [bed_graph_prefix + ".reverse.bedGraph", big_wig_prefix + ".reverse.bw"],
                ]
            else:
                outputs = [[bed_graph_prefix + ".bedGraph", big_wig_prefix + ".bw"]]

            for bed_graph_output, big_wig_output in outputs:
                job = concat_jobs([
                    Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig"), removable_files=["tracks"]),
                    bedtools.graph(input_bam, bed_graph_output, big_wig_output,library[sample])
                ], name="wiggle." + re.sub(".bedGraph", "", os.path.basename(bed_graph_output)))
                jobs.append(job)

        return jobs