Example #1
0
    def estimate_ribosomal_rna(self):
        """
        Use bwa mem to align reads on the rRNA reference fasta and count the number of read mapped
        The filtered reads are aligned to a reference fasta file of ribosomal sequence. The alignment is done per sequencing readset.
        The alignment software used is [BWA](http://bio-bwa.sourceforge.net/) with algorithm: bwa mem.
        BWA output BAM files are then sorted by coordinate using [Picard](http://broadinstitute.github.io/picard/).

        This step takes as input files:

        readset Bam files
        """

        jobs = []
        for readset in self.readsets:
            readset_bam = os.path.join("alignment", readset.sample.name, readset.name , "Aligned.sortedByCoord.out.bam")
            output_folder = os.path.join("metrics",readset.sample.name, readset.name)
            readset_metrics_bam = os.path.join(output_folder,readset.name +"rRNA.bam")


            job = concat_jobs([
                Job(command="mkdir -p " + os.path.dirname(readset_bam) + " " + output_folder),
                pipe_jobs([
                    bvatools.bam2fq(
                        readset_bam
                    ),
                    bwa.mem(
                        "/dev/stdin",
                        None,
                        read_group="'@RG" + \
                            "\tID:" + readset.name + \
                            "\tSM:" + readset.sample.name + \
                            ("\tLB:" + readset.library if readset.library else "") + \
                            ("\tPU:run" + readset.run + "_" + readset.lane if readset.run and readset.lane else "") + \
                            ("\tCN:" + config.param('bwa_mem_rRNA', 'sequencing_center') if config.param('bwa_mem_rRNA', 'sequencing_center', required=False) else "") + \
                            "\tPL:Illumina" + \
                            "'",
                        ref=config.param('bwa_mem_rRNA', 'ribosomal_fasta'),
                        ini_section='bwa_mem_rRNA'
                    ),
                    picard.sort_sam(
                        "/dev/stdin",
                        readset_metrics_bam,
                        "coordinate",
                        ini_section='picard_sort_sam_rrna'
                    )
                ]),
                tools.py_rrnaBAMcount (
                    bam=readset_metrics_bam, 
                    gtf=config.param('bwa_mem_rRNA', 'gtf'), 
                    output=os.path.join(output_folder,readset.name+"rRNA.stats.tsv"),
                    typ="transcript")], name="bwa_mem_rRNA." + readset.name )
            
            job.removable_files=[readset_metrics_bam]
            jobs.append(job)
        return jobs
Example #2
0
 def bwa_mem_picard_sort_sam(self):
     jobs = []
     for readset in self.readsets:
         trim_file_prefix = os.path.join("trim", readset.sample.name, readset.name + ".trim.")
         alignment_directory = os.path.join("alignment", readset.sample.name)
         readset_bam = os.path.join(alignment_directory, readset.name + ".sorted.bam")
         
         if readset.run_type == "PAIRED_END":
             fastq1 = trim_file_prefix + "pair1.fastq.gz"
             fastq2 = trim_file_prefix + "pair2.fastq.gz"
         elif readset.run_type == "SINGLE_END":
             fastq1 = trim_file_prefix + "single.fastq.gz"
             fastq2 = None
         else:
             raise Exception("Error: run type \"" + readset.run_type +
             "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END or SINGLE_END)!")
         
         job = concat_jobs([
             Job(command="mkdir -p " + alignment_directory),
             pipe_jobs([
                 bwa.mem(
                     fastq1,
                     fastq2,
                     read_group="'@RG" + \
                         "\tID:" + readset.name + \
                         "\tSM:" + readset.sample.name + \
                         "\tLB:" + readset.library + \
                         "\tPU:run" + readset.run + "_" + readset.lane + \
                         "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \
                         "\tPL:Illumina" + \
                         "'"
                 ),
                 picard.sort_sam(
                     "/dev/stdin",
                     readset_bam,
                     "coordinate"
                 )
             ])
         ], name="bwa_mem_picard_sort_sam." + readset.name)
         
         # If this readset is unique for this sample, further BAM merging is not necessary.
         # Thus, create a sample BAM symlink to the readset BAM, along with its index.
         if len(readset.sample.readsets) == 1:
             readset_index = re.sub("\.bam$", ".bai", readset_bam)
             sample_bam = os.path.join(alignment_directory, readset.sample.name + ".sorted.bam")
             sample_index = re.sub("\.bam$", ".bai", sample_bam)
             job = concat_jobs([
                 job,
                 Job([readset_bam], [sample_bam], command="ln -s -f " + os.path.relpath(readset_bam, os.path.dirname(sample_bam)) + " " + sample_bam),
                 Job([readset_bam], [sample_index], command="ln -s -f " + os.path.relpath(readset_index, os.path.dirname(sample_index)) + " " + sample_index)
             ], name=job.name)
         
         jobs.append(job)
     return jobs
Example #3
0
    def picard_sort_sam(self):
        """
        The alignment file is reordered (QueryName) using [Picard](http://broadinstitute.github.io/picard/). The QueryName-sorted bam files will be used to determine raw read counts.
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name)

            job = picard.sort_sam(
                alignment_file_prefix + ".sorted.bam",
                alignment_file_prefix + ".QueryNameSorted.bam",
                "queryname"
            )
            job.name = "picard_sort_sam." + sample.name
            jobs.append(job)
        return jobs
Example #4
0
    def picard_sort_sam(self):
        """
        The alignment file is reordered (QueryName) using [Picard](http://broadinstitute.github.io/picard/). The QueryName-sorted bam files will be used to determine raw read counts.
        """

        jobs = []
        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name)

            job = picard.sort_sam(
                alignment_file_prefix + ".sorted.bam",
                alignment_file_prefix + ".QueryNameSorted.bam",
                "queryname"
            )
            job.name = "picard_sort_sam." + sample.name
            jobs.append(job)
        return jobs
Example #5
0
    def get_alignment_job(self, readset):
        output = readset.bam + ".bam"
        job = concat_jobs([
            Job(command="mkdir -p " + os.path.dirname(output)),
            pipe_jobs([
                bwa.mem(readset.fastq1,
                        readset.fastq2,
                        read_group=RunProcessingAligner.get_rg_tag(
                            readset, 'bwa_mem'),
                        ref=readset.aligner_reference_index),
                picard.sort_sam("/dev/stdin", output, "coordinate")
            ])
        ],
                          name="bwa_mem_picard_sort_sam." + readset.name +
                          "." + readset.run + "." + readset.lane)

        return job
    def _estimate_ribosomal_rna(readset):
        """
        Use bwa mem to align reads on the rRNA reference fasta and count the number of read mapped
        The filtered reads are aligned to a reference fasta file of ribosomal sequence. The alignment is done per
        sequencing readset.
        The alignment software used is [BWA](http://bio-bwa.sourceforge.net/) with algorithm: bwa mem.
        BWA output BAM files are then sorted by coordinate using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        if len(readset.annotation_files) > 1 and os.path.isfile(readset.annotation_files[0]) and os.path.isfile(
                readset.annotation_files[1]):
            readset_bam = readset.bam + ".bam"
            readset_metrics_bam = readset.bam + ".rRNA.bam"

            job = concat_jobs([
                                  pipe_jobs([
                                      bvatools.bam2fq(
                                          readset_bam
                                      ),
                                      bwa.mem(
                                          "/dev/stdin",
                                          None,
                                          read_group=RunProcessingAligner.get_rg_tag(readset, 'bwa_mem_rRNA'),
                                          ref=readset.annotation_files[1],
                                          ini_section='bwa_mem_rRNA'
                                      ),
                                      picard.sort_sam(
                                          "/dev/stdin",
                                          readset_metrics_bam,
                                          "coordinate",
                                          ini_section='picard_sort_sam_rrna'
                                      )
                                  ]),
                                  tools.py_rrnaBAMcount(
                                      bam=readset_metrics_bam,
                                      gtf=readset.annotation_files[0],
                                      output=os.path.join(readset.bam + ".metrics.rRNA.tsv"),
                                      typ="transcript")],
                              name="bwa_mem_rRNA." + readset.name + ".rRNA" + "." + readset.run + "." + readset.lane)

            job.removable_files = [readset_metrics_bam]
            jobs.append(job)

        return jobs
Example #7
0
    def _estimate_ribosomal_rna(readset):
        """
        Use bwa mem to align reads on the rRNA reference fasta and count the number of read mapped
        The filtered reads are aligned to a reference fasta file of ribosomal sequence. The alignment is done per
        sequencing readset.
        The alignment software used is [BWA](http://bio-bwa.sourceforge.net/) with algorithm: bwa mem.
        BWA output BAM files are then sorted by coordinate using [Picard](http://broadinstitute.github.io/picard/).
        """

        jobs = []
        if len(readset.annotation_files) > 1 and os.path.isfile(
                readset.annotation_files[0]) and os.path.isfile(
                    readset.annotation_files[1]):
            readset_bam = readset.bam + ".bam"
            readset_metrics_bam = readset.bam + ".rRNA.bam"

            job = concat_jobs([
                pipe_jobs([
                    bvatools.bam2fq(readset_bam),
                    bwa.mem("/dev/stdin",
                            None,
                            read_group=RunProcessingAligner.get_rg_tag(
                                readset, 'bwa_mem_rRNA'),
                            ref=readset.annotation_files[1],
                            ini_section='bwa_mem_rRNA'),
                    picard.sort_sam("/dev/stdin",
                                    readset_metrics_bam,
                                    "coordinate",
                                    ini_section='picard_sort_sam_rrna')
                ]),
                tools.py_rrnaBAMcount(
                    bam=readset_metrics_bam,
                    gtf=readset.annotation_files[0],
                    output=os.path.join(readset.bam + ".metrics.rRNA.tsv"),
                    typ="transcript")
            ],
                              name="bwa_mem_rRNA." + readset.name + ".rRNA" +
                              "." + readset.run + "." + readset.lane)

            job.removable_files = [readset_metrics_bam]
            jobs.append(job)

        return jobs
    def get_alignment_jobs(self, readset):
        jobs = []
        output = readset.bam + ".bam"
        job = concat_jobs([
                              Job(command="mkdir -p " + os.path.dirname(output)),
                              pipe_jobs([
                                  bwa.mem(
                                      readset.fastq1,
                                      readset.fastq2,
                                      read_group=RunProcessingAligner.get_rg_tag(readset, 'bwa_mem'),
                                      ref=readset.aligner_reference_index
                                  ),
                                  picard.sort_sam(
                                      "/dev/stdin",
                                      output,
                                      "coordinate"
                                  )
                              ])
                          ], name="bwa_mem_picard_sort_sam." + readset.name + "_" + readset.run + "_" + readset.lane)

        jobs.append(job)
        return jobs
Example #9
0
 def map_on_scaffolds(self):
     jobs = []
     
     for sample in self.samples:
         cov_directory = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer'), "cov")
         extract_file_prefix = os.path.join("extract", sample.name, sample.name + ".")
         scaffolds_file = os.path.join("scaffolds", sample.name, "ray", "ray" + config.param('ray', 'kmer'), "Scaffolds.fasta")
         
         #map Orphan read
         job = concat_jobs([
             Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"),
             pipe_jobs([
                 bwa.mem(
                     extract_file_prefix + "ORPHAN.1.fastq.gz",
                     extract_file_prefix + "ORPHAN.2.fastq.gz",
                     read_group="'@RG" + \
                         "\tID:" + sample.name + "_ray_orphan" \
                         "\tSM:" + sample.name + \
                         "\tLB:" + sample.name + \
                         "\tPU:orphan" + \
                         "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \
                         "\tPL:Illumina" + \
                         "'",
                     ref=scaffolds_file
                 ),
                 picard.sort_sam(
                     "/dev/stdin",
                     os.path.join(cov_directory, "ORPHAN.bam"),
                     "coordinate"
                 )
             ])
         ], name="bwa_mem_picard_sort_sam_ORPHAN_" + sample.name)
         jobs.append(job)
         
         #map OEA read
         job = concat_jobs([
             Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"),
             pipe_jobs([
                 bwa.mem(
                     extract_file_prefix + "OEAUNMAP.1.equal.fastq.gz",
                     read_group="'@RG" + \
                         "\tID:" + sample.name + "_ray_scoea1"\
                         "\tSM:" + sample.name + \
                         "\tLB:" + sample.name + \
                         "\tPU:scoea1" + \
                         "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \
                         "\tPL:Illumina" + \
                         "'",
                     ref=scaffolds_file
                 ),
                 picard.sort_sam(
                     "/dev/stdin",
                     os.path.join(cov_directory, "OEAUNMAP.1.bam"),
                     "coordinate"
                 )
             ])
         ], name="bwa_mem_picard_sort_sam_OEA1_" + sample.name)
         jobs.append(job)
         
         job = concat_jobs([
             Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"),
             pipe_jobs([
                 bwa.mem(
                     extract_file_prefix + "OEAUNMAP.2.equal.fastq.gz",
                     read_group="'@RG" + \
                         "\tID:" + sample.name + "_ray_scoea2" \
                         "\tSM:" + sample.name + \
                         "\tLB:" + sample.name + \
                         "\tPU:scoea2" + \
                         "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \
                         "\tPL:Illumina" + \
                         "'",
                     ref=scaffolds_file
                 ),
                 picard.sort_sam(
                     "/dev/stdin",
                     os.path.join(cov_directory, "OEAUNMAP.2.bam"),
                     "coordinate"
                 )
             ])
         ], name="bwa_mem_picard_sort_sam_OEA2_" + sample.name)
         jobs.append(job)
         
         #map sclip read
         job = concat_jobs([
             Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"),
             pipe_jobs([
                 bwa.mem(
                     extract_file_prefix + "sclip.1.fastq.gz",
                     read_group="'@RG" + \
                         "\tID:" + sample.name + "_ray_sclip1" \
                         "\tSM:" + sample.name + \
                         "\tLB:" + sample.name + \
                         "\tPU:sclip1" + \
                         "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \
                         "\tPL:Illumina" + \
                         "'",
                     ref=scaffolds_file
                 ),
                 picard.sort_sam(
                     "/dev/stdin",
                     os.path.join(cov_directory, "sclip.1.bam"),
                     "coordinate"
                 )
             ])
         ], name="bwa_mem_picard_sort_sam_sclip1_" + sample.name)
         jobs.append(job)
         
         job = concat_jobs([
             Job(command="if [ ! -d " + cov_directory + " ]; then mkdir -p " + cov_directory + "; fi"),
             pipe_jobs([
                 bwa.mem(
                     extract_file_prefix + "sclip.2.fastq.gz",
                     read_group="'@RG" + \
                         "\tID:" + sample.name + "_ray_sclip2" \
                         "\tSM:" + sample.name + \
                         "\tLB:" + sample.name + \
                         "\tPU:sclip2" + \
                         "\tCN:" + config.param('bwa_mem', 'sequencing_center') + \
                         "\tPL:Illumina" + \
                         "'",
                     ref=scaffolds_file
                 ),
                 picard.sort_sam(
                     "/dev/stdin",
                     os.path.join(cov_directory, "sclip.2.bam"),
                     "coordinate"
                 )
             ])
         ], name="bwa_mem_picard_sort_sam_sclip2_" + sample.name)
         jobs.append(job)
     
     return jobs