Esempio n. 1
0
    def execute(self, inBam, refFasta, outBam, options=None, min_qual=0, JVMmemory=None):    # pylint: disable=W0221
        ''' Execute Novoalign on BAM inputs and outputs.
            If the BAM contains multiple read groups, break up
            the input and perform Novoalign separately on each one
            (because Novoalign mangles read groups).
            Use Picard to sort and index the output BAM.
            If min_qual>0, use Samtools to filter on mapping quality.
        '''
        options = options or ["-r", "Random"]

        samtools = tools.samtools.SamtoolsTool()

        # fetch list of RGs
        rgs = samtools.getReadGroups(inBam)

        rgs_list = list(samtools.getReadGroups(inBam).keys())
        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_one_rg_bam(inBam, refFasta, outBam, rgs=rgs, options=options, min_qual=min_qual, JVMmemory=JVMmemory)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []
            for rg in rgs:
                tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                self.align_one_rg_bam(
                    inBam,
                    refFasta,
                    tmp_bam,
                    rgid=rg,
                    rgs=rgs,
                    options=options,
                    min_qual=min_qual,
                    JVMmemory=JVMmemory
                )
                if os.path.getsize(tmp_bam) > 0:
                    align_bams.append(tmp_bam)

            # Merge BAMs, sort, and index
            tools.picard.MergeSamFilesTool().execute(
                align_bams,
                outBam,
                picardOptions=['SORT_ORDER=coordinate', 'USE_THREADING=true', 'CREATE_INDEX=true'],
                JVMmemory=JVMmemory
            )
            for bam in align_bams:
                os.unlink(bam)
Esempio n. 2
0
    def align_mem_bam(self,
                      inBam,
                      refDb,
                      outBam,
                      options=None,
                      min_qual=30,
                      threads=None,
                      JVMmemory=None):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_mem_one_rg(inBam,
                                  refDb,
                                  outBam,
                                  options=options,
                                  min_qual=min_qual,
                                  threads=threads)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []
            for rg in rgs:
                tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                self.align_mem_one_rg(inBam,
                                      refDb,
                                      tmp_bam,
                                      rgid=rg,
                                      options=options,
                                      min_qual=min_qual,
                                      threads=threads)
                if os.path.getsize(tmp_bam) > 0:
                    align_bams.append(tmp_bam)

            # Merge BAMs, sort, and index
            tools.picard.MergeSamFilesTool().execute(
                align_bams,
                outBam,
                picardOptions=[
                    'SORT_ORDER=coordinate', 'USE_THREADING=true',
                    'CREATE_INDEX=true'
                ],
                JVMmemory=JVMmemory)
            for bam in align_bams:
                os.unlink(bam)
Esempio n. 3
0
    def align_bam(self, inBam, refDb, outBam, options=None,
                      threads=None, JVMmemory=None):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()
        threads = util.misc.sanitize_thread_count(threads)

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_one_rg(inBam, refDb, outBam, options=options, threads=threads)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []

            for rg in rgs:
                tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                self.align_one_rg(
                    inBam,
                    refDb,
                    tmp_bam,
                    rgid=rg,
                    options=options,
                    threads=threads
                )
                if not samtools.isEmpty(tmp_bam):
                    align_bams.append(tmp_bam)
                else:
                    log.warning("No alignment output for RG %s in file %s against %s", rg, inBam, refDb)

            if len(align_bams)==0:
                with util.file.tempfname('.empty.sam') as empty_sam:
                    samtools.dumpHeader(inBam, empty_sam)
                    samtools.sort(empty_sam, outBam)
            else:
                # Merge BAMs, sort, and index
                picardOptions = ['SORT_ORDER=coordinate', 'USE_THREADING=true', 'CREATE_INDEX=true']
                tools.picard.MergeSamFilesTool().execute(
                    align_bams,
                    outBam,
                    picardOptions=picardOptions,
                    JVMmemory=JVMmemory
                )
                for bam in align_bams:
                    os.unlink(bam)
Esempio n. 4
0
    def align_mem_bam(self, inBam, refDb, outBam, options=None, min_qual=30, threads=None, JVMmemory=None):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_mem_one_rg(inBam, refDb, outBam, options=options, min_qual=min_qual, threads=threads)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []
            for rg in rgs:
                tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                self.align_mem_one_rg(
                    inBam,
                    refDb,
                    tmp_bam,
                    rgid=rg,
                    options=options,
                    min_qual=min_qual,
                    threads=threads
                )
                if os.path.getsize(tmp_bam) > 0:
                    align_bams.append(tmp_bam)

            # Merge BAMs, sort, and index
            tools.picard.MergeSamFilesTool().execute(
                align_bams,
                outBam,
                picardOptions=['SORT_ORDER=coordinate', 'USE_THREADING=true', 'CREATE_INDEX=true'],
                JVMmemory=JVMmemory
            )
            for bam in align_bams:
                os.unlink(bam)
Esempio n. 5
0
    def align_one_rg_bam(self, inBam, refFasta, outBam, rgid=None, rgs=None, options=None, min_qual=0, JVMmemory=None):
        ''' Execute Novoalign on BAM inputs and outputs.
            Requires that only one RG exists (will error otherwise).
            Use Picard to sort and index the output BAM.
            If min_qual>0, use Samtools to filter on mapping quality.
        '''
        options = options or ["-r", "Random"]

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = rgs if rgs is not None else samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))
        #rg = rgs[rgid]

        # Strip inBam to just one RG (if necessary)
        if len(rgs) == 1:
            one_rg_inBam = inBam
        else:
            # strip inBam to one read group
            tmp_bam = util.file.mkstempfname('.onebam.bam')
            samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
            # special exit if this file is empty
            if samtools.count(tmp_bam) == 0:
                return

            # simplify BAM header otherwise Novoalign gets confused
            one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
            headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
            with open(headerFile, 'wt') as outf:
                for row in samtools.getHeader(inBam):
                    if len(row) > 0 and row[0] == '@RG':
                        if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                            # skip all read groups that are not rgid
                            continue
                    outf.write('\t'.join(row) + '\n')
            samtools.reheader(tmp_bam, headerFile, one_rg_inBam)
            os.unlink(tmp_bam)
            os.unlink(headerFile)

        # Novoalign
        tmp_sam = util.file.mkstempfname('.novoalign.sam')
        tmp_sam_err = util.file.mkstempfname('.novoalign.sam.err')
        cmd = [self.install_and_get_path(), '-f', one_rg_inBam] + list(map(str, options))
        cmd = cmd + ['-F', 'BAM', '-d', self._fasta_to_idx_name(refFasta), '-o', 'SAM']
        _log.debug(' '.join(cmd))
        with open(tmp_sam, 'wt') as outf:
            util.misc.run_and_save(cmd, outf=outf)

        # Samtools filter (optional)
        if min_qual:
            tmp_bam2 = util.file.mkstempfname('.filtered.bam')
            cmd = [samtools.install_and_get_path(), 'view', '-b', '-S', '-1', '-q', str(min_qual), tmp_sam]
            _log.debug('%s > %s', ' '.join(cmd), tmp_bam2)
            with open(tmp_bam2, 'wb') as outf:
                util.misc.run_and_save(cmd, outf=outf)
            os.unlink(tmp_sam)
            tmp_sam = tmp_bam2

        # Picard SortSam
        sorter = tools.picard.SortSamTool()
        sorter.execute(
            tmp_sam,
            outBam,
            sort_order='coordinate',
            picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'],
            JVMmemory=JVMmemory
        )
Esempio n. 6
0
    def align_mem_bam(self,
                      inBam,
                      refDb,
                      outBam,
                      options=None,
                      min_score_to_filter=None,
                      threads=None,
                      JVMmemory=None,
                      invert_filter=False,
                      should_index=True):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()
        threads = util.misc.sanitize_thread_count(threads)

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_mem_one_rg(inBam,
                                  refDb,
                                  outBam,
                                  options=options,
                                  min_score_to_filter=min_score_to_filter,
                                  threads=threads,
                                  invert_filter=invert_filter)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []

            threads_for_chunk = int(
                round(min(max(threads / len(rgs), 1), threads), 0)) + 1
            # worker count limited to 1 for now to reduce in-memory index size resulting from
            # running multiple copies of bwa in parallel
            workers = 1  #len(rgs) if len(rgs)<threads else threads
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=workers) as executor:
                futures = [
                ]  # executor.submit(util.file.count_occurrences_in_tsv, filePath, include_noise=includeNoise) for rg in rgs]

                for rg in rgs:
                    tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                    futures.append(
                        executor.submit(
                            self.align_mem_one_rg,
                            inBam,
                            refDb,
                            tmp_bam,
                            rgid=rg,
                            options=options,
                            min_score_to_filter=min_score_to_filter,
                            threads=threads_for_chunk,
                            invert_filter=invert_filter))

                for future in concurrent.futures.as_completed(futures):
                    if future.result():
                        rg, aln_bam = future.result()
                        if os.path.getsize(aln_bam) > 0:
                            align_bams.append(aln_bam)
                        else:
                            log.warning(
                                "No alignment output for RG %s in file %s against %s",
                                rg, inBam, refDb)

            if len(align_bams) == 0:
                util.file.touch(outBam)
            else:
                # Merge BAMs, sort, and index
                picardOptions = ['SORT_ORDER=coordinate', 'USE_THREADING=true']
                if should_index:
                    picardOptions.append('CREATE_INDEX=true')
                tools.picard.MergeSamFilesTool().execute(
                    align_bams,
                    outBam,
                    picardOptions=picardOptions,
                    JVMmemory=JVMmemory)
                # no longer required since MergeSamFiles creates the index
                #if outBam.endswith(".bam") or outBam.endswith(".cram"):
                #    samtools.index(outBam)
                for bam in align_bams:
                    os.unlink(bam)
Esempio n. 7
0
    def align_mem_one_rg(self,
                         inBam,
                         refDb,
                         outBam,
                         rgid=None,
                         options=None,
                         min_score_to_filter=None,
                         threads=None,
                         JVMmemory=None,
                         invert_filter=False,
                         should_index=True):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError(
                "{} has {} read groups, but we require exactly one".format(
                    inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError(
                "{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            with util.file.tempfname('.onebam.bam') as tmp_bam:
                samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
                # special exit if this file is empty
                if samtools.count(tmp_bam) == 0:
                    log.warning("No reads present for RG %s in file: %s", rgid,
                                inBam)
                    return
                # simplify BAM header otherwise Novoalign gets confused
                one_rg_inBam = util.file.mkstempfname(
                    '.{}.in.bam'.format(rgid))
                removeInput = True

                with open(headerFile, 'wt') as outf:
                    for row in samtools.getHeader(inBam):
                        if len(row) > 0 and row[0] == '@RG':
                            if rgid != list(x[3:] for x in row
                                            if x.startswith('ID:'))[0]:
                                # skip all read groups that are not rgid
                                continue
                        outf.write('\t'.join(row) + '\n')
                samtools.reheader(tmp_bam, headerFile, one_rg_inBam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0

        #with util.file.tempfname('.aligned.bam') as tmp_bam_aligned:
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(
            one_rg_inBam,
            refDb,
            outBam,
            options=options +
            ['-R', readgroup_line.rstrip("\r\n").replace('\t', '\\t')],
            min_score_to_filter=min_score_to_filter,
            threads=threads,
            invert_filter=invert_filter,
            should_index=should_index)

        return (rgid, outBam)
        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)
Esempio n. 8
0
    def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None,
                         min_score_to_filter=None, threads=None, JVMmemory=None):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            tmp_bam = util.file.mkstempfname('.onebam.bam')
            samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
            # special exit if this file is empty
            if samtools.count(tmp_bam) == 0:
                log.warning("No reads present for RG %s in file: %s", rgid, inBam)
                return
            # simplify BAM header otherwise Novoalign gets confused
            one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
            removeInput = True
            
            with open(headerFile, 'wt') as outf:
                for row in samtools.getHeader(inBam):
                    if len(row) > 0 and row[0] == '@RG':
                        if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                            # skip all read groups that are not rgid
                            continue
                    outf.write('\t'.join(row) + '\n')
            samtools.reheader(tmp_bam, headerFile, one_rg_inBam)
            os.unlink(tmp_bam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0

        tmp_bam_aligned = util.file.mkstempfname('.aligned.bam')
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(one_rg_inBam, refDb, tmp_bam_aligned, options=options+['-R',
                 readgroup_line.rstrip("\r\n")],
                 min_score_to_filter=min_score_to_filter, threads=threads)

        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)

        # if the aligned bam file contains no reads after filtering
        # just create an empty file
        if tools.samtools.SamtoolsTool().count(tmp_bam_aligned) == 0:
            util.file.touch(outBam)
        else:
            # samtools reheader seems to segfault on some alignments created by bwa
            # so rather than reheader, BWA will write out the RG given to it via '-R'
            # reheadered_bam = util.file.mkstempfname('.reheadered.bam')
            # tools.samtools.SamtoolsTool().reheader(tmp_bam_aligned, headerFile, reheadered_bam)
            # os.unlink(tmp_bam_aligned)
            # os.unlink(headerFile)
            # os.system("samtools view -h {} > /Users/tomkinsc/Desktop/test_reheader.bam".format(reheadered_bam))

            # sort
            sorter = tools.picard.SortSamTool()
            sorter.execute(
                tmp_bam_aligned,
                outBam,
                sort_order='coordinate',
                picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'],
                JVMmemory=JVMmemory
            )
Esempio n. 9
0
    def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None, min_qual=30, threads=None, JVMmemory=None):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            tmp_bam = util.file.mkstempfname('.onebam.bam')
            samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
            # special exit if this file is empty
            if samtools.count(tmp_bam) == 0:
                return
            # simplify BAM header otherwise Novoalign gets confused
            one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
            removeInput = True
            
            with open(headerFile, 'wt') as outf:
                for row in samtools.getHeader(inBam):
                    if len(row) > 0 and row[0] == '@RG':
                        if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                            # skip all read groups that are not rgid
                            continue
                    outf.write('\t'.join(row) + '\n')
            samtools.reheader(tmp_bam, headerFile, one_rg_inBam)
            os.unlink(tmp_bam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0
        
        aln_bam_prefilter = util.file.mkstempfname('.prefiltered.bam')
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(one_rg_inBam, refDb, aln_bam_prefilter, options=options+['-R', readgroup_line.rstrip("\n").rstrip("\r")], min_qual=min_qual, threads=threads)

        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)

        # @haydenm says: 
        # For some reason (particularly when the --sensitive option is on), bwa
        # doesn't listen to its '-T' flag and outputs alignments with score less
        # than the '-T 30' threshold. So filter these:
        if min_qual > 0:
            tmp_bam_aligned = util.file.mkstempfname('.aligned.bam')
            tools.samtools.SamtoolsTool().view(["-b", "-h", "-q", str(min_qual)], aln_bam_prefilter, tmp_bam_aligned)
            os.unlink(aln_bam_prefilter)
        else:
            shutil.move(aln_bam_prefilter, tmp_bam_aligned)

        # if the aligned bam file contains no reads after filtering
        # just create an empty file
        if tools.samtools.SamtoolsTool().count(tmp_bam_aligned) == 0:
            util.file.touch(outBam)
        else:
            # samtools reheader seems to segfault on some alignments created by bwa
            # so rather than reheader, BWA will write out the RG given to it via '-R'
            # reheadered_bam = util.file.mkstempfname('.reheadered.bam')
            # tools.samtools.SamtoolsTool().reheader(tmp_bam_aligned, headerFile, reheadered_bam)
            # os.unlink(tmp_bam_aligned)
            # os.unlink(headerFile)
            # os.system("samtools view -h {} > /Users/tomkinsc/Desktop/test_reheader.bam".format(reheadered_bam))

            # sort
            sorter = tools.picard.SortSamTool()
            sorter.execute(
                tmp_bam_aligned,
                outBam,
                sort_order='coordinate',
                picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'],
                JVMmemory=JVMmemory
            )
Esempio n. 10
0
    def align_one_rg(self, inBam, refDb, outBam, rgid=None, preset=None, options=None,
                         threads=None, JVMmemory=None):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file using minimap2.
            Emits alignments in sorted, index bam files.
            inBam may contain more read groups, but we will subset input to the specified rgid.
            preset may be specified as a valid value for "minimap2 -x" which depends on the type of
                data (short accurate reads vs long noisy reads). If preset is set to None, we will autodetect
                based on the PL (platform) tag in the read group header (e.g. illumina, ont, pacbio)
        """
        options = list(options).copy() or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            with util.file.tempfname('.onebam.bam') as tmp_bam:
                samtools.view(['-1', '-r', rgid], inBam, tmp_bam)
                # special exit if this file is empty
                if samtools.isEmpty(tmp_bam):
                    log.warning("No reads present for RG %s in file: %s", rgid, inBam)
                    shutil.copyfile(tmp_bam, outBam)
                    return
                # simplify BAM header otherwise Novoalign gets confused
                one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
                removeInput = True
                
                with open(headerFile, 'wt') as outf:
                    for row in samtools.getHeader(inBam):
                        if len(row) > 0 and row[0] == '@RG':
                            if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                                # skip all read groups that are not rgid
                                continue
                        outf.write('\t'.join(row) + '\n')
                samtools.reheader(tmp_bam, headerFile, one_rg_inBam)

        # get the read group line to give to mm2
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line.rstrip("\r\n")
        if not readgroup_line:
            raise Exception()
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to minimap2 to write out
        options.extend(('-R', readgroup_line.replace('\t','\\t')))

        # dynamically determine the mode of operation
        if '-x' not in options:
            if preset is None:
                platform = list(x for x in readgroup_line.split('\t') if x.startswith('PL:'))
                if len(platform) != 1:
                    raise Exception("cannot autodetect minimap2 aligner mode when PL: tag is not set in the read group header for {}: {}".format(inBam, readgroup_line))
                else:
                    platform = platform[0][3:].lower()
                    if platform == 'illumina':
                        preset = 'sr'
                    elif platform == 'ont':
                        preset = 'map-ont'
                    elif platform == 'pacbio':
                        preset = 'map-pb'
                    else:
                        raise Exception("PL: tag {} for read group {} in bam {} refers to a data type we do not know how to map with minimap2".format(platform, rgid, inBam))
            options.extend(('-x', preset))

        # perform actual alignment
        if samtools.isEmpty(one_rg_inBam):
            # minimap doesn't like empty inputs, so copy empty bam through
            samtools.sort(one_rg_inBam, outBam)
        else:
            self.align_cmd(one_rg_inBam, refDb, outBam, options=options, threads=threads)

        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)
Esempio n. 11
0
    def align_mem_bam(self,
                      inBam,
                      refDb,
                      outBam,
                      options=None,
                      min_score_to_filter=None,
                      threads=None,
                      JVMmemory=None,
                      invert_filter=False):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_mem_one_rg(inBam,
                                  refDb,
                                  outBam,
                                  options=options,
                                  min_score_to_filter=min_score_to_filter,
                                  threads=threads,
                                  invert_filter=invert_filter)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []
            for rg in rgs:
                tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                self.align_mem_one_rg(inBam,
                                      refDb,
                                      tmp_bam,
                                      rgid=rg,
                                      options=options,
                                      min_score_to_filter=min_score_to_filter,
                                      threads=threads,
                                      invert_filter=invert_filter)
                if os.path.getsize(tmp_bam) > 0:
                    align_bams.append(tmp_bam)
                else:
                    log.warning(
                        "No alignment output for RG %s in file %s against %s",
                        rg, inBam, refDb)

            if len(align_bams) == 0:
                util.file.touch(outBam)
            else:
                # Merge BAMs, sort, and index
                tools.picard.MergeSamFilesTool().execute(
                    align_bams,
                    outBam,
                    picardOptions=[
                        'SORT_ORDER=coordinate', 'USE_THREADING=true',
                        'CREATE_INDEX=true'
                    ],
                    JVMmemory=JVMmemory)
                if outBam.endswith(".bam") or outBam.endswith(".cram"):
                    samtools.index(outBam)
                for bam in align_bams:
                    os.unlink(bam)
Esempio n. 12
0
    def align_mem_bam(self, inBam, refDb, outBam, options=None,
                      min_score_to_filter=None, threads=None, JVMmemory=None, invert_filter=False, should_index=True):
        options = options or []

        samtools = tools.samtools.SamtoolsTool()
        threads = util.misc.sanitize_thread_count(threads)

        # fetch list of RGs
        rgs = list(samtools.getReadGroups(inBam).keys())

        if len(rgs) == 0:
            # Can't do this
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))

        elif len(rgs) == 1:
            # Only one RG, keep it simple
            self.align_mem_one_rg(inBam, refDb, outBam, options=options,
                                  min_score_to_filter=min_score_to_filter,
                                  threads=threads, invert_filter=invert_filter)

        else:
            # Multiple RGs, align one at a time and merge
            align_bams = []

            threads_for_chunk = int(round(min(max(threads / len(rgs),1),threads),0))+1
            # worker count limited to 1 for now to reduce in-memory index size resulting from
            # running multiple copies of bwa in parallel
            workers = 1 #len(rgs) if len(rgs)<threads else threads
            with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
                futures = []# executor.submit(util.file.count_occurrences_in_tsv, filePath, include_noise=includeNoise) for rg in rgs]

                for rg in rgs:
                    tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg))
                    futures.append(executor.submit(
                        self.align_mem_one_rg,
                        inBam,
                        refDb,
                        tmp_bam,
                        rgid=rg,
                        options=options,
                        min_score_to_filter=min_score_to_filter,
                        threads=threads_for_chunk, 
                        invert_filter=invert_filter
                    ))

                for future in concurrent.futures.as_completed(futures):
                    if future.result():
                        rg, aln_bam = future.result()
                        if os.path.getsize(aln_bam) > 0:
                            align_bams.append(aln_bam)
                        else:
                            log.warning("No alignment output for RG %s in file %s against %s", rg, inBam, refDb)

            if len(align_bams) == 0:
                util.file.touch(outBam)
            else:
                # Merge BAMs, sort, and index
                picardOptions = ['SORT_ORDER=coordinate', 'USE_THREADING=true']
                if should_index:
                    picardOptions.append('CREATE_INDEX=true')
                tools.picard.MergeSamFilesTool().execute(
                    align_bams,
                    outBam,
                    picardOptions=picardOptions,
                    JVMmemory=JVMmemory
                )
                # no longer required since MergeSamFiles creates the index
                #if outBam.endswith(".bam") or outBam.endswith(".cram"):
                #    samtools.index(outBam)
                for bam in align_bams:
                    os.unlink(bam)
Esempio n. 13
0
    def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None,
                         min_score_to_filter=None, threads=None, JVMmemory=None, invert_filter=False, should_index=True):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            with util.file.tempfname('.onebam.bam') as tmp_bam:
                samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
                # special exit if this file is empty
                if samtools.count(tmp_bam) == 0:
                    log.warning("No reads present for RG %s in file: %s", rgid, inBam)
                    return
                # simplify BAM header otherwise Novoalign gets confused
                one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
                removeInput = True
                
                with open(headerFile, 'wt') as outf:
                    for row in samtools.getHeader(inBam):
                        if len(row) > 0 and row[0] == '@RG':
                            if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                                # skip all read groups that are not rgid
                                continue
                        outf.write('\t'.join(row) + '\n')
                samtools.reheader(tmp_bam, headerFile, one_rg_inBam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0

        #with util.file.tempfname('.aligned.bam') as tmp_bam_aligned:
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(one_rg_inBam, refDb, outBam, options=options+['-R',
                 readgroup_line.rstrip("\r\n").replace('\t','\\t')],
                 min_score_to_filter=min_score_to_filter, threads=threads, invert_filter=invert_filter, should_index=should_index)

        return (rgid, outBam)
        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)