Beispiel #1
0
    def test_run(self):
        """Test FilterService.run()."""
        options = Opt(30, 70, 50, 1, None, "random")

        obj = FilterService(self.alignedSam, self.targetFileName,
                            self.filteredSam, "BlasrService", -1, options)

        _output, errCode, _errMsg = obj.run()

        self.assertEqual(errCode, 0)
    def test_run(self):
        """Test FilterService.run()."""
        options = Opt(30, 70, 50, 1, None, "random")

        obj = FilterService(self.alignedSam, self.targetFileName,
                            self.filteredSam, "BlasrService", -1,
                            options)

        _output, errCode, _errMsg = obj.run()

        self.assertEqual(errCode, 0)
Beispiel #3
0
    def test_run_without_scoreCutoff(self):
        """Test FilterService.run() without score cutoff."""
        options2 = Opt(40, 50, None, None, None, "allbest")
        obj2 = FilterService(self.alignedSam, self.targetFileName,
                             self.filteredSam, "BowtieService", 1, options2)

        self.assertNotIn("-seed", obj2.cmd)
        self.assertNotIn("-scoreCutoff", obj2.cmd)
        self.assertIn("-scoreSign 1", obj2.cmd)

        _output, errCode, _errMsg = obj2.run()

        self.assertEqual(errCode, 0)
    def test_run_without_scoreCutoff(self):
        """Test FilterService.run() without score cutoff."""
        options2 = Opt(40, 50, None, None, None, "allbest")
        obj2 = FilterService(self.alignedSam, self.targetFileName,
                             self.filteredSam, "BowtieService", 1,
                             options2)

        self.assertNotIn("-seed", obj2.cmd)
        self.assertNotIn("-scoreCutoff", obj2.cmd)
        self.assertIn("-scoreSign 1", obj2.cmd)

        _output, errCode, _errMsg = obj2.run()

        self.assertEqual(errCode, 0)
Beispiel #5
0
class PBAlignRunner(PBToolRunner):
    """Tool runner."""
    def __init__(self,
                 args=None,
                 argumentList=(),
                 output_dataset_type=AlignmentSet):
        """Initialize a PBAlignRunner object.
           argumentList is a list of arguments, such as:
           ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam']
        """
        desc = "Utilities for aligning PacBio reads to reference sequences."
        if args is None:  # FIXME unit testing hack
            args = get_contract_parser().arg_parser.parser.parse_args(
                argumentList)
        self.args = args
        # args.verbosity is computed by counting # of 'v's in '-vv...'.
        # However in parseOptions, arguments are parsed twice to import config
        # options and then overwrite them with argumentList (e.g. command-line)
        # options.
        #self.args.verbosity = 1 if (self.args.verbosity is None) else \
        #    (int(self.args.verbosity) / 2 + 1)
        super(PBAlignRunner, self).__init__(desc)
        self._output_dataset_type = output_dataset_type
        self._alnService = None
        self._filterService = None
        self.fileNames = PBAlignFiles()
        self._tempFileManager = TempFileManager()

    def _setupParsers(self, description):
        pass

    def _addStandardArguments(self):
        pass

    def getVersion(self):
        """Return version."""
        return get_version()

    def _createAlignService(self, name, args, fileNames, tempFileManager):
        """
        Create and return an AlignService by algorithm name.
        Input:
            name           : an algorithm name such as blasr
            fileNames      : an PBAlignFiles object
            args           : pbalign options
            tempFileManager: a temporary file manager
        Output:
            an object of AlignService subclass (such as BlasrService).
        """
        if name not in ALGORITHM_CANDIDATES:
            errMsg = "ERROR: unrecognized algorithm {algo}".format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service = None
        if name == "blasr":
            service = BlasrService(args, fileNames, tempFileManager)
        elif name == "bowtie":
            service = BowtieService(args, fileNames, tempFileManager)
        elif name == "gmap":
            service = GMAPService(args, fileNames, tempFileManager)
        else:
            errMsg = "Service for {algo} is not implemented.".\
                     format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service.checkAvailability()
        return service

    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            logging.warning("Option --forQuiver has been deprecated in 3.0")

        outFormat = getFileFormat(fileNames.outputFileName)

        if outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            raise IOError(errMsg)

        if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
            if args.algorithm != "blasr":
                errMsg = "Must choose blasr in order to output a bam file."
                raise ValueError(errMsg)
            if args.filterAdapterOnly:
                errMsg = "-filterAdapter does not work when out format is BAM."
                raise ValueError(errMsg)

    def _parseArgs(self):
        """Overwrite ToolRunner.parseArgs(self).
        Parse PBAlignRunner arguments considering both args in argumentList and
        args in a config file (specified by --configFile).
        """
        pass

    def _output(self, inSam, refFile, outFile, readType=None):
        """Generate a SAM, BAM file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM file
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass  # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move %s as %s", inSam, outFile)
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, "Exited with error: " + str(e)
                logging.error(errMsg)
                raise RuntimeError(errMsg)
        elif outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            logging.error(errMsg)
            raise IOError(errMsg)
        elif outFormat == FILE_FORMATS.XML:
            logging.info("OutputService: Generating the output XML file %s %s",
                         inSam, outFile)
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        return output, errCode, errMsg

    def _cleanUp(self, realDelete=False):
        """ Clean up temporary files and intermediate results. """
        logging.debug("Clean up temporary files and directories.")
        self._tempFileManager.CleanUp(realDelete)

    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info("pbalign version: %s", get_version())
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args, self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        self._alnService.run()

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(
            self.fileNames.alignerSamOut,
            self.fileNames.targetFileName,
            self.fileNames.filteredSam,
            self.args.algorithm,
            #self._alnService.name,
            self._alnService.scoreSign,
            self.args,
            self.fileNames.adapterGffFileName)
        self._filterService.run()

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            BamPostService(self.fileNames).run()

        # Output all hits in SAM, BAM.
        self._output(inSam=self.fileNames.filteredSam,
                     refFile=self.fileNames.targetFileName,
                     outFile=self.fileNames.outputFileName,
                     readType=self.args.readType)

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles")
                                and self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime -
                                                          startTime)))
        return 0
Beispiel #6
0
class PBAlignRunner(PBToolRunner):
    """Tool runner."""

    def __init__(self, argumentList):
        """Initialize a PBAlignRunner object.
           argumentList is a list of arguments, such as:
           ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam']
        """
        desc = "Utilities for aligning PacBio reads to reference sequences."
        super(PBAlignRunner, self).__init__(desc)
        self._argumentList = argumentList
        self._alnService = None
        self._filterService = None
        self.fileNames = PBAlignFiles()
        self._tempFileManager = TempFileManager()

        self.parser, self.args, _infoMsg = parseOptions(
            argumentList=self._argumentList, parser=self.parser)
        # args.verbosity is computed by counting # of 'v's in '-vv...'.
        # However in parseOptions, arguments are parsed twice to import config
        # options and then overwrite them with argumentList (e.g. command-line)
        # options.
        self.args.verbosity = 0 if (self.args.verbosity is None) else \
            int(self.args.verbosity) / 2

    def getVersion(self):
        """Return version."""
        return get_version()

    def _createAlignService(self, name, args, fileNames, tempFileManager):
        """
        Create and return an AlignService by algorithm name.
        Input:
            name           : an algorithm name such as blasr
            fileNames      : an PBAlignFiles object
            args           : pbalign options
            tempFileManager: a temporary file manager
        Output:
            an object of AlignService subclass (such as BlasrService).
        """
        if name not in ALGORITHM_CANDIDATES:
            errMsg = "ERROR: unrecognized algorithm {algo}".format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service = None
        if name == "blasr":
            service = BlasrService(args, fileNames, tempFileManager)
        elif name == "bowtie":
            service = BowtieService(args, fileNames, tempFileManager)
        elif name == "gmap":
            service = GMAPService(args, fileNames, tempFileManager)
        else:
            errMsg = "Service for {algo} is not implemented.".\
                     format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service.checkAvailability()
        return service

    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            if args.useccs is not None:
                errMsg = "Options --forQuiver and --useccs should not " + \
                         "be used together, since Quiver is not designed to " + \
                         "polish ccs reads. if you want to align ccs reads" + \
                         "in cmp.h5 format with pulse QVs loaded, use " + \
                         "--loadQVs with --useccs instead."
                raise ValueError(errMsg)
            args.loadQVs = True

        if args.loadQVs:
            if fileNames.pulseFileName is None:
                errMsg = "The input file has to be in bas/pls/ccs.h5 " + \
                         "format, or --pulseFile needs to be specified, "
            if getFileFormat(fileNames.outputFileName) != FILE_FORMATS.CMP:
                errMsg = "The output file has to be in cmp.h5 format, "
            if errMsg != "":
                errMsg += "in order to load pulse QVs."
                logging.error(errMsg)
                raise ValueError(errMsg)

    def _parseArgs(self):
        """Overwrite ToolRunner.parseArgs(self).
        Parse PBAlignRunner arguments considering both args in argumentList and
        args in a config file (specified by --configFile).
        """
        pass

    def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False):
        """Generate a sam or a cmp.h5 file.
        Input:
            inSam   : an input SAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM or CMP.H5 file.
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        if getFileFormat(outFile) == FILE_FORMATS.SAM:
            #`mv inSam outFile`
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move {src} as {dst}".format(
                src=inSam, dst=outFile))
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, str(e)
        elif getFileFormat(outFile) == FILE_FORMATS.CMP:
            #`samtoh5 inSam outFile -readType readType
            logging.info("OutputService: Genearte the output CMP.H5 " +
                         "file using samtoh5.")
            prog = "samtoh5"
            cmd = "samtoh5 {samFile} {refFile} {outFile}".format(
                samFile=inSam, refFile=refFile, outFile=outFile)
            if readType is not None:
                cmd += " -readType {0} ".format(readType)
            if smrtTitle:
                cmd += " -smrtTitle "
            # Execute the command line
            logging.debug("OutputService: Call \"{0}\"".format(cmd))
            output, errCode, errMsg = backticks(cmd)

        if errCode != 0:
            errMsg = prog + " returned a non-zero exit status." + errMsg
            logging.error(errMsg)
            raise RuntimeError(errMsg)
        return output, errCode, errMsg

    def _cleanUp(self, realDelete=False):
        """ Clean up temporary files and intermediate results. """
        logging.debug("Clean up temporary files and directories.")
        self._tempFileManager.CleanUp(realDelete)

#    def _setupLogging(self):
#        LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
#        if self.args.verbosity >= 2:
#            print "logLevel = debug"
#            logLevel = logging.DEBUG
#        elif self.args.verbosity == 1:
#            print "logLevel = info"
#            logLevel = logging.INFO
#        else:
#            print "logLevel = warn"
#            logLevel = logging.WARN
#        logging.basicConfig(level=logLevel, format=LOG_FORMAT)

    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info("pbalign version: {version}".format(version=get_version()))
        logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args,
                                                    self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        try:
            self._alnService.run()
        except RuntimeError:
            return 1

        # Create a temporary filtered SAM file as output for FilterService.
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=".sam")

        # Call filter service.
        self._filterService = FilterService(self.fileNames.alignerSamOut,
                                            self.fileNames.targetFileName,
                                            self.fileNames.filteredSam,
                                            self._alnService.name,
                                            self._alnService.scoreSign,
                                            self.args,
                                            self.fileNames.adapterGffFileName)
        try:
            self._filterService.run()
        except RuntimeError:
            return 1

        # Output all hits either in SAM or CMP.H5.
        try:
            useSmrtTitle = False
            if (self.args.algorithm != "blasr" or
                self.fileNames.inputFileFormat == FILE_FORMATS.FASTA):
                useSmrtTitle = True

            self._output(
                self.fileNames.filteredSam,
                self.fileNames.targetFileName,
                self.fileNames.outputFileName,
                self.args.readType,
                useSmrtTitle)
        except RuntimeError:
            return 1

        # Call post service for quiver.
        if self.args.forQuiver or self.args.loadQVs:
            postService = ForQuiverService(self.fileNames,
                                           self.args)
            try:
                postService.run()
            except RuntimeError:
                return 1

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and
                               self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime - startTime)))
        return 0
Beispiel #7
0
class PBAlignRunner(PBToolRunner):

    """Tool runner."""

    def __init__(self, args=None, argumentList=(),
                 output_dataset_type=AlignmentSet):
        """Initialize a PBAlignRunner object.
           argumentList is a list of arguments, such as:
           ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam']
        """
        desc = "Utilities for aligning PacBio reads to reference sequences."
        if args is None: # FIXME unit testing hack
            args = get_contract_parser().arg_parser.parser.parse_args(argumentList)
        self.args = args
        # args.verbosity is computed by counting # of 'v's in '-vv...'.
        # However in parseOptions, arguments are parsed twice to import config
        # options and then overwrite them with argumentList (e.g. command-line)
        # options.
        #self.args.verbosity = 1 if (self.args.verbosity is None) else \
        #    (int(self.args.verbosity) / 2 + 1)
        super(PBAlignRunner, self).__init__(desc)
        self._output_dataset_type = output_dataset_type
        self._alnService = None
        self._filterService = None
        self.fileNames = PBAlignFiles()
        self._tempFileManager = TempFileManager()

    def _setupParsers(self, description):
        pass

    def _addStandardArguments(self):
        pass

    def getVersion(self):
        """Return version."""
        return get_version()

    def _createAlignService(self, name, args, fileNames, tempFileManager):
        """
        Create and return an AlignService by algorithm name.
        Input:
            name           : an algorithm name such as blasr
            fileNames      : an PBAlignFiles object
            args           : pbalign options
            tempFileManager: a temporary file manager
        Output:
            an object of AlignService subclass (such as BlasrService).
        """
        if name not in ALGORITHM_CANDIDATES:
            errMsg = "ERROR: unrecognized algorithm {algo}".format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service = None
        if name == "blasr":
            service = BlasrService(args, fileNames, tempFileManager)
        elif name == "bowtie":
            service = BowtieService(args, fileNames, tempFileManager)
        elif name == "gmap":
            service = GMAPService(args, fileNames, tempFileManager)
        else:
            errMsg = "Service for {algo} is not implemented.".\
                     format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service.checkAvailability()
        return service

    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            logging.warning("Option --forQuiver has been deprecated in 3.0")

        outFormat = getFileFormat(fileNames.outputFileName)

        if outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            raise IOError(errMsg)

        if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
            if args.algorithm != "blasr":
                errMsg = "Must choose blasr in order to output a bam file."
                raise ValueError(errMsg)
            if args.filterAdapterOnly:
                errMsg = "-filterAdapter does not work when out format is BAM."
                raise ValueError(errMsg)

    def _parseArgs(self):
        """Overwrite ToolRunner.parseArgs(self).
        Parse PBAlignRunner arguments considering both args in argumentList and
        args in a config file (specified by --configFile).
        """
        pass

    def _output(self, inSam, refFile, outFile, readType=None):
        """Generate a SAM, BAM file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM file
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move %s as %s", inSam, outFile)
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, "Exited with error: " + str(e)
                logging.error(errMsg)
                raise RuntimeError(errMsg)
        elif outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            logging.error(errMsg)
            raise IOError(errMsg)
        elif outFormat == FILE_FORMATS.XML:
            logging.info("OutputService: Generating the output XML file %s %s",
                         inSam, outFile)
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        return output, errCode, errMsg

    def _cleanUp(self, realDelete=False):
        """ Clean up temporary files and intermediate results. """
        logging.debug("Clean up temporary files and directories.")
        self._tempFileManager.CleanUp(realDelete)

    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info("pbalign version: %s", get_version())
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args,
                                                    self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        self._alnService.run()

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(self.fileNames.alignerSamOut,
                                            self.fileNames.targetFileName,
                                            self.fileNames.filteredSam,
                                            self.args.algorithm,
                                            #self._alnService.name,
                                            self._alnService.scoreSign,
                                            self.args,
                                            self.fileNames.adapterGffFileName)
        self._filterService.run()

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            BamPostService(self.fileNames).run()

        # Output all hits in SAM, BAM.
        self._output(
            inSam=self.fileNames.filteredSam,
            refFile=self.fileNames.targetFileName,
            outFile=self.fileNames.outputFileName,
            readType=self.args.readType)

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and
                                self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime - startTime)))
        return 0
Beispiel #8
0
class PBAlignRunner(PBToolRunner):

    """Tool runner."""

    def __init__(self, args=None, argumentList=(),
                 output_dataset_type=AlignmentSet):
        """Initialize a PBAlignRunner object.
           argumentList is a list of arguments, such as:
           ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam']
        """
        desc = "Utilities for aligning PacBio reads to reference sequences."
        if args is None: # FIXME unit testing hack
            args = get_contract_parser().arg_parser.parser.parse_args(argumentList)
        self.args = args
        # args.verbosity is computed by counting # of 'v's in '-vv...'.
        # However in parseOptions, arguments are parsed twice to import config
        # options and then overwrite them with argumentList (e.g. command-line)
        # options.
        #self.args.verbosity = 1 if (self.args.verbosity is None) else \
        #    (int(self.args.verbosity) / 2 + 1)
        super(PBAlignRunner, self).__init__(desc)
        self._output_dataset_type = output_dataset_type
        self._alnService = None
        self._filterService = None
        self.fileNames = PBAlignFiles()
        self._tempFileManager = TempFileManager()

    def _setupParsers(self, description):
        pass

    def _addStandardArguments(self):
        pass

    def getVersion(self):
        """Return version."""
        return get_version()

    def _createAlignService(self, name, args, fileNames, tempFileManager):
        """
        Create and return an AlignService by algorithm name.
        Input:
            name           : an algorithm name such as blasr
            fileNames      : an PBAlignFiles object
            args           : pbalign options
            tempFileManager: a temporary file manager
        Output:
            an object of AlignService subclass (such as BlasrService).
        """
        if name not in ALGORITHM_CANDIDATES:
            errMsg = "ERROR: unrecognized algorithm {algo}".format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service = None
        if name == "blasr":
            service = BlasrService(args, fileNames, tempFileManager)
        elif name == "bowtie":
            service = BowtieService(args, fileNames, tempFileManager)
        elif name == "gmap":
            service = GMAPService(args, fileNames, tempFileManager)
        else:
            errMsg = "Service for {algo} is not implemented.".\
                     format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service.checkAvailability()
        return service

    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            if args.useccs is not None:
                errMsg = "Options --forQuiver and --useccs should not " + \
                         "be used together, since Quiver is not designed to " + \
                         "polish ccs reads. if you want to align ccs reads" + \
                         "in cmp.h5 format with pulse QVs loaded, use " + \
                         "--loadQVs with --useccs instead."
                raise ValueError(errMsg)
            args.loadQVs = True

        outFormat = getFileFormat(fileNames.outputFileName)
        if args.loadQVs:
            if fileNames.pulseFileName is None:
                errMsg = "The input file has to be in bas/pls/ccs.h5 " + \
                         "format, or --pulseFile needs to be specified, "
            if outFormat != FILE_FORMATS.CMP:
                errMsg = "The output file has to be in cmp.h5 format, "
            if errMsg != "":
                errMsg += "in order to load pulse QVs."
                logging.error(errMsg)
                raise ValueError(errMsg)

        if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
            if args.algorithm != "blasr":
                errMsg = "Must choose blasr in order to output a bam file."
                raise ValueError(errMsg)
            if args.filterAdapterOnly:
                errMsg = "-filterAdapter does not work when out format is BAM."
                raise ValueError(errMsg)

    def _parseArgs(self):
        """Overwrite ToolRunner.parseArgs(self).
        Parse PBAlignRunner arguments considering both args in argumentList and
        args in a config file (specified by --configFile).
        """
        pass

    def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False):
        """Generate a SAM, BAM or a CMP.H5 file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM or CMP.H5 file.
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move {src} as {dst}".format(
                src=inSam, dst=outFile))
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, str(e)
        elif outFormat == FILE_FORMATS.CMP:
            #`samtoh5 inSam outFile -readType readType
            logging.info("OutputService: Genearte the output CMP.H5 " +
                         "file using samtoh5.")
            prog = "samtoh5"
            cmd = "samtoh5 {samFile} {refFile} {outFile}".format(
                samFile=inSam, refFile=refFile, outFile=outFile)
            if readType is not None:
                cmd += " -readType {0} ".format(readType)
            if smrtTitle:
                cmd += " -smrtTitle "
            # Execute the command line
            logging.debug("OutputService: Call \"{0}\"".format(cmd))
            output, errCode, errMsg = backticks(cmd)
        elif outFormat == FILE_FORMATS.XML:
            logging.info("OutputService: Generating the output XML file".
                         format(samFile=inSam, outFile=outFile))
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if self.args.readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        if errCode != 0:
            errMsg = prog + " returned a non-zero exit status." + errMsg
            logging.error(errMsg)
            raise RuntimeError(errMsg)
        return output, errCode, errMsg

    def _cleanUp(self, realDelete=False):
        """ Clean up temporary files and intermediate results. """
        logging.debug("Clean up temporary files and directories.")
        self._tempFileManager.CleanUp(realDelete)

    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info("pbalign version: {version}".format(version=get_version()))
        # FIXME
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args,
                                                    self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        try:
            self._alnService.run()
        except RuntimeError:
            return 1

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(self.fileNames.alignerSamOut,
                                            self.fileNames.targetFileName,
                                            self.fileNames.filteredSam,
                                            self.args.algorithm,
                                            #self._alnService.name,
                                            self._alnService.scoreSign,
                                            self.args,
                                            self.fileNames.adapterGffFileName)
        try:
            self._filterService.run()
        except RuntimeError:
            return 1

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            try:
                BamPostService(self.fileNames).run()
            except RuntimeError:
                return 1

        # Output all hits in SAM, BAM or CMP.H5.
        try:
            useSmrtTitle = False
            if (self.args.algorithm != "blasr" or
                self.fileNames.inputFileFormat == FILE_FORMATS.FASTA):
                useSmrtTitle = True

            self._output(
                inSam=self.fileNames.filteredSam,
                refFile=self.fileNames.targetFileName,
                outFile=self.fileNames.outputFileName,
                readType=self.args.readType,
                smrtTitle=useSmrtTitle)
        except RuntimeError:
            return 1

        # Load QVs to cmp.h5 for Quiver
        if outFormat == FILE_FORMATS.CMP and \
            self.args.forQuiver or self.args.loadQVs:
            # Call post service for quiver.
            try:
                ForQuiverService(self.fileNames, self.args).run()
            except RuntimeError:
                return 1

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and
                               self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime - startTime)))
        return 0
Beispiel #9
0
class PBAlignRunner(PBToolRunner):
    """Tool runner."""
    def __init__(self,
                 args=None,
                 argumentList=(),
                 output_dataset_type=AlignmentSet):
        """Initialize a PBAlignRunner object.
           argumentList is a list of arguments, such as:
           ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam']
        """
        desc = "Utilities for aligning PacBio reads to reference sequences."
        if args is None:  # FIXME unit testing hack
            args = get_contract_parser().arg_parser.parser.parse_args(
                argumentList)
        self.args = args
        # args.verbosity is computed by counting # of 'v's in '-vv...'.
        # However in parseOptions, arguments are parsed twice to import config
        # options and then overwrite them with argumentList (e.g. command-line)
        # options.
        #self.args.verbosity = 1 if (self.args.verbosity is None) else \
        #    (int(self.args.verbosity) / 2 + 1)
        super(PBAlignRunner, self).__init__(desc)
        self._output_dataset_type = output_dataset_type
        self._alnService = None
        self._filterService = None
        self.fileNames = PBAlignFiles()
        self._tempFileManager = TempFileManager()

    def _setupParsers(self, description):
        pass

    def _addStandardArguments(self):
        pass

    def getVersion(self):
        """Return version."""
        return get_version()

    def _createAlignService(self, name, args, fileNames, tempFileManager):
        """
        Create and return an AlignService by algorithm name.
        Input:
            name           : an algorithm name such as blasr
            fileNames      : an PBAlignFiles object
            args           : pbalign options
            tempFileManager: a temporary file manager
        Output:
            an object of AlignService subclass (such as BlasrService).
        """
        if name not in ALGORITHM_CANDIDATES:
            errMsg = "ERROR: unrecognized algorithm {algo}".format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service = None
        if name == "blasr":
            service = BlasrService(args, fileNames, tempFileManager)
        elif name == "bowtie":
            service = BowtieService(args, fileNames, tempFileManager)
        elif name == "gmap":
            service = GMAPService(args, fileNames, tempFileManager)
        else:
            errMsg = "Service for {algo} is not implemented.".\
                     format(algo=name)
            logging.error(errMsg)
            raise ValueError(errMsg)

        service.checkAvailability()
        return service

    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            if args.useccs is not None:
                errMsg = "Options --forQuiver and --useccs should not " + \
                         "be used together, since Quiver is not designed to " + \
                         "polish ccs reads. if you want to align ccs reads" + \
                         "in cmp.h5 format with pulse QVs loaded, use " + \
                         "--loadQVs with --useccs instead."
                raise ValueError(errMsg)
            args.loadQVs = True

        outFormat = getFileFormat(fileNames.outputFileName)
        if args.loadQVs:
            if fileNames.pulseFileName is None:
                errMsg = "The input file has to be in bas/pls/ccs.h5 " + \
                         "format, or --pulseFile needs to be specified, "
            if outFormat != FILE_FORMATS.CMP:
                errMsg = "The output file has to be in cmp.h5 format, "
            if errMsg != "":
                errMsg += "in order to load pulse QVs."
                logging.error(errMsg)
                raise ValueError(errMsg)

        if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
            if args.algorithm != "blasr":
                errMsg = "Must choose blasr in order to output a bam file."
                raise ValueError(errMsg)
            if args.filterAdapterOnly:
                errMsg = "-filterAdapter does not work when out format is BAM."
                raise ValueError(errMsg)

    def _parseArgs(self):
        """Overwrite ToolRunner.parseArgs(self).
        Parse PBAlignRunner arguments considering both args in argumentList and
        args in a config file (specified by --configFile).
        """
        pass

    def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False):
        """Generate a SAM, BAM or a CMP.H5 file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM or CMP.H5 file.
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass  # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move {src} as {dst}".format(
                src=inSam, dst=outFile))
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, str(e)
        elif outFormat == FILE_FORMATS.CMP:
            #`samtoh5 inSam outFile -readType readType
            logging.info("OutputService: Genearte the output CMP.H5 " +
                         "file using samtoh5.")
            prog = "samtoh5"
            cmd = "samtoh5 {samFile} {refFile} {outFile}".format(
                samFile=inSam, refFile=refFile, outFile=outFile)
            if readType is not None:
                cmd += " -readType {0} ".format(readType)
            if smrtTitle:
                cmd += " -smrtTitle "
            # Execute the command line
            logging.debug("OutputService: Call \"{0}\"".format(cmd))
            output, errCode, errMsg = backticks(cmd)
        elif outFormat == FILE_FORMATS.XML:
            logging.info(
                "OutputService: Generating the output XML file".format(
                    samFile=inSam, outFile=outFile))
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if self.args.readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        if errCode != 0:
            errMsg = prog + " returned a non-zero exit status." + errMsg
            logging.error(errMsg)
            raise RuntimeError(errMsg)
        return output, errCode, errMsg

    def _cleanUp(self, realDelete=False):
        """ Clean up temporary files and intermediate results. """
        logging.debug("Clean up temporary files and directories.")
        self._tempFileManager.CleanUp(realDelete)

    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info(
            "pbalign version: {version}".format(version=get_version()))
        # FIXME
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args, self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        try:
            self._alnService.run()
        except RuntimeError:
            return 1

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(
            self.fileNames.alignerSamOut,
            self.fileNames.targetFileName,
            self.fileNames.filteredSam,
            self.args.algorithm,
            #self._alnService.name,
            self._alnService.scoreSign,
            self.args,
            self.fileNames.adapterGffFileName)
        try:
            self._filterService.run()
        except RuntimeError:
            return 1

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            try:
                BamPostService(self.fileNames).run()
            except RuntimeError:
                return 1

        # Output all hits in SAM, BAM or CMP.H5.
        try:
            useSmrtTitle = False
            if (self.args.algorithm != "blasr"
                    or self.fileNames.inputFileFormat == FILE_FORMATS.FASTA):
                useSmrtTitle = True

            self._output(inSam=self.fileNames.filteredSam,
                         refFile=self.fileNames.targetFileName,
                         outFile=self.fileNames.outputFileName,
                         readType=self.args.readType,
                         smrtTitle=useSmrtTitle)
        except RuntimeError:
            return 1

        # Load QVs to cmp.h5 for Quiver
        if outFormat == FILE_FORMATS.CMP and \
            self.args.forQuiver or self.args.loadQVs:
            # Call post service for quiver.
            try:
                ForQuiverService(self.fileNames, self.args).run()
            except RuntimeError:
                return 1

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles")
                                and self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime -
                                                          startTime)))
        return 0