Beispiel #1
0
 def run(self, fileStore):
     blastResultsFile = fileStore.getLocalTempFile()
     seqFile = fileStore.readGlobalFile(self.seqFileID)
     runSelfLastz(seqFile,
                  blastResultsFile,
                  lastzArguments=self.blastOptions.lastzArguments,
                  gpuLastz=self.blastOptions.gpuLastz)
     if self.blastOptions.realign:
         realignResultsFile = fileStore.getLocalTempFile()
         runCactusSelfRealign(
             seqFile,
             inputAlignmentsFile=blastResultsFile,
             outputAlignmentsFile=realignResultsFile,
             realignArguments=self.blastOptions.realignArguments)
         blastResultsFile = realignResultsFile
     resultsFile = fileStore.getLocalTempFile()
     cactus_call(parameters=[
         "cactus_blast_convertCoordinates", blastResultsFile, resultsFile,
         str(self.blastOptions.roundsOfCoordinateConversion)
     ])
     if self.blastOptions.compressFiles:
         #TODO: This throws away the compressed file
         seqFile = compressFastaFile(seqFile)
     logger.info("Ran the self blast okay")
     return fileStore.writeGlobalFile(resultsFile)
Beispiel #2
0
    def readXML(self, path):
        xmlRoot = ET.parse(path).getroot()
        treeElem = xmlRoot.find("tree")
        self.mcTree = MultiCactusTree(NXNewick().parseString(
            treeElem.text, addImpliedRoots=False))
        self.expMap = dict()
        self.expIDMap = dict()
        cactusPathElemList = xmlRoot.findall("cactus")
        for cactusPathElem in cactusPathElemList:
            nameElem = cactusPathElem.attrib["name"]
            pathElem = cactusPathElem.attrib["experiment_path"]
            self.expMap[nameElem] = pathElem
            if "experiment_id" in cactusPathElem.attrib:
                self.expIDMap[nameElem] = cactusPathElem.attrib[
                    "experiment_id"]
        self.inputSequenceMap = dict(
            zip(xmlRoot.attrib["inputSequenceNames"].split(),
                xmlRoot.attrib["inputSequences"].split()))
        if "inputSequenceIDs" in xmlRoot.attrib:
            self.inputSequenceIDMap = dict(
                zip(xmlRoot.attrib["inputSequenceIDNames"].split(),
                    xmlRoot.attrib["inputSequenceIDs"].split()))
        if "outputSequenceIDs" in xmlRoot.attrib:
            self.outputSequenceIDMap = dict(
                zip(xmlRoot.attrib["outputSequenceNames"].split(),
                    xmlRoot.attrib["outputSequenceIDs"].split()))

        logger.info("xmlRoot = %s" % ET.tostring(xmlRoot))
        if "configID" in xmlRoot.attrib:
            self.configID = xmlRoot.attrib["configID"]

        self.mcTree.assignSubtreeRootNames(self.expMap)
Beispiel #3
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        # Log the stats for the un-preprocessed assemblies
        for name, sequence in self.project.getInputSequenceIDMap().items():
            self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence)

        # Create jobs to create the output sequences
        logger.info("Reading config file from: %s" % self.project.getConfigID())
        configFile = fileStore.readGlobalFile(self.project.getConfigID())
        configNode = ET.parse(configFile).getroot()
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
        #Add the preprocessor child job. The output is a job promise value that will be
        #converted into a list of the IDs of the preprocessed sequences in the follow on job.
        preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode))
        self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))])

        #Now build the progressive-down job
        schedule = Schedule()
        schedule.loadProject(self.project, fileStore=fileStore)
        schedule.compute()
        self.options.event = self.project.mcTree.getRootName()
        leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ]
        fileStore.logToMaster("Leaf names = %s" % leafNames)
        self.options.globalLeafEventSet = set(leafNames)

        return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Beispiel #4
0
    def run(self, fileStore):
        seqFile1 = fileStore.readGlobalFile(self.seqFileID1)
        seqFile2 = fileStore.readGlobalFile(self.seqFileID2)
        if self.blastOptions.compressFiles:
            seqFile1 = decompressFastaFile(seqFile1,
                                           fileStore.getLocalTempFile())
            seqFile2 = decompressFastaFile(seqFile2,
                                           fileStore.getLocalTempFile())
        blastResultsFile = fileStore.getLocalTempFile()
        runLastz(seqFile1,
                 seqFile2,
                 blastResultsFile,
                 lastzArguments=self.blastOptions.lastzArguments,
                 gpuLastz=self.blastOptions.gpuLastz)
        if self.blastOptions.realign:
            realignResultsFile = fileStore.getLocalTempFile()
            runCactusRealign(
                seqFile1,
                seqFile2,
                inputAlignmentsFile=blastResultsFile,
                outputAlignmentsFile=realignResultsFile,
                realignArguments=self.blastOptions.realignArguments)
            blastResultsFile = realignResultsFile

        resultsFile = fileStore.getLocalTempFile()
        cactus_call(parameters=[
            "cactus_blast_convertCoordinates", blastResultsFile, resultsFile,
            str(self.blastOptions.roundsOfCoordinateConversion)
        ])
        logger.info("Ran the blast okay")
        return fileStore.writeGlobalFile(resultsFile)
def importSingularityImage():
    """Import the Singularity image from Docker if using Singularity."""
    mode = os.environ.get("CACTUS_BINARIES_MODE", "docker")
    localImage = os.environ.get("CACTUS_USE_LOCAL_SINGULARITY_IMG", "0")
    if mode == "singularity":
        imgPath = os.environ["CACTUS_SINGULARITY_IMG"]
        # If not using local image, pull the docker image
        if localImage == "0":
            # Singularity will complain if the image file already exists. Remove it.
            try:
                os.remove(imgPath)
            except OSError:
                # File doesn't exist
                pass
            # Singularity 2.4 broke the functionality that let --name
            # point to a path instead of a name in the CWD. So we change
            # to the proper directory manually, then change back after the
            # image is pulled.
            # NOTE: singularity writes images in the current directory only
            #       when SINGULARITY_CACHEDIR is not set
            oldCWD = os.getcwd()
            os.chdir(os.path.dirname(imgPath))
            # --size is deprecated starting in 2.4, but is needed for 2.3 support. Keeping it in for now.
            try:
                check_call(["singularity", "pull", "--size", "2000", "--name", os.path.basename(imgPath),
                            "docker://" + getDockerImage()])
            except CalledProcessError:
                # Call failed, try without --size, required for singularity 3+
                check_call(["singularity", "pull", "--name", os.path.basename(imgPath),
                            "docker://" + getDockerImage()])
            os.chdir(oldCWD)
        else:
            logger.info("Using pre-built singularity image: '{}'".format(imgPath))
def split_minimap_fallback(job, options, config, seqIDMap, output_id_map):
    """ take the output table from gather_fas, pull out the ambiguous sequences, remap them to the reference, and 
    add them to the events where possible"""

    # can't do anything without a reference
    if not options.reference:
        logger.info("Skipping minimap2 fallback as --reference was not specified")
        return None, None
    # todo: also skip if no ambgious sequences
    
    ref_path, ref_id = seqIDMap[options.reference]
    mm_mem = ref_id.size * 5
    if seqIDMap[options.reference][0].endswith('.gz'):
        mm_mem *= 4
    mm_index_job = job.addChildJobFn(minimap_index, ref_path, ref_id, disk=ref_id.size * 5, memory=mm_mem)
    mm_map_root_job = Job()
    mm_index_job.addFollowOn(mm_map_root_job)
    
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_")

    if amb_name not in output_id_map:
        logger.info("Skipping minmap2 fallback as no ambigious sequences found")
        return None, None

    # map every ambgiuous sequence against the reference in parallel
    paf_ids = []
    ambiguous_seq_id_map = {}
    for event, fa_id in output_id_map[amb_name]['fa'].items():
        paf_job = mm_map_root_job.addChildJobFn(minimap_map, mm_index_job.rv(), event, fa_id, seqIDMap[event][0],
                                                disk=ref_id.size * 3, memory=mm_mem)
        paf_ids.append(paf_job.rv())
        ambiguous_seq_id_map[event] = (seqIDMap[event][0], fa_id)

    return paf_ids, ambiguous_seq_id_map
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Using the following configuration:\n%s" % ET.tostring(self.configNode))

        # Log the stats for the un-preprocessed assemblies
        for name, sequence in self.project.getInputSequenceIDMap().items():
            self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence)

        # Create jobs to create the output sequences
        logger.info("Reading config file from: %s" % self.project.getConfigID())
        configFile = fileStore.readGlobalFile(self.project.getConfigID())
        configNode = ET.parse(configFile).getroot()
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
        #Add the preprocessor child job. The output is a job promise value that will be
        #converted into a list of the IDs of the preprocessed sequences in the follow on job.
        preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode))
        self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))])

        #Now build the progressive-down job
        schedule = Schedule()
        schedule.loadProject(self.project, fileStore=fileStore)
        schedule.compute()
        self.options.event = self.project.mcTree.getRootName()
        leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ]
        fileStore.logToMaster("Leaf names = %s" % leafNames)
        self.options.globalLeafEventSet = set(leafNames)

        return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Beispiel #8
0
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")

        inSequence = fileStore.readGlobalFile(self.inSequenceID)

        if self.prepOptions.chunkSize <= 0:
            # In this first case we don't need to break up the sequence
            chunked = False
            inChunkList = [inSequence]
        else:
            # chunk it up
            chunked = True
            inChunkDirectory = getTempDirectory(
                rootDir=fileStore.getLocalTempDir())
            inChunkList = runGetChunks(sequenceFiles=[inSequence],
                                       chunksDir=inChunkDirectory,
                                       chunkSize=self.prepOptions.chunkSize,
                                       overlapSize=0)
            inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)

        inChunkIDList = [
            fileStore.writeGlobalFile(chunk, cleanup=True)
            for chunk in inChunkList
        ]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = list(range(len(inChunkList)))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(
                max(
                    1,
                    math.ceil(
                        len(inChunkList) *
                        self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber // 2)
            inChunkIDs = inChunkIDList[j:j + inChunkNumber]
            if len(
                    inChunkIDs
            ) < inChunkNumber:  #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(
                self.addChild(
                    self.getChunkedJobForCurrentStage(
                        inChunkIDs,
                        float(inChunkNumber) / len(inChunkIDList),
                        inChunkIDList[i])).rv())

        if chunked:
            # Merge results of the chunking process back into a genome-wide file
            return self.addFollowOn(
                MergeChunks(self.prepOptions, outChunkIDList)).rv()
        else:
            # Didn't chunk--we have a genome-wide fasta file
            return outChunkIDList[0]
Beispiel #9
0
    def run(self, fileStore):
        sequenceFiles1 = [
            fileStore.readGlobalFile(fileID)
            for fileID in self.sequenceFileIDs1
        ]
        chunks = runGetChunks(
            sequenceFiles=sequenceFiles1,
            chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()),
            chunkSize=self.blastOptions.chunkSize,
            overlapSize=self.blastOptions.overlapSize)
        assert len(chunks) > 0
        logger.info(
            "Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [
            fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks
        ]

        diagonalResultsID = self.addChild(
            MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(
            MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(
            CollateBlasts(self.blastOptions,
                          [diagonalResultsID, offDiagonalResultsID])).rv()
Beispiel #10
0
    def run(self, fileStore):
        sequenceFiles1 = [
            fileStore.readGlobalFile(fileID)
            for fileID in self.sequenceFileIDs1
        ]
        if self.blastOptions.gpuLastz == True:
            # wga-gpu has a 3G limit.
            self.blastOptions.chunkSize = 3000000000
        chunks = runGetChunks(
            sequenceFiles=sequenceFiles1,
            chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()),
            chunkSize=self.blastOptions.chunkSize,
            overlapSize=self.blastOptions.overlapSize)
        if len(chunks) == 0:
            raise Exception(
                "no chunks produced for files: {} ".format(sequenceFiles1))
        logger.info(
            "Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [
            fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks
        ]

        diagonalResultsID = self.addChild(
            MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(
            MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(
            CollateBlasts(self.blastOptions,
                          [diagonalResultsID, offDiagonalResultsID])).rv()
    def run(self, fileStore):
        prepXmlElems = self.configNode.findall("preprocessor")

        if len(prepXmlElems) == 0: #Just cp the file to the output file
            return self.inputSequenceID
        else:
            logger.info("Adding child batch_preprocessor target")
            return self.addChild(BatchPreprocessor(prepXmlElems, self.inputSequenceID, 0)).rv()
Beispiel #12
0
    def run(self, fileStore):
        prepXmlElems = self.configNode.findall("preprocessor")

        if len(prepXmlElems) == 0: #Just cp the file to the output file
            return self.inputSequenceID
        else:
            logger.info("Adding child batch_preprocessor target")
            return self.addChild(BatchPreprocessor(prepXmlElems, self.inputSequenceID, 0)).rv()
Beispiel #13
0
 def run(self, fileStore):
     logger.info("Results IDs: %s" % self.resultsFileIDs)
     resultsFiles = [readGlobalFileWithoutCache(fileStore, fileID) for fileID in self.resultsFileIDs]
     collatedResultsFile = fileStore.getLocalTempFile()
     catFiles(resultsFiles, collatedResultsFile)
     logger.info("Collated the alignments to the file: %s",  collatedResultsFile)
     collatedResultsID = fileStore.writeGlobalFile(collatedResultsFile)
     for resultsFileID in self.resultsFileIDs:
         fileStore.deleteGlobalFile(resultsFileID)
     return collatedResultsID
Beispiel #14
0
 def run(self, fileStore):
     logger.info("Results IDs: %s" % self.resultsFileIDs)
     resultsFiles = [readGlobalFileWithoutCache(fileStore, fileID) for fileID in self.resultsFileIDs]
     collatedResultsFile = fileStore.getLocalTempFile()
     catFiles(resultsFiles, collatedResultsFile)
     logger.info("Collated the alignments to the file: %s",  collatedResultsFile)
     collatedResultsID = fileStore.writeGlobalFile(collatedResultsFile)
     for resultsFileID in self.resultsFileIDs:
         fileStore.deleteGlobalFile(resultsFileID)
     return collatedResultsID
Beispiel #15
0
def calculateCoverage(sequenceFile, cigarFile, outputFile, fromGenome=None, depthById=False, work_dir=None):
    logger.info("Calculating coverage of cigar file %s on %s, writing to %s" % (
        cigarFile, sequenceFile, outputFile))
    args = [sequenceFile, cigarFile]
    if fromGenome is not None:
        args += ["--from", fromGenome]
    if depthById:
        args += ["--depthById"]
    cactus_call(outfile=outputFile, work_dir=work_dir,
                parameters=["cactus_coverage"] + args)
Beispiel #16
0
def calculateCoverage(sequenceFile, cigarFile, outputFile, fromGenome=None, depthById=False, work_dir=None):
    logger.info("Calculating coverage of cigar file %s on %s, writing to %s" % (
        cigarFile, sequenceFile, outputFile))
    args = [sequenceFile, cigarFile]
    if fromGenome is not None:
        args += ["--from", fromGenome]
    if depthById:
        args += ["--depthById"]
    cactus_call(outfile=outputFile, work_dir=work_dir,
                parameters=["cactus_coverage"] + args)
Beispiel #17
0
    def run(self, fileStore):
        sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
        chunks = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize = self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
        assert len(chunks) > 0
        logger.info("Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks]

        diagonalResultsID = self.addChild(MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
Beispiel #18
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file (will be modified if necessary to include graph Fasta sequence)")
    parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)")
    parser.add_argument("outputPAF", type=str, help = "Output pairwise alignment file in PAF format")
    parser.add_argument("--outputFasta", type=str, help = "Output graph sequence file in FASTA format (required if not present in seqFile)")
    parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp (overrides config option of same name)")    
    parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory")
    parser.add_argument("--refFromGFA", type=str, help = "Do not align given genome from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)")

    #WDL hacks
    parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile")
    parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outputGAFDir:
        if not os.path.isdir(options.outputGAFDir):
            os.makedirs(options.outputGAFDir)

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames')

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMap(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap has finished after {} seconds".format(run_time))
Beispiel #19
0
def stageWorkflow(outputSequenceDir,
                  configFile,
                  inputSequences,
                  toil,
                  restart=False,
                  outputSequences=[],
                  maskAlpha=False,
                  clipAlpha=None):
    #Replace any constants
    configNode = ET.parse(configFile).getroot()
    if not outputSequences:
        outputSequences = CactusPreprocessor.getOutputSequenceFiles(
            inputSequences, outputSequenceDir)
    else:
        assert len(outputSequences) == len(inputSequences)

    # Make sure we have the dna-brnn model in the filestore if we need it
    loadDnaBrnnModel(toil, ET.parse(configFile).getroot(), maskAlpha=maskAlpha)

    if configNode.find("constants") != None:
        ConfigWrapper(
            configNode).substituteAllPredefinedConstantsWithLiterals()
    if maskAlpha or clipAlpha:
        ConfigWrapper(configNode).setPreprocessorActive(
            "lastzRepeatMask", False)
        ConfigWrapper(configNode).setPreprocessorActive("dna-brnn", True)
        if clipAlpha:
            for node in configNode.findall("preprocessor"):
                if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                    node.attrib["action"] = "clip"
                    node.attrib["minLength"] = clipAlpha
                    node.attrib["mergeLength"] = clipAlpha

    if not restart:
        inputSequenceIDs = []
        for seq in inputSequences:
            logger.info("Importing {}".format(seq))
            inputSequenceIDs.append(toil.importFile(makeURL(seq)))
        unzip_job = Job.wrapJobFn(unzip_then_pp, configNode, inputSequences,
                                  inputSequenceIDs)
        outputSequenceIDs = toil.start(unzip_job)
    else:
        outputSequenceIDs = toil.restart()
    for seqID, path in zip(outputSequenceIDs, outputSequences):
        try:
            iter(seqID)
            # dna-brnn will output a couple of bed files.  we scrape those out here
            toil.exportFile(seqID[0], makeURL(path))
            toil.exportFile(seqID[1], makeURL(path) + '.bed')
            toil.exportFile(seqID[2], makeURL(path) + '.mask.bed')
        except:
            toil.exportFile(seqID, makeURL(path))
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("--vg", required=True, nargs='+',  help = "Input vg files (PackedGraph or HashGraph format)")
    parser.add_argument("--outDir", required=True, type=str, help = "Output directory")
    parser.add_argument("--outName", required=True, type=str, help = "Basename of all output files")
    parser.add_argument("--reference", required=True, type=str, help = "Reference event name")
    parser.add_argument("--vcfReference", type=str, help = "Reference event for VCF (if different from --reference)")
    parser.add_argument("--rename", nargs='+', default = [], help = "Path renaming, each of form src>dest (see clip-vg -r)")
    parser.add_argument("--clipLength", type=int, default=None, help = "clip out unaligned sequences longer than this")
    parser.add_argument("--wlineSep", type=str, help = "wline separator for vg convert")
    parser.add_argument("--indexCores", type=int, default=1, help = "cores for indexing processes")
    parser.add_argument("--decoyGraph", help= "decoy sequences vg graph to add (PackedGraph or HashGraph format)")
    parser.add_argument("--hal", nargs='+', default = [], help = "Input hal files (for merging)")
    
    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outDir and not options.outDir.startswith('s3://'):
        if not os.path.isdir(options.outDir):
            os.makedirs(options.outDir)

    if options.hal and len(options.hal) != len(options.vg):
        raise RuntimeError("If --hal and --vg should specify the same number of files")
        
    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMapJoin(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap-join has finished after {} seconds".format(run_time))
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        if not self.schedule.isVirtual(self.event):
            tmpExp = fileStore.getLocalTempFile()
            self.eventExpWrapper.writeXML(tmpExp)
            self.project.expIDMap[self.event] = fileStore.writeGlobalFile(tmpExp)
        followOnEvent = self.schedule.followOn(self.event)
        if followOnEvent is not None:
            logger.info("Adding follow-on event %s" % followOnEvent)
            return self.addFollowOn(ProgressiveDown(self.options, self.project, followOnEvent,
                                                    self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()

        return self.project
Beispiel #22
0
 def run(self, fileStore):
     logger.info("Results IDs: %s" % self.resultsFileIDs)
     resultsFiles = [
         readGlobalFileWithoutCache(fileStore, fileID)
         for fileID in self.resultsFileIDs
     ]
     collatedResultsFile = fileStore.getLocalTempFile()
     catFiles(resultsFiles, collatedResultsFile)
     logger.info("Collated the alignments to the file: %s",
                 collatedResultsFile)
     collatedResultsID = fileStore.writeGlobalFile(collatedResultsFile)
     for i in range(0, len(self.resultsFileIDs), self.delete_batch_size):
         self.addChild(
             DeleteFileIDs(self.resultsFileIDs[i:i +
                                               self.delete_batch_size]))
     return collatedResultsID
Beispiel #23
0
    def blockUntilServerIsRunning(self, createTimeout=1800):
        """Check status until it's successful, an error is found, or we timeout.

        Returns True if the redis-server is now running, False if something went wrong."""
        success = False
        for i in range(createTimeout):
            if self.isServerFailed():
                logger.critical('Error starting Redis server.')
                success = False
                break
            if self.isServerRunning():
                logger.info('Redis server running.')
                success = True
                break
            sleep(1)
        return success
Beispiel #24
0
def blockUntilKtserverIsRunning(logPath, createTimeout=1800):
    """Check status until it's successful, an error is found, or we timeout.

    Returns True if the ktserver is now running, False if something went wrong."""
    success = False
    for i in xrange(createTimeout):
        if isKtServerFailed(logPath):
            logger.critical('Error starting ktserver.')
            success = False
            break
        if isKtServerRunning(logPath):
            logger.info('Ktserver running.')
            success = True
            break
        sleep(1)
    return success
Beispiel #25
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        if not self.schedule.isVirtual(self.event):
            tmpExp = fileStore.getLocalTempFile()
            self.eventExpWrapper.writeXML(tmpExp)
            self.project.expIDMap[self.event] = fileStore.writeGlobalFile(tmpExp)
        followOnEvent = self.schedule.followOn(self.event)
        if followOnEvent is not None:
            logger.info("Adding follow-on event %s" % followOnEvent)
            return self.addFollowOn(ProgressiveDown(self.options, self.project, followOnEvent,
                                                    self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()

        return self.project
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file (gzipped fastas supported)")
    parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)")
    parser.add_argument("graphmapPAF", type=str, help = "Output pairwise alignment file in PAF format (can be gzipped)")
    parser.add_argument("--outDir", required=True, type=str, help = "Output directory")
    parser.add_argument("--refContigs", nargs="*", help = "Subset to these reference contigs (multiple allowed)", default=[])
    parser.add_argument("--refContigsFile", type=str, help = "Subset to (newline-separated) reference contigs in this file")
    parser.add_argument("--otherContig", type=str, help = "Lump all reference contigs unselected by above options into single one with this name")
    parser.add_argument("--reference", type=str, help = "Name of reference (in seqFile).  Ambiguity filters will not be applied to it")
    parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp")
    
    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outDir and not options.outDir.startswith('s3://'):
        if not os.path.isdir(options.outDir):
            os.makedirs(options.outDir)
        
    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMapSplit(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap-split has finished after {} seconds".format(run_time))
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()
        logger.info("Progressive Down: " + self.event)

        depProjects = dict()
        deps = self.schedule.deps(self.event)
        fileStore.logToMaster("There are %i dependent projects" % len(deps))
        for child in deps:
            fileStore.logToMaster("Adding dependent project %s" % child)
            depProjects[child] = self.addChild(ProgressiveDown(self.options,
                                                               self.project, child,
                                                               self.schedule)).rv()

        return self.addFollowOn(ProgressiveNext(self.options, self.project, self.event,
                                                              self.schedule, depProjects, memory=self.configWrapper.getDefaultMemory())).rv()
Beispiel #28
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()
        logger.info("Progressive Down: " + self.event)

        depProjects = dict()
        deps = self.schedule.deps(self.event)
        fileStore.logToMaster("There are %i dependent projects" % len(deps))
        for child in deps:
            fileStore.logToMaster("Adding dependent project %s" % child)
            depProjects[child] = self.addChild(ProgressiveDown(self.options,
                                                               self.project, child,
                                                               self.schedule)).rv()

        return self.addFollowOn(ProgressiveNext(self.options, self.project, self.event,
                                                              self.schedule, depProjects, memory=self.configWrapper.getDefaultMemory())).rv()
Beispiel #29
0
 def run(self, fileStore):
     sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
     sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2]
     chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1]
     chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2]
     resultsIDs = []
     #Make the list of blast jobs.
     for chunkID1 in chunkIDs1:
         for chunkID2 in chunkIDs2:
             #TODO: Make the compression work
             self.blastOptions.compressFiles = False
             resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv())
     logger.info("Made the list of blasts")
     #Set up the job to collate all the results
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
Beispiel #30
0
 def run(self, fileStore):
     sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
     sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2]
     chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1]
     chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2]
     resultsIDs = []
     #Make the list of blast jobs.
     for chunkID1 in chunkIDs1:
         for chunkID2 in chunkIDs2:
             #TODO: Make the compression work
             self.blastOptions.compressFiles = False
             resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv())
     logger.info("Made the list of blasts")
     #Set up the job to collate all the results
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
    def run(self, fileStore):
        # Parse the "preprocessor" config xml element
        assert self.iteration < len(self.prepXmlElems)

        lastIteration = self.iteration == len(self.prepXmlElems) - 1

        prepNode = self.prepXmlElems[self.iteration]
        if getOptionalAttrib(prepNode, "active", typeFn = bool, default=True):
            prepOptions = PreprocessorOptions(chunkSize = int(prepNode.get("chunkSize", default="-1")),
                                              preprocessJob=prepNode.attrib["preprocessJob"],
                                              memory = int(prepNode.get("memory", default=0)),
                                              cpu = int(prepNode.get("cpu", default=1)),
                                              check = bool(int(prepNode.get("check", default="0"))),
                                              proportionToSample = getOptionalAttrib(prepNode, "proportionToSample", typeFn=float, default=1.0),
                                              unmask = getOptionalAttrib(prepNode, "unmask", typeFn=bool, default=False),
                                              lastzOptions = getOptionalAttrib(prepNode, "lastzOpts", default=""),
                                              minPeriod = getOptionalAttrib(prepNode, "minPeriod", typeFn=int, default=0),
                                              checkAssemblyHub = getOptionalAttrib(prepNode, "checkAssemblyHub", typeFn=bool, default=False),
                                              gpuLastz = getOptionalAttrib(prepNode, "gpuLastz", typeFn=bool, default=False),
                                              dnabrnnOpts = getOptionalAttrib(prepNode, "dna-brnnOpts", default=""),
                                              dnabrnnLength = getOptionalAttrib(prepNode, "minLength", typeFn=int, default=1),
                                              dnabrnnMerge = getOptionalAttrib(prepNode, "mergeLength", typeFn=int, default=0),
                                              dnabrnnAction = getOptionalAttrib(prepNode, "action", typeFn=str, default="softmask"),
                                              dnabrnnInputBedID = getOptionalAttrib(prepNode, "inputBedID", typeFn=str, default=None),
                                              dnabrnnEventName = getOptionalAttrib(prepNode, "eventName", typeFn=str, default=None),
                                              cutBefore = getOptionalAttrib(prepNode, "cutBefore", typeFn=str, default=None),
                                              cutAfter = getOptionalAttrib(prepNode, "cutAfter", typeFn=str, default=None))

            if prepOptions.unmask:
                inSequence = fileStore.readGlobalFile(self.inSequenceID)
                unmaskedInputFile = fileStore.getLocalTempFile()
                unmaskFasta(inSequence, unmaskedInputFile)
                self.inSequenceID = fileStore.writeGlobalFile(unmaskedInputFile)

            outSeqID = self.addChild(PreprocessSequence(prepOptions, self.inSequenceID)).rv()
        else:
            logger.info("Skipping inactive preprocessor {}".format(prepNode.attrib["preprocessJob"]))
            outSeqID = self.inSequenceID

        if lastIteration == False:
            return self.addFollowOn(BatchPreprocessor(self.prepXmlElems, outSeqID, self.iteration + 1)).rv()
        else:
            return outSeqID
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")

        inSequence = fileStore.readGlobalFile(self.inSequenceID)

        if self.prepOptions.chunkSize <= 0:
            # In this first case we don't need to break up the sequence
            chunked = False
            inChunkList = [inSequence]
        else:
            # chunk it up
            chunked = True
            inChunkDirectory = getTempDirectory(rootDir=fileStore.getLocalTempDir())
            inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory,
                                       chunkSize=self.prepOptions.chunkSize,
                                       overlapSize=0)
            inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)

        inChunkIDList = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = range(len(inChunkList))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber/2)
            inChunkIDs = inChunkIDList[j:j+inChunkNumber]
            if len(inChunkIDs) < inChunkNumber: #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber-len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(self.addChild(self.getChunkedJobForCurrentStage(inChunkIDs, float(inChunkNumber)/len(inChunkIDList), inChunkIDList[i])).rv())

        if chunked:
            # Merge results of the chunking process back into a genome-wide file
            return self.addFollowOn(MergeChunks(self.prepOptions, outChunkIDList)).rv()
        else:
            # Didn't chunk--we have a genome-wide fasta file
            return outChunkIDList[0]
Beispiel #33
0
 def run(self, fileStore):   
     blastResultsFile = fileStore.getLocalTempFile()
     seqFile = fileStore.readGlobalFile(self.seqFileID)
     runSelfLastz(seqFile, blastResultsFile, lastzArguments=self.blastOptions.lastzArguments)
     if self.blastOptions.realign:
         realignResultsFile = fileStore.getLocalTempFile()
         runCactusSelfRealign(seqFile, inputAlignmentsFile=blastResultsFile,
                              outputAlignmentsFile=realignResultsFile,
                              realignArguments=self.blastOptions.realignArguments)
         blastResultsFile = realignResultsFile
     resultsFile = fileStore.getLocalTempFile()
     cactus_call(parameters=["cactus_blast_convertCoordinates",
                             blastResultsFile,
                             resultsFile,
                             str(self.blastOptions.roundsOfCoordinateConversion)])
     if self.blastOptions.compressFiles:
         #TODO: This throws away the compressed file
         seqFile = compressFastaFile(seqFile)
     logger.info("Ran the self blast okay")
     return fileStore.writeGlobalFile(resultsFile)
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Project has %i dependencies" % len(self.depProjects))
        for projName in self.depProjects:
            depProject = self.depProjects[projName]
            for expName in depProject.expIDMap: 
                expID = depProject.expIDMap[expName]
                experiment = ExperimentWrapper(ET.parse(fileStore.readGlobalFile(expID)).getroot())
                fileStore.logToMaster("Reference ID for experiment %s: %s" % (expName, experiment.getReferenceID()))
                if experiment.getReferenceID():
                    self.project.expIDMap[expName] = expID
                    self.project.outputSequenceIDMap[expName] = experiment.getReferenceID()
                        
        eventExpWrapper = None
        logger.info("Progressive Next: " + self.event)
        if not self.schedule.isVirtual(self.event):
            eventExpWrapper = self.addChild(ProgressiveUp(self.options, self.project, self.event, memory=self.configWrapper.getDefaultMemory())).rv()
        return self.addFollowOn(ProgressiveOut(self.options, self.project, self.event, eventExpWrapper, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Beispiel #35
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Project has %i dependencies" % len(self.depProjects))
        for projName in self.depProjects:
            depProject = self.depProjects[projName]
            for expName in depProject.expIDMap: 
                expID = depProject.expIDMap[expName]
                experiment = ExperimentWrapper(ET.parse(fileStore.readGlobalFile(expID)).getroot())
                fileStore.logToMaster("Reference ID for experiment %s: %s" % (expName, experiment.getReferenceID()))
                if experiment.getReferenceID():
                    self.project.expIDMap[expName] = expID
                    self.project.outputSequenceIDMap[expName] = experiment.getReferenceID()
                        
        eventExpWrapper = None
        logger.info("Progressive Next: " + self.event)
        if not self.schedule.isVirtual(self.event):
            eventExpWrapper = self.addChild(ProgressiveUp(self.options, self.project, self.event, memory=self.configWrapper.getDefaultMemory())).rv()
        return self.addFollowOn(ProgressiveOut(self.options, self.project, self.event, eventExpWrapper, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node):
                name = tree.getName(node)
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames))
            
        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile)

        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
Beispiel #37
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node):
                name = tree.getName(node)
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames))
            
        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile)

        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
def importSingularityImage(options):
    """Import the Singularity image from Docker if using Singularity."""
    mode = os.environ.get("CACTUS_BINARIES_MODE", "docker")
    localImage = os.environ.get("CACTUS_USE_LOCAL_SINGULARITY_IMG", "0")
    if mode == "singularity" and Toil.parseLocator(
            options.jobStore)[0] == "file":
        imgPath = os.environ["CACTUS_SINGULARITY_IMG"]
        # If not using local image, pull the docker image
        if localImage == "0":
            # Singularity will complain if the image file already exists. Remove it.
            try:
                os.remove(imgPath)
            except OSError:
                # File doesn't exist
                pass
            # Singularity 2.4 broke the functionality that let --name
            # point to a path instead of a name in the CWD. So we change
            # to the proper directory manually, then change back after the
            # image is pulled.
            # NOTE: singularity writes images in the current directory only
            #       when SINGULARITY_CACHEDIR is not set
            oldCWD = os.getcwd()
            os.chdir(os.path.dirname(imgPath))
            # --size is deprecated starting in 2.4, but is needed for 2.3 support. Keeping it in for now.
            try:
                check_call([
                    "singularity", "pull", "--size", "2000", "--name",
                    os.path.basename(imgPath), "docker://" + getDockerImage()
                ])
            except CalledProcessError:
                # Call failed, try without --size, required for singularity 3+
                check_call([
                    "singularity", "pull", "--name",
                    os.path.basename(imgPath), "docker://" + getDockerImage()
                ])
            os.chdir(oldCWD)
        else:
            logger.info(
                "Using pre-built singularity image: '{}'".format(imgPath))
Beispiel #39
0
 def run(self, fileStore):
     logger.info("Blasting ingroup sequences to outgroup %s",
                 self.outgroupNames[self.outgroupNumber - 1])
     alignmentsID = self.addChild(BlastSequencesAgainstEachOther(
         self.sequenceIDs,
         [self.outgroupSequenceIDs[0]],
         self.blastOptions)).rv()
     trimRecurseJob = self.addFollowOn(TrimAndRecurseOnOutgroups(
         ingroupNames=self.ingroupNames,
         untrimmedSequenceIDs=self.untrimmedSequenceIDs,
         sequenceIDs=self.sequenceIDs,
         outgroupNames=self.outgroupNames,
         outgroupSequenceIDs=self.outgroupSequenceIDs,
         outgroupFragmentIDs=self.outgroupFragmentIDs,
         mostRecentResultsID=alignmentsID,
         outgroupResultsID=self.outgroupResultsID,
         blastOptions=self.blastOptions,
         outgroupNumber=self.outgroupNumber,
         ingroupCoverageIDs=self.ingroupCoverageIDs))
     outgroupAlignmentsID = trimRecurseJob.rv(0)
     outgroupFragmentIDs = trimRecurseJob.rv(1)
     ingroupCoverageIDs = trimRecurseJob.rv(2)
     return (outgroupAlignmentsID, outgroupFragmentIDs, ingroupCoverageIDs)
Beispiel #40
0
    def run(self, fileStore):
        seqFile1 = fileStore.readGlobalFile(self.seqFileID1)
        seqFile2 = fileStore.readGlobalFile(self.seqFileID2)
        if self.blastOptions.compressFiles:
            seqFile1 = decompressFastaFile(seqFile1, fileStore.getLocalTempFile())
            seqFile2 = decompressFastaFile(seqFile2, fileStore.getLocalTempFile())
        blastResultsFile = fileStore.getLocalTempFile()

        runLastz(seqFile1, seqFile2, blastResultsFile, lastzArguments = self.blastOptions.lastzArguments)
        if self.blastOptions.realign:
            realignResultsFile = fileStore.getLocalTempFile()
            runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=blastResultsFile,
                             outputAlignmentsFile=realignResultsFile,
                             realignArguments=self.blastOptions.realignArguments)
            blastResultsFile = realignResultsFile
            
        resultsFile = fileStore.getLocalTempFile()
        cactus_call(parameters=["cactus_blast_convertCoordinates",
                                blastResultsFile,
                                resultsFile,
                                str(self.blastOptions.roundsOfCoordinateConversion)])
        logger.info("Ran the blast okay")
        return fileStore.writeGlobalFile(resultsFile)
Beispiel #41
0
 def run(self, fileStore):
     logger.info("Blasting ingroup sequences to outgroup %s",
                 self.outgroupNames[self.outgroupNumber - 1])
     alignmentsID = self.addChild(BlastSequencesAgainstEachOther(
         self.sequenceIDs,
         [self.outgroupSequenceIDs[0]],
         self.blastOptions)).rv()
     trimRecurseJob = self.addFollowOn(TrimAndRecurseOnOutgroups(
         ingroupNames=self.ingroupNames,
         untrimmedSequenceIDs=self.untrimmedSequenceIDs,
         sequenceIDs=self.sequenceIDs,
         outgroupNames=self.outgroupNames,
         outgroupSequenceIDs=self.outgroupSequenceIDs,
         outgroupFragmentIDs=self.outgroupFragmentIDs,
         mostRecentResultsID=alignmentsID,
         outgroupResultsID=self.outgroupResultsID,
         blastOptions=self.blastOptions,
         outgroupNumber=self.outgroupNumber,
         ingroupCoverageIDs=self.ingroupCoverageIDs))
     outgroupAlignmentsID = trimRecurseJob.rv(0)
     outgroupFragmentIDs = trimRecurseJob.rv(1)
     ingroupCoverageIDs = trimRecurseJob.rv(2)
     return (outgroupAlignmentsID, outgroupFragmentIDs, ingroupCoverageIDs)
Beispiel #42
0
 def run(self, fileStore):
     logger.info("Chunk IDs: %s" % self.chunkIDs)
     #Avoid compression if just one chunk
     self.blastOptions.compressFiles = self.blastOptions.compressFiles and len(self.chunkIDs) > 2
     resultsIDs = []
     for i in range(len(self.chunkIDs)):
         resultsIDs.append(self.addChild(RunSelfBlast(self.blastOptions, self.chunkIDs[i])).rv())
     logger.info("Made the list of self blasts")
     #Setup job to make all-against-all blasts
     logger.debug("Collating self blasts.")
     logger.info("Blast file IDs: %s" % resultsIDs)
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
Beispiel #43
0
 def run(self, fileStore):
     logger.info("Chunk IDs: %s" % self.chunkIDs)
     #Avoid compression if just one chunk
     self.blastOptions.compressFiles = self.blastOptions.compressFiles and len(self.chunkIDs) > 2
     resultsIDs = []
     for i in xrange(len(self.chunkIDs)):
         resultsIDs.append(self.addChild(RunSelfBlast(self.blastOptions, self.chunkIDs[i])).rv())
     logger.info("Made the list of self blasts")
     #Setup job to make all-against-all blasts
     logger.debug("Collating self blasts.")
     logger.info("Blast file IDs: %s" % resultsIDs)
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")
        # chunk it up
        inSequence = fileStore.readGlobalFile(self.inSequenceID)
        inChunkDirectory = getTempDirectory(
            rootDir=fileStore.getLocalTempDir())
        inChunkList = runGetChunks(sequenceFiles=[inSequence],
                                   chunksDir=inChunkDirectory,
                                   chunkSize=self.prepOptions.chunkSize,
                                   overlapSize=0)
        inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)
        logger.info("Chunks dir = %s" % os.listdir(inChunkDirectory))

        inChunkIDList = [
            fileStore.writeGlobalFile(chunk) for chunk in inChunkList
        ]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = range(len(inChunkList))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(
                max(
                    1,
                    math.ceil(
                        len(inChunkList) *
                        self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber / 2)
            inChunkIDs = inChunkIDList[j:j + inChunkNumber]
            if len(
                    inChunkIDs
            ) < inChunkNumber:  #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(
                self.addChild(
                    PreprocessChunk(self.prepOptions, inChunkIDs,
                                    float(inChunkNumber) / len(inChunkIDList),
                                    inChunkIDList[i])).rv())
        # follow on to merge chunks
        return self.addFollowOn(MergeChunks(self.prepOptions,
                                            outChunkIDList)).rv()
Beispiel #45
0
def runCactusGraphMapJoin(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            wf_output = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # load up the vgs
            vg_ids = []
            for vg_path in options.vg:
                logger.info("Importing {}".format(vg_path))
                vg_ids.append(toil.importFile(makeURL(vg_path)))

            # tack on the decoys
            if options.decoyGraph:
                logger.info("Importing decoys {}".format(options.decoyGraph))
                vg_ids.append(toil.importFile(makeURL(options.decoyGraph)))
                # we'll treat it like any other graph downstream, except clipping
                # where we'll check first using the path name
                options.vg.append(options.decoyGraph)

            # load up the hals
            hal_ids = []
            for hal_path in options.hal:
                logger.info("Importing {}".format(hal_path))
                hal_ids.append(toil.importFile(makeURL(hal_path)))

            # run the workflow
            wf_output = toil.start(
                Job.wrapJobFn(graphmap_join_workflow, options, config, vg_ids,
                              hal_ids))

        #export the split data
        export_join_data(toil, options, wf_output[0], wf_output[1],
                         wf_output[2])
Beispiel #46
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="+",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outputHal", type=str, help="Output HAL file")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None,
        required=True)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Override some CAF settings whose defaults are not suited to star trees"
    )
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    if options.pafInput:
        # cactus-graphmap does not do any prepending to simplify interface with minigraph node names
        # so it must be done here
        options.nonCactusInput = True

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))