def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file (will be modified if necessary to include graph Fasta sequence)") parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)") parser.add_argument("outputPAF", type=str, help = "Output pairwise alignment file in PAF format") parser.add_argument("--outputFasta", type=str, help = "Output graph sequence file in FASTA format (required if not present in seqFile)") parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp (overrides config option of same name)") parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory") parser.add_argument("--refFromGFA", type=str, help = "Do not align given genome from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") #WDL hacks parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outputGAFDir: if not os.path.isdir(options.outputGAFDir): os.makedirs(options.outputGAFDir) if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMap(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError('Only 1 CPU detected. Cactus requires at least 2') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusProgressive(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("Cactus has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("--vg", required=True, nargs='+', help = "Input vg files (PackedGraph or HashGraph format)") parser.add_argument("--outDir", required=True, type=str, help = "Output directory") parser.add_argument("--outName", required=True, type=str, help = "Basename of all output files") parser.add_argument("--reference", required=True, type=str, help = "Reference event name") parser.add_argument("--vcfReference", type=str, help = "Reference event for VCF (if different from --reference)") parser.add_argument("--rename", nargs='+', default = [], help = "Path renaming, each of form src>dest (see clip-vg -r)") parser.add_argument("--clipLength", type=int, default=None, help = "clip out unaligned sequences longer than this") parser.add_argument("--wlineSep", type=str, help = "wline separator for vg convert") parser.add_argument("--indexCores", type=int, default=1, help = "cores for indexing processes") parser.add_argument("--decoyGraph", help= "decoy sequences vg graph to add (PackedGraph or HashGraph format)") parser.add_argument("--hal", nargs='+', default = [], help = "Input hal files (for merging)") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outDir and not options.outDir.startswith('s3://'): if not os.path.isdir(options.outDir): os.makedirs(options.outDir) if options.hal and len(options.hal) != len(options.vg): raise RuntimeError("If --hal and --vg should specify the same number of files") # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMapJoin(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap-join has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file (gzipped fastas supported)") parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)") parser.add_argument("graphmapPAF", type=str, help = "Output pairwise alignment file in PAF format (can be gzipped)") parser.add_argument("--outDir", required=True, type=str, help = "Output directory") parser.add_argument("--refContigs", nargs="*", help = "Subset to these reference contigs (multiple allowed)", default=[]) parser.add_argument("--refContigsFile", type=str, help = "Subset to (newline-separated) reference contigs in this file") parser.add_argument("--otherContig", type=str, help = "Lump all reference contigs unselected by above options into single one with this name") parser.add_argument("--reference", type=str, help = "Name of reference (in seqFile). Ambiguity filters will not be applied to it") parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outDir and not options.outDir.startswith('s3://'): if not os.path.isdir(options.outDir): os.makedirs(options.outDir) # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMapSplit(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap-split has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outSeqDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed') parser.add_argument("outSeqFile", help = "Path for annotated Seq file output") parser.add_argument("outputHal", type=str, help = "Output HAL file") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="$JOBSTORE", help="jobstore to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]") parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") parser.add_argument("--defaultMem", type=float, help="Memory in GB for each job unless otherwise specified") parser.add_argument("--preprocessMem", type=float, help="Memory in GB for each cactus-preprocess job") parser.add_argument("--blastMem", type=float, help="Memory in GB for each cactus-blast job") parser.add_argument("--alignMem", type=float, help="Memory in GB for each cactus-align job") parser.add_argument("--defaultDisk", type=int, help="Disk in GB for each job unless otherwise specified") parser.add_argument("--preprocessDisk", type=int, help="Disk in GB for each cactus-preprocess job") parser.add_argument("--blastDisk", type=int, help="Disk in GB for each cactus-blast job") parser.add_argument("--alignDisk", type=int, help="Disk in GB for each cactus-align job") parser.add_argument("--halAppendDisk", type=int, help="Disk in GB for each halAppendSubtree job") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible in GB for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible in GB for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible in GB for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible in GB for each halAppendSubtree job [default=1]", default=1) options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None if not options.wdl: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMem: if not options.preprocessMem: options.preprocessMem = options.defaultMem if not options.blastMem: options.blastMem = options.defaultMem if not options.alignMem: options.alignMem = options.defaultMem if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("blastOutput", nargs="+", help = "Blast output (from cactus-blast)") parser.add_argument("outputHal", type=str, help = "Output HAL file") parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--nonBlastInput", action="store_true", help="Input does not come from cactus-blast: Do not append ids to fasta names") parser.add_argument("--nonBlastMegablockFilter", action="store_true", help="By default, the megablock filter is off for --nonBlastInput, as it does not play" "nicely with reference-based alignments. This flag will turn it back on") parser.add_argument("--pafInput", action="store_true", help="'blastOutput' input is in paf format, rather than lastz cigars.") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames') # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError('Only 1 CPU detected. Cactus requires at least 2') # tokyo_cabinet is no longer supported options.database = "kyoto_tycoon" options.database = 'kyoto_tycoon' options.buildHal = True options.buildFasta = True # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusAfterBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main(toil_mode=False): parser = ArgumentParser() if toil_mode: Job.Runner.addToilOptions(parser) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries (at top level; use --cactusOpts to set it in nested calls)", default=None) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]", required=toil_mode) if not toil_mode: parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo --retryCount 0", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") if not toil_mode: parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") if not toil_mode: parser.add_argument("--defaultMemory", type=human2bytesN, help="Memory for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessMemory", type=human2bytesN, help="Memory for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastMemory", type=human2bytesN, help="Memory for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignMemory", type=human2bytesN, help="Memory for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") if not toil_mode: parser.add_argument("--defaultDisk", type=human2bytesN, help="Disk for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessDisk", type=human2bytesN, help="Disk for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastDisk", type=human2bytesN, help="Disk for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignDisk", type=human2bytesN, help="Disk for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--halAppendDisk", type=human2bytesN, help="Disk for each halAppendSubtree job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible attempt count for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible attempt count for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible attempt count for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible attempt count for each halAppendSubtree job [default=1]", default=1) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() #todo support root option options.root = None if toil_mode: options.wdl = False options.noLocalInputs = False options.outDir = '.' setupBinaries(options) # need to avoid nested container calls, so set toil-inside-toil jobs to local by default if "--binariesMode" not in options.cactusOptions: options.cactusOptions += " --binariesMode local" if options.jobStore.startswith('aws'): if not options.outHal.startswith('s3://'): raise RuntimeError("--outHal must be s3:// address when using s3 job store") if not has_s3: raise RuntimeError("S3 support requires toil to be installed with [aws]") options.toil = toil_mode if not options.wdl and not options.toil: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMemory: if not options.preprocessMemory: options.preprocessMemory = options.defaultMemory if not options.blastMemory: options.blastMemory = options.defaultMemory if not options.alignMemory: options.alignMemory = options.defaultMemory if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # todo: no reason not to support non-1 batch size, but mirror wdl logic for now if options.toil: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # todo: could also support this assert not options.preprocessOnly # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help = "Input Seq file") parser.add_argument("outSeqFile", type=str, nargs='?', default=None, help = "Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--inputNames", nargs='*', help='input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)') parser.add_argument("--inPaths", nargs='*', help='Space-separated list of input fasta paths (to be used in place of --inSeqFile') parser.add_argument("--outPaths", nargs='*', help='Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)') parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError('--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths') elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile or options.inputNames: raise RuntimeError('--inPaths must be used in conjunction with --outPaths and not with --inSeqFile, --outSeqFile nor --inputNames') if len(options.inPaths) != len(options.outPaths): raise RuntimeError('--inPaths and --outPaths must have the same number of arguments') else: raise RuntimeError('--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input') inSeqPaths = [] outSeqPaths = [] # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) inNames = options.inputNames if not inNames: inNames = [inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves()] for inName in inNames: if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError('{} not present in input and output Seq files'.format(inNmae)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath)] outSeqPaths += [os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath)] else: inSeqPaths += [inPath] outSeqPaths += [outPath] # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths)
def main_batch(): """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine. pros: much less chance of a problem with one chromosome affecting anything else more forgiving for inexact resource specs could be ported to Terra cons: less efficient use of resources """ parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("chromFile", help="chroms file") parser.add_argument("outHal", type=str, help="Output directory (can be s3://)") parser.add_argument( "--alignOptions", type=str, help= "Options to pass through to cactus-align (don't forget to wrap in quotes)" ) parser.add_argument("--alignCores", type=int, help="Number of cores per align job") parser.add_argument( "--alignCoresOverrides", nargs="*", help= "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected" ) parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) options = parser.parse_args() options.containerImage = None options.binariesMode = None options.root = None options.latest = None options.database = "kyoto_tycoon" setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # Turn the overrides into a dict cores_overrides = {} if options.alignCoresOverrides: for o in options.alignCoresOverrides: try: chrom, cores = o.split(',') cores_overrides[chrom] = int(cores) except: raise RuntimeError( "Error parsing alignCoresOverrides \"{}\"".format(o)) options.alignCoresOverrides = cores_overrides start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: config_id = toil.importFile(makeURL(options.configFile)) # load the chromfile into memory chrom_dict = {} with open(options.chromFile, 'r') as chrom_file: for line in chrom_file: toks = line.strip().split() if len(toks): assert len(toks) == 3 chrom, seqfile, alnFile = toks[0], toks[1], toks[2] chrom_dict[chrom] = toil.importFile( makeURL(seqfile)), toil.importFile( makeURL(alnFile)) results_dict = toil.start( Job.wrapJobFn(align_toil_batch, chrom_dict, config_id, options)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) toil.exportFile( results[3], makeURL( os.path.join(options.outHal, '{}.hal.log'.format(chrom)))) end_time = timeit.default_timer() run_time = end_time - start_time logger.info( "cactus-align-batch has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("seqFile", help="Input Seq file") parser.add_argument( "outSeqFile", help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() inSeqFile = SeqFile(options.seqFile) outSeqFile = SeqFile(options.outSeqFile) inNames = options.inputNames if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] inSeqPaths = [] outSeqPaths = [] for inName in inNames: if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format(inNmae)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help="Input Seq file") parser.add_argument( "outSeqFile", type=str, nargs='?', default=None, help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--inPaths", nargs='*', help= 'Space-separated list of input fasta paths (to be used in place of --inSeqFile' ) parser.add_argument( "--outPaths", nargs='*', help= 'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)' ) parser.add_argument("--maskAlpha", action='store_true', help='Use dna-brnn instead of lastz for repeatmasking') parser.add_argument( "--clipAlpha", action='store_true', help= 'use dna-brnn instead of lastz for repeatmasking. Also, clip sequence using given minimum length instead of softmasking' ) parser.add_argument( "--ignore", nargs='*', help='Space-separate list of genomes from inSeqFile to ignore', default=[]) parser.add_argument( "--maskPAF", type=str, help= 'Incorporate coverage gaps from given PAF when masking. Only implemented for dna-brnn masking' ) parser.add_argument( "--brnnCores", type=int, help= 'Specify number of cores for each dna-brnn job (overriding default value from the config)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError( '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths' ) elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile: raise RuntimeError( '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile' ) if len(options.inPaths) != len(options.outPaths): raise RuntimeError( '--inPaths and --outPaths must have the same number of arguments' ) else: raise RuntimeError( '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input' ) if options.maskAlpha and options.clipAlpha: raise RuntimeError( '--maskAlpha and --clipAlpha cannot be used together') if options.clipAlpha: options.maskAlpha = True if options.maskPAF and not options.inputNames and not options.inSeqFile: raise RuntimeError( '--maskPAF requires event names specified wither with an input seqfile or with --inputNames' ) if options.ignore and options.clipAlpha is None: raise RuntimeError('--ignore can only be used with --clipAlpha') inSeqPaths = [] outSeqPaths = [] inNames = options.inputNames eventNames = [] #load cactus config configNode = ET.parse(options.configFile).getroot() #we never want to preprocess minigraph sequences graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") options.ignore.append(graph_event) # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] for inName in inNames: if inName in options.ignore: # "convenience" functionality: we let the --ignore option update the output seqfile # to reflect the fact that we're not touching the original input outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName] continue if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format( inName)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] eventNames.append(inName) if options.ignore: # see comment above with open(options.outSeqFile, 'w') as outSF: outSF.write(str(outSeqFile)) # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths, maskAlpha=options.maskAlpha, clipAlpha=options.clipAlpha, maskPAF=options.maskPAF, inputEventNames=eventNames, brnnCores=options.brnnCores)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("blastOutput", type=str, help="Blast output (from cactus-blast)") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonBlastInput", action="store_true", help= "Input does not come from cactus-blast: Do not append ids to fasta names" ) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() options.database = 'kyoto_tycoon' options.buildHal = True options.buildFasta = True # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 start_time = timeit.default_timer() runCactusAfterBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-blast has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputFile", type=str, help="Output pairwise alignment file") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides" ) #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-blast has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') # tokyo_cabinet is no longer supported options.database = "kyoto_tycoon" # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None and options.batchSystem != 'singleMachine': # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 start_time = timeit.default_timer() runCactusProgressive(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("Cactus has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="+", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outputHal", type=str, help="Output HAL file") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument( "--pangenome", action="store_true", help= "Override some CAF settings whose defaults are not suited to star trees" ) parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') if options.pafInput: # cactus-graphmap does not do any prepending to simplify interface with minigraph node names # so it must be done here options.nonCactusInput = True options.buildHal = True options.buildFasta = True # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusAfterBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="*", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outHal", type=str, help="Output HAL file (or directory in --batch mode)") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Pangenome Options parser.add_argument( "--pangenome", action="store_true", help= "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in <outHal>.pg-conf.xml") parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument( "--usePafSecondaries", action="store_true", help= "use the secondary alignments from the PAF input. They are ignored by default." ) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument( "--barMaskFilter", type=int, default=None, help= "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)" ) parser.add_argument( "--outVG", action="store_true", help="export pangenome graph in VG (.vg) in addition to HAL") parser.add_argument( "--outGFA", action="store_true", help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL") parser.add_argument( "--batch", action="store_true", help= "Launch batch of alignments. Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit" ) parser.add_argument( "--stagger", type=int, help= "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)", default=0) parser.add_argument( "--acyclic", type=str, help= "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing" ) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') options.buildHal = True options.buildFasta = True if options.outHal.startswith('s3://'): if not has_s3: raise RuntimeError( "S3 support requires toil to be installed with [aws]") # write a little something to the bucket now to catch any glaring problems asap test_file = os.path.join(getTempDirectory(), 'check') with open(test_file, 'w') as test_o: test_o.write("\n") region = get_aws_region( options.jobStore) if options.jobStore.startswith('aws:') else None write_s3(test_file, options.outHal if options.outHal.endswith('.hal') else os.path.join(options.outHal, 'test'), region=region) options.checkpointInfo = (get_aws_region(options.jobStore), options.outHal) else: options.checkpointInfo = None if options.batch: # the output hal is a directory, make sure it's there if not os.path.isdir(options.outHal): os.makedirs(options.outHal) assert len(options.cigarsFile) == 0 else: assert len(options.cigarsFile) > 0 # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # We set which type of unique ids to expect. Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap) # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy # But I don't think there's a real use case yet of making a separate parameter options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID') if options.eventNameAsID is not None: options.eventNameAsID = False if not bool( eventName) or eventName == '0' else True else: options.eventNameAsID = options.pangenome or options.pafInput os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str( int(options.eventNameAsID)) start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: align_jobs = make_batch_align_jobs(options, toil) results_dict = toil.start( Job.wrapJobFn(run_batch_align_jobs, align_jobs)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) else: assert len(results_dict) == 1 and None in results_dict halID, vgID, gfaID = results_dict[None][0], results_dict[None][ 1], results_dict[None][2] # export the hal toil.exportFile(halID, makeURL(options.outHal)) # export the vg if options.outVG: toil.exportFile( vgID, makeURL(os.path.splitext(options.outHal)[0] + '.vg')) if options.outGFA: toil.exportFile( gfaID, makeURL( os.path.splitext(options.outHal)[0] + '.gfa.gz')) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))