def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file (will be modified if necessary to include graph Fasta sequence)") parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)") parser.add_argument("outputPAF", type=str, help = "Output pairwise alignment file in PAF format") parser.add_argument("--outputFasta", type=str, help = "Output graph sequence file in FASTA format (required if not present in seqFile)") parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp (overrides config option of same name)") parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory") parser.add_argument("--refFromGFA", type=str, help = "Do not align given genome from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") #WDL hacks parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outputGAFDir: if not os.path.isdir(options.outputGAFDir): os.makedirs(options.outputGAFDir) if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMap(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError('Only 1 CPU detected. Cactus requires at least 2') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusProgressive(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("Cactus has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("--vg", required=True, nargs='+', help = "Input vg files (PackedGraph or HashGraph format)") parser.add_argument("--outDir", required=True, type=str, help = "Output directory") parser.add_argument("--outName", required=True, type=str, help = "Basename of all output files") parser.add_argument("--reference", required=True, type=str, help = "Reference event name") parser.add_argument("--vcfReference", type=str, help = "Reference event for VCF (if different from --reference)") parser.add_argument("--rename", nargs='+', default = [], help = "Path renaming, each of form src>dest (see clip-vg -r)") parser.add_argument("--clipLength", type=int, default=None, help = "clip out unaligned sequences longer than this") parser.add_argument("--wlineSep", type=str, help = "wline separator for vg convert") parser.add_argument("--indexCores", type=int, default=1, help = "cores for indexing processes") parser.add_argument("--decoyGraph", help= "decoy sequences vg graph to add (PackedGraph or HashGraph format)") parser.add_argument("--hal", nargs='+', default = [], help = "Input hal files (for merging)") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outDir and not options.outDir.startswith('s3://'): if not os.path.isdir(options.outDir): os.makedirs(options.outDir) if options.hal and len(options.hal) != len(options.vg): raise RuntimeError("If --hal and --vg should specify the same number of files") # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMapJoin(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap-join has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file (gzipped fastas supported)") parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)") parser.add_argument("graphmapPAF", type=str, help = "Output pairwise alignment file in PAF format (can be gzipped)") parser.add_argument("--outDir", required=True, type=str, help = "Output directory") parser.add_argument("--refContigs", nargs="*", help = "Subset to these reference contigs (multiple allowed)", default=[]) parser.add_argument("--refContigsFile", type=str, help = "Subset to (newline-separated) reference contigs in this file") parser.add_argument("--otherContig", type=str, help = "Lump all reference contigs unselected by above options into single one with this name") parser.add_argument("--reference", type=str, help = "Name of reference (in seqFile). Ambiguity filters will not be applied to it") parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outDir and not options.outDir.startswith('s3://'): if not os.path.isdir(options.outDir): os.makedirs(options.outDir) # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMapSplit(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap-split has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="+", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outputHal", type=str, help="Output HAL file") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument( "--pangenome", action="store_true", help= "Override some CAF settings whose defaults are not suited to star trees" ) parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') if options.pafInput: # cactus-graphmap does not do any prepending to simplify interface with minigraph node names # so it must be done here options.nonCactusInput = True options.buildHal = True options.buildFasta = True # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusAfterBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("blastOutput", nargs="+", help = "Blast output (from cactus-blast)") parser.add_argument("outputHal", type=str, help = "Output HAL file") parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--nonBlastInput", action="store_true", help="Input does not come from cactus-blast: Do not append ids to fasta names") parser.add_argument("--nonBlastMegablockFilter", action="store_true", help="By default, the megablock filter is off for --nonBlastInput, as it does not play" "nicely with reference-based alignments. This flag will turn it back on") parser.add_argument("--pafInput", action="store_true", help="'blastOutput' input is in paf format, rather than lastz cigars.") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames') # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError('Only 1 CPU detected. Cactus requires at least 2') # tokyo_cabinet is no longer supported options.database = "kyoto_tycoon" options.database = 'kyoto_tycoon' options.buildHal = True options.buildFasta = True # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusAfterBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main_batch(): """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine. pros: much less chance of a problem with one chromosome affecting anything else more forgiving for inexact resource specs could be ported to Terra cons: less efficient use of resources """ parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("chromFile", help="chroms file") parser.add_argument("outHal", type=str, help="Output directory (can be s3://)") parser.add_argument( "--alignOptions", type=str, help= "Options to pass through to cactus-align (don't forget to wrap in quotes)" ) parser.add_argument("--alignCores", type=int, help="Number of cores per align job") parser.add_argument( "--alignCoresOverrides", nargs="*", help= "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected" ) parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) options = parser.parse_args() options.containerImage = None options.binariesMode = None options.root = None options.latest = None options.database = "kyoto_tycoon" setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # Turn the overrides into a dict cores_overrides = {} if options.alignCoresOverrides: for o in options.alignCoresOverrides: try: chrom, cores = o.split(',') cores_overrides[chrom] = int(cores) except: raise RuntimeError( "Error parsing alignCoresOverrides \"{}\"".format(o)) options.alignCoresOverrides = cores_overrides start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: config_id = toil.importFile(makeURL(options.configFile)) # load the chromfile into memory chrom_dict = {} with open(options.chromFile, 'r') as chrom_file: for line in chrom_file: toks = line.strip().split() if len(toks): assert len(toks) == 3 chrom, seqfile, alnFile = toks[0], toks[1], toks[2] chrom_dict[chrom] = toil.importFile( makeURL(seqfile)), toil.importFile( makeURL(alnFile)) results_dict = toil.start( Job.wrapJobFn(align_toil_batch, chrom_dict, config_id, options)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) toil.exportFile( results[3], makeURL( os.path.join(options.outHal, '{}.hal.log'.format(chrom)))) end_time = timeit.default_timer() run_time = end_time - start_time logger.info( "cactus-align-batch has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="*", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outHal", type=str, help="Output HAL file (or directory in --batch mode)") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Pangenome Options parser.add_argument( "--pangenome", action="store_true", help= "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in <outHal>.pg-conf.xml") parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument( "--usePafSecondaries", action="store_true", help= "use the secondary alignments from the PAF input. They are ignored by default." ) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument( "--barMaskFilter", type=int, default=None, help= "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)" ) parser.add_argument( "--outVG", action="store_true", help="export pangenome graph in VG (.vg) in addition to HAL") parser.add_argument( "--outGFA", action="store_true", help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL") parser.add_argument( "--batch", action="store_true", help= "Launch batch of alignments. Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit" ) parser.add_argument( "--stagger", type=int, help= "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)", default=0) parser.add_argument( "--acyclic", type=str, help= "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing" ) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') options.buildHal = True options.buildFasta = True if options.outHal.startswith('s3://'): if not has_s3: raise RuntimeError( "S3 support requires toil to be installed with [aws]") # write a little something to the bucket now to catch any glaring problems asap test_file = os.path.join(getTempDirectory(), 'check') with open(test_file, 'w') as test_o: test_o.write("\n") region = get_aws_region( options.jobStore) if options.jobStore.startswith('aws:') else None write_s3(test_file, options.outHal if options.outHal.endswith('.hal') else os.path.join(options.outHal, 'test'), region=region) options.checkpointInfo = (get_aws_region(options.jobStore), options.outHal) else: options.checkpointInfo = None if options.batch: # the output hal is a directory, make sure it's there if not os.path.isdir(options.outHal): os.makedirs(options.outHal) assert len(options.cigarsFile) == 0 else: assert len(options.cigarsFile) > 0 # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # We set which type of unique ids to expect. Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap) # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy # But I don't think there's a real use case yet of making a separate parameter options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID') if options.eventNameAsID is not None: options.eventNameAsID = False if not bool( eventName) or eventName == '0' else True else: options.eventNameAsID = options.pangenome or options.pafInput os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str( int(options.eventNameAsID)) start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: align_jobs = make_batch_align_jobs(options, toil) results_dict = toil.start( Job.wrapJobFn(run_batch_align_jobs, align_jobs)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) else: assert len(results_dict) == 1 and None in results_dict halID, vgID, gfaID = results_dict[None][0], results_dict[None][ 1], results_dict[None][2] # export the hal toil.exportFile(halID, makeURL(options.outHal)) # export the vg if options.outVG: toil.exportFile( vgID, makeURL(os.path.splitext(options.outHal)[0] + '.vg')) if options.outGFA: toil.exportFile( gfaID, makeURL( os.path.splitext(options.outHal)[0] + '.gfa.gz')) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help="Input Seq file") parser.add_argument( "outSeqFile", type=str, nargs='?', default=None, help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--inPaths", nargs='*', help= 'Space-separated list of input fasta paths (to be used in place of --inSeqFile' ) parser.add_argument( "--outPaths", nargs='*', help= 'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)' ) parser.add_argument("--maskAlpha", action='store_true', help='Use dna-brnn instead of lastz for repeatmasking') parser.add_argument( "--clipAlpha", action='store_true', help= 'use dna-brnn instead of lastz for repeatmasking. Also, clip sequence using given minimum length instead of softmasking' ) parser.add_argument( "--ignore", nargs='*', help='Space-separate list of genomes from inSeqFile to ignore', default=[]) parser.add_argument( "--maskPAF", type=str, help= 'Incorporate coverage gaps from given PAF when masking. Only implemented for dna-brnn masking' ) parser.add_argument( "--brnnCores", type=int, help= 'Specify number of cores for each dna-brnn job (overriding default value from the config)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError( '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths' ) elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile: raise RuntimeError( '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile' ) if len(options.inPaths) != len(options.outPaths): raise RuntimeError( '--inPaths and --outPaths must have the same number of arguments' ) else: raise RuntimeError( '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input' ) if options.maskAlpha and options.clipAlpha: raise RuntimeError( '--maskAlpha and --clipAlpha cannot be used together') if options.clipAlpha: options.maskAlpha = True if options.maskPAF and not options.inputNames and not options.inSeqFile: raise RuntimeError( '--maskPAF requires event names specified wither with an input seqfile or with --inputNames' ) if options.ignore and options.clipAlpha is None: raise RuntimeError('--ignore can only be used with --clipAlpha') inSeqPaths = [] outSeqPaths = [] inNames = options.inputNames eventNames = [] #load cactus config configNode = ET.parse(options.configFile).getroot() #we never want to preprocess minigraph sequences graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") options.ignore.append(graph_event) # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] for inName in inNames: if inName in options.ignore: # "convenience" functionality: we let the --ignore option update the output seqfile # to reflect the fact that we're not touching the original input outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName] continue if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format( inName)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] eventNames.append(inName) if options.ignore: # see comment above with open(options.outSeqFile, 'w') as outSF: outSF.write(str(outSeqFile)) # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths, maskAlpha=options.maskAlpha, clipAlpha=options.clipAlpha, maskPAF=options.maskPAF, inputEventNames=eventNames, brnnCores=options.brnnCores)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputFile", type=str, help="Output pairwise alignment file") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides" ) #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-blast has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help="Input Seq file") parser.add_argument( "outSeqFile", type=str, nargs='?', default=None, help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--inPaths", nargs='*', help= 'Space-separated list of input fasta paths (to be used in place of --inSeqFile' ) parser.add_argument( "--outPaths", nargs='*', help= 'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)' ) parser.add_argument("--maskAlpha", action='store_true', help='Use dna-brnn instead of lastz for repeatmasking') parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError( '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths' ) elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile or options.inputNames: raise RuntimeError( '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile, --outSeqFile nor --inputNames' ) if len(options.inPaths) != len(options.outPaths): raise RuntimeError( '--inPaths and --outPaths must have the same number of arguments' ) else: raise RuntimeError( '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input' ) inSeqPaths = [] outSeqPaths = [] # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) inNames = options.inputNames if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] for inName in inNames: if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format( inNmae)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths, maskAlpha=options.maskAlpha)