def runCactusProgressive(options): with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences for genome, seq in project.inputSequenceMap.items(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outSeqDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed') parser.add_argument("outSeqFile", help = "Path for annotated Seq file output") parser.add_argument("outputHal", type=str, help = "Output HAL file") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="$JOBSTORE", help="jobstore to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments configWrapper.disableCafMegablockFilter() # the recoverable chains parameter does not seem to play nicely with star-like alignments either #configWrapper.disableRecoverableChains() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path( '.ig_coverage_{}'.format(i))))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, doRenaming=options.nonCactusInput, pafInput=options.pafInput)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]") parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") parser.add_argument("--defaultMem", type=float, help="Memory in GB for each job unless otherwise specified") parser.add_argument("--preprocessMem", type=float, help="Memory in GB for each cactus-preprocess job") parser.add_argument("--blastMem", type=float, help="Memory in GB for each cactus-blast job") parser.add_argument("--alignMem", type=float, help="Memory in GB for each cactus-align job") parser.add_argument("--defaultDisk", type=int, help="Disk in GB for each job unless otherwise specified") parser.add_argument("--preprocessDisk", type=int, help="Disk in GB for each cactus-preprocess job") parser.add_argument("--blastDisk", type=int, help="Disk in GB for each cactus-blast job") parser.add_argument("--alignDisk", type=int, help="Disk in GB for each cactus-align job") parser.add_argument("--halAppendDisk", type=int, help="Disk in GB for each halAppendSubtree job") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible in GB for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible in GB for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible in GB for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible in GB for each halAppendSubtree job [default=1]", default=1) options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None if not options.wdl: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMem: if not options.preprocessMem: options.preprocessMem = options.defaultMem if not options.blastMem: options.blastMem = options.defaultMem if not options.alignMem: options.alignMem = options.defaultMem if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def main(toil_mode=False): parser = ArgumentParser() if toil_mode: Job.Runner.addToilOptions(parser) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries (at top level; use --cactusOpts to set it in nested calls)", default=None) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]", required=toil_mode) if not toil_mode: parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo --retryCount 0", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") if not toil_mode: parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") if not toil_mode: parser.add_argument("--defaultMemory", type=human2bytesN, help="Memory for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessMemory", type=human2bytesN, help="Memory for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastMemory", type=human2bytesN, help="Memory for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignMemory", type=human2bytesN, help="Memory for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") if not toil_mode: parser.add_argument("--defaultDisk", type=human2bytesN, help="Disk for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessDisk", type=human2bytesN, help="Disk for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastDisk", type=human2bytesN, help="Disk for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignDisk", type=human2bytesN, help="Disk for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--halAppendDisk", type=human2bytesN, help="Disk for each halAppendSubtree job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible attempt count for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible attempt count for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible attempt count for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible attempt count for each halAppendSubtree job [default=1]", default=1) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() #todo support root option options.root = None if toil_mode: options.wdl = False options.noLocalInputs = False options.outDir = '.' setupBinaries(options) # need to avoid nested container calls, so set toil-inside-toil jobs to local by default if "--binariesMode" not in options.cactusOptions: options.cactusOptions += " --binariesMode local" if options.jobStore.startswith('aws'): if not options.outHal.startswith('s3://'): raise RuntimeError("--outHal must be s3:// address when using s3 job store") if not has_s3: raise RuntimeError("S3 support requires toil to be installed with [aws]") options.toil = toil_mode if not options.wdl and not options.toil: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMemory: if not options.preprocessMemory: options.preprocessMemory = options.defaultMemory if not options.blastMemory: options.blastMemory = options.defaultMemory if not options.alignMemory: options.alignMemory = options.defaultMemory if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # todo: no reason not to support non-1 batch size, but mirror wdl logic for now if options.toil: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # todo: could also support this assert not options.preprocessOnly # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest, locally-built docker container " "rather than pulling from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() options.cactusDir = getTempDirectory() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: project.readXML(pjPath) #import the sequences seqIDs = [] for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) logger.info("Setting config id to: %s" % cactusConfigID) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # import the outgroups outgroupIDs = [] cactus_blast_input = not options.nonBlastInput for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(options.blastOutput) + '.og_fragment_{}'.format(i)) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) except: if cactus_blast_input: raise # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not cactus_blast_input and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not cactus_blast_input: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(options.blastOutput)) try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(options.blastOutput) + '.secondary') except: workFlowArgs.secondaryAlignmentsID = None workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if cactus_blast_input and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(options.blastOutput) + '.ig_coverage_{}'.format(i))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, cactus_blast_input)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) print(str(project.inputSequenceMap)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences seqIDs = [] print "Importing %s sequences" % (len(project.getInputSequencePaths())) for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))