def toil_call_hal_append_subtrees(job, options, project, root_name, root_hal_id, event_names, *event_ids): work_dir = job.fileStore.getLocalTempDir() # donload the root hal file root_file = os.path.join(work_dir, '{}.hal'.format(root_name)) job.fileStore.readGlobalFile(root_hal_id, root_file, mutable=True) # download the hal files from the file store hal_files = [] for event_name, event_id in zip(event_names, event_ids): hal_files.append(os.path.join(work_dir, '{}.hal'.format(event_name))) job.fileStore.readGlobalFile(event_id, hal_files[-1]) # append to the root cactus_call(parameters=['halAppendSubtree', root_file, hal_files[-1], event_name, event_name, '--merge'] + options.halOptions.strip().split(' ')) # bypassing toil.exportFile for now as it only works on promises returned by the # start job, which isn't how this is set up. also in practice it's often more convenient # to output to s3 # todo: can we just use job.fileStore? if options.outHal.startswith('s3://'): # write it directly to s3 write_s3(root_file, options.outHal, region=get_aws_region(options.jobStore)) else: # write the output to disk shutil.copy2(root_file, options.outHal) return job.fileStore.writeGlobalFile(root_file)
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=True, checkpointInfo=None): HALPath = "tmp_alignment.hal" # traverse tree to make sure we are going breadth-first tree = project.mcTree # find subtree if event specified rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in project.expMap: experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName]) experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupGenomes() experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID())) expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHalID() is not None assert experiment.getHalFastaID() is not None subHALPath = job.fileStore.readGlobalFile(experiment.getHalID()) halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID()) args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)] if len(outgroups) > 0: args += ["--outgroups", ",".join(outgroups)] if cacheBytes is not None: args += ["--cacheBytes", cacheBytes] if cacheMDC is not None: args += ["--cacheMDC", cacheMDC] if cacheRDC is not None: args += ["--cacheRDC", cacheRDC] if cacheW0 is not None: args += ["--cacheW0", cacheW0] if chunk is not None: args += ["--chunk", chunk] if deflate is not None: args += ["--deflate", deflate] if inMemory is True: args += ["--inMemory"] cactus_call(parameters=["halAppendCactusSubtree"] + args) cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit]) with job.fileStore.readGlobalFileStream(project.configID) as configFile: cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read()).decode()]) if checkpointInfo: write_s3(HALPath, checkpointInfo[1], region=checkpointInfo[0]) return job.fileStore.writeGlobalFile(HALPath)
def export_vg(job, hal_id, configWrapper, doVG, doGFA, checkpointInfo=None, resource_spec = False): """ use hal2vg to convert the HAL to vg format """ if not resource_spec: # caller couldn't figure out the resrouces from hal_id promise. do that # now and try again return job.addChildJobFn(export_vg, hal_id, configWrapper, doVG, doGFA, checkpointInfo, resource_spec = True, disk=hal_id.size * 3, memory=hal_id.size * 10).rv() work_dir = job.fileStore.getLocalTempDir() hal_path = os.path.join(work_dir, "out.hal") job.fileStore.readGlobalFile(hal_id, hal_path) graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") hal2vg_opts = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "hal2vgOptions", default="") if hal2vg_opts: hal2vg_opts = hal2vg_opts.split(' ') else: hal2vg_opts = [] ignore_events = [] if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False): ignore_events.append(graph_event) if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeAncestor", typeFn=bool, default=False): ignore_events.append(configWrapper.getDefaultInternalNodePrefix() + '0') if ignore_events: hal2vg_opts += ['--ignoreGenomes', ','.join(ignore_events)] if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "prependGenomeNames", typeFn=bool, default=True): hal2vg_opts += ['--onlySequenceNames'] vg_path = os.path.join(work_dir, "out.vg") cmd = ['hal2vg', hal_path] + hal2vg_opts cactus_call(parameters=cmd, outfile=vg_path) if checkpointInfo: write_s3(vg_path, os.path.splitext(checkpointInfo[1])[0] + '.vg', region=checkpointInfo[0]) gfa_path = os.path.join(work_dir, "out.gfa.gz") if doGFA: gfa_cmd = [ ['vg', 'view', '-g', vg_path], ['gzip'] ] cactus_call(parameters=gfa_cmd, outfile=gfa_path) if checkpointInfo: write_s3(gfa_path, os.path.splitext(checkpointInfo[1])[0] + '.gfa.gz', region=checkpointInfo[0]) vg_id = job.fileStore.writeGlobalFile(vg_path) if doVG else None gfa_id = job.fileStore.writeGlobalFile(gfa_path) if doGFA else None return vg_id, gfa_id
def align_toil(job, chrom, seq_file_id, paf_file_id, config_id, options): """ run cactus-align """ work_dir = job.fileStore.getLocalTempDir() config_file = os.path.join(work_dir, 'config.xml') job.fileStore.readGlobalFile(config_id, config_file) seq_file = os.path.join(work_dir, '{}_seq_file.txt'.format(chrom)) job.fileStore.readGlobalFile(seq_file_id, seq_file) paf_file = os.path.join(work_dir, '{}.paf'.format(chrom)) job.fileStore.readGlobalFile(paf_file_id, paf_file) js = os.path.join(work_dir, 'js') if options.outHal.startswith('s3://'): out_file = os.path.join(options.outHal, '{}.hal'.format(chrom)) else: out_file = os.path.join(work_dir, '{}.hal'.format(chrom)) log_file = os.path.join(work_dir, '{}.hal.log'.format(chrom)) cmd = [ 'cactus-align', js, seq_file, paf_file, out_file, '--logFile', log_file ] + options.alignOptions.split() cactus_call(parameters=cmd) ret_ids = [None, None, None, None] if not options.outHal.startswith('s3://'): # we're not checkpoint directly to s3, so we return ret_ids[0] = job.fileStore.writeGlobalFile(out_file) out_vg = os.path.splitext(out_file)[0] + '.vg' if os.path.exists(out_vg): ret_ids[1] = job.fileStore.writeGlobalFile(out_vg) out_gfa = os.path.splitext(out_file)[0] + '.gfa.gz' if os.path.exists(out_gfa): ret_ids[2] = job.fileStore.writeGlobalFile(out_gfa) ret_ids[3] = job.fileStore.writeGlobalFile(log_file) else: write_s3(log_file, out_file + '.log', region=get_aws_region(options.jobStore)) return ret_ids
def export_split_data(toil, input_seq_id_map, output_id_map, split_log_ids, output_dir, config): """ download all the split data locally """ amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") chrom_file_map = {} for ref_contig in output_id_map.keys(): ref_contig_path = os.path.join(output_dir, ref_contig) if not os.path.isdir( ref_contig_path) and not ref_contig_path.startswith('s3://'): os.makedirs(ref_contig_path) # GFA: <output_dir>/<contig>/<contig>.gfa if 'gfa' in output_id_map[ref_contig]: # we do this check because no gfa made for ambiguous sequences "contig" toil.exportFile( output_id_map[ref_contig]['gfa'], makeURL( os.path.join(ref_contig_path, '{}.gfa'.format(ref_contig)))) # PAF: <output_dir>/<contig>/<contig>.paf paf_path = os.path.join(ref_contig_path, '{}.paf'.format(ref_contig)) toil.exportFile(output_id_map[ref_contig]['paf'], makeURL(paf_path)) # Fasta: <output_dir>/<contig>/fasta/<event>_<contig>.fa .. seq_file_map = {} for event, ref_contig_fa_id in output_id_map[ref_contig]['fa'].items(): fa_base = os.path.join(ref_contig_path, 'fasta') if not os.path.isdir(fa_base) and not fa_base.startswith('s3://'): os.makedirs(fa_base) fa_path = makeURL( os.path.join(fa_base, '{}_{}.fa'.format(event, ref_contig))) if input_seq_id_map[event][0].endswith('.gz'): fa_path += '.gz' seq_file_map[event] = fa_path toil.exportFile(ref_contig_fa_id, fa_path) # Seqfile: <output_dir>/seqfiles/<contig>.seqfile seq_file_path = os.path.join(output_dir, 'seqfiles', '{}.seqfile'.format(ref_contig)) if seq_file_path.startswith('s3://'): seq_file_temp_path = getTempFile() else: seq_file_temp_path = seq_file_path if not os.path.isdir(os.path.dirname(seq_file_path)): os.makedirs(os.path.dirname(seq_file_path)) with open(seq_file_temp_path, 'w') as seq_file: for event, fa_path in seq_file_map.items(): # cactus can't handle empty fastas. if there are no sequences for a sample for this # contig, just don't add it. if output_id_map[ref_contig]['fa'][event].size > 0: seq_file.write('{}\t{}\n'.format(event, fa_path)) if seq_file_path.startswith('s3://'): write_s3(seq_file_temp_path, seq_file_path) # Top-level seqfile chrom_file_map[ref_contig] = seq_file_path, paf_path # Chromfile : <coutput_dir>/chromfile.txt chrom_file_path = os.path.join(output_dir, 'chromfile.txt') if chrom_file_path.startswith('s3://'): chrom_file_temp_path = getTempFile() else: chrom_file_temp_path = chrom_file_path with open(chrom_file_temp_path, 'w') as chromfile: for ref_contig, seqfile_paf in chrom_file_map.items(): if ref_contig != amb_name: seqfile, paf = seqfile_paf[0], seqfile_paf[1] if seqfile.startswith('s3://'): # no use to have absolute s3 reference as cactus-align requires seqfiles passed locally seqfile = 'seqfiles/{}'.format(os.path.basename(seqfile)) chromfile.write('{}\t{}\t{}\n'.format(ref_contig, seqfile, paf)) if chrom_file_path.startswith('s3://'): write_s3(chrom_file_temp_path, chrom_file_path) toil.exportFile(split_log_ids[0], makeURL(os.path.join(output_dir, 'minigraph.split.log'))) if split_log_ids[1]: toil.exportFile( split_log_ids[1], makeURL(os.path.join(output_dir, 'minimap2.ambiguous.split.log')))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="*", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outHal", type=str, help="Output HAL file (or directory in --batch mode)") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Pangenome Options parser.add_argument( "--pangenome", action="store_true", help= "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in <outHal>.pg-conf.xml") parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument( "--usePafSecondaries", action="store_true", help= "use the secondary alignments from the PAF input. They are ignored by default." ) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument( "--barMaskFilter", type=int, default=None, help= "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)" ) parser.add_argument( "--outVG", action="store_true", help="export pangenome graph in VG (.vg) in addition to HAL") parser.add_argument( "--outGFA", action="store_true", help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL") parser.add_argument( "--batch", action="store_true", help= "Launch batch of alignments. Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit" ) parser.add_argument( "--stagger", type=int, help= "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)", default=0) parser.add_argument( "--acyclic", type=str, help= "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing" ) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') options.buildHal = True options.buildFasta = True if options.outHal.startswith('s3://'): if not has_s3: raise RuntimeError( "S3 support requires toil to be installed with [aws]") # write a little something to the bucket now to catch any glaring problems asap test_file = os.path.join(getTempDirectory(), 'check') with open(test_file, 'w') as test_o: test_o.write("\n") region = get_aws_region( options.jobStore) if options.jobStore.startswith('aws:') else None write_s3(test_file, options.outHal if options.outHal.endswith('.hal') else os.path.join(options.outHal, 'test'), region=region) options.checkpointInfo = (get_aws_region(options.jobStore), options.outHal) else: options.checkpointInfo = None if options.batch: # the output hal is a directory, make sure it's there if not os.path.isdir(options.outHal): os.makedirs(options.outHal) assert len(options.cigarsFile) == 0 else: assert len(options.cigarsFile) > 0 # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # We set which type of unique ids to expect. Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap) # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy # But I don't think there's a real use case yet of making a separate parameter options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID') if options.eventNameAsID is not None: options.eventNameAsID = False if not bool( eventName) or eventName == '0' else True else: options.eventNameAsID = options.pangenome or options.pafInput os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str( int(options.eventNameAsID)) start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: align_jobs = make_batch_align_jobs(options, toil) results_dict = toil.start( Job.wrapJobFn(run_batch_align_jobs, align_jobs)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) else: assert len(results_dict) == 1 and None in results_dict halID, vgID, gfaID = results_dict[None][0], results_dict[None][ 1], results_dict[None][2] # export the hal toil.exportFile(halID, makeURL(options.outHal)) # export the vg if options.outVG: toil.exportFile( vgID, makeURL(os.path.splitext(options.outHal)[0] + '.vg')) if options.outGFA: toil.exportFile( gfaID, makeURL( os.path.splitext(options.outHal)[0] + '.gfa.gz')) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job