def toil_call_hal_append_subtrees(job, options, project, root_name, root_hal_id, event_names, *event_ids): work_dir = job.fileStore.getLocalTempDir() # donload the root hal file root_file = os.path.join(work_dir, '{}.hal'.format(root_name)) job.fileStore.readGlobalFile(root_hal_id, root_file, mutable=True) # download the hal files from the file store hal_files = [] for event_name, event_id in zip(event_names, event_ids): hal_files.append(os.path.join(work_dir, '{}.hal'.format(event_name))) job.fileStore.readGlobalFile(event_id, hal_files[-1]) # append to the root cactus_call(parameters=['halAppendSubtree', root_file, hal_files[-1], event_name, event_name, '--merge'] + options.halOptions.strip().split(' ')) # bypassing toil.exportFile for now as it only works on promises returned by the # start job, which isn't how this is set up. also in practice it's often more convenient # to output to s3 # todo: can we just use job.fileStore? if options.outHal.startswith('s3://'): # write it directly to s3 write_s3(root_file, options.outHal, region=get_aws_region(options.jobStore)) else: # write the output to disk shutil.copy2(root_file, options.outHal) return job.fileStore.writeGlobalFile(root_file)
def align_toil(job, chrom, seq_file_id, paf_file_id, config_id, options): """ run cactus-align """ work_dir = job.fileStore.getLocalTempDir() config_file = os.path.join(work_dir, 'config.xml') job.fileStore.readGlobalFile(config_id, config_file) seq_file = os.path.join(work_dir, '{}_seq_file.txt'.format(chrom)) job.fileStore.readGlobalFile(seq_file_id, seq_file) paf_file = os.path.join(work_dir, '{}.paf'.format(chrom)) job.fileStore.readGlobalFile(paf_file_id, paf_file) js = os.path.join(work_dir, 'js') if options.outHal.startswith('s3://'): out_file = os.path.join(options.outHal, '{}.hal'.format(chrom)) else: out_file = os.path.join(work_dir, '{}.hal'.format(chrom)) log_file = os.path.join(work_dir, '{}.hal.log'.format(chrom)) cmd = [ 'cactus-align', js, seq_file, paf_file, out_file, '--logFile', log_file ] + options.alignOptions.split() cactus_call(parameters=cmd) ret_ids = [None, None, None, None] if not options.outHal.startswith('s3://'): # we're not checkpoint directly to s3, so we return ret_ids[0] = job.fileStore.writeGlobalFile(out_file) out_vg = os.path.splitext(out_file)[0] + '.vg' if os.path.exists(out_vg): ret_ids[1] = job.fileStore.writeGlobalFile(out_vg) out_gfa = os.path.splitext(out_file)[0] + '.gfa.gz' if os.path.exists(out_gfa): ret_ids[2] = job.fileStore.writeGlobalFile(out_gfa) ret_ids[3] = job.fileStore.writeGlobalFile(log_file) else: write_s3(log_file, out_file + '.log', region=get_aws_region(options.jobStore)) return ret_ids
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="*", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outHal", type=str, help="Output HAL file (or directory in --batch mode)") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Pangenome Options parser.add_argument( "--pangenome", action="store_true", help= "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in <outHal>.pg-conf.xml") parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument( "--usePafSecondaries", action="store_true", help= "use the secondary alignments from the PAF input. They are ignored by default." ) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument( "--barMaskFilter", type=int, default=None, help= "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)" ) parser.add_argument( "--outVG", action="store_true", help="export pangenome graph in VG (.vg) in addition to HAL") parser.add_argument( "--outGFA", action="store_true", help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL") parser.add_argument( "--batch", action="store_true", help= "Launch batch of alignments. Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit" ) parser.add_argument( "--stagger", type=int, help= "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)", default=0) parser.add_argument( "--acyclic", type=str, help= "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing" ) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') options.buildHal = True options.buildFasta = True if options.outHal.startswith('s3://'): if not has_s3: raise RuntimeError( "S3 support requires toil to be installed with [aws]") # write a little something to the bucket now to catch any glaring problems asap test_file = os.path.join(getTempDirectory(), 'check') with open(test_file, 'w') as test_o: test_o.write("\n") region = get_aws_region( options.jobStore) if options.jobStore.startswith('aws:') else None write_s3(test_file, options.outHal if options.outHal.endswith('.hal') else os.path.join(options.outHal, 'test'), region=region) options.checkpointInfo = (get_aws_region(options.jobStore), options.outHal) else: options.checkpointInfo = None if options.batch: # the output hal is a directory, make sure it's there if not os.path.isdir(options.outHal): os.makedirs(options.outHal) assert len(options.cigarsFile) == 0 else: assert len(options.cigarsFile) > 0 # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # We set which type of unique ids to expect. Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap) # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy # But I don't think there's a real use case yet of making a separate parameter options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID') if options.eventNameAsID is not None: options.eventNameAsID = False if not bool( eventName) or eventName == '0' else True else: options.eventNameAsID = options.pangenome or options.pafInput os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str( int(options.eventNameAsID)) start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: align_jobs = make_batch_align_jobs(options, toil) results_dict = toil.start( Job.wrapJobFn(run_batch_align_jobs, align_jobs)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) else: assert len(results_dict) == 1 and None in results_dict halID, vgID, gfaID = results_dict[None][0], results_dict[None][ 1], results_dict[None][2] # export the hal toil.exportFile(halID, makeURL(options.outHal)) # export the vg if options.outVG: toil.exportFile( vgID, makeURL(os.path.splitext(options.outHal)[0] + '.vg')) if options.outGFA: toil.exportFile( gfaID, makeURL( os.path.splitext(options.outHal)[0] + '.gfa.gz')) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job