Esempio n. 1
0
def get_plan(options, project, inSeqFile, outSeqFile):

    plan = get_generation_info() + '\n'

    if options.wdl:
        plan += wdl_workflow_start(options, inSeqFile)
    
    # preprocessing
    plan += '\n## Preprocessor\n'
    leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()]
    for i in range(0, len(leaves), options.preprocessBatchSize):
        pre_batch = leaves[i:i+options.preprocessBatchSize]
        if options.wdl:
            assert len(pre_batch) == 1
            plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, leaves[i])
        else:
            plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format(
                get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch),
                options.cactusOptions, get_toil_resource_opts(options, 'preprocess'))

    if options.preprocessOnly:
        # if we're only making preprocess jobs, we can end early
        plan += '\n## Cactus\n'
        plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile,
                                              options.outHal, options.cactusOptions)
        return plan

    # shedule up the alignments
    schedule = Schedule()
    schedule.loadProject(project)
    schedule.compute()

    # set of all jobs, as genome names from the (fully resolved, output) seqfile
    events = set(outSeqFile.pathMap.keys()) - set(leaves)
    resolved = set(leaves)

    # convert follow-ons to dependencies
    follow_on_deps = {}
    for event in events:
        fo = schedule.followOn(event)
        if fo:
            follow_on_deps[fo] = event

    def get_deps(event):
        deps = set(schedule.deps(event))
        if event in follow_on_deps:
            deps = deps.union(set(follow_on_deps[event]))
        # I don't know why the schedule doesn't always give the children
        # todo: understand!
        try:
            has_name = outSeqFile.tree.getNodeId(event) is not None
        except:
            has_name = False
        if has_name:
            for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)):
                if not outSeqFile.tree.isLeaf(node):
                    deps.add(outSeqFile.tree.getName(node))
        return deps

    events_and_virtuals = set()
    # there can be chains of virtual events not caught by a single round
    # of get_deps, so we iterate till we have them all
    to_add = set()
    for event in events:
        to_add.add(event)
    while len(to_add) > 0:
        to_add1 = set()
        for event in to_add:
            events_and_virtuals.add(event)
            for dep in get_deps(event):
                if dep not in events_and_virtuals and dep not in to_add:
                    to_add1.add(dep)
        to_add = to_add1

    # group jobs into rounds.  where all jobs of round i can be run in parallel
    groups = []
    while len(events_and_virtuals) > 0:
        group = []
        to_remove = []
        added = 0
        for event in events_and_virtuals:
            if all([dep in resolved for dep in get_deps(event)]):
                if not schedule.isVirtual(event):
                    group.append(event)
                to_remove.append(event)
                added += 1
        if added == 0:
            sys.stderr.write("schedule deadlock:\n")
            for event in events_and_virtuals:
                sys.stderr.write("{} has deps {}\n".format(event, ["{}(resolved={})".format(d, d in resolved) for d in get_deps(event)]))
            sys.exit(1)
        for tr in to_remove:
            resolved.add(tr)
            events_and_virtuals.remove(tr)
        groups.append(group)

    def halPath(event):
        if event == project.mcTree.getRootName():
            return options.outHal
        else:
            return os.path.join(options.outDir, event + '.hal')
    def cigarPath(event):
        return os.path.join(options.outDir, event + '.cigar')

    # alignment groups
    plan += '\n## Alignment\n'
    for i, group in enumerate(groups):
        plan += '\n### Round {}'.format(i)
        for event in sorted(group):
            plan += '\n'
            if options.wdl:
                plan += wdl_call_blast(options, project, event, cigarPath(event))
                plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event])
            else:
                # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle)
                plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'blast'))
                plan += 'cactus-align {} {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'align'))
                # todo: just output the fasta in cactus-align.
                plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event])

    # stitch together the final tree
    plan += '\n## HAL merging\n'
    root = project.mcTree.getRootName()
    prev_event = None
    append_count = 0
    for group in reversed(groups):
        for event in group:
            if event != root:
                if options.wdl:
                    plan += wdl_call_hal_append(options, project, event, prev_event)
                else:
                    plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format(
                        halPath(root), halPath(event), event, event, options.halOptions)
                append_count += 1
            prev_event = event

    if options.wdl:
        plan += wdl_workflow_end(options, prev_event, append_count > 1)

    return plan
Esempio n. 2
0
def get_plan(options, project, inSeqFile, outSeqFile, toil):

    plan = get_generation_info() + '\n'

    if options.wdl:
        plan += wdl_workflow_start(options, inSeqFile)
        options.pp_map = {}

    if options.toil:
        # kick things off with an empty job which we will hook subsequent jobs onto
        # (using RoundedJob because root job must be sublcass of Job,
        #  https://github.com/ComparativeGenomicsToolkit/cactus/pull/284#issuecomment-684125478)
        start_job = RoundedJob()
        parent_job = start_job
        job_idx = {}
    
    # preprocessing
    plan += '\n## Preprocessor\n'
    leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()]
    for i in range(0, len(leaves), options.preprocessBatchSize):
        pre_batch = leaves[i:i+options.preprocessBatchSize]
        if options.wdl:
            plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, pre_batch)
        elif options.toil:
            job_idx[("preprocess", leaves[i])] = parent_job.addChildJobFn(toil_call_preprocess, options, inSeqFile, outSeqFile, leaves[i],
                                                                          cores=options.preprocessCores,
                                                                          memory=options.preprocessMemory,
                                                                          disk=options.preprocessDisk)
        else:
            plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format(
                get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch),
                options.cactusOptions, get_toil_resource_opts(options, 'preprocess'))

    if options.preprocessOnly:
        plan += '\n## Cactus\n'
        plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile,
                                              options.outHal, options.cactusOptions)
        return plan

    # shedule up the alignments
    schedule = Schedule()
    schedule.loadProject(project)
    schedule.compute()

    # set of all jobs, as genome names from the (fully resolved, output) seqfile
    events = set(outSeqFile.pathMap.keys()) - set(leaves)
    resolved = set(leaves)

    # convert follow-ons to dependencies
    follow_on_deps = {}
    for event in events:
        fo = schedule.followOn(event)
        if fo:
            follow_on_deps[fo] = event

    def get_deps(event):
        deps = set(schedule.deps(event))
        if event in follow_on_deps:
            deps = deps.union(set(follow_on_deps[event]))
        # I don't know why the schedule doesn't always give the children
        # todo: understand!
        try:
            has_name = outSeqFile.tree.getNodeId(event) is not None
        except:
            has_name = False
        if has_name:
            for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)):
                if not outSeqFile.tree.isLeaf(node):
                    deps.add(outSeqFile.tree.getName(node))
        return deps

    events_and_virtuals = set(events)
    # add all events, potentially looping through virtual dependency chains
    # (hence the double loop)
    batch = set(events_and_virtuals)
    while len(batch) > 0:
        next_batch = set()
        for event in batch:
            for dep in get_deps(event):
                if dep not in events_and_virtuals:
                    next_batch.add(dep)
                    events_and_virtuals.add(dep)
        batch = next_batch

    # group jobs into rounds.  where all jobs of round i can be run in parallel
    groups = []
    while len(events_and_virtuals) > 0:
        group = []
        to_remove = []
        added = 0
        for event in events_and_virtuals:
            if all([dep in resolved for dep in get_deps(event)]):
                if not schedule.isVirtual(event):
                    group.append(event)
                to_remove.append(event)
                added += 1
        if added == 0:
            sys.stderr.write("schedule deadlock:\n")
            for event in events_and_virtuals:
                sys.stderr.write("{} has deps {}\n".format(event, get_deps(event)))
            sys.exit(1)
        for tr in to_remove:
            resolved.add(tr)
            events_and_virtuals.remove(tr)
        groups.append(group)

    def halPath(event):
        if event == project.mcTree.getRootName():
            return options.outHal
        else:
            return os.path.join(options.outDir, event + '.hal')
    def cigarPath(event):
        return os.path.join(options.outDir, event + '.cigar')

    # alignment groups
    plan += '\n## Alignment\n'
    for i, group in enumerate(groups):
        plan += '\n### Round {}'.format(i)
        if options.toil:
            # advance toil phase
            # todo: recapitulate exact dependencies
            parent_job = parent_job.addFollowOn(Job())
        for event in sorted(group):
            plan += '\n'
            if options.wdl:
                plan += wdl_call_blast(options, project, event, cigarPath(event))
                plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event])
            elif options.toil:
                # promises only get fulfilleed if they are passed directly as arguments to the toil job, so we pull out the ones we need here
                leaf_deps, anc_deps = get_dep_names(options, project, event)
                fa_promises = [job_idx[("preprocess", dep)].rv() for dep in leaf_deps] + [job_idx[("align", dep)].rv(0) for dep in anc_deps]
                job_idx[("blast", event)] = parent_job.addChildJobFn(toil_call_blast,
                                                                     options,
                                                                     outSeqFile,
                                                                     project,
                                                                     event,
                                                                     cigarPath(event),
                                                                     leaf_deps + anc_deps,
                                                                     *fa_promises,
                                                                     cores=options.blastCores,
                                                                     memory=options.blastMemory,
                                                                     disk=options.preprocessDisk)
                job_idx[("align", event)] = job_idx[("blast", event)].addFollowOnJobFn(toil_call_align,
                                                                                       options, outSeqFile,
                                                                                       project,
                                                                                       event,
                                                                                       cigarPath(event),
                                                                                       halPath(event),
                                                                                       outSeqFile.pathMap[event],
                                                                                       job_idx[("blast", event)].rv(),
                                                                                       leaf_deps + anc_deps, *fa_promises,
                                                                                       cores=options.alignCores,
                                                                                       memory=options.alignMemory,
                                                                                       disk=options.alignDisk)
            else:
                # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle)
                plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'blast'))
                plan += 'cactus-align {} {} {} {} --root {} {} {} --database {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'align'), options.database)
                # todo: just output the fasta in cactus-align.
                plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event])

    # advance toil phase
    if options.toil:
        parent_job = parent_job.addFollowOn(Job())
                
    # stitch together the final tree
    plan += '\n## HAL merging\n'
    root = project.mcTree.getRootName()
    prev_event = None
    append_count = 0
    event_list = []
    for group in reversed(groups):
        for event in group:
            if event != root:
                if options.wdl:
                    plan += wdl_call_hal_append(options, project, event, prev_event)
                elif not options.toil:
                    plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format(
                        halPath(root), halPath(event), event, event, options.halOptions)
                append_count += 1
                event_list.append(event)
            prev_event = event

    if options.toil:
        job_idx['hal_append'] = parent_job.addChildJobFn(toil_call_hal_append_subtrees,
                                                         options,
                                                         project,
                                                         root,
                                                         job_idx[('align', root)].rv(1),
                                                         event_list,
                                                         *[job_idx[('align', e)].rv(1) for e in event_list],
                                                         cores=1,
                                                         memory=options.alignMemory,
                                                         disk=options.halAppendDisk)

    if options.wdl:
        plan += wdl_workflow_end(options, prev_event, append_count > 1)

    if options.toil:
        start_time = timeit.default_timer()
        toil.start(start_job)
        end_time = timeit.default_timer()
        run_time = end_time - start_time
        logger.info("cactus-prepare-toil has finished after {} seconds".format(run_time))
        
    return plan