Ejemplo n.º 1
0
 def __init__(self, options, project, event, schedule, depProjects, memory=None, cores=None):
     RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
     self.options = options
     self.project = project
     self.event = event
     self.schedule = schedule
     self.depProjects = depProjects
Ejemplo n.º 2
0
 def __init__(self, options, project, event, eventExpWrapper, schedule, memory=None, cores=None):
     RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
     self.options = options
     self.project = project
     self.event = event
     self.eventExpWrapper = eventExpWrapper
     self.schedule = schedule
Ejemplo n.º 3
0
 def __init__(self, options, project, event, eventExpWrapper, schedule, memory=None, cores=None):
     RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
     self.options = options
     self.project = project
     self.event = event
     self.eventExpWrapper = eventExpWrapper
     self.schedule = schedule
Ejemplo n.º 4
0
 def __init__(self, prepOptions, inSequenceID, chunksToCompute=None):
     disk = 3*inSequenceID.size if hasattr(inSequenceID, "size") else None
     RoundedJob.__init__(self, cores=prepOptions.cpu, memory=prepOptions.memory, disk=disk,
                  preemptable=True)
     self.prepOptions = prepOptions
     self.inSequenceID = inSequenceID
     self.chunksToCompute = chunksToCompute
Ejemplo n.º 5
0
 def __init__(self, prepOptions, inSequenceID, chunksToCompute=None):
     disk = 3*inSequenceID.size if hasattr(inSequenceID, "size") else None
     RoundedJob.__init__(self, cores=prepOptions.cpu, memory=prepOptions.memory, disk=disk,
                  preemptable=True)
     self.prepOptions = prepOptions 
     self.inSequenceID = inSequenceID
     self.chunksToCompute = chunksToCompute
Ejemplo n.º 6
0
 def __init__(self, options, project, event, schedule, depProjects, memory=None, cores=None):
     RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
     self.options = options
     self.project = project
     self.event = event
     self.schedule = schedule
     self.depProjects = depProjects
Ejemplo n.º 7
0
 def __init__(self, repeatMaskOptions, queryID, targetIDs):
     targetsSize = sum(targetID.size for targetID in targetIDs)
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (queryID.size + targetsSize)
     RoundedJob.__init__(self, memory=memory, disk=disk, preemptable=True)
     self.repeatMaskOptions = repeatMaskOptions
     self.queryID = queryID
     self.targetIDs = targetIDs
Ejemplo n.º 8
0
 def __init__(self, prepOptions, inChunkID):
     disk = inChunkID.size
     RoundedJob.__init__(self,
                         memory=prepOptions.memory,
                         cores=prepOptions.cpu,
                         disk=disk,
                         preemptable=True)
     self.prepOptions = prepOptions
     self.inChunkID = inChunkID
Ejemplo n.º 9
0
 def __init__(self, prepOptions, chunkIDList):
     disk = 2 * sum([chunkID.size for chunkID in chunkIDList])
     RoundedJob.__init__(self,
                         cores=prepOptions.cpu,
                         memory=prepOptions.memory,
                         disk=disk,
                         preemptable=True)
     self.prepOptions = prepOptions
     self.chunkIDList = chunkIDList
Ejemplo n.º 10
0
 def __init__(self, inputSequenceIDs, configNode):
     RoundedJob.__init__(self,
                         disk=sum([
                             id.size for id in inputSequenceIDs
                             if hasattr(id, 'size')
                         ]),
                         preemptable=True)
     self.inputSequenceIDs = inputSequenceIDs
     self.configNode = configNode
Ejemplo n.º 11
0
 def __init__(self, prepOptions, seqIDs, proportionSampled, inChunkID):
     disk = sum([seqID.size for seqID in seqIDs]) + 3 * inChunkID.size
     RoundedJob.__init__(self,
                         memory=prepOptions.memory,
                         cores=prepOptions.cpu,
                         disk=disk,
                         preemptable=True)
     self.prepOptions = prepOptions
     self.seqIDs = seqIDs
     self.inChunkID = inChunkID
Ejemplo n.º 12
0
 def __init__(self, repeatMaskOptions, fragmentsID, targetIDs):
     if hasattr(fragmentsID, "size"):
         targetsSize = sum(targetID.size for targetID in targetIDs)
         memory = 3500000000
         disk = 2 * (fragmentsID.size + targetsSize)
     else:
         memory = None
         disk = None
     RoundedJob.__init__(self, memory=memory, disk=disk, preemptable=True)
     self.repeatMaskOptions = repeatMaskOptions
     self.fragmentsID = fragmentsID
     self.targetIDs = targetIDs
Ejemplo n.º 13
0
 def __init__(self,
              fastaID,
              inputBedID=None,
              eventName=None,
              minLength=None):
     disk = 2 * (fastaID.size)
     memory = fastaID.size
     RoundedJob.__init__(self, disk=disk, memory=memory, preemptable=True)
     self.fastaID = fastaID
     self.minLength = minLength
     self.inputBedID = inputBedID
     self.eventName = eventName
Ejemplo n.º 14
0
 def __init__(self, fastaID, minLength, dnabrnnOpts):
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (fastaID.size)
     # todo: clean up
     cores = cpu_count()
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.fastaID = fastaID
     self.minLength = minLength
     self.dnabrnnOpts = dnabrnnOpts
Ejemplo n.º 15
0
 def __init__(self, repeatMaskOptions, queryID, targetIDs):
     targetsSize = sum(targetID.size for targetID in targetIDs)
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (queryID.size + targetsSize)
     if repeatMaskOptions.gpuLastz:
         # gpu jobs get the whole node (same hack as used in blast phase)
         cores = cpu_count()
     else:
         cores = None
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.repeatMaskOptions = repeatMaskOptions
     self.queryID = queryID
     self.targetIDs = targetIDs
Ejemplo n.º 16
0
 def __init__(self,
              fastaID,
              dnabrnnOpts,
              cpu,
              minLength=None,
              mergeLength=None,
              action=None):
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (fastaID.size)
     cores = min(cpu_count(), cpu)
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.fastaID = fastaID
     self.minLength = minLength
     self.mergeLength = mergeLength
     self.action = action
     self.dnabrnnOpts = dnabrnnOpts
Ejemplo n.º 17
0
 def __init__(self,
              fastaID,
              dnabrnnOpts,
              cpu,
              minLength=None,
              action=None,
              inputBedID=None,
              eventName=None):
     memory = 4 * 1024 * 1024 * 1024
     disk = 2 * (fastaID.size)
     cores = min(cpu_count(), cpu)
     RoundedJob.__init__(self,
                         memory=memory,
                         disk=disk,
                         cores=cores,
                         preemptable=True)
     self.fastaID = fastaID
     self.minLength = minLength
     self.action = action
     self.dnabrnnOpts = dnabrnnOpts
     self.inputBedID = inputBedID  #todo: moved to fileMasking --> remove from here
     self.eventName = eventName
Ejemplo n.º 18
0
 def __init__(self, inputSequenceID, configNode):
     RoundedJob.__init__(self, preemptable=True)
     self.inputSequenceID = inputSequenceID
     self.configNode = configNode
Ejemplo n.º 19
0
def get_plan(options, project, inSeqFile, outSeqFile, toil):

    plan = get_generation_info() + '\n'

    if options.wdl:
        plan += wdl_workflow_start(options, inSeqFile)
        options.pp_map = {}

    if options.toil:
        # kick things off with an empty job which we will hook subsequent jobs onto
        # (using RoundedJob because root job must be sublcass of Job,
        #  https://github.com/ComparativeGenomicsToolkit/cactus/pull/284#issuecomment-684125478)
        start_job = RoundedJob()
        parent_job = start_job
        job_idx = {}
    
    # preprocessing
    plan += '\n## Preprocessor\n'
    leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()]
    for i in range(0, len(leaves), options.preprocessBatchSize):
        pre_batch = leaves[i:i+options.preprocessBatchSize]
        if options.wdl:
            plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, pre_batch)
        elif options.toil:
            job_idx[("preprocess", leaves[i])] = parent_job.addChildJobFn(toil_call_preprocess, options, inSeqFile, outSeqFile, leaves[i],
                                                                          cores=options.preprocessCores,
                                                                          memory=options.preprocessMemory,
                                                                          disk=options.preprocessDisk)
        else:
            plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format(
                get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch),
                options.cactusOptions, get_toil_resource_opts(options, 'preprocess'))

    if options.preprocessOnly:
        plan += '\n## Cactus\n'
        plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile,
                                              options.outHal, options.cactusOptions)
        return plan

    # shedule up the alignments
    schedule = Schedule()
    schedule.loadProject(project)
    schedule.compute()

    # set of all jobs, as genome names from the (fully resolved, output) seqfile
    events = set(outSeqFile.pathMap.keys()) - set(leaves)
    resolved = set(leaves)

    # convert follow-ons to dependencies
    follow_on_deps = {}
    for event in events:
        fo = schedule.followOn(event)
        if fo:
            follow_on_deps[fo] = event

    def get_deps(event):
        deps = set(schedule.deps(event))
        if event in follow_on_deps:
            deps = deps.union(set(follow_on_deps[event]))
        # I don't know why the schedule doesn't always give the children
        # todo: understand!
        try:
            has_name = outSeqFile.tree.getNodeId(event) is not None
        except:
            has_name = False
        if has_name:
            for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)):
                if not outSeqFile.tree.isLeaf(node):
                    deps.add(outSeqFile.tree.getName(node))
        return deps

    events_and_virtuals = set(events)
    # add all events, potentially looping through virtual dependency chains
    # (hence the double loop)
    batch = set(events_and_virtuals)
    while len(batch) > 0:
        next_batch = set()
        for event in batch:
            for dep in get_deps(event):
                if dep not in events_and_virtuals:
                    next_batch.add(dep)
                    events_and_virtuals.add(dep)
        batch = next_batch

    # group jobs into rounds.  where all jobs of round i can be run in parallel
    groups = []
    while len(events_and_virtuals) > 0:
        group = []
        to_remove = []
        added = 0
        for event in events_and_virtuals:
            if all([dep in resolved for dep in get_deps(event)]):
                if not schedule.isVirtual(event):
                    group.append(event)
                to_remove.append(event)
                added += 1
        if added == 0:
            sys.stderr.write("schedule deadlock:\n")
            for event in events_and_virtuals:
                sys.stderr.write("{} has deps {}\n".format(event, get_deps(event)))
            sys.exit(1)
        for tr in to_remove:
            resolved.add(tr)
            events_and_virtuals.remove(tr)
        groups.append(group)

    def halPath(event):
        if event == project.mcTree.getRootName():
            return options.outHal
        else:
            return os.path.join(options.outDir, event + '.hal')
    def cigarPath(event):
        return os.path.join(options.outDir, event + '.cigar')

    # alignment groups
    plan += '\n## Alignment\n'
    for i, group in enumerate(groups):
        plan += '\n### Round {}'.format(i)
        if options.toil:
            # advance toil phase
            # todo: recapitulate exact dependencies
            parent_job = parent_job.addFollowOn(Job())
        for event in sorted(group):
            plan += '\n'
            if options.wdl:
                plan += wdl_call_blast(options, project, event, cigarPath(event))
                plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event])
            elif options.toil:
                # promises only get fulfilleed if they are passed directly as arguments to the toil job, so we pull out the ones we need here
                leaf_deps, anc_deps = get_dep_names(options, project, event)
                fa_promises = [job_idx[("preprocess", dep)].rv() for dep in leaf_deps] + [job_idx[("align", dep)].rv(0) for dep in anc_deps]
                job_idx[("blast", event)] = parent_job.addChildJobFn(toil_call_blast,
                                                                     options,
                                                                     outSeqFile,
                                                                     project,
                                                                     event,
                                                                     cigarPath(event),
                                                                     leaf_deps + anc_deps,
                                                                     *fa_promises,
                                                                     cores=options.blastCores,
                                                                     memory=options.blastMemory,
                                                                     disk=options.preprocessDisk)
                job_idx[("align", event)] = job_idx[("blast", event)].addFollowOnJobFn(toil_call_align,
                                                                                       options, outSeqFile,
                                                                                       project,
                                                                                       event,
                                                                                       cigarPath(event),
                                                                                       halPath(event),
                                                                                       outSeqFile.pathMap[event],
                                                                                       job_idx[("blast", event)].rv(),
                                                                                       leaf_deps + anc_deps, *fa_promises,
                                                                                       cores=options.alignCores,
                                                                                       memory=options.alignMemory,
                                                                                       disk=options.alignDisk)
            else:
                # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle)
                plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'blast'))
                plan += 'cactus-align {} {} {} {} --root {} {} {} --database {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'align'), options.database)
                # todo: just output the fasta in cactus-align.
                plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event])

    # advance toil phase
    if options.toil:
        parent_job = parent_job.addFollowOn(Job())
                
    # stitch together the final tree
    plan += '\n## HAL merging\n'
    root = project.mcTree.getRootName()
    prev_event = None
    append_count = 0
    event_list = []
    for group in reversed(groups):
        for event in group:
            if event != root:
                if options.wdl:
                    plan += wdl_call_hal_append(options, project, event, prev_event)
                elif not options.toil:
                    plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format(
                        halPath(root), halPath(event), event, event, options.halOptions)
                append_count += 1
                event_list.append(event)
            prev_event = event

    if options.toil:
        job_idx['hal_append'] = parent_job.addChildJobFn(toil_call_hal_append_subtrees,
                                                         options,
                                                         project,
                                                         root,
                                                         job_idx[('align', root)].rv(1),
                                                         event_list,
                                                         *[job_idx[('align', e)].rv(1) for e in event_list],
                                                         cores=1,
                                                         memory=options.alignMemory,
                                                         disk=options.halAppendDisk)

    if options.wdl:
        plan += wdl_workflow_end(options, prev_event, append_count > 1)

    if options.toil:
        start_time = timeit.default_timer()
        toil.start(start_job)
        end_time = timeit.default_timer()
        run_time = end_time - start_time
        logger.info("cactus-prepare-toil has finished after {} seconds".format(run_time))
        
    return plan
Ejemplo n.º 20
0
 def __init__(self, prepOptions, inChunkID):
     disk = inChunkID.size
     RoundedJob.__init__(self, memory=prepOptions.memory, cores=prepOptions.cpu, disk=disk,
                  preemptable=True)
     self.prepOptions = prepOptions 
     self.inChunkID = inChunkID
Ejemplo n.º 21
0
 def __init__(self, prepOptions, chunkIDList):
     RoundedJob.__init__(self, preemptable=True)
     self.prepOptions = prepOptions
     self.chunkIDList = chunkIDList
Ejemplo n.º 22
0
 def __init__(self, inputSequenceID, configNode):
     RoundedJob.__init__(self, preemptable=True)
     self.inputSequenceID = inputSequenceID
     self.configNode = configNode
Ejemplo n.º 23
0
 def __init__(self, repeatMaskOptions, alignmentsID, queryID):
     RoundedJob.__init__(self, preemptable=True)
     self.repeatMaskOptions = repeatMaskOptions
     self.alignmentsID = alignmentsID
     self.queryID = queryID
Ejemplo n.º 24
0
 def __init__(self, prepOptions, chunkIDList):
     RoundedJob.__init__(self, preemptable=True)
     self.prepOptions = prepOptions
     self.chunkIDList = chunkIDList
Ejemplo n.º 25
0
 def __init__(self, repeatMaskOptions, queryID, targetIDs):
     RoundedJob.__init__(self, preemptable=True)
     self.repeatMaskOptions = repeatMaskOptions
     self.queryID = queryID
     self.targetIDs = targetIDs
Ejemplo n.º 26
0
 def __init__(self, prepOptions, chunkIDList):
     disk = 2*sum([chunkID.size for chunkID in chunkIDList])
     RoundedJob.__init__(self, cores=prepOptions.cpu, memory=prepOptions.memory, disk=disk,
                  preemptable=True)
     self.prepOptions = prepOptions 
     self.chunkIDList = chunkIDList
Ejemplo n.º 27
0
 def __init__(self, options, project, memory=None, cores=None):
     RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
     self.options = options
     self.project = project
Ejemplo n.º 28
0
 def __init__(self, fastaID, cutBefore, cutAfter):
     disk = 2*(fastaID.size)
     RoundedJob.__init__(self, disk=disk, preemptable=True)
     self.fastaID = fastaID
     self.cutBefore = cutBefore
     self.cutAfter = cutAfter
Ejemplo n.º 29
0
 def __init__(self, inputSequenceIDs, configNode):
     RoundedJob.__init__(self, disk=sum([id.size for id in inputSequenceIDs]), preemptable=True)
     self.inputSequenceIDs = inputSequenceIDs
     self.configNode = configNode  
Ejemplo n.º 30
0
 def __init__(self, options, project, memory=None, cores=None):
     RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
     self.options = options
     self.project = project
Ejemplo n.º 31
0
 def __init__(self, prepXmlElems, inSequenceID, iteration = 0):
     self.prepXmlElems = prepXmlElems
     self.inSequenceID = inSequenceID
     self.iteration = iteration
     RoundedJob.__init__(self, preemptable=True)
Ejemplo n.º 32
0
 def __init__(self, prepXmlElems, inSequenceID, iteration = 0):
     self.prepXmlElems = prepXmlElems
     self.inSequenceID = inSequenceID
     self.iteration = iteration
     RoundedJob.__init__(self, preemptable=True)