def dispatch_submission(self, raw_msg):
        """Dispatch job submission to appropriate handlers."""
        # ensure targets up to date:
        self.notifier_stream.flush()
        try:
            idents, msg = self.session.feed_identities(raw_msg, copy=False)
            msg = self.session.deserialize(msg, content=False, copy=False)
        except Exception:
            self.log.error("task::Invaid task msg: %r" % raw_msg,
                           exc_info=True)
            return

        # send to monitor
        self.mon_stream.send_multipart([b'intask'] + raw_msg, copy=False)

        header = msg['header']
        md = msg['metadata']
        msg_id = header['msg_id']
        self.all_ids.add(msg_id)

        # get targets as a set of bytes objects
        # from a list of unicode objects
        targets = md.get('targets', [])
        targets = set(map(cast_bytes, targets))

        retries = md.get('retries', 0)
        self.retries[msg_id] = retries

        # time dependencies
        after = md.get('after', None)
        if after:
            after = Dependency(after)
            if after.all:
                if after.success:
                    after = Dependency(
                        after.difference(self.all_completed),
                        success=after.success,
                        failure=after.failure,
                        all=after.all,
                    )
                if after.failure:
                    after = Dependency(
                        after.difference(self.all_failed),
                        success=after.success,
                        failure=after.failure,
                        all=after.all,
                    )
            if after.check(self.all_completed, self.all_failed):
                # recast as empty set, if `after` already met,
                # to prevent unnecessary set comparisons
                after = MET
        else:
            after = MET

        # location dependencies
        follow = Dependency(md.get('follow', []))

        timeout = md.get('timeout', None)
        if timeout:
            timeout = float(timeout)

        job = Job(
            msg_id=msg_id,
            raw_msg=raw_msg,
            idents=idents,
            msg=msg,
            header=header,
            targets=targets,
            after=after,
            follow=follow,
            timeout=timeout,
            metadata=md,
        )
        # validate and reduce dependencies:
        for dep in after, follow:
            if not dep:  # empty dependency
                continue
            # check valid:
            if msg_id in dep or dep.difference(self.all_ids):
                self.queue_map[msg_id] = job
                return self.fail_unreachable(msg_id, error.InvalidDependency)
            # check if unreachable:
            if dep.unreachable(self.all_completed, self.all_failed):
                self.queue_map[msg_id] = job
                return self.fail_unreachable(msg_id)

        if after.check(self.all_completed, self.all_failed):
            # time deps already met, try to run
            if not self.maybe_run(job):
                # can't run yet
                if msg_id not in self.all_failed:
                    # could have failed as unreachable
                    self.save_unmet(job)
        else:
            self.save_unmet(job)
Exemple #2
0
# fail after some time:
def wait_and_fail(t):
    import time

    time.sleep(t)
    return 1 / 0


successes = [view.apply_async(wait, 1).msg_ids[0] for i in range(len(client.ids))]
failures = [
    view.apply_async(wait_and_fail, 1).msg_ids[0] for i in range(len(client.ids))
]

mixed = [failures[0], successes[0]]
d1a = Dependency(mixed, all=False, failure=True)  # yes
d1b = Dependency(mixed, all=False)  # yes
d2a = Dependency(mixed, all=True, failure=True)  # yes after / no follow
d2b = Dependency(mixed, all=True)  # no
d3 = Dependency(failures, all=False)  # no
d4 = Dependency(failures, all=False, failure=True)  # yes
d5 = Dependency(failures, all=True, failure=True)  # yes after / no follow
d6 = Dependency(successes, all=True, failure=True)  # yes after / no follow

view.block = False
flags = view.temp_flags
with flags(after=d1a):
    r1a = view.apply(getpid)
with flags(follow=d1b):
    r1b = view.apply(getpid)
with flags(after=d2b, follow=d2a):
def leastload(loads):
    """Always choose the lowest load.

    If the lowest load occurs more than once, the first
    occurance will be used.  If loads has LRU ordering, this means
    the LRU of those with the lowest load is chosen.
    """
    return loads.index(min(loads))


# ---------------------------------------------------------------------
# Classes
# ---------------------------------------------------------------------

# store empty default dependency:
MET = Dependency([])


class Job(object):
    """Simple container for a job"""
    def __init__(
        self,
        msg_id,
        raw_msg,
        idents,
        msg,
        header,
        metadata,
        targets,
        after,
        follow,
Exemple #4
0
def apply_jobs(data, samples, ipyclient, noreverse, force, preview):
    """ pass the samples to N engines to execute run_full on each.

    :param data: An Assembly object
    :param samples: one or more samples selected from data
    :param ipyclient: ipyparallel load_balanced_view client
    :param noreverse: toggle revcomp clustering despite datatype default
    :param force: force
    :param preview: run preview
    :param align_only: skips clustering/mapping and aligns existing files

    :returns: None
    """
    ## make directories
    setup_dirs(data)

    ## Create threaded_view of engines by grouping only ids that are threaded
    hostdict = get_threaded_view(ipyclient)
    threaded_views = {}
    for key, val in hostdict.items():
        ## e.g., client.load_balanced_view([1,3])
        threaded_views[key] = ipyclient.load_balanced_view(val)

    ## A single load-balanced view for FUNCs 3-4
    lbview = ipyclient.load_balanced_view()

    ## If doing reference sequence mapping in any fashion then init
    ## the samples. This is very fast operation.
    if "reference" in data.paramsdict["assembly_method"]:
        samples = [refmap_init([data, s]) for s in samples]


    ## FUNC 1: derep ---------------------------------------------------
    res_derep = {}
    done_derep = 0
    for sample in samples:
        args = [data, sample]
        res_derep[sample] = lbview.apply(derep_concat_split, args)


    ## FUNC 3: mapreads ----------------------------------------------------
    ## Submit samples to reference map else null
    res_ref = {}
    mcfunc = null_func
    done_ref = 1
    if "reference" in data.paramsdict["assembly_method"]:
        done_ref = 0
        mcfunc = mapreads
    for sample in samples:
        ## Create a depedency for each sample. To run, the result from the
        ## previous step must not have failed.
        check_deps = Dependency(res_derep[sample], failure=False, success=True)
        args = [data, sample, noreverse, 1]
        with lbview.temp_flags(after=check_deps):
            res_ref[sample] = lbview.apply(mcfunc, args)
    ## test just up to this point [comment this out when done]
    #[res_ref[i].get() for i in res_ref]
    #for i in res_ref:
    #    print(res_ref[i].metadata.status)
    #return 1


    ## FUNC 4: clustering ---------------------------------------------------
    ## Cluster reads for all assembly methods except 'reference' since
    ## the refmapping clusters them for us and we don't care about the reads
    ## that don't map.
    mcfunc = null_func
    done_clust = 1
    if data.paramsdict["assembly_method"] != "reference":
        done_clust = 0
        mcfunc = clust_and_build
    ## require that the sample successfully finished previous step
    res_clust = {}
    for sample in samples:
        check_deps = Dependency(res_ref[sample], failure=False, success=True)
        args = [data, sample, noreverse, 1] 
        with lbview.temp_flags(after=check_deps):
            res_clust[sample] = lbview.apply(mcfunc, args)
    ## test just up to this point [comment this out when done]
    #[res_clust[i].get() for i in res_clust]
    #for i in res_clust:
    #    print(res_clust[i].metadata.status)
    #return 1


    ## FUNC 5: reference cleanup -------------------------------------------
    ## Pull in alignments from mapped bam files and write them to the clust.gz 
    ## to fold them back into the pipeline. If we are doing "denovo" then 
    ## don't call this, but less obvious, "denovo-reference" intentionally 
    ## doesn't call this to effectively discard reference mapped reads.
    mcfunc = null_func
    if data.paramsdict["assembly_method"] in ["reference", "denovo+reference"]: 
        mcfunc = ref_muscle_chunker
    ## requires sample to have finished the previous step before running
    res_clean = {}
    for sample in samples:
        check_deps = Dependency(res_clust[sample], failure=False, success=True)
        with lbview.temp_flags(after=check_deps):
            res_clean[sample] = lbview.apply(mcfunc, [data, sample])
    ## test just up to this point [comment this out when done]
    #[res_clean[i].get() for i in res_clean]
    #return 1


    ## FUNC 6: split up clusters into chunks -------------------------------
    #mcfunc = null_func
    #if data.paramsdict["assembly_method"] in ["denovo", "denovo+reference"]: 
    mcfunc = muscle_chunker
    res_chunk = {}
    tmpdir = os.path.join(data.dirs.project, data.name+'-tmpalign')    
    for sample in samples:
        check_deps = Dependency(res_clean[sample], failure=False, success=True)
        with lbview.temp_flags(after=check_deps):
            args = [data, sample, tmpdir]
            res_chunk[sample] = lbview.apply(mcfunc, args)
    ## test just up to this point [comment this out when done]
    #[res_chunk[i].get() for i in res_chunk]
    #return 1


    ## FUNC 7: align chunks -------------------------------------------------
    res_align = {sample:[] for sample in samples}
    for sample in samples:
        ## get all chunks for this sample
        check_deps = Dependency(res_chunk[sample], failure=False, success=True)
        with lbview.temp_flags(after=check_deps):
            for i in range(10):
                chunk = os.path.join(tmpdir, sample.name+"_chunk_{}.ali".format(i))
                res_align[sample].append(lbview.apply(muscle_align, [data, chunk]))
    ## test just up to this point [comment this out when done]
    align_asyncs = list(itertools.chain(*res_align.values()))
    #[i.get() for i in align_asyncs]

    ## FUNC 6: concat chunks -------------------------------------------------
    res_concat = {}
    for sample in samples:
        #LOGGER.info('resalign[sample] %s', res_align[sample])
        tmpids = list(itertools.chain(*[i.msg_ids for i in res_align[sample]]))
        check_deps = Dependency(tmpids, failure=False, success=True)
        #LOGGER.info('tmpids %s', tmpids)
        with lbview.temp_flags(after=check_deps):
            res_concat[sample] = lbview.apply(reconcat, [data, sample])
    ## test just up to this point [comment this out when done]
    #[res_concat[i].get() for i in res_concat]


    ## wait func
    tmpids = list(itertools.chain(*[i.msg_ids for i in res_concat.values()]))
    with lbview.temp_flags(after=tmpids):
        res = lbview.apply(time.sleep, 0.1)    

    ## print progress bars
    all_derep = len(res_derep)
    clusttotal = len(res_clust)
    all_refmap = len(res_ref)
    all_aligns = list(itertools.chain(*res_align.values()))
    aligntotal = len(all_aligns)

    while 1:
        if not res.ready():
            if not done_derep:
                ## prints a progress bar
                fderep = sum([res_derep[i].ready() for i in res_derep])
                elapsed = datetime.timedelta(seconds=int(res.elapsed))
                progressbar(all_derep, fderep,
                            " dereplicating     | {}".format(elapsed))
                ## go to next print row when done
                if fderep == all_derep:
                    done_derep = 1
                    print("")
                    try:
                        failed_samples = check_results(res_derep)
                    except IPyradError as inst:
                        print("All samples failed dereplicating. - {}".format(inst))
                        raise

            elif not done_ref:
                ## prints a progress bar
                fref = sum([res_ref[i].ready() for i in res_ref])
                elapsed = datetime.timedelta(seconds=int(res.elapsed))                
                progressbar(all_refmap, fref, 
                            " mapping reads     | {}".format(elapsed))
                ## go to next print row when done
                if fref == all_refmap:
                    done_ref = 1
                    print("")
                
                    ## When all refmap results are done check them.
                    ## If all samples failed this step check_results raises
                    ## an IPyradError, otherwise it returns a dict of failed
                    ## samples and error messages
                    try:
                        failed_samples = check_results(res_ref)
                    except IPyradError as inst:
                        print("All samples failed read mapping - {}".format(inst))
                        raise

            elif not done_clust:
                ## prints a progress bar
                fclust = sum([res_clust[i].ready() for i in res_clust])
                elapsed = datetime.timedelta(seconds=int(res.elapsed))
                progressbar(clusttotal, fclust, 
                            " clustering reads  | {}".format(elapsed))
                ## go to next print row when done
                if fclust == clusttotal:
                    done_clust = 1
                    print("")
                    try:
                        failed_samples = check_results(res_clust)
                    except IPyradError as inst:
                        print("All samples failed clustering - {}".format(inst))
                        raise

            else:
                falign = sum([i.ready() for i in all_aligns])
                elapsed = datetime.timedelta(seconds=int(res.elapsed))
                progressbar(aligntotal, falign, 
                    " aligning clusters | {}".format(elapsed))
            sys.stdout.flush()
            time.sleep(1)

        else:
            ## print final progress bar
            elapsed = datetime.timedelta(seconds=int(res.elapsed))                            
            progressbar(20, 20,
                " aligning clusters | {}".format(elapsed))
            if data._headers:
                print("")

            try:
                failed_samples = check_results_alignment(res_align)
                print("Samples failed the aligning step: {}".format(failed_samples))
            except IPyradError as inst:
                print("All samples failed aligning - {}".format(inst))
                raise IPyradError("Failed during alignment.")

            ## store returned badalign values from muscle_align
            for sample in samples:
                ## Helpful debug for seeing which samples had bad alignments
                #for i in res_align[sample]:
                #    print(sample, i.get())
                badaligns = sum([i.get() for i in res_align[sample]])
                sample.stats_dfs.s3.filtered_bad_align = badaligns
            break


    ## Cleanup -------------------------------------------------------
    for sample in samples:
        sample_cleanup(data, sample)

    data_cleanup(data)