Beispiel #1
0
 def on_running(self):
     """
     Log memory consumption as the computation goes on; it only works
     when the environment variable OQ_NO_DISTRIBUTE is set, since it
     is intended for debugging purposes.
     """
     if no_distribute():
         logs.LOG.warn('PyMem: %d mb, PgMem: %d mb' % self.mem_peaks)
Beispiel #2
0
def map_reduce(task, task_args, agg, acc):
    """
    Given a task and an iterable of positional arguments, apply the
    task function to the arguments in parallel and return an aggregate
    result depending on the initial value of the accumulator
    and on the aggregation function. To save memory, the order is
    not preserved and there is no list with the intermediated results:
    the accumulator is incremented as soon as a task result comes.

    NB: if the environment variable OQ_NO_DISTRIBUTE is set the
    tasks are run sequentially in the current process and then
    map_reduce(task, task_args, agg, acc) is the same as
    reduce(agg, itertools.starmap(task, task_args), acc).
    Users of map_reduce should be aware of the fact that when
    thousands of tasks are spawned and large arguments are passed
    or large results are returned they may incur in memory issue:
    this is way the calculators limit the queue with the
    `concurrent_task` concept.

    :param task: a `celery` task callable.
    :param task_args: an iterable over positional arguments
    :param agg: the aggregation function, (acc, val) -> new acc
    :param acc: the initial value of the accumulator
    :returns: the final value of the accumulator
    """
    if no_distribute():
        for the_args in task_args:
            result, exctype = safely_call(task.task_func, the_args)
            if exctype:
                raise RuntimeError(result)
            acc = agg(acc, result)
    else:
        backend = current_app().backend
        unpik = 0
        job_id = task_args[0][0]
        taskname = task.__name__
        mon = LightMonitor("unpickling %s" % taskname, job_id, task)
        to_send = 0
        pickled_args = []
        for args in task_args:
            piks = pickle_sequence(args)
            pickled_args.append(piks)
            to_send += sum(len(p) for p in piks)
        logs.LOG.info("Sending %dM", to_send / ONE_MB)
        taskset = TaskSet(tasks=map(task.subtask, pickled_args))
        for task_id, result_dict in taskset.apply_async().iter_native():
            check_mem_usage()  # log a warning if too much memory is used
            result_pik = result_dict["result"]
            with mon:
                result, exctype = result_pik.unpickle()
            if exctype:
                raise RuntimeError(result)
            unpik += len(result_pik)
            acc = agg(acc, result)
            del backend._cache[task_id]  # work around a celery bug
        logs.LOG.info("Unpickled %dM of received data in %s seconds", unpik / ONE_MB, mon.duration)
    return acc
Beispiel #3
0
def _map_reduce(task_func, task_args, agg, acc):
    """
    Given a callable and an iterable of positional arguments, apply the
    callable to the arguments in parallel and return an aggregate
    result depending on the initial value of the accumulator
    and on the aggregation function. To save memory, the order is
    not preserved and there is no list with the intermediated results:
    the accumulator is incremented as soon as a task result comes.

    :param task_func: a `celery` task callable.
    :param task_args: an iterable over positional arguments
    :param agg: the aggregation function, (acc, val) -> new acc
    :param acc: the initial value of the accumulator
    :returns: the final value of the accumulator

    NB: if the environment variable OQ_NO_DISTRIBUTE is set the
    tasks are run sequentially in the current process and then
    map_reduce(task_func, task_args, agg, acc) is the same as
    reduce(agg, itertools.starmap(task_func, task_args), acc).
    Users of map_reduce should be aware of the fact that when
    thousands of tasks are spawned and large arguments are passed
    or large results are returned they may incur in memory issue:
    this is way the calculators limit the queue with the
    `concurrent_task` concept.
    """
    if no_distribute():
        for the_args in task_args:
            acc = agg(acc, task_func(*the_args))
    else:
        taskset = TaskSet(tasks=map(task_func.subtask, task_args))
        for result in taskset.apply_async():
            if isinstance(result, Exception):
                # TODO: kill all the other tasks
                raise result
            acc = agg(acc, result)
    return acc
Beispiel #4
0
def _map_reduce(task_func, task_args, agg, acc):
    """
    Given a callable and an iterable of positional arguments, apply the
    callable to the arguments in parallel and return an aggregate
    result depending on the initial value of the accumulator
    and on the aggregation function. To save memory, the order is
    not preserved and there is no list with the intermediated results:
    the accumulator is incremented as soon as a task result comes.

    :param task_func: a `celery` task callable.
    :param task_args: an iterable over positional arguments
    :param agg: the aggregation function, (acc, val) -> new acc
    :param acc: the initial value of the accumulator
    :returns: the final value of the accumulator

    NB: if the environment variable OQ_NO_DISTRIBUTE is set the
    tasks are run sequentially in the current process and then
    map_reduce(task_func, task_args, agg, acc) is the same as
    reduce(agg, itertools.starmap(task_func, task_args), acc).
    Users of map_reduce should be aware of the fact that when
    thousands of tasks are spawned and large arguments are passed
    or large results are returned they may incur in memory issue:
    this is way the calculators limit the queue with the
    `concurrent_task` concept.
    """
    if no_distribute():
        for the_args in task_args:
            acc = agg(acc, task_func(*the_args))
    else:
        taskset = TaskSet(tasks=map(task_func.subtask, task_args))
        for result in taskset.apply_async():
            if isinstance(result, Exception):
                # TODO: kill all the other tasks
                raise result
            acc = agg(acc, result)
    return acc