Esempio n. 1
0
def result_iterator(results,
                    notifier=None,
                    reader=func.chain_reader,
                    input_stream=(func.map_input_stream, ),
                    params=None,
                    ddfs=None,
                    tempdir=None):
    """
    Iterates the key-value pairs in job results. *results* is a list of
    results, as returned by :meth:`Disco.wait`.

    :param notifier: a function called when the iterator moves to the
                     next result file::

                      def notifier(url):
                          ...

                     *url* may be a list if results are replicated.

    :param reader: a custom reader function.
                   Specify this to match with a custom *map_writer* or *reduce_writer*.
                   By default, *reader* is :func:`disco.func.netstr_reader`.

    :param tempdir: if results are replicated, *result_iterator* ensures that only
                    valid replicas are used. By default, this is done by downloading
                    and parsing results first to a temporary file. If the temporary
                    file was created succesfully, the results are returned,
                    otherwise an alternative replica is used.

                    If *tempdir=None* (default), the system default temporary
                    directory is used (typically ``/tmp``). An alternative path
                    can be set with *tempdir="path"*. Temporary files can be disabled
                    with *tempdir=False*, in which case results are read in memory.
    """
    from disco.task import Task
    task = Task()
    task.params = params
    task.input_stream = list(input_stream)
    if reader:
        task.input_stream.append(func.reader_wrapper(reader))
    task.insert_globals(task.input_stream)
    for result in results:
        for url in util.urllist(result, ddfs=ddfs):
            if notifier:
                notifier(url)
            if type(url) == list:
                iter = process_url_safe(url, tempdir, task)
            else:
                iter, sze, url = task.connect_input(url)
            for x in iter:
                yield x
Esempio n. 2
0
def init(mode, host, master, job_name, id, inputs):
    global Task
    Task = TaskEnvironment(mode, host, master, job_name, id, inputs)
    ensure_path(os.path.dirname(Task.oob_file('')))
    os.chdir(Task.path('CHDIR_PATH'))
Esempio n. 3
0
 def get_task(cls):
     from disco.task import Task
     return Task(**dict((str(k), v) for k, v in cls.send('TASK').items()))