Example #1
0
def mr_map_parallel(
    processor,
    fd=STDIN,
    workers=NCPUS,
    chunk_size=1000,
    out=STDOUT,
):
    """Map in parallel.

    `processor` must be an instance of Mapper and promise that it is
    safe to execute in a fork()d process.  Also note that we f**k
    up the result ordering, but relying on result ordering breaks
    the mapreduce contract anyway. Note also that like many of the
    mr_tools functions, we break on newlines in the emitted output

    :param processor: an multiprocessing-safe instance of :py:class:`Mapper`
    :param int workers: Number of concurrent workers.
    :param int chunk_size: job size per worker
    :param file fd: Input data stream (default is stdin)
    :param file out: Output data stream (default is stdout)
    :type processor: :py:class:`Mapper`
    """
    if workers == 1:
        return mr_map(processor, fd=fd, out=out)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres, out=out)
Example #2
0
def mr_map_parallel(
    processor,
    fd=STDIN,
    workers=NCPUS,
    chunk_size=1000,
    out=STDOUT,
):
    """Map in parallel.

    `processor` must be an instance of Mapper and promise that it is
    safe to execute in a fork()d process.  Also note that we f**k
    up the result ordering, but relying on result ordering breaks
    the mapreduce contract anyway. Note also that like many of the
    mr_tools functions, we break on newlines in the emitted output

    :param processor: an multiprocessing-safe instance of :py:class:`Mapper`
    :param int workers: Number of concurrent workers.
    :param int chunk_size: job size per worker
    :param file fd: Input data stream (default is stdin)
    :param file out: Output data stream (default is stdout)
    :type processor: :py:class:`Mapper`
    """
    if workers == 1:
        return mr_map(processor, fd=fd, out=out)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres, out=out)
Example #3
0
def mr_map_parallel(processor, fd=stdin, workers=multiprocessing.cpu_count(), chunk_size=1000):
    # `process` must be an instance of Mapper and promise that it is
    # safe to execute in a fork()d process.  Also note that we f**k
    # up the result ordering, but relying on result ordering breaks
    # the mapreduce contract anyway. Note also that like many of the
    # mr_tools functions, we break on newlines in the emitted output

    if workers == 1:
        return mr_map(process, fd=fd)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres)
Example #4
0
def mr_map_parallel(processor, fd = stdin,
                    workers = multiprocessing.cpu_count(),
                    chunk_size = 1000):
    # `process` must be an instance of Mapper and promise that it is
    # safe to execute in a fork()d process.  Also note that we f**k
    # up the result ordering, but relying on result ordering breaks
    # the mapreduce contract anyway. Note also that like many of the
    # mr_tools functions, we break on newlines in the emitted output

    if workers == 1:
        return mr_map(process, fd=fd)

    pool = multiprocessing.Pool(workers)

    for res in pool.imap_unordered(processor, fd, chunk_size):
        for subres in res:
            emit(subres)