def test_mr_map(self): stdin = StringIO("\n".join(["foo\tbar\tbar1", "baz\tbad\tbad1"])) stdout = StringIO() mr_map(lambda x: [x[:1]], fd=stdin, out=stdout) self.assertEqual(stdout.getvalue(), "foo\nbaz\n")
def test_mr_map(self): stdin = StringIO("\n".join([ "foo\tbar\tbar1", "baz\tbad\tbad1", ])) stdout = StringIO() mr_map(lambda x: [x[:1]], fd=stdin, out=stdout) self.assertEqual(stdout.getvalue(), "foo\nbaz\n")
def mr_map_parallel( processor, fd=STDIN, workers=NCPUS, chunk_size=1000, out=STDOUT, ): """Map in parallel. `processor` must be an instance of Mapper and promise that it is safe to execute in a fork()d process. Also note that we f**k up the result ordering, but relying on result ordering breaks the mapreduce contract anyway. Note also that like many of the mr_tools functions, we break on newlines in the emitted output :param processor: an multiprocessing-safe instance of :py:class:`Mapper` :param int workers: Number of concurrent workers. :param int chunk_size: job size per worker :param file fd: Input data stream (default is stdin) :param file out: Output data stream (default is stdout) :type processor: :py:class:`Mapper` """ if workers == 1: return mr_map(processor, fd=fd, out=out) pool = multiprocessing.Pool(workers) for res in pool.imap_unordered(processor, fd, chunk_size): for subres in res: emit(subres, out=out)
def mr_map_parallel(processor, fd=stdin, workers=multiprocessing.cpu_count(), chunk_size=1000): # `process` must be an instance of Mapper and promise that it is # safe to execute in a fork()d process. Also note that we f**k # up the result ordering, but relying on result ordering breaks # the mapreduce contract anyway. Note also that like many of the # mr_tools functions, we break on newlines in the emitted output if workers == 1: return mr_map(process, fd=fd) pool = multiprocessing.Pool(workers) for res in pool.imap_unordered(processor, fd, chunk_size): for subres in res: emit(subres)
def mr_map_parallel(processor, fd = stdin, workers = multiprocessing.cpu_count(), chunk_size = 1000): # `process` must be an instance of Mapper and promise that it is # safe to execute in a fork()d process. Also note that we f**k # up the result ordering, but relying on result ordering breaks # the mapreduce contract anyway. Note also that like many of the # mr_tools functions, we break on newlines in the emitted output if workers == 1: return mr_map(process, fd=fd) pool = multiprocessing.Pool(workers) for res in pool.imap_unordered(processor, fd, chunk_size): for subres in res: emit(subres)