def test_uniq_combiner(self): # put data in a .gz to force a single map taxsk x_gz_path = join(self.tmp_dir, 'data.gz') with gzip.open(x_gz_path, 'wb') as x_gz: x_gz.write(b'x\nx\nx\nx\nx\nx\n') job = MRCmdJob(['--combiner-cmd=uniq', '--runner=local', x_gz_path]) job.sandbox() with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', }, 'combiner': { 'type': 'command', 'command': 'uniq' } }]) r.run() # there is only one map task, thus only one combiner, # thus there should only be one value self.assertEqual(b''.join(r.cat_output()), b'x\n')
def test_multiple(self): data = b'x\nx\nx\nx\nx\nx\n' mapper_cmd = 'cat -e' reducer_cmd = _bash_wrap('wc -l | tr -Cd "[:digit:]"') job = MRCmdJob([ '--runner', 'local', '--mapper-cmd', mapper_cmd, '--combiner-cmd', 'uniq', '--reducer-cmd', reducer_cmd ]) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'command', 'command': mapper_cmd }, 'combiner': { 'type': 'command', 'command': 'uniq' }, 'reducer': { 'type': 'command', 'command': reducer_cmd }, }]) r.run() self.assertEqual(list(r.stream_output()), [b'2'])
def test_multiple(self): # put data in a .gz to force a single map task x_gz_path = join(self.tmp_dir, 'data.gz') with gzip.open(x_gz_path, 'wb') as x_gz: x_gz.write(b'x\nx\nx\nx\nx\nx\n') reducer_cmd = '/bin/sh -c \'wc -l | tr -Cd "[:digit:]"\'' job = MRCmdJob([ '--runner', 'local', '--mapper-cmd', 'cat -e', '--combiner-cmd', 'uniq', '--reducer-cmd', reducer_cmd, x_gz_path]) job.sandbox() with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': {'type': 'command', 'command': 'cat -e'}, 'combiner': {'type': 'command', 'command': 'uniq'}, 'reducer': {'type': 'command', 'command': reducer_cmd}, }]) r.run() self.assertEqual( sum(int(v) for _, v in job.parse_output(r.cat_output())), 1)
def test_uniq_combiner(self): # put data in a .gz to force a single map taxsk x_gz_path = join(self.tmp_dir, 'data.gz') with gzip.open(x_gz_path, 'wb') as x_gz: x_gz.write(b'x\nx\nx\nx\nx\nx\n') job = MRCmdJob(['--combiner-cmd=uniq', '--runner=local', x_gz_path]) job.sandbox() with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', }, 'combiner': { 'type': 'command', 'command': 'uniq'}}]) r.run() # there is only one map task, thus only one combiner, # thus there should only be one value self.assertEqual(b''.join(r.cat_output()), b'x\n')
def test_multiple_2(self): data = b'x\ny\nz\n' job = MRCmdJob(['--mapper-cmd=cat', '--reducer-cmd-2', 'wc -l', '--runner=local', '--no-conf']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: r.run() self.assertEqual(sum(int(l) for l in to_lines(r.cat_output())), 3)
def test_cat_mapper(self): data = b'x\ny\nz\n' job = MRCmdJob(['--mapper-cmd=cat', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat' } }]) r.run() lines = [line.strip() for line in list(r.stream_output())] self.assertEqual(sorted(lines), sorted(data.split()))
def test_cat_mapper(self): data = b'x\ny\nz\n' job = MRCmdJob(['--mapper-cmd=cat', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat'}}]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), sorted(data.split()))
def test_command_streaming_step_without_mr_job_script(self): # you don't need a script to run commands steps = MRCmdJob(['--mapper-cmd', 'cat'])._steps_desc() runner = LocalMRJobRunner(steps=steps, stdin=BytesIO(b'dog\n')) runner.run() runner.cleanup()
def test_cat_reducer(self): data = b'x\ny\nz\n' job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', }, 'reducer': { 'type': 'command', 'command': 'cat -e' } }]) r.run() lines = list(r.stream_output()) self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
def test_cat_reducer(self): data = b'x\ny\nz\n' job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', }, 'reducer': { 'type': 'command', 'command': 'cat -e'}}]) r.run() lines = list(to_lines(r.cat_output())) self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
def test_multiple(self): # put data in a .gz to force a single map task x_gz_path = join(self.tmp_dir, 'data.gz') with gzip.open(x_gz_path, 'wb') as x_gz: x_gz.write(b'x\nx\nx\nx\nx\nx\n') reducer_cmd = _bash_wrap('wc -l | tr -Cd "[:digit:]"') job = MRCmdJob([ '--runner', 'local', '--mapper-cmd', 'cat -e', '--combiner-cmd', 'uniq', '--reducer-cmd', reducer_cmd, x_gz_path ]) job.sandbox() with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat -e' }, 'combiner': { 'type': 'command', 'command': 'uniq' }, 'reducer': { 'type': 'command', 'command': reducer_cmd }, }]) r.run() self.assertEqual( sum(int(v) for _, v in job.parse_output(r.cat_output())), 1)
def test_uniq_combiner(self): data = b'x\nx\nx\nx\nx\nx\n' job = MRCmdJob(['--combiner-cmd=uniq', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', }, 'combiner': { 'type': 'command', 'command': 'uniq' } }]) r.run() # there are 2 map tasks, each of which has 1 combiner, and all rows # are the same, so we should end up with just 2 values self.assertEqual(b''.join(r.stream_output()), b'x\nx\n')
def test_passthrough_options(self): MRCmdJob(['--help']) self.exit.assert_called_once_with(0) output = self.stdout.getvalue() self.assertIn('--reducer-cmd-2', output)
def test_no_command_steps(self): job = MRCmdJob(['-r', 'inline', '--mapper-cmd', 'cat']) job.sandbox() self.assertRaises(NotImplementedError, job.make_runner)