def _hadoop_streaming_commands(self, step_num): version = self.get_hadoop_version() # Hadoop streaming stuff mapper, bash_wrap_mapper = self._render_substep( step_num, 'mapper') combiner, bash_wrap_combiner = self._render_substep( step_num, 'combiner') reducer, bash_wrap_reducer = self._render_substep( step_num, 'reducer') if (combiner is not None and not supports_combiners_in_hadoop_streaming(version)): # krazy hack to support combiners on hadoop <0.20 bash_wrap_mapper = True mapper = "%s | sort | %s" % (mapper, combiner) # take the combiner away, hadoop will just be confused combiner = None bash_wrap_combiner = False if bash_wrap_mapper: mapper = bash_wrap(mapper) if bash_wrap_combiner: combiner = bash_wrap(combiner) if bash_wrap_reducer: reducer = bash_wrap(reducer) return mapper, combiner, reducer
def steps(self): steps = [] for step in self.options.steps: step_kwargs = {} if 'mapper' in step: step_kwargs['mapper_cmd'] = bash_wrap(step['mapper']) if 'combiner' in step: step_kwargs['combiner_cmd'] = bash_wrap(step['combiner']) if 'reducer' in step: step_kwargs['reducer_cmd'] = bash_wrap(step['reducer']) steps.append(self.mr(**step_kwargs)) return steps
def steps(self): steps = [] for step in self.options.steps: step_kwargs = {} if "mapper" in step: step_kwargs["mapper_cmd"] = bash_wrap(step["mapper"]) if "combiner" in step: step_kwargs["combiner_cmd"] = bash_wrap(step["combiner"]) if "reducer" in step: step_kwargs["reducer_cmd"] = bash_wrap(step["reducer"]) steps.append(MRStep(**step_kwargs)) return steps
def test_multiple(self): data = b'x\nx\nx\nx\nx\nx\n' mapper_cmd = 'cat -e' reducer_cmd = bash_wrap('wc -l | tr -Cd "[:digit:]"') job = CmdJob([ '--runner', 'local', '--mapper-cmd', mapper_cmd, '--combiner-cmd', 'uniq', '--reducer-cmd', reducer_cmd ]) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'command', 'command': mapper_cmd }, 'combiner': { 'type': 'command', 'command': 'uniq' }, 'reducer': { 'type': 'command', 'command': reducer_cmd }, }]) r.run() self.assertEqual(list(r.stream_output()), [b'2'])
def _hadoop_streaming_commands(self, step_num): # Hadoop streaming stuff mapper, bash_wrap_mapper = self._render_substep(step_num, "mapper") combiner, bash_wrap_combiner = self._render_substep(step_num, "combiner") reducer, bash_wrap_reducer = self._render_substep(step_num, "reducer") if bash_wrap_mapper: mapper = bash_wrap(mapper) if bash_wrap_combiner: combiner = bash_wrap(combiner) if bash_wrap_reducer: reducer = bash_wrap(reducer) return mapper, combiner, reducer
def _hadoop_streaming_commands(self, step_num): # Hadoop streaming stuff mapper, bash_wrap_mapper = self._render_substep(step_num, 'mapper') combiner, bash_wrap_combiner = self._render_substep( step_num, 'combiner') reducer, bash_wrap_reducer = self._render_substep(step_num, 'reducer') if bash_wrap_mapper: mapper = bash_wrap(mapper) if bash_wrap_combiner: combiner = bash_wrap(combiner) if bash_wrap_reducer: reducer = bash_wrap(reducer) return mapper, combiner, reducer
def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( {"type": "streaming", "mapper": {"type": "script", "pre_filter": bash_wrap("grep 'anything'")}}, [ "-mapper", "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", "-jobconf", "mapred.reduce.tasks=0", ], )
def _hadoop_streaming_commands(self, step_num): version = self.get_hadoop_version() # Hadoop streaming stuff mapper, bash_wrap_mapper = self._render_substep( step_num, 'mapper') combiner, bash_wrap_combiner = self._render_substep( step_num, 'combiner') reducer, bash_wrap_reducer = self._render_substep( step_num, 'reducer') if bash_wrap_mapper: mapper = bash_wrap(mapper) if bash_wrap_combiner: combiner = bash_wrap(combiner) if bash_wrap_reducer: reducer = bash_wrap(reducer) return mapper, combiner, reducer
def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ['-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0'])
def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self._assert_streaming_step( { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, [ '-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' |" " python my_job.py --step-num=0 --mapper'", '-jobconf', 'mapred.reduce.tasks=0' ])
def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] + self.BASIC_JOB_ARGS + [ '-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'" ]))
def test_pre_filter_escaping(self): # ESCAPE ALL THE THINGS!!! self.runner._steps = [ { 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': bash_wrap("grep 'anything'"), }, }, ] self.assertEqual( self.runner._args_for_streaming_step(0), (self.BASIC_HADOOP_ARGS + ['-D', 'mapreduce.job.reduces=0'] + self.BASIC_JOB_ARGS + [ '-mapper', "bash -c 'bash -c '\\''grep" " '\\''\\'\\'''\\''anything'\\''\\'\\'''\\'''\\'' | " + PYTHON_BIN + " my_job.py --step-num=0 --mapper'"]))
def test_multiple(self): data = 'x\nx\nx\nx\nx\nx\n' mapper_cmd = 'cat -e' reducer_cmd = bash_wrap('wc -l | tr -Cd "[:digit:]"') job = CmdJob([ '--runner', 'local', '--mapper-cmd', mapper_cmd, '--combiner-cmd', 'uniq', '--reducer-cmd', reducer_cmd]) job.sandbox(stdin=StringIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': {'type': 'command', 'command': mapper_cmd}, 'combiner': {'type': 'command', 'command': 'uniq'}, 'reducer': {'type': 'command', 'command': reducer_cmd}, }]) r.run() self.assertEqual(list(r.stream_output()), ['2'])
def test_multiple(self): data = b"x\nx\nx\nx\nx\nx\n" mapper_cmd = "cat -e" reducer_cmd = bash_wrap('wc -l | tr -Cd "[:digit:]"') job = CmdJob( ["--runner", "local", "--mapper-cmd", mapper_cmd, "--combiner-cmd", "uniq", "--reducer-cmd", reducer_cmd] ) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [ { "type": "streaming", "mapper": {"type": "command", "command": mapper_cmd}, "combiner": {"type": "command", "command": "uniq"}, "reducer": {"type": "command", "command": reducer_cmd}, } ], ) r.run() self.assertEqual(list(r.stream_output()), [b"2"])
def mapper_cmd(self): return bash_wrap('./wordcount.sh mapper')
def reducer_cmd(self): return bash_wrap('./wordcount.sh reducer')