def test_update_counters(self): counters = {'Foo': {'Bar': 3, 'Baz': 1}} parse_mr_job_stderr( StringIO('reporter:counter:Foo,Baz,1\n'), counters=counters) assert_equal(counters, {'Foo': {'Bar': 3, 'Baz': 2}})
def test_update_counters(self): counters = {'Foo': {'Bar': 3, 'Baz': 1}} parse_mr_job_stderr( BytesIO(b'reporter:counter:Foo,Baz,1\n'), counters=counters) self.assertEqual(counters, {'Foo': {'Bar': 3, 'Baz': 2}})
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) # if no mapper, just pass the data through (see #1141) if step_type == 'mapper' and not step.get('mapper'): copyfile(input_path, output_path) return # Passing local=False ensures the job uses proper names for file # options (see issue #851 on github) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=False)) if step_type == 'mapper': child_args = ( ['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = ( ['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] has_combiner = (step_type == 'mapper' and 'combiner' in step) try: # Use custom stdout if has_combiner: child_stdout = BytesIO() else: child_stdout = open(output_path, 'wb') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance = self._mrjob_cls(args=child_args) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = BytesIO(b'\n'.join(sorted_lines)) else: child_stdout.flush() finally: child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def _parse_task_counters(self, task_type, step_num): """Parse all stderr files from the given task (if any).""" # don't disable if read_logs=False; parsing counters is # internal to Hadoop, not something that happens in log files stderr_paths = self.fs.ls(self._task_stderr_paths_glob( task_type, step_num)) for stderr_path in stderr_paths: with open(stderr_path, 'rb') as stderr: parse_mr_job_stderr(stderr, counters=self._counters[step_num])
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = ( ['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = ( ['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(output_path, 'w') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def test_negative_counters(self): # kind of poor practice to use negative counters, but Hadoop # Streaming supports it (negative numbers are integers too!) self.assertEqual( parse_mr_job_stderr([b'reporter:counter:Foo,Bar,-2\n']), {'counters': {'Foo': {'Bar': -2}}, 'statuses': [], 'other': []})
def test_counters_and_status(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar') mr_job.set_status('Initializing qux gradients...') mr_job.increment_counter('Foo', 'Bar') mr_job.increment_counter('Foo', 'Baz', 20) mr_job.set_status('Sorting metasyntactic variables...') parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual( parsed_stderr, { 'counters': { 'Foo': { 'Bar': 2, 'Baz': 20 } }, 'statuses': [ 'Initializing qux gradients...', 'Sorting metasyntactic variables...' ], 'other': [] }) # make sure parse_counters() works self.assertEqual(mr_job.parse_counters(), parsed_stderr['counters'])
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) # Passing local=False ensures the job uses proper names for file # options (see issue #851 on github) common_args = ["--step-num=%d" % step_num] + self._mr_job_extra_args(local=False) if step_type == "mapper": child_args = ["--mapper"] + [input_path] + common_args elif step_type == "reducer": child_args = ["--reducer"] + [input_path] + common_args elif step_type == "combiner": child_args = ["--combiner"] + common_args + ["-"] has_combiner = step_type == "mapper" and "combiner" in step # Use custom stdin if has_combiner: child_stdout = BytesIO() else: child_stdout = open(output_path, "wb") with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance = self._mrjob_cls(args=child_args) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = BytesIO(b"\n".join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, "combiner", None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def test_commas_in_counters(self): # commas should be replaced with semicolons mr_job = MRJob().sandbox() mr_job.increment_counter("Bad items", "a, b, c") mr_job.increment_counter("girl, interrupted", "movie") parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual(parsed_stderr["counters"], {"Bad items": {"a; b; c": 1}, "girl; interrupted": {"movie": 1}})
def test_negative_and_zero_counters(self): mr_job = MRJob().sandbox() mr_job.increment_counter("Foo", "Bar", -1) mr_job.increment_counter("Foo", "Baz") mr_job.increment_counter("Foo", "Baz", -1) mr_job.increment_counter("Qux", "Quux", 0) parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual(parsed_stderr["counters"], {"Foo": {"Bar": -1, "Baz": 0}, "Qux": {"Quux": 0}})
def test_negative_and_zero_counters(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar', -1) mr_job.increment_counter('Foo', 'Baz') mr_job.increment_counter('Foo', 'Baz', -1) mr_job.increment_counter('Qux', 'Quux', 0) parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual(parsed_stderr['counters'], {'Foo': {'Bar': -1, 'Baz': 0}, 'Qux': {'Quux': 0}})
def test_commas_in_counters(self): # commas should be replaced with semicolons mr_job = MRJob().sandbox() mr_job.increment_counter('Bad items', 'a, b, c') mr_job.increment_counter('girl, interrupted', 'movie') parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual(parsed_stderr['counters'], {'Bad items': {'a; b; c': 1}, 'girl; interrupted': {'movie': 1}})
def test_garbled_counters(self): # we should be able to do something graceful with # garbled counters and status messages BAD_LINES = [ 'reporter:counter:Foo,Bar,Baz,1\n', # too many items 'reporter:counter:Foo,1\n', # too few items 'reporter:counter:Foo,Bar,a million\n', # not a number 'reporter:counter:Foo,Bar,1.0\n', # not an int 'reporter:crounter:Foo,Bar,1\n', # not a valid reporter 'reporter,counter:Foo,Bar,1\n', # wrong format! ] self.assertEqual(parse_mr_job_stderr(BAD_LINES), {'counters': {}, 'statuses': [], 'other': BAD_LINES})
def test_parsing(self): INPUT = BytesIO( b'reporter:counter:Foo,Bar,2\n' + b'reporter:status:Baz\n' + b'reporter:status:Baz\n' + b'reporter:counter:Foo,Bar,1\n' + b'reporter:counter:Foo,Baz,1\n' + b'reporter:counter:Quux Subsystem,Baz,42\n' + b'Warning: deprecated metasyntactic variable: garply\n') self.assertEqual( parse_mr_job_stderr(INPUT), {'counters': {'Foo': {'Bar': 3, 'Baz': 1}, 'Quux Subsystem': {'Baz': 42}}, 'statuses': ['Baz', 'Baz'], 'other': ['Warning: deprecated metasyntactic variable: garply\n'] })
def parse_counters(self, counters=None): """.. deprecated:: 0.4.2 Parse the counters from the given sandboxed job's ``self.stderr``; superseded :py:func:`mrjob.parse.parse_mr_job_stderr`. This was only useful for testing individual mappers/reducers without a runner; normally you'd just use :py:meth:`runner.counters() <mrjob.runner.MRJobRunner.counters()>`. """ if self.stderr == sys.stderr: raise AssertionError('You must call sandbox() first;' ' parse_counters() is for testing only.') log.warning( 'parse_counters() is deprecated and will be removed in v0.5.0') stderr_results = parse_mr_job_stderr(self.stderr.getvalue(), counters) return stderr_results['counters']
def test_counters_and_status(self): mr_job = MRJob().sandbox() mr_job.increment_counter("Foo", "Bar") mr_job.set_status("Initializing qux gradients...") mr_job.increment_counter("Foo", "Bar") mr_job.increment_counter("Foo", "Baz", 20) mr_job.set_status("Sorting metasyntactic variables...") parsed_stderr = parse_mr_job_stderr(mr_job.stderr.getvalue()) self.assertEqual( parsed_stderr, { "counters": {"Foo": {"Bar": 2, "Baz": 20}}, "statuses": ["Initializing qux gradients...", "Sorting metasyntactic variables..."], "other": [], }, )
def _process_stderr_from_script(self, stderr): """Handle stderr a line at time: - for counter lines, store counters - for status message, log the status change - for all other lines, log an error, and yield the lines """ for line in stderr: # just pass one line at a time to parse_mr_job_stderr(), # so we can print error and status messages in realtime parsed = parse_mr_job_stderr([line], counters=self._counters) # in practice there's only going to be at most one line in # one of these lists, but the code is cleaner this way for status in parsed['statuses']: log.info('status: %s' % status) for line in parsed['other']: log.error('STDERR: %s' % line.rstrip('\n')) yield line
def _process_stderr_from_script(self, stderr, step_num=0): """Handle stderr a line at time: * for counter lines, store counters * for status message, log the status change * for all other lines, log an error, and yield the lines """ for line in stderr: # just pass one line at a time to parse_mr_job_stderr(), # so we can print error and status messages in realtime parsed = parse_mr_job_stderr([line], counters=self._counters[step_num]) # in practice there's only going to be at most one line in # one of these lists, but the code is cleaner this way for status in parsed['statuses']: log.info('Status: %s' % status) for line in parsed['other']: log.debug('STDERR: %s' % line.rstrip('\r\n')) yield line
def test_parsing(self): INPUT = StringIO( 'reporter:counter:Foo,Bar,2\n' + 'reporter:status:Baz\n' + 'reporter:status:Baz\n' + 'reporter:counter:Foo,Bar,1\n' + 'reporter:counter:Foo,Baz,1\n' + 'reporter:counter:Quux Subsystem,Baz,42\n' + 'Warning: deprecated metasyntactic variable: garply\n') self.assertEqual( parse_mr_job_stderr(INPUT), { 'counters': { 'Foo': { 'Bar': 3, 'Baz': 1 }, 'Quux Subsystem': { 'Baz': 42 } }, 'statuses': ['Baz', 'Baz'], 'other': ['Warning: deprecated metasyntactic variable: garply\n'] })
def test_empty(self): assert_equal(parse_mr_job_stderr(StringIO()), { 'counters': {}, 'statuses': [], 'other': [] })
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=True)) if step_type == 'mapper': child_args = (['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = (['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] child_instance = self._mrjob_cls(args=child_args) has_combiner = (step_type == 'mapper' and 'combiner' in step) # Use custom stdin if has_combiner: child_stdout = StringIO() else: child_stdout = open(output_path, 'w') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = StringIO('\n'.join(sorted_lines)) else: child_stdout.flush() child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def test_empty(self): assert_equal(parse_mr_job_stderr(StringIO()), {'counters': {}, 'statuses': [], 'other': []})
def test_read_multiple_lines_from_buffer(self): self.assertEqual( parse_mr_job_stderr(b'reporter:counter:Foo,Bar,2\nwoot\n'), {'counters': {'Foo': {'Bar': 2}}, 'statuses': [], 'other': ['woot\n']})
def test_read_single_line(self): # LocalMRJobRunner runs parse_mr_job_stderr on one line at a time. self.assertEqual(parse_mr_job_stderr(b'reporter:counter:Foo,Bar,2\n'), {'counters': {'Foo': {'Bar': 2}}, 'statuses': [], 'other': []})
def _run_step(self, step_num, step_type, input_path, output_path, working_dir, env, child_stdin=None): step = self._get_step(step_num) # Passing local=False ensures the job uses proper names for file # options (see issue #851 on github) common_args = (['--step-num=%d' % step_num] + self._mr_job_extra_args(local=False)) if step_type == 'mapper': child_args = (['--mapper'] + [input_path] + common_args) elif step_type == 'reducer': child_args = (['--reducer'] + [input_path] + common_args) elif step_type == 'combiner': child_args = ['--combiner'] + common_args + ['-'] has_combiner = (step_type == 'mapper' and 'combiner' in step) try: # Use custom stdout if has_combiner: child_stdout = BytesIO() else: child_stdout = open(output_path, 'wb') with save_current_environment(): with save_cwd(): os.environ.update(env) os.chdir(working_dir) child_instance = self._mrjob_cls(args=child_args) child_instance.sandbox(stdin=child_stdin, stdout=child_stdout) child_instance.execute() if has_combiner: sorted_lines = sorted(child_stdout.getvalue().splitlines()) combiner_stdin = BytesIO(b'\n'.join(sorted_lines)) else: child_stdout.flush() finally: child_stdout.close() while len(self._counters) <= step_num: self._counters.append({}) parse_mr_job_stderr(child_instance.stderr.getvalue(), counters=self._counters[step_num]) if has_combiner: self._run_step(step_num, 'combiner', None, output_path, working_dir, env, child_stdin=combiner_stdin) combiner_stdin.close()
def test_empty(self): self.assertEqual(parse_mr_job_stderr(BytesIO()), {'counters': {}, 'statuses': [], 'other': []})