def setUp(self): super(InterpretTaskLogsTestCase, self).setUp() self.runner._ls_task_syslogs = Mock() self._interpret_task_logs = ( self.start(patch('mrjob.logs.mixin._interpret_task_logs'))) self.runner.get_hadoop_version = Mock(return_value='2.7.1')
def _test_run_combiner(self, sort_values=False, num_reducers=None): rdd = self.mock_rdd() combiner_job = Mock() combiner_job.pick_protocols.return_value = (Mock(), Mock()) final_rdd = _run_combiner(combiner_job, rdd, sort_values=sort_values, num_reducers=num_reducers) self.assertEqual(final_rdd, rdd) # mock RDD's methods return it # check that we preserve partitions after calling combineByKey() # # Python 3.4 and 3.5's mock modules have slightly different ways # of tracking function calls. to work around this, we avoid calling # assert_called() and just inspect `method_calls` directly called_combineByKey = False for name, args, kwargs in rdd.method_calls: if called_combineByKey: # mapValues() doesn't have to use preservesPartitioning # because it's just encoding the list of all values for a key if name == 'mapValues': f = args[0] self._assert_maps_list_to_list_of_same_size(f) else: self.assertEqual(kwargs.get('preservesPartitioning'), True) elif name == 'combineByKey': called_combineByKey = True # check that combineByKey() was actually called self.assertTrue(called_combineByKey)
def setUp(self): super(InterpretEMRStepStderrTestCase, self).setUp() # instead of mocking out contents of files, just mock out # what _parse_step_syslog() should return, and have # _cat_log() just pass through the path self.mock_paths = [] self.path_to_mock_result = {} self.mock_paths_catted = [] def mock_cat_log(fs, path): if path in self.mock_paths: self.mock_paths_catted.append(path) return path def mock_parse_task_stderr(path_from_mock_cat_log): return self.path_to_mock_result.get(path_from_mock_cat_log) # need to mock ls so that _ls_task_syslogs() can work def mock_exists(path): return path in self.mock_paths def mock_ls(log_dir): return self.mock_paths self.mock_fs = Mock() self.mock_fs.ls = Mock(side_effect=mock_ls) self.mock_cat_log = self.start( patch('mrjob.logs.step._cat_log', side_effect=mock_cat_log)) self.start(patch('mrjob.logs.step._parse_task_stderr', side_effect=mock_parse_task_stderr))
def setUp(self): super(LsLogsTestCase, self).setUp() self.mock_fs = Mock() self.mock_paths = [] def mock_fs_ls(log_dir): prefix = log_dir.rstrip('/') + '/' exists = False for p in self.mock_paths: if isinstance(p, Exception): raise p elif p.startswith(prefix): yield p exists = True if not exists: raise IOError def mock_fs_exists(log_dir): return any(mock_fs_ls(log_dir)) self.mock_fs.ls = Mock(side_effect=mock_fs_ls) self.mock_fs.exists = Mock(side_effect=mock_fs_exists) # a matcher that cheerfully passes through kwargs def mock_matcher(path, **kwargs): return dict(**kwargs) self.mock_matcher = Mock(side_effect=mock_matcher)
def _test_run_reducer(self, num_reducers=None): rdd = self.mock_rdd() reducer_job = Mock() reducer_job.pick_protocols.return_value = (Mock(), Mock()) final_rdd = _run_reducer(reducer_job, rdd, num_reducers=num_reducers) self.assertEqual(final_rdd, rdd) # mock RDD's methods return it called_mapPartitions = False before_map_partition = True for name, args, kwargs in rdd.method_calls: if name == 'mapPartitions': called_mapPartitions = True before_map_partition = False # We want to make sure we keep the original partition before # reaching to map_partition if before_map_partition: self.assertEqual(kwargs.get('preservesPartitioning'), True) # Once we finished from map_paratition, we don't care about if # we keep the same partition unless we want to fixed on number # of partitions else: self.assertEqual(kwargs.get('preservesPartitioning'), bool(num_reducers)) # sanity-check that mapPartitions() was actually called self.assertTrue(called_mapPartitions)
def setUp(self): super(PickErrorTestCase, self).setUp() self.runner._interpret_history_log = Mock() self.runner._interpret_step_logs = Mock() self.runner._interpret_task_logs = Mock() self._pick_error = self.start(patch('mrjob.logs.mixin._pick_error'))
def test_try_till_success(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=[IOError, IOError, None]) a = RetryWrapper(a1, retry_if=lambda x: True, backoff=0.0001, max_tries=0) a.f() self.assertEqual(a1.f.call_count, 3)
def test_failure_raises_if_all_tries_fail(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=[IOError, IOError]) a = RetryWrapper(a1, retry_if=lambda x: True, backoff=0.0001, max_tries=2) with self.assertRaises(IOError): a.f() self.assertEqual(a1.f.call_count, 2)
def test_failure(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=[IOError, 1]) a = RetryWrapper(a1, retry_if=lambda x: True, backoff=0.0001, max_tries=2) self.assertEqual(a.f(), 1) self.assertEqual(a1.f.call_count, 2)
def test_success(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=None) a = RetryWrapper(a1, retry_if=lambda x: True, backoff=0.0001, max_tries=2) a.f() a1.f.assert_called_once_with()
def setUp(self): super(CatLogsTestCase, self) self.mock_data = None self.mock_fs = Mock() self.mock_fs.cat = Mock(return_value=()) self.mock_fs.exists = Mock(return_value=True) self.mock_log = self.start(patch('mrjob.logs.wrap.log'))
def setUp(self): super(StreamingLogDirsTestCase, self).setUp() self.log = self.start(patch('mrjob.hadoop.log')) self.runner = HadoopJobRunner() self.runner._hadoop_log_dirs = Mock(return_value=[]) self.runner.fs.exists = Mock(return_value=True) self.log.reset_mock() # ignore logging from HadoopJobRunner init
def test_wrapping(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=IOError) a2 = Mock() a2.f = Mock(__name__='f', return_value=2) a = RetryGoRound([a1, a2], lambda ex: isinstance(ex, IOError)) self.assertEqual(a.f('foo', bar='baz'), 2) a1.f.assert_called_once_with('foo', bar='baz') a2.f.assert_called_once_with('foo', bar='baz') self.assertEqual(a.f.__name__, 'f')
def test_multiple_files(self): fs = Filesystem() fs.ls = Mock(return_value=['path1', 'path2', 'path3']) fs._cat_file = Mock(return_value=[b'chunk1\n', b'chunk2']) chunks = list(fs.cat('whatever')) self.assertEqual(chunks, [ b'chunk1\n', b'chunk2', b'', b'chunk1\n', b'chunk2', b'', b'chunk1\n', b'chunk2' ])
def test_can_wrap_around(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=[IOError, 1]) a2 = Mock() a2.f = Mock(__name__='f', side_effect=[2, IOError]) a = RetryGoRound([a1, a2], lambda ex: isinstance(ex, IOError)) self.assertEqual(a.f(), 2) self.assertEqual(a.f(), 1) self.assertEqual(a1.f.call_count, 2) self.assertEqual(a2.f.call_count, 2)
def test_unrecoverable_error(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=ValueError) a2 = Mock() a2.f = Mock(__name__='f', return_value=2) a = RetryGoRound([a1, a2], lambda ex: isinstance(ex, IOError)) self.assertRaises(ValueError, a.f) self.assertRaises(ValueError, a.f) self.assertEqual(a1.f.call_count, 2) self.assertEqual(a2.f.call_count, 0)
def test_success(self): a1 = Mock() # need __name__ so wraps() will work a1.f = Mock(__name__='f', return_value=1) a2 = Mock() a2.f = Mock(__name__='f', return_value=2) a = RetryGoRound([a1, a2], lambda ex: isinstance(ex, IOError)) self.assertEqual(a.f(), 1) self.assertEqual(a1.f.call_count, 1) # never needed to try a2.f() self.assertEqual(a2.f.call_count, 0)
def setUp(self): super(InterpretTaskLogsTestCase, self).setUp() # instead of mocking out contents of files, just mock out # what _parse_task_{syslog,stderr}() should return, and have # _cat_log_lines() just pass through the path self.mock_paths = [] self.path_to_mock_result = {} self.mock_log_callback = Mock() self.mock_paths_catted = [] def mock_cat_log_lines(fs, path): if path in self.mock_paths: self.mock_paths_catted.append(path) return path # (the actual log-parsing functions take lines from the log) def mock_parse_task_syslog(path_from_mock_cat_log_lines): # default is {} return self.path_to_mock_result.get(path_from_mock_cat_log_lines, {}) def mock_parse_task_stderr(path_from_mock_cat_log_lines): # default is None return self.path_to_mock_result.get(path_from_mock_cat_log_lines) def mock_exists(path): return path in self.mock_paths or path == 'MOCK_LOG_DIR' # need to mock ls so that _ls_task_logs() can work def mock_ls(log_dir): return self.mock_paths self.mock_fs = Mock() self.mock_fs.exists = Mock(side_effect=mock_exists) self.mock_fs.ls = Mock(side_effect=mock_ls) self.mock_cat_log_lines = self.start( patch('mrjob.logs.task._cat_log_lines', side_effect=mock_cat_log_lines)) self.start( patch('mrjob.logs.task._parse_task_syslog', side_effect=mock_parse_task_syslog)) self.start( patch('mrjob.logs.task._parse_task_stderr', side_effect=mock_parse_task_stderr))
def setUp(self): super(LsLogsTestCase, self).setUp() self.mock_fs = Mock() self.mock_paths = [] def mock_fs_ls(path): # we just ignore path, keeping it simple for p in self.mock_paths: if isinstance(p, Exception): raise p else: yield p self.mock_fs.ls = Mock(side_effect=mock_fs_ls)
def setUp(self): super(LsTaskLogsTestCase, self).setUp() self.mock_paths = [] def mock_ls(log_dir): return [p for p in self.mock_paths if p.startswith(log_dir + '/')] def mock_join(path, *paths): return '/'.join([path] + list(paths)) self.mock_fs = Mock() self.mock_fs.exists = Mock(return_value=True) self.mock_fs.join = Mock(side_effect=mock_join) self.mock_fs.ls = Mock(side_effect=mock_ls)
def setUp(self): super(CompositeFilesystemTestCase, self).setUp() self.log = self.start(patch('mrjob.fs.composite.log')) self.hadoop_fs = Mock(spec=Filesystem) self.hadoop_fs.get_hadoop_version = Mock() self.hadoop_fs.can_handle_path.side_effect = is_uri self.local_fs = Mock(spec=Filesystem) self.local_fs.can_handle_path.side_effect = lambda p: not is_uri(p) self.s3_fs = Mock(spec=Filesystem) self.s3_fs.create_bucket = Mock() self.s3_fs.can_handle_path.side_effect = is_s3_uri
def test_syslog_with_corresponding_stderr(self): syslog_path = '/userlogs/attempt_201512232143_0008_m_000001_3/syslog' stderr_path = '/userlogs/attempt_201512232143_0008_m_000001_3/stderr' mock_stderr_callback = Mock() self.mock_paths = [syslog_path, stderr_path] self.path_to_mock_result = { syslog_path: dict(hadoop_error=dict(message='BOOM')), stderr_path: dict(message='because, exploding code') } self.assertEqual( self.interpret_task_logs(stderr_callback=mock_stderr_callback), dict( errors=[ dict( attempt_id='attempt_201512232143_0008_m_000001_3', hadoop_error=dict( message='BOOM', path=syslog_path, ), task_error=dict( message='because, exploding code', path=stderr_path, ), task_id='task_201512232143_0008_m_000001', ), ], partial=True, ) ) mock_stderr_callback.assert_called_once_with(stderr_path)
def test_syslog_with_empty_corresponding_stderr(self): syslog_path = '/userlogs/attempt_201512232143_0008_m_000001_3/syslog' stderr_path = '/userlogs/attempt_201512232143_0008_m_000001_3/stderr' mock_log_callback = Mock() self.mock_paths = [syslog_path, stderr_path] self.path_to_mock_result = { syslog_path: dict(hadoop_error=dict(message='BOOM')), } self.assertEqual( self.interpret_task_logs(log_callback=mock_log_callback), dict( errors=[ dict( attempt_id='attempt_201512232143_0008_m_000001_3', hadoop_error=dict( message='BOOM', path=syslog_path, ), task_id='task_201512232143_0008_m_000001', ), ], partial=True, ) ) self.assertEqual( mock_log_callback.call_args_list, [call(stderr_path), call(syslog_path)])
def _mock_runner_class(runner_alias): rc = _runner_class(runner_alias) self.runner_class = Mock() self.runner_class.return_value = self.runner self.runner_class.alias = rc.alias self.runner_class.OPT_NAMES = rc.OPT_NAMES return self.runner_class
def setUp(self): super(LsTaskLogsTestCase, self).setUp() self._ls_task_logs = self.start( patch('mrjob.logs.mixin._ls_task_logs')) self._ls_spark_task_logs = self.start( patch('mrjob.logs.mixin._ls_spark_task_logs')) self.runner._stream_task_log_dirs = Mock()
def test_set_credentials_and_project_id(self): creds = Mock() project_id = 'alan-parsons' fs = GCSFilesystem(credentials=creds, project_id=project_id) self.assertFalse(self.log.warning.called) self.assertEqual(fs.client, self.Client(project=project_id, credentials=creds))
def test_one_failure(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=IOError) a1.x = 100 a2 = Mock() a2.f = Mock(__name__='f', return_value=2) a2.x = 200 a = RetryGoRound([a1, a2], lambda ex: isinstance(ex, IOError)) self.assertEqual(a.x, 100) self.assertEqual(a.f(), 2) # a2 was the last alternative that worked, so now we get x from it self.assertEqual(a.x, 200) # this time we should skip calling a1.f() entirely self.assertEqual(a.f(), 2) self.assertEqual(a1.f.call_count, 1) self.assertEqual(a2.f.call_count, 2)
def test_no_output(self): launcher = MRJobLauncher(args=['--no-conf', '--no-output', '']) launcher.sandbox() with patch.object(launcher, 'make_runner') as m_make_runner: runner = Mock() _mock_context_mgr(m_make_runner, runner) runner.stream_output.return_value = ['a line'] launcher.run_job() self.assertEqual(launcher.stdout.getvalue(), b'') self.assertEqual(launcher.stderr.getvalue(), b'')
def setUp(self): super(InterpretEMRBootstrapStderrTestCase, self).setUp() self.mock_fs = Mock() self.mock_parse_task_stderr = self.start( patch('mrjob.logs.bootstrap._parse_task_stderr', return_value=dict(message='BOOM!\n'))) self.mock_cat_log = self.start(patch('mrjob.logs.bootstrap._cat_log'))
def test_all_fail(self): a1 = Mock() a1.f = Mock(__name__='f', side_effect=IOError) a1.x = 100 a2 = Mock() a2.f = Mock(__name__='f', side_effect=IOError) a2.x = 200 a = RetryGoRound([a1, a2], lambda ex: isinstance(ex, IOError)) self.assertEqual(a.x, 100) # ran out of alternatives self.assertRaises(IOError, a.f) # nothing worked, so we're still pointing at a1 self.assertEqual(a.x, 100) # yep, still broken self.assertRaises(IOError, a.f) self.assertEqual(a1.f.call_count, 2) self.assertEqual(a2.f.call_count, 2)