def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start( patch.object(self.runner, '_upload_args', return_value=['<upload args>'])) self.start( patch.object(self.runner, '_hadoop_args_for_step', return_value=['<hadoop args for step>'])) self.start( patch.object(self.runner, '_hdfs_step_input_files', return_value=['<hdfs step input files>'])) self.start( patch.object(self.runner, '_hdfs_step_output_dir', return_value='<hdfs step output dir>')) self.start( patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='2.7.1')) self.runner._script_path = 'my_job.py'
def setUp(self): # patch boto3 self.mock_emr_failures = set() self.mock_emr_self_termination = set() self.mock_emr_clusters = {} self.mock_emr_output = {} self.mock_iam_instance_profiles = {} self.mock_iam_role_attached_policies = {} self.mock_iam_roles = {} self.mock_s3_fs = {} self.emr_client = None # used by simulate_emr_progress() self.emr_client_counter = itertools.repeat(None, self.MAX_EMR_CLIENTS) self.start(patch.object(boto3, 'client', self.client)) self.start(patch.object(boto3, 'resource', self.resource)) super(MockBoto3TestCase, self).setUp() # patch slow things self.mrjob_zip_path = None def fake_create_mrjob_zip(mocked_runner, *args, **kwargs): if not self.mrjob_zip_path: self.mrjob_zip_path = self.makefile('fake_mrjob.zip') mocked_runner._mrjob_zip_path = self.mrjob_zip_path return self.mrjob_zip_path self.start( patch.object(EMRJobRunner, '_create_mrjob_zip', fake_create_mrjob_zip)) self.start(patch.object(time, 'sleep'))
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version='0.20.204' self.start(patch.object(self.runner, '_upload_args', return_value=['new_upload_args'])) self.start(patch.object(self.runner, '_pre_0_20_upload_args', return_value=['old_upload_args'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir')) self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir'] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args']
def setUp(self): super(MockGoogleAPITestCase, self).setUp() self._dataproc_client = MockDataprocClient(self) self.start(patch.object( DataprocJobRunner, 'api_client', self._dataproc_client)) self.start(patch('mrjob.dataproc._read_gcloud_config', lambda: _GCLOUD_CONFIG)) # patch slow things self.mrjob_zip_path = None def fake_create_mrjob_zip(runner, *args, **kwargs): if not self.mrjob_zip_path: self.mrjob_zip_path = self.makefile('fake_mrjob.zip') runner._mrjob_zip_path = self.mrjob_zip_path return self.mrjob_zip_path self.start(patch.object( DataprocJobRunner, '_create_mrjob_zip', fake_create_mrjob_zip)) self.start(patch.object(time, 'sleep'))
def test_find_hadoop_streaming_jar(self): # not just any jar will do with patch.object(os, 'walk', return_value=[ ('/some_dir', None, 'mason.jar')]): self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None) # should match streaming jar with patch.object(os, 'walk', return_value=[ ('/some_dir', None, 'hadoop-0.20.2-streaming.jar')]): self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None) # shouldn't find anything in an empty dir with patch.object(os, 'walk', return_value=[]): self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)
def test_works_with_built_in_json_module(self): # regression test: make sure we're not trying to serialize dict_items self.start(patch.object(MRTextClassifier, 'INTERNAL_PROTOCOL', StandardJSONProtocol)) self.start(patch.object(MRTextClassifier, 'OUTPUT_PROTOCOL', StandardJSONProtocol)) docs_paths = glob(join( dirname(mrjob.__file__), 'examples', 'docs-to-classify', '*')) # use --min-df 1 because we have so few documents job_args = ['--min-df', '1'] + docs_paths run_job(MRTextClassifier(job_args))
def test_kill_persistent_cluster(self): with no_handlers_for_logger("mrjob.dataproc"): r = self._quick_runner() with patch.object(mrjob.dataproc.DataprocJobRunner, "_api_cluster_delete") as m: r._opts["cluster_id"] = "j-MOCKCLUSTER0" r._cleanup_cluster() self.assertTrue(m.called)
def mrjob_conf_patcher(substitute_conf=EMPTY_MRJOB_CONF): def mock_load_opts_from_mrjob_confs(runner_alias, conf_paths=None): return [(None, substitute_conf.get('runners', {}).get(runner_alias, {}))] return patch.object(runner, 'load_opts_from_mrjob_confs', mock_load_opts_from_mrjob_confs)
def simple_patch(self, obj, attr, side_effect=None, return_value=None): patcher = patch.object(obj, attr, side_effect=side_effect, return_value=return_value) patcher.start() self.addCleanup(patcher.stop)
def test_hadoop_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) # HadoopRunnerOptionStore really wants to find the streaming jar with patch.object(mrjob.hadoop, 'find_hadoop_streaming_jar', return_value='found'): opts = HadoopRunnerOptionStore( 'hadoop', dict(base_tmp_dir='/scratch', hadoop_home='required', hdfs_scratch_dir='hdfs:///scratch'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn( 'Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue()) self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch') self.assertNotIn('hdfs_scratch_dir', opts) self.assertIn( 'Deprecated option hdfs_scratch_dir has been renamed' ' to hadoop_tmp_dir', stderr.getvalue())
def test_kill_cluster(self): with no_handlers_for_logger('mrjob.dataproc'): r = self._quick_runner() with patch.object(mrjob.dataproc.DataprocJobRunner, '_api_cluster_delete') as m: r._cleanup_cluster() self.assertTrue(m.called)
def test_kill_persistent_cluster(self): with no_handlers_for_logger('mrjob.dataproc'): r = self._quick_runner() with patch.object(mrjob.dataproc.DataprocJobRunner, '_api_cluster_delete') as m: r._opts['cluster_id'] = 'j-MOCKCLUSTER0' r._cleanup_cluster() self.assertTrue(m.called)
def test_verbose(self): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging(verbose=True) log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
def test_kill_persistent_cluster(self): with no_handlers_for_logger('mrjob.dataproc'): r = self._quick_runner() with patch.object(mrjob.dataproc.DataprocJobRunner, '_delete_cluster') as m: r._opts['cluster_id'] = 'j-MOCKCLUSTER0' r._cleanup_cluster() self.assertTrue(m.called)
def test_put_part_size_mb(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() with patch.object(GCSFilesystem, '_blob') as blob_meth: self.fs.put(local_path, dest, part_size_mb=99999) blob_meth.assert_called_once_with(dest, chunk_size=99999)
def test_default_options(self): with no_handlers_for_logger('__main__'): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging() log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\n')
def test_find_hadoop_streaming_jar(self): # not just any jar will do with patch.object(os, 'walk', return_value=[('/some_dir', None, 'mason.jar')]): self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None) # should match streaming jar with patch.object(os, 'walk', return_value=[('/some_dir', None, 'hadoop-0.20.2-streaming.jar')]): self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None) # shouldn't find anything in an empty dir with patch.object(os, 'walk', return_value=[]): self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)
def test_path_join(self): fs = Filesystem() with patch.object(fs, 'join'): with no_handlers_for_logger('mrjob.fs.base'): fs.path_join('foo', 'bar') fs.join.assert_called_once_with('foo', 'bar')
def test_path_join(self): fs = Filesystem() with patch.object(fs, "join"): with no_handlers_for_logger("mrjob.fs.base"): fs.path_join("foo", "bar") fs.join.assert_called_once_with("foo", "bar")
def test_path_exists(self): fs = Filesystem() with patch.object(fs, "exists"): with no_handlers_for_logger("mrjob.fs.base"): fs.path_exists("foo") fs.exists.assert_called_once_with("foo")
def test_path_exists(self): fs = Filesystem() with patch.object(fs, 'exists'): with no_handlers_for_logger('mrjob.fs.base'): fs.path_exists('foo') fs.exists.assert_called_once_with('foo')
def test_too_many_jobs_on_the_dance_floor(self): def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (b"2 jobs currently running\n", b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen): self.assertRaises(IOError, ssh.ssh_terminate_single_job, ['ssh_bin'], 'address', 'key.pem')
def test_kill_cluster_if_successful(self): # If they are setting up the cleanup to kill the cluster, mrjob should # kill the cluster independent of job success. with no_handlers_for_logger('mrjob.dataproc'): r = self._quick_runner() with patch.object(mrjob.dataproc.DataprocJobRunner, '_api_cluster_delete') as m: r._ran_job = True r._cleanup_cluster() self.assertTrue(m.called)
def test_junk_list_output(self): def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (b"yah output, its gahbage\n", b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen): self.assertRaises(IOError, ssh.ssh_terminate_single_job, ['ssh_bin'], 'address', 'key.pem')
def test_libjars_attr_relative_path(self): job_dir = os.path.dirname(MRJob.mr_job_script()) with patch.object(MRJob, "LIBJARS", ["cookie.jar", "/left/dora.jar"]): job = MRJob() self.assertEqual( job.job_runner_kwargs()["libjars"], [os.path.join(job_dir, "cookie.jar"), "/left/dora.jar"] )
def test_libjars_attr_relative_path(self): job_dir = os.path.dirname(MRJob.mr_job_script()) with patch.object(MRJob, 'LIBJARS', ['cookie.jar', '/left/dora.jar']): job = MRJob() self.assertEqual( job._runner_kwargs()['libjars'], [os.path.join(job_dir, 'cookie.jar'), '/left/dora.jar'])
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner( hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.start(patch.object(self.runner, '_upload_args', return_value=['<upload args>'])) self.start(patch.object(self.runner, '_hadoop_args_for_step', return_value=['<hadoop args for step>'])) self.start(patch.object(self.runner, '_hdfs_step_input_files', return_value=['<hdfs step input files>'])) self.start(patch.object(self.runner, '_hdfs_step_output_dir', return_value='<hdfs step output dir>')) self.start(patch.object(HadoopFilesystem, 'get_hadoop_version', return_value='2.7.1')) self.runner._script_path = 'my_job.py'
def test_no_output(self): launcher = MRJobLauncher(args=['--no-conf', '--no-output', '']) launcher.sandbox() with patch.object(launcher, 'make_runner') as m_make_runner: runner = Mock() _mock_context_mgr(m_make_runner, runner) runner.stream_output.return_value = ['a line'] launcher.run_job() self.assertEqual(launcher.stdout.getvalue(), b'') self.assertEqual(launcher.stderr.getvalue(), b'')
def test_no_mrjob_confs(self): with patch.object(conf, 'real_mrjob_conf_path', return_value=None): mr_job = MRIncrementerJob(['-r', 'inline', '--times', '2']) mr_job.sandbox(stdin=BytesIO(b'0\n1\n2\n')) with mr_job.make_runner() as runner: runner.run() output = sorted(mr_job.parse_output_line(line)[1] for line in runner.stream_output()) self.assertEqual(output, [2, 3, 4])
def setUp(self): def error(msg=None): if msg: raise ValueError(msg) else: raise ValueError p = patch.object(cmd, 'error', side_effect=error) p.start() self.addCleanup(p.stop)
def test_configuration_translation(self): job = MRWordCount(["--jobconf", "mapred.jobtracker.maxtasks.per.job=1"]) with job.make_runner() as runner: with no_handlers_for_logger("mrjob.runner"): with patch.object(runner, "get_hadoop_version", return_value="2.7.1"): self.assertEqual( runner._hadoop_args_for_step(0), ["-D", "mapred.jobtracker.maxtasks.per.job=1", "-D", "mapreduce.jobtracker.maxtasks.perjob=1"], )
def setUp(self): super(StreamingArgsTestCase, self).setUp() self.runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar', mr_job_script='my_job.py', stdin=BytesIO()) self.runner._add_job_files_for_upload() self.runner._hadoop_version = '0.20.204' self.start( patch.object(self.runner, '_upload_args', return_value=['new_upload_args'])) self.start( patch.object(self.runner, '_pre_0_20_upload_args', return_value=['old_upload_args'])) self.start( patch.object(self.runner, '_hadoop_args_for_step', return_value=['hadoop_args_for_step'])) self.start( patch.object(self.runner, '_hdfs_step_input_files', return_value=['hdfs_step_input_files'])) self.start( patch.object(self.runner, '_hdfs_step_output_dir', return_value='hdfs_step_output_dir')) self.runner._script_path = 'my_job.py' self._new_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'new_upload_args', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir' ] self._old_basic_args = [ 'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input', 'hdfs_step_input_files', '-output', 'hdfs_step_output_dir', 'old_upload_args' ]
def test_dance_floor_is_empty(self): def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (b"0 jobs currently running\n", b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen): self.assertEqual( None, ssh.ssh_terminate_single_job(['ssh_bin'], 'address', 'key.pem'))
def test_dance_floor_is_empty(self): def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (b"0 jobs currently running\n", b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen): self.assertEqual( None, ssh.ssh_terminate_single_job( ['ssh_bin'], 'address', 'key.pem'))
def test_put_chunk_size(self): local_path = self.makefile('foo', contents=b'bar') dest = 'gs://bar-files/foo' self.storage_client().bucket('bar-files').create() with patch.object(GCSFilesystem, '_blob') as blob_meth: with patch('mrjob.fs.gcs.log') as log: self.fs.put(local_path, dest, chunk_size=99999) blob_meth.assert_called_once_with(dest, chunk_size=99999) self.assertTrue(log.warning.called)
def test_libjars_environment_variables(self): job_dir = os.path.dirname(MRJob.mr_job_script()) with patch.dict("os.environ", A="/path/to/a", B="b"): with patch.object(MRJob, "LIBJARS", ["$A/cookie.jar", "$B/honey.jar"]): job = MRJob() # libjars() peeks into envvars to figure out if the path # is relative or absolute self.assertEqual( job.job_runner_kwargs()["libjars"], ["$A/cookie.jar", os.path.join(job_dir, "$B/honey.jar")] )
def setUp(self): self._dataproc_client = MockDataprocClient(self) self._gcs_client = MockGCSClient(self) self._gcs_fs = self._gcs_client._fs self.start(patch.object( DataprocJobRunner, 'api_client', self._dataproc_client)) self.gcs_patch_api_client = patch.object( GCSFilesystem, 'api_client', self._gcs_client) self.gcs_patch_download_io = patch.object( GCSFilesystem, '_download_io', self._gcs_client.download_io) self.gcs_patch_upload_io = patch.object( GCSFilesystem, '_upload_io', self._gcs_client.upload_io) self.start(self.gcs_patch_api_client) self.start(self.gcs_patch_download_io) self.start(self.gcs_patch_upload_io) self.start(patch('mrjob.dataproc._read_gcloud_config', lambda: _GCLOUD_CONFIG)) super(MockGoogleAPITestCase, self).setUp() # patch slow things def fake_create_mrjob_tar_gz(mocked_self, *args, **kwargs): mocked_self._mrjob_tar_gz_path = self.fake_mrjob_tgz_path return self.fake_mrjob_tgz_path self.start(patch.object( DataprocJobRunner, '_create_mrjob_tar_gz', fake_create_mrjob_tar_gz)) self.start(patch.object(time, 'sleep'))
def setUp(self): self._dataproc_client = MockDataprocClient(self) self._gcs_client = MockGCSClient(self) self._gcs_fs = self._gcs_client._fs self.start(patch.object( DataprocJobRunner, 'api_client', self._dataproc_client)) self.gcs_patch_api_client = patch.object( GCSFilesystem, 'api_client', self._gcs_client) self.gcs_patch_download_io = patch.object( GCSFilesystem, '_download_io', self._gcs_client.download_io) self.gcs_patch_upload_io = patch.object( GCSFilesystem, '_upload_io', self._gcs_client.upload_io) self.start(self.gcs_patch_api_client) self.start(self.gcs_patch_download_io) self.start(self.gcs_patch_upload_io) self.start(patch('mrjob.dataproc._read_gcloud_config', lambda: _GCLOUD_CONFIG)) super(MockGoogleAPITestCase, self).setUp() # patch slow things def fake_create_mrjob_zip(mocked_self, *args, **kwargs): mocked_self._mrjob_zip_path = self.fake_mrjob_zip_path return self.fake_mrjob_zip_path self.start(patch.object( DataprocJobRunner, '_create_mrjob_zip', fake_create_mrjob_zip)) self.start(patch.object(time, 'sleep'))
def setUp(self): """disable all logging handlers """ # Extra logging messages were cluttering Travis CI. See #1793 super(BasicTestCase, self).setUp() for name in ['', '__main__', 'mrjob']: log = logging.getLogger(name) self.start(patch.object(log, 'handlers', [])) if not name: # add a dummy handler to the root logger log.addHandler(NullHandler())
def test_libjars_environment_variables(self): job_dir = os.path.dirname(MRJob.mr_job_script()) with patch.dict('os.environ', A='/path/to/a', B='b'): with patch.object(MRJob, 'LIBJARS', ['$A/cookie.jar', '$B/honey.jar']): job = MRJob() # libjars() peeks into envvars to figure out if the path # is relative or absolute self.assertEqual( job._runner_kwargs()['libjars'], ['$A/cookie.jar', os.path.join(job_dir, '$B/honey.jar')])
def test_configuration_translation(self): job = MRWordCount( ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1']) with job.make_runner() as runner: with no_handlers_for_logger('mrjob.runner'): with patch.object(runner, 'get_hadoop_version', return_value='2.7.1'): self.assertEqual( runner._hadoop_args_for_step(0), ['-D', 'mapred.jobtracker.maxtasks.per.job=1', '-D', 'mapreduce.jobtracker.maxtasks.perjob=1' ])
def test_junk_kill_output(self): values = [self.GOOD_LIST_OUTPUT, b"yah output, its gahbage\n"] def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (values.pop(0), b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen): self.assertEqual( ssh.ssh_terminate_single_job( ['ssh_bin'], 'address', 'key.pem'), 'yah output, its gahbage\n')
def test_junk_kill_output(self): values = [self.GOOD_LIST_OUTPUT, b"yah output, its gahbage\n"] def fake_popen(*args, **kwargs): m = Mock() m.communicate.return_value = (values.pop(0), b'') return m with patch.object(ssh, 'Popen', side_effect=fake_popen): self.assertEqual( ssh.ssh_terminate_single_job(['ssh_bin'], 'address', 'key.pem'), 'yah output, its gahbage\n')