Ejemplo n.º 1
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(hadoop_bin='hadoop',
                                      hadoop_streaming_jar='<streaming jar>',
                                      mr_job_script='my_job.py',
                                      stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(
            patch.object(self.runner,
                         '_upload_args',
                         return_value=['<upload args>']))
        self.start(
            patch.object(self.runner,
                         '_hadoop_args_for_step',
                         return_value=['<hadoop args for step>']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_input_files',
                         return_value=['<hdfs step input files>']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_output_dir',
                         return_value='<hdfs step output dir>'))
        self.start(
            patch.object(HadoopFilesystem,
                         'get_hadoop_version',
                         return_value='2.7.1'))
        self.runner._script_path = 'my_job.py'
Ejemplo n.º 2
0
    def setUp(self):
        # patch boto3
        self.mock_emr_failures = set()
        self.mock_emr_self_termination = set()
        self.mock_emr_clusters = {}
        self.mock_emr_output = {}
        self.mock_iam_instance_profiles = {}
        self.mock_iam_role_attached_policies = {}
        self.mock_iam_roles = {}
        self.mock_s3_fs = {}

        self.emr_client = None  # used by simulate_emr_progress()
        self.emr_client_counter = itertools.repeat(None, self.MAX_EMR_CLIENTS)

        self.start(patch.object(boto3, 'client', self.client))
        self.start(patch.object(boto3, 'resource', self.resource))

        super(MockBoto3TestCase, self).setUp()

        # patch slow things
        self.mrjob_zip_path = None

        def fake_create_mrjob_zip(mocked_runner, *args, **kwargs):
            if not self.mrjob_zip_path:
                self.mrjob_zip_path = self.makefile('fake_mrjob.zip')

            mocked_runner._mrjob_zip_path = self.mrjob_zip_path

            return self.mrjob_zip_path

        self.start(
            patch.object(EMRJobRunner, '_create_mrjob_zip',
                         fake_create_mrjob_zip))

        self.start(patch.object(time, 'sleep'))
Ejemplo n.º 3
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version='0.20.204'
        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['new_upload_args']))
        self.start(patch.object(self.runner, '_pre_0_20_upload_args',
                                return_value=['old_upload_args']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['hadoop_args_for_step']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['hdfs_step_input_files']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='hdfs_step_output_dir'))
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Ejemplo n.º 4
0
    def setUp(self):
        super(MockGoogleAPITestCase, self).setUp()

        self._dataproc_client = MockDataprocClient(self)

        self.start(patch.object(
            DataprocJobRunner, 'api_client', self._dataproc_client))

        self.start(patch('mrjob.dataproc._read_gcloud_config',
                         lambda: _GCLOUD_CONFIG))

        # patch slow things
        self.mrjob_zip_path = None

        def fake_create_mrjob_zip(runner, *args, **kwargs):
            if not self.mrjob_zip_path:
                self.mrjob_zip_path = self.makefile('fake_mrjob.zip')

            runner._mrjob_zip_path = self.mrjob_zip_path
            return self.mrjob_zip_path

        self.start(patch.object(
            DataprocJobRunner, '_create_mrjob_zip',
            fake_create_mrjob_zip))

        self.start(patch.object(time, 'sleep'))
Ejemplo n.º 5
0
    def test_find_hadoop_streaming_jar(self):
        # not just any jar will do
        with patch.object(os, 'walk', return_value=[
            ('/some_dir', None, 'mason.jar')]):
            self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)

        # should match streaming jar
        with patch.object(os, 'walk', return_value=[
            ('/some_dir', None, 'hadoop-0.20.2-streaming.jar')]):
            self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)

        # shouldn't find anything in an empty dir
        with patch.object(os, 'walk', return_value=[]):
            self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)
Ejemplo n.º 6
0
    def test_works_with_built_in_json_module(self):
        # regression test: make sure we're not trying to serialize dict_items
        self.start(patch.object(MRTextClassifier,
                                'INTERNAL_PROTOCOL', StandardJSONProtocol))
        self.start(patch.object(MRTextClassifier,
                                'OUTPUT_PROTOCOL', StandardJSONProtocol))

        docs_paths = glob(join(
            dirname(mrjob.__file__), 'examples', 'docs-to-classify', '*'))

        # use --min-df 1 because we have so few documents
        job_args = ['--min-df', '1'] + docs_paths

        run_job(MRTextClassifier(job_args))
Ejemplo n.º 7
0
 def test_kill_persistent_cluster(self):
     with no_handlers_for_logger("mrjob.dataproc"):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner, "_api_cluster_delete") as m:
             r._opts["cluster_id"] = "j-MOCKCLUSTER0"
             r._cleanup_cluster()
             self.assertTrue(m.called)
Ejemplo n.º 8
0
def mrjob_conf_patcher(substitute_conf=EMPTY_MRJOB_CONF):
    def mock_load_opts_from_mrjob_confs(runner_alias, conf_paths=None):
        return [(None, substitute_conf.get('runners',
                                           {}).get(runner_alias, {}))]

    return patch.object(runner, 'load_opts_from_mrjob_confs',
                        mock_load_opts_from_mrjob_confs)
Ejemplo n.º 9
0
 def simple_patch(self, obj, attr, side_effect=None, return_value=None):
     patcher = patch.object(obj,
                            attr,
                            side_effect=side_effect,
                            return_value=return_value)
     patcher.start()
     self.addCleanup(patcher.stop)
Ejemplo n.º 10
0
    def test_hadoop_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)

            # HadoopRunnerOptionStore really wants to find the streaming jar
            with patch.object(mrjob.hadoop,
                              'find_hadoop_streaming_jar',
                              return_value='found'):
                opts = HadoopRunnerOptionStore(
                    'hadoop',
                    dict(base_tmp_dir='/scratch',
                         hadoop_home='required',
                         hdfs_scratch_dir='hdfs:///scratch'), [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn(
                'Deprecated option base_tmp_dir has been renamed'
                ' to local_tmp_dir', stderr.getvalue())

            self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch')
            self.assertNotIn('hdfs_scratch_dir', opts)
            self.assertIn(
                'Deprecated option hdfs_scratch_dir has been renamed'
                ' to hadoop_tmp_dir', stderr.getvalue())
Ejemplo n.º 11
0
def mrjob_conf_patcher(substitute_conf=EMPTY_MRJOB_CONF):
    def mock_load_opts_from_mrjob_confs(runner_alias, conf_paths=None):
        return [(None,
                 substitute_conf.get('runners', {}).get(runner_alias, {}))]

    return patch.object(runner, 'load_opts_from_mrjob_confs',
                        mock_load_opts_from_mrjob_confs)
Ejemplo n.º 12
0
 def test_kill_cluster(self):
     with no_handlers_for_logger('mrjob.dataproc'):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner,
                           '_api_cluster_delete') as m:
             r._cleanup_cluster()
             self.assertTrue(m.called)
Ejemplo n.º 13
0
 def test_kill_persistent_cluster(self):
     with no_handlers_for_logger('mrjob.dataproc'):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner, '_api_cluster_delete') as m:
             r._opts['cluster_id'] = 'j-MOCKCLUSTER0'
             r._cleanup_cluster()
             self.assertTrue(m.called)
Ejemplo n.º 14
0
 def test_verbose(self):
     with patch.object(sys, 'stderr', StringIO()) as stderr:
         MRJob.set_up_logging(verbose=True)
         log = logging.getLogger('__main__')
         log.info('INFO')
         log.debug('DEBUG')
         self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
Ejemplo n.º 15
0
 def test_verbose(self):
     with patch.object(sys, 'stderr', StringIO()) as stderr:
         MRJob.set_up_logging(verbose=True)
         log = logging.getLogger('__main__')
         log.info('INFO')
         log.debug('DEBUG')
         self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
Ejemplo n.º 16
0
 def test_kill_persistent_cluster(self):
     with no_handlers_for_logger('mrjob.dataproc'):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner,
                           '_delete_cluster') as m:
             r._opts['cluster_id'] = 'j-MOCKCLUSTER0'
             r._cleanup_cluster()
             self.assertTrue(m.called)
Ejemplo n.º 17
0
    def test_put_part_size_mb(self):
        local_path = self.makefile('foo', contents=b'bar')
        dest = 'gs://bar-files/foo'
        self.storage_client().bucket('bar-files').create()

        with patch.object(GCSFilesystem, '_blob') as blob_meth:
            self.fs.put(local_path, dest, part_size_mb=99999)
            blob_meth.assert_called_once_with(dest, chunk_size=99999)
Ejemplo n.º 18
0
 def test_default_options(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', StringIO()) as stderr:
             MRJob.set_up_logging()
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\n')
Ejemplo n.º 19
0
    def test_put_part_size_mb(self):
        local_path = self.makefile('foo', contents=b'bar')
        dest = 'gs://bar-files/foo'
        self.storage_client().bucket('bar-files').create()

        with patch.object(GCSFilesystem, '_blob') as blob_meth:
            self.fs.put(local_path, dest, part_size_mb=99999)
            blob_meth.assert_called_once_with(dest, chunk_size=99999)
Ejemplo n.º 20
0
    def test_find_hadoop_streaming_jar(self):
        # not just any jar will do
        with patch.object(os,
                          'walk',
                          return_value=[('/some_dir', None, 'mason.jar')]):
            self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)

        # should match streaming jar
        with patch.object(os,
                          'walk',
                          return_value=[('/some_dir', None,
                                         'hadoop-0.20.2-streaming.jar')]):
            self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)

        # shouldn't find anything in an empty dir
        with patch.object(os, 'walk', return_value=[]):
            self.assertEqual(find_hadoop_streaming_jar('/some_dir'), None)
Ejemplo n.º 21
0
 def test_default_options(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', StringIO()) as stderr:
             MRJob.set_up_logging()
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\n')
Ejemplo n.º 22
0
    def test_path_join(self):
        fs = Filesystem()

        with patch.object(fs, 'join'):
            with no_handlers_for_logger('mrjob.fs.base'):
                fs.path_join('foo', 'bar')

            fs.join.assert_called_once_with('foo', 'bar')
Ejemplo n.º 23
0
    def test_path_join(self):
        fs = Filesystem()

        with patch.object(fs, "join"):
            with no_handlers_for_logger("mrjob.fs.base"):
                fs.path_join("foo", "bar")

            fs.join.assert_called_once_with("foo", "bar")
Ejemplo n.º 24
0
    def test_path_exists(self):
        fs = Filesystem()

        with patch.object(fs, "exists"):
            with no_handlers_for_logger("mrjob.fs.base"):
                fs.path_exists("foo")

            fs.exists.assert_called_once_with("foo")
Ejemplo n.º 25
0
    def test_path_exists(self):
        fs = Filesystem()

        with patch.object(fs, 'exists'):
            with no_handlers_for_logger('mrjob.fs.base'):
                fs.path_exists('foo')

            fs.exists.assert_called_once_with('foo')
Ejemplo n.º 26
0
    def test_path_join(self):
        fs = Filesystem()

        with patch.object(fs, 'join'):
            with no_handlers_for_logger('mrjob.fs.base'):
                fs.path_join('foo', 'bar')

            fs.join.assert_called_once_with('foo', 'bar')
Ejemplo n.º 27
0
    def test_path_exists(self):
        fs = Filesystem()

        with patch.object(fs, 'exists'):
            with no_handlers_for_logger('mrjob.fs.base'):
                fs.path_exists('foo')

            fs.exists.assert_called_once_with('foo')
Ejemplo n.º 28
0
    def test_too_many_jobs_on_the_dance_floor(self):
        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (b"2 jobs currently running\n", b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertRaises(IOError, ssh.ssh_terminate_single_job,
                              ['ssh_bin'], 'address', 'key.pem')
Ejemplo n.º 29
0
 def test_kill_cluster_if_successful(self):
     # If they are setting up the cleanup to kill the cluster, mrjob should
     # kill the cluster independent of job success.
     with no_handlers_for_logger('mrjob.dataproc'):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner, '_api_cluster_delete') as m:
             r._ran_job = True
             r._cleanup_cluster()
             self.assertTrue(m.called)
Ejemplo n.º 30
0
    def test_junk_list_output(self):
        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (b"yah output, its gahbage\n", b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertRaises(IOError, ssh.ssh_terminate_single_job,
                              ['ssh_bin'], 'address', 'key.pem')
Ejemplo n.º 31
0
    def test_libjars_attr_relative_path(self):
        job_dir = os.path.dirname(MRJob.mr_job_script())

        with patch.object(MRJob, "LIBJARS", ["cookie.jar", "/left/dora.jar"]):
            job = MRJob()

            self.assertEqual(
                job.job_runner_kwargs()["libjars"], [os.path.join(job_dir, "cookie.jar"), "/left/dora.jar"]
            )
Ejemplo n.º 32
0
    def test_libjars_attr_relative_path(self):
        job_dir = os.path.dirname(MRJob.mr_job_script())

        with patch.object(MRJob, 'LIBJARS', ['cookie.jar', '/left/dora.jar']):
            job = MRJob()

            self.assertEqual(
                job._runner_kwargs()['libjars'],
                [os.path.join(job_dir, 'cookie.jar'), '/left/dora.jar'])
Ejemplo n.º 33
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='<streaming jar>',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['<upload args>']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['<hadoop args for step>']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['<hdfs step input files>']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='<hdfs step output dir>'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='2.7.1'))
        self.runner._script_path = 'my_job.py'
Ejemplo n.º 34
0
    def test_libjars_attr_relative_path(self):
        job_dir = os.path.dirname(MRJob.mr_job_script())

        with patch.object(MRJob, 'LIBJARS', ['cookie.jar', '/left/dora.jar']):
            job = MRJob()

            self.assertEqual(
                job._runner_kwargs()['libjars'],
                [os.path.join(job_dir, 'cookie.jar'), '/left/dora.jar'])
Ejemplo n.º 35
0
 def test_kill_cluster_if_successful(self):
     # If they are setting up the cleanup to kill the cluster, mrjob should
     # kill the cluster independent of job success.
     with no_handlers_for_logger('mrjob.dataproc'):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner,
                           '_api_cluster_delete') as m:
             r._ran_job = True
             r._cleanup_cluster()
             self.assertTrue(m.called)
Ejemplo n.º 36
0
 def test_no_output(self):
     launcher = MRJobLauncher(args=['--no-conf', '--no-output', ''])
     launcher.sandbox()
     with patch.object(launcher, 'make_runner') as m_make_runner:
         runner = Mock()
         _mock_context_mgr(m_make_runner, runner)
         runner.stream_output.return_value = ['a line']
         launcher.run_job()
         self.assertEqual(launcher.stdout.getvalue(), b'')
         self.assertEqual(launcher.stderr.getvalue(), b'')
Ejemplo n.º 37
0
    def test_no_mrjob_confs(self):
        with patch.object(conf, 'real_mrjob_conf_path', return_value=None):
            mr_job = MRIncrementerJob(['-r', 'inline', '--times', '2'])
            mr_job.sandbox(stdin=BytesIO(b'0\n1\n2\n'))

            with mr_job.make_runner() as runner:
                runner.run()
                output = sorted(mr_job.parse_output_line(line)[1]
                                for line in runner.stream_output())
                self.assertEqual(output, [2, 3, 4])
Ejemplo n.º 38
0
    def setUp(self):
        def error(msg=None):
            if msg:
                raise ValueError(msg)
            else:
                raise ValueError

        p = patch.object(cmd, 'error', side_effect=error)
        p.start()
        self.addCleanup(p.stop)
Ejemplo n.º 39
0
    def test_too_many_jobs_on_the_dance_floor(self):

        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (b"2 jobs currently running\n", b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertRaises(IOError, ssh.ssh_terminate_single_job,
                              ['ssh_bin'], 'address', 'key.pem')
Ejemplo n.º 40
0
    def test_no_mrjob_confs(self):
        with patch.object(conf, 'real_mrjob_conf_path', return_value=None):
            mr_job = MRIncrementerJob(['-r', 'inline', '--times', '2'])
            mr_job.sandbox(stdin=BytesIO(b'0\n1\n2\n'))

            with mr_job.make_runner() as runner:
                runner.run()
                output = sorted(mr_job.parse_output_line(line)[1]
                                for line in runner.stream_output())
                self.assertEqual(output, [2, 3, 4])
Ejemplo n.º 41
0
    def test_configuration_translation(self):
        job = MRWordCount(["--jobconf", "mapred.jobtracker.maxtasks.per.job=1"])

        with job.make_runner() as runner:
            with no_handlers_for_logger("mrjob.runner"):
                with patch.object(runner, "get_hadoop_version", return_value="2.7.1"):
                    self.assertEqual(
                        runner._hadoop_args_for_step(0),
                        ["-D", "mapred.jobtracker.maxtasks.per.job=1", "-D", "mapreduce.jobtracker.maxtasks.perjob=1"],
                    )
Ejemplo n.º 42
0
 def test_no_output(self):
     launcher = MRJobLauncher(args=['--no-conf', '--no-output', ''])
     launcher.sandbox()
     with patch.object(launcher, 'make_runner') as m_make_runner:
         runner = Mock()
         _mock_context_mgr(m_make_runner, runner)
         runner.stream_output.return_value = ['a line']
         launcher.run_job()
         self.assertEqual(launcher.stdout.getvalue(), b'')
         self.assertEqual(launcher.stderr.getvalue(), b'')
Ejemplo n.º 43
0
    def test_junk_list_output(self):

        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (b"yah output, its gahbage\n", b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertRaises(IOError, ssh.ssh_terminate_single_job,
                              ['ssh_bin'], 'address', 'key.pem')
Ejemplo n.º 44
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(hadoop_bin='hadoop',
                                      hadoop_streaming_jar='streaming.jar',
                                      mr_job_script='my_job.py',
                                      stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version = '0.20.204'
        self.start(
            patch.object(self.runner,
                         '_upload_args',
                         return_value=['new_upload_args']))
        self.start(
            patch.object(self.runner,
                         '_pre_0_20_upload_args',
                         return_value=['old_upload_args']))
        self.start(
            patch.object(self.runner,
                         '_hadoop_args_for_step',
                         return_value=['hadoop_args_for_step']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_input_files',
                         return_value=['hdfs_step_input_files']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_output_dir',
                         return_value='hdfs_step_output_dir'))
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar', 'new_upload_args',
            'hadoop_args_for_step', '-input', 'hdfs_step_input_files',
            '-output', 'hdfs_step_output_dir'
        ]

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar', 'hadoop_args_for_step', '-input',
            'hdfs_step_input_files', '-output', 'hdfs_step_output_dir',
            'old_upload_args'
        ]
Ejemplo n.º 45
0
    def test_dance_floor_is_empty(self):
        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (b"0 jobs currently running\n", b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertEqual(
                None,
                ssh.ssh_terminate_single_job(['ssh_bin'], 'address',
                                             'key.pem'))
Ejemplo n.º 46
0
    def test_dance_floor_is_empty(self):

        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (b"0 jobs currently running\n", b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertEqual(
                None, ssh.ssh_terminate_single_job(
                    ['ssh_bin'], 'address', 'key.pem'))
Ejemplo n.º 47
0
    def test_put_chunk_size(self):
        local_path = self.makefile('foo', contents=b'bar')
        dest = 'gs://bar-files/foo'
        self.storage_client().bucket('bar-files').create()

        with patch.object(GCSFilesystem, '_blob') as blob_meth:
            with patch('mrjob.fs.gcs.log') as log:

                self.fs.put(local_path, dest, chunk_size=99999)
                blob_meth.assert_called_once_with(dest, chunk_size=99999)

                self.assertTrue(log.warning.called)
Ejemplo n.º 48
0
    def test_put_chunk_size(self):
        local_path = self.makefile('foo', contents=b'bar')
        dest = 'gs://bar-files/foo'
        self.storage_client().bucket('bar-files').create()

        with patch.object(GCSFilesystem, '_blob') as blob_meth:
            with patch('mrjob.fs.gcs.log') as log:

                self.fs.put(local_path, dest, chunk_size=99999)
                blob_meth.assert_called_once_with(dest, chunk_size=99999)

                self.assertTrue(log.warning.called)
Ejemplo n.º 49
0
    def test_libjars_environment_variables(self):
        job_dir = os.path.dirname(MRJob.mr_job_script())

        with patch.dict("os.environ", A="/path/to/a", B="b"):
            with patch.object(MRJob, "LIBJARS", ["$A/cookie.jar", "$B/honey.jar"]):
                job = MRJob()

                # libjars() peeks into envvars to figure out if the path
                # is relative or absolute
                self.assertEqual(
                    job.job_runner_kwargs()["libjars"], ["$A/cookie.jar", os.path.join(job_dir, "$B/honey.jar")]
                )
Ejemplo n.º 50
0
    def setUp(self):
        self._dataproc_client = MockDataprocClient(self)
        self._gcs_client = MockGCSClient(self)
        self._gcs_fs = self._gcs_client._fs

        self.start(patch.object(
            DataprocJobRunner, 'api_client', self._dataproc_client))

        self.gcs_patch_api_client = patch.object(
            GCSFilesystem, 'api_client', self._gcs_client)
        self.gcs_patch_download_io = patch.object(
            GCSFilesystem, '_download_io', self._gcs_client.download_io)
        self.gcs_patch_upload_io = patch.object(
            GCSFilesystem, '_upload_io', self._gcs_client.upload_io)
        self.start(self.gcs_patch_api_client)
        self.start(self.gcs_patch_download_io)
        self.start(self.gcs_patch_upload_io)

        self.start(patch('mrjob.dataproc._read_gcloud_config',
                         lambda: _GCLOUD_CONFIG))

        super(MockGoogleAPITestCase, self).setUp()

        # patch slow things
        def fake_create_mrjob_tar_gz(mocked_self, *args, **kwargs):
            mocked_self._mrjob_tar_gz_path = self.fake_mrjob_tgz_path
            return self.fake_mrjob_tgz_path

        self.start(patch.object(
            DataprocJobRunner, '_create_mrjob_tar_gz',
            fake_create_mrjob_tar_gz))

        self.start(patch.object(time, 'sleep'))
Ejemplo n.º 51
0
    def setUp(self):
        self._dataproc_client = MockDataprocClient(self)
        self._gcs_client = MockGCSClient(self)
        self._gcs_fs = self._gcs_client._fs

        self.start(patch.object(
            DataprocJobRunner, 'api_client', self._dataproc_client))

        self.gcs_patch_api_client = patch.object(
            GCSFilesystem, 'api_client', self._gcs_client)
        self.gcs_patch_download_io = patch.object(
            GCSFilesystem, '_download_io', self._gcs_client.download_io)
        self.gcs_patch_upload_io = patch.object(
            GCSFilesystem, '_upload_io', self._gcs_client.upload_io)
        self.start(self.gcs_patch_api_client)
        self.start(self.gcs_patch_download_io)
        self.start(self.gcs_patch_upload_io)

        self.start(patch('mrjob.dataproc._read_gcloud_config',
                         lambda: _GCLOUD_CONFIG))

        super(MockGoogleAPITestCase, self).setUp()

        # patch slow things
        def fake_create_mrjob_zip(mocked_self, *args, **kwargs):
            mocked_self._mrjob_zip_path = self.fake_mrjob_zip_path
            return self.fake_mrjob_zip_path

        self.start(patch.object(
            DataprocJobRunner, '_create_mrjob_zip',
            fake_create_mrjob_zip))

        self.start(patch.object(time, 'sleep'))
Ejemplo n.º 52
0
    def setUp(self):
        """disable all logging handlers
        """
        # Extra logging messages were cluttering Travis CI. See #1793
        super(BasicTestCase, self).setUp()

        for name in ['', '__main__', 'mrjob']:
            log = logging.getLogger(name)
            self.start(patch.object(log, 'handlers', []))

            if not name:
                # add a dummy handler to the root logger
                log.addHandler(NullHandler())
Ejemplo n.º 53
0
    def test_libjars_environment_variables(self):
        job_dir = os.path.dirname(MRJob.mr_job_script())

        with patch.dict('os.environ', A='/path/to/a', B='b'):
            with patch.object(MRJob, 'LIBJARS',
                              ['$A/cookie.jar', '$B/honey.jar']):
                job = MRJob()

                # libjars() peeks into envvars to figure out if the path
                # is relative or absolute
                self.assertEqual(
                    job._runner_kwargs()['libjars'],
                    ['$A/cookie.jar', os.path.join(job_dir, '$B/honey.jar')])
Ejemplo n.º 54
0
    def test_configuration_translation(self):
        job = MRWordCount(
            ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1'])

        with job.make_runner() as runner:
            with no_handlers_for_logger('mrjob.runner'):
                with patch.object(runner,
                                  'get_hadoop_version', return_value='2.7.1'):
                    self.assertEqual(
                        runner._hadoop_args_for_step(0),
                        ['-D', 'mapred.jobtracker.maxtasks.per.job=1',
                         '-D', 'mapreduce.jobtracker.maxtasks.perjob=1'
                         ])
Ejemplo n.º 55
0
    def test_libjars_environment_variables(self):
        job_dir = os.path.dirname(MRJob.mr_job_script())

        with patch.dict('os.environ', A='/path/to/a', B='b'):
            with patch.object(MRJob, 'LIBJARS',
                              ['$A/cookie.jar', '$B/honey.jar']):
                job = MRJob()

                # libjars() peeks into envvars to figure out if the path
                # is relative or absolute
                self.assertEqual(
                    job._runner_kwargs()['libjars'],
                    ['$A/cookie.jar', os.path.join(job_dir, '$B/honey.jar')])
Ejemplo n.º 56
0
    def setUp(self):
        """disable all logging handlers
        """
        # Extra logging messages were cluttering Travis CI. See #1793
        super(BasicTestCase, self).setUp()

        for name in ['', '__main__', 'mrjob']:
            log = logging.getLogger(name)
            self.start(patch.object(log, 'handlers', []))

            if not name:
                # add a dummy handler to the root logger
                log.addHandler(NullHandler())
Ejemplo n.º 57
0
    def test_configuration_translation(self):
        job = MRWordCount(
            ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1'])

        with job.make_runner() as runner:
            with no_handlers_for_logger('mrjob.runner'):
                with patch.object(runner,
                                  'get_hadoop_version', return_value='2.7.1'):
                    self.assertEqual(
                        runner._hadoop_args_for_step(0),
                        ['-D', 'mapred.jobtracker.maxtasks.per.job=1',
                         '-D', 'mapreduce.jobtracker.maxtasks.perjob=1'
                         ])
Ejemplo n.º 58
0
    def test_junk_kill_output(self):

        values = [self.GOOD_LIST_OUTPUT, b"yah output, its gahbage\n"]

        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (values.pop(0), b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertEqual(
                ssh.ssh_terminate_single_job(
                    ['ssh_bin'], 'address', 'key.pem'),
                'yah output, its gahbage\n')
Ejemplo n.º 59
0
    def test_junk_kill_output(self):

        values = [self.GOOD_LIST_OUTPUT, b"yah output, its gahbage\n"]

        def fake_popen(*args, **kwargs):
            m = Mock()
            m.communicate.return_value = (values.pop(0), b'')
            return m

        with patch.object(ssh, 'Popen', side_effect=fake_popen):
            self.assertEqual(
                ssh.ssh_terminate_single_job(['ssh_bin'], 'address',
                                             'key.pem'),
                'yah output, its gahbage\n')