Esempio n. 1
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(hadoop_bin='hadoop',
                                      hadoop_streaming_jar='<streaming jar>',
                                      mr_job_script='my_job.py',
                                      stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(
            patch.object(self.runner,
                         '_upload_args',
                         return_value=['<upload args>']))
        self.start(
            patch.object(self.runner,
                         '_hadoop_args_for_step',
                         return_value=['<hadoop args for step>']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_input_files',
                         return_value=['<hdfs step input files>']))
        self.start(
            patch.object(self.runner,
                         '_hdfs_step_output_dir',
                         return_value='<hdfs step output dir>'))
        self.start(
            patch.object(HadoopFilesystem,
                         'get_hadoop_version',
                         return_value='2.7.1'))
        self.runner._script_path = 'my_job.py'
Esempio n. 2
0
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        if self.options.runner == 'emr':
            # avoid requiring dependencies (such as boto3) for other runners
            from mrjob.emr import EMRJobRunner
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'dataproc':
            from mrjob.dataproc import DataprocJobRunner
            return DataprocJobRunner(**self.dataproc_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            from mrjob.hadoop import HadoopJobRunner
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            from mrjob.local import LocalMRJobRunner
            return LocalMRJobRunner(**self.local_job_runner_kwargs())
Esempio n. 3
0
    def test_hadoop_mapred_home_beats_infer_from_hadoop_bin(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')

        self.test_hadoop_mapred_home()
Esempio n. 4
0
    def test_infer_from_hadoop_bin_parent_dir(self):
        self.runner = HadoopJobRunner(
            hadoop_bin=['/ha/do/op/bin-parent/bin/hadoop'])

        self.mock_paths.append('/ha/do/op/bin-parent/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/bin-parent/hadoop-streaming.jar')
Esempio n. 5
0
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        # have to import here so that we can still run the MRJob
        # without importing boto
        from mrjob.emr import EMRJobRunner
        from mrjob.hadoop import HadoopJobRunner
        from mrjob.local import LocalMRJobRunner

        if self.options.runner == 'emr':
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            return LocalMRJobRunner(**self.local_job_runner_kwargs())
Esempio n. 6
0
    def setUp(self):
        super(HadoopLogDirsTestCase, self).setUp()

        os.environ.clear()

        self.mock_hadoop_version = '2.7.0'
        # the result of _hadoop_dir(). This handles non-log-specific
        # environment variables, such as $HADOOP_PREFIX, and also guesses
        # based on the path of the Hadoop binary
        self.mock_hadoop_dirs = []

        def mock_get_hadoop_version():
            return self.mock_hadoop_version

        def mock_hadoop_dirs_method():
            return (d for d in self.mock_hadoop_dirs)

        self.start(
            patch('mrjob.hadoop.HadoopJobRunner.get_hadoop_version',
                  side_effect=mock_get_hadoop_version))
        self.start(
            patch('mrjob.hadoop.HadoopJobRunner._hadoop_dirs',
                  side_effect=mock_hadoop_dirs_method))

        self.runner = HadoopJobRunner()
Esempio n. 7
0
    def test_hadoop_runner_cluster_mode(self):
        runner = HadoopJobRunner(spark_deploy_mode='cluster')

        self.assertEqual(runner._logs_needed_to_pick_error('streaming'),
                         ('step', 'history', 'task'))
        self.assertEqual(runner._logs_needed_to_pick_error('spark'),
                         ('step', 'task'))
Esempio n. 8
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=StringIO())
        self.runner._add_job_files_for_upload()

        self.runner._hadoop_version='0.20.204'
        self.simple_patch(self.runner, '_new_upload_args',
                          return_value=['new_upload_args'])
        self.simple_patch(self.runner, '_old_upload_args',
                          return_value=['old_upload_args'])
        self.simple_patch(self.runner, '_hadoop_args_for_step',
                          return_value=['hadoop_args_for_step'])
        self.simple_patch(self.runner, '_hdfs_step_input_files',
                          return_value=['hdfs_step_input_files'])
        self.simple_patch(self.runner, '_hdfs_step_output_dir',
                          return_value='hdfs_step_output_dir')
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Esempio n. 9
0
    def setUp(self):
        super(StreamingArgsTestCase, self).setUp()
        self.runner = HadoopJobRunner(
            hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
            mr_job_script='my_job.py', stdin=BytesIO())
        self.runner._add_job_files_for_upload()

        self.start(patch.object(self.runner, '_upload_args',
                                return_value=['new_upload_args']))
        self.start(patch.object(self.runner, '_pre_0_20_upload_args',
                                return_value=['old_upload_args']))
        self.start(patch.object(self.runner, '_hadoop_args_for_step',
                                return_value=['hadoop_args_for_step']))
        self.start(patch.object(self.runner, '_hdfs_step_input_files',
                                return_value=['hdfs_step_input_files']))
        self.start(patch.object(self.runner, '_hdfs_step_output_dir',
                                return_value='hdfs_step_output_dir'))
        self.start(patch.object(HadoopFilesystem, 'get_hadoop_version',
                                return_value='1.2.0'))
        self.runner._script_path = 'my_job.py'

        self._new_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'new_upload_args', 'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir']

        self._old_basic_args = [
            'hadoop', 'jar', 'streaming.jar',
             'hadoop_args_for_step',
             '-input', 'hdfs_step_input_files',
             '-output', 'hdfs_step_output_dir',
             'old_upload_args']
Esempio n. 10
0
    def test_hadoop_runner_client_mode(self):
        runner = HadoopJobRunner()

        self.assertEqual(runner._logs_needed_to_pick_error('streaming'),
                         ('step', 'history', 'task'))
        self.assertEqual(runner._logs_needed_to_pick_error('spark'),
                         ('step',))
Esempio n. 11
0
    def test_infer_from_hadoop_bin_realpath(self):
        with patch('posixpath.realpath', return_value='/ha/do/op/bin'):
            self.runner = HadoopJobRunner(hadoop_bin=['/usr/bin/hadoop'])
            self.mock_paths.append('/ha/do/op/hadoop-streaming.jar')

            self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                             '/ha/do/op/hadoop-streaming.jar')
Esempio n. 12
0
    def test_hadoop_log_dirs_opt(self):
        self.runner = HadoopJobRunner(hadoop_log_dirs=['/logs1', '/logs2'])

        os.environ['HADOOP_LOG_DIR'] = '/path/to/hadoop-log-dir'

        # setting hadoop_log_dirs short-circuits automatic discovery of logs
        self.assertEqual(list(self.runner._hadoop_log_dirs()),
                         ['/logs1', '/logs2'])
Esempio n. 13
0
    def setUp(self):
        super(StreamingLogDirsTestCase, self).setUp()

        self.log = self.start(patch('mrjob.hadoop.log'))

        self.runner = HadoopJobRunner()
        self.runner._hadoop_log_dirs = Mock(return_value=[])
        self.runner.fs.exists = Mock(return_value=True)

        self.log.reset_mock()  # ignore logging from HadoopJobRunner init
Esempio n. 14
0
    def test_hadoop_home_regression(self):
        # kill $HADOOP_HOME if it exists
        try:
            del os.environ['HADOOP_HOME']
        except KeyError:
            pass

        with patch('mrjob.hadoop.find_hadoop_streaming_jar',
                   return_value='some.jar'):
            HadoopJobRunner(hadoop_home=self.tmp_dir, conf_paths=[])
Esempio n. 15
0
    def test_uris(self):
        runner = HadoopJobRunner()
        list(runner.ls('hdfs://tmp/waffles'))
        list(runner.ls('lego://my/ego'))
        list(runner.ls('/tmp'))

        with open(os.environ['MOCK_HADOOP_LOG']) as mock_log:
            hadoop_cmd_args = [shlex.split(line) for line in mock_log]

        assert_equal(hadoop_cmd_args, [
            ['fs', '-lsr', 'hdfs://tmp/waffles'],
            ['fs', '-lsr', 'lego://my/ego'],
        ])
Esempio n. 16
0
    def setUp(self):
        super(HadoopStreamingJarTestCase, self).setUp()

        self.mock_paths = []

        def mock_ls(path):  # don't bother to support globs
            return (p for p in sorted(self.mock_paths) if p.startswith(path))

        self.start(patch('mrjob.fs.local.LocalFilesystem.ls',
                         side_effect=mock_ls))

        os.environ.clear()

        self.runner = HadoopJobRunner()
Esempio n. 17
0
    def test_cat_compressed(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        with HadoopJobRunner(cleanup=['NONE']) as runner:
            output = []
            for line in runner.cat(input_gz_path):
                output.append(line)

        assert_equal(output, ['foo\n', 'bar\n'])

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
        input_bz2.write('bar\nbar\nfoo\n')
        input_bz2.close()

        with HadoopJobRunner(cleanup=['NONE']) as runner:
            output = []
            for line in runner.cat(input_bz2_path):
                output.append(line)

        assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
Esempio n. 18
0
    def test_pass_through_fields(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(hadoop_bin='hadoooooooooop',
                                 hadoop_home='kansas',
                                 hadoop_streaming_jar='streaming.jar')

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin)

            # deprecation warning is different for non-functions
            self.assertIn(
                'deprecated: access HadoopJobRunner.fs._hadoop_bin directly',
                stderr.getvalue())
Esempio n. 19
0
    def test_prefer_own_methods(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(hadoop_bin='hadoop',
                                 hadoop_home='kansas',
                                 hadoop_streaming_jar='streaming.jar')

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner.ls, runner.fs.ls)

            # Hadoop Runner has its own version
            self.assertNotEqual(runner.get_hadoop_version,
                                runner.fs.get_hadoop_version)

            self.assertIn('deprecated: call HadoopJobRunner.fs.ls() directly',
                          stderr.getvalue())
            self.assertNotIn('get_hadoop_version', stderr.getvalue())
Esempio n. 20
0
    def test_cat_uncompressed(self):
        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as input_file:
            input_file.write('bar\nfoo\n')

        input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
        with open(input_to_upload, 'w') as input_to_upload_file:
            input_to_upload_file.write('foo\nfoo\n')
        remote_input_path = 'hdfs:///data/foo'
        check_call([
            self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path
        ])

        with HadoopJobRunner(cleanup=['NONE']) as runner:
            local_output = []
            for line in runner.cat(local_input_path):
                local_output.append(line)

            remote_output = []
            for line in runner.cat(remote_input_path):
                remote_output.append(line)

        assert_equal(local_output, ['bar\n', 'foo\n'])
        assert_equal(remote_output, ['foo\n', 'foo\n'])
Esempio n. 21
0
    def setUp(self):
        super(PickErrorTestCase, self).setUp()

        os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'

        self.runner = HadoopJobRunner()
Esempio n. 22
0
    def setUp(self):
        super(FindProbableCauseOfFailureTestCase, self).setUp()

        os.environ['MOCK_HADOOP_VERSION'] = '2.7.0'

        self.runner = HadoopJobRunner()
Esempio n. 23
0
 def test_missing_hadoop_version(self):
     with patch.dict('os.environ', MOCK_HADOOP_VERSION=''):
         runner = HadoopJobRunner()
         self.assertRaises(Exception, runner.get_hadoop_version)
Esempio n. 24
0
 def test_get_hadoop_version(self):
     runner = HadoopJobRunner()
     self.assertEqual(runner.get_hadoop_version(), '1.2.0')
Esempio n. 25
0
    def test_dont_infer_from_usr_local_bin_hadoop(self):
        self.runner = HadoopJobRunner(hadoop_bin=['/usr/local/bin/hadoop'])
        self.mock_paths.append('/usr/local/hadoop-streaming.jar')

        self.assertEqual(self.runner._find_hadoop_streaming_jar(), None)
Esempio n. 26
0
from mrjob.hadoop import HadoopJobRunner

#x = HadoopJobRunner(conf_path="/nfs/ruby/calvin/.mrjob", mr_job_script="mr_sha1.py", hadoop_input_format="org.apache.hadoop.mapred.SequenceFileAsTextInputFormat")
x = HadoopJobRunner(conf_path="/nfs/ruby/calvin/.mrjob",
                    mr_job_script="mr_sha1.py",
                    hadoop_input_format=
                    "org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat")
#x = HadoopJobRunner(hadoop_input_format="org.apache.hadoop.mapred.SequenceFileAsTextInputFormat")
x.run()
Esempio n. 27
0
    def test_deprecated_hadoop_home_option(self):
        self.runner = HadoopJobRunner(hadoop_home='/ha/do/op/home-option')

        self.mock_paths.append('/ha/do/op/home-option/hadoop-streaming.jar')
        self.assertEqual(self.runner._find_hadoop_streaming_jar(),
                         '/ha/do/op/home-option/hadoop-streaming.jar')